16.5. Feature Selection and Evaluation#

16.5.1. PCA#

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
%matplotlib inline 
np.random.seed = 1
N = 1000
fs = 500
w = np.arange(1,N+1) * 2 * np.pi/fs
t = np.arange(1,N+1)/fs
x = 0.75 * np.sin(w*5)
y = signal.sawtooth(w*7, 0.5)
d1 = 0.5*y   + 0.5*x   + 0.1*np.random.rand(1,N)
d2 = 0.2*y   + 0.75*x  + 0.15*np.random.rand(1,N)
d3 = 0.7*y   + 0.25*x  + 0.1*np.random.rand(1,N)
d4 = -0.5*y  + 0.4*x   + 0.2*np.random.rand(1,N)
d5 = 0.6*np.random.rand(1,N)


d1 = d1 - d1.mean()
d2 = d2 - d2.mean()
d3 = d3 - d3.mean()
d4 = d4 - d4.mean()
d5 = d5 - d5.mean()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 3
      1 from IPython.core.interactiveshell import InteractiveShell
      2 InteractiveShell.ast_node_interactivity = "all"
----> 3 import numpy as np
      4 from scipy import signal
      5 import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'numpy'
plt.plot(d1.transpose())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[2], line 1
----> 1 plt.plot(d1.transpose())

NameError: name 'plt' is not defined
plt.plot(t, x)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 plt.plot(t, x)

NameError: name 'plt' is not defined
plt.plot(t, y)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 plt.plot(t, y)

NameError: name 'plt' is not defined
import numpy as np
X = np.array([d1[0], d2[0], d3[0], d4[0], d5[0]])
X
X.shape
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[5], line 1
----> 1 import numpy as np
      2 X = np.array([d1[0], d2[0], d3[0], d4[0], d5[0]])
      3 X

ModuleNotFoundError: No module named 'numpy'
U,S,V = np.linalg.svd(X)
S
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 1
----> 1 U,S,V = np.linalg.svd(X)
      2 S

NameError: name 'np' is not defined
for i in range(5):
    V[:,i] = V[:,i] * np.sqrt(S[i])
eigen = S**2
eigen
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[7], line 2
      1 for i in range(5):
----> 2     V[:,i] = V[:,i] * np.sqrt(S[i])
      3 eigen = S**2
      4 eigen

NameError: name 'V' is not defined
eigen = eigen/N
eigen = eigen/sum(eigen)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 eigen = eigen/N
      2 eigen = eigen/sum(eigen)

NameError: name 'eigen' is not defined

16.5.1.1. Scree plot#

Gives the measure of the associated principal component’s importance with regards to how much of the total information it represents.

plt.plot(range(1,6), eigen)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[9], line 1
----> 1 plt.plot(range(1,6), eigen)

NameError: name 'plt' is not defined
plt.plot(V[:,0])
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[10], line 1
----> 1 plt.plot(V[:,0])
      2 plt.show()

NameError: name 'plt' is not defined
plt.plot(V[:,1])
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[11], line 1
----> 1 plt.plot(V[:,1])
      2 plt.show()

NameError: name 'plt' is not defined
plt.plot(V[:,2])
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[12], line 1
----> 1 plt.plot(V[:,2])
      2 plt.show()

NameError: name 'plt' is not defined

16.5.1.2. PCA on the IRIS Data#

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

plt.figure(2, figsize=(8, 6))
plt.clf()

# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
            edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[13], line 1
----> 1 import matplotlib.pyplot as plt
      2 from mpl_toolkits.mplot3d import Axes3D
      3 from sklearn import datasets

ModuleNotFoundError: No module named 'matplotlib'
iris = datasets.load_iris()
X = iris.data[:50,:] 
X2 = X +0.05*np.random.rand(50,4)
X_combined = np.zeros((50,8))
X_combined[:,0:4] = X
X_combined[:,4:] = X2
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 1
----> 1 iris = datasets.load_iris()
      2 X = iris.data[:50,:] 
      3 X2 = X +0.05*np.random.rand(50,4)

NameError: name 'datasets' is not defined
X_combined.mean(axis=0)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 1
----> 1 X_combined.mean(axis=0)

NameError: name 'X_combined' is not defined
from sklearn import preprocessing
X_scaled = preprocessing.scale(X_combined)
X_combined.mean(axis=0)  
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[16], line 1
----> 1 from sklearn import preprocessing
      2 X_scaled = preprocessing.scale(X_combined)
      3 X_combined.mean(axis=0)  

ModuleNotFoundError: No module named 'sklearn'
U,S,V = np.linalg.svd(X_scaled)
S
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[17], line 1
----> 1 U,S,V = np.linalg.svd(X_scaled)
      2 S

NameError: name 'np' is not defined
eigen = S**2
eigen = eigen/50
eigen = eigen/sum(eigen)
eigen = np.round(eigen*100)/100
print(eigen)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 1
----> 1 eigen = S**2
      2 eigen = eigen/50
      3 eigen = eigen/sum(eigen)

NameError: name 'S' is not defined
sum([0.51, 0.26, 0.17, 0.06])
1.0
plt.plot(range(1, 9), eigen)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 1
----> 1 plt.plot(range(1, 9), eigen)

NameError: name 'plt' is not defined
X_reduced = PCA(n_components=3).fit_transform(X_scaled)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 1
----> 1 X_reduced = PCA(n_components=3).fit_transform(X_scaled)

NameError: name 'PCA' is not defined

16.5.2. K-Means Clustering#

# https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203
# https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/
from sklearn.datasets import make_blobs
plt.title("Three blobs", fontsize='small')
X1, Y1 = make_blobs(n_features=2, centers=3, random_state=10)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='k')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[22], line 3
      1 # https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203
      2 # https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/
----> 3 from sklearn.datasets import make_blobs
      4 plt.title("Three blobs", fontsize='small')
      5 X1, Y1 = make_blobs(n_features=2, centers=3, random_state=10)

ModuleNotFoundError: No module named 'sklearn'

16.5.3. Using scikit-learn to perform K-Means clustering#

from sklearn.cluster import KMeans
    
# Specify the number of clusters (3) and fit the data X
kmeans = KMeans(n_clusters=3, random_state=0).fit(X1)
# Get the cluster centroids

# Plotting the cluster centers and the data points on a 2D plane
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='k')
    
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x')
    
plt.title('Data points and cluster centroids')
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[23], line 1
----> 1 from sklearn.cluster import KMeans
      3 # Specify the number of clusters (3) and fit the data X
      4 kmeans = KMeans(n_clusters=3, random_state=0).fit(X1)

ModuleNotFoundError: No module named 'sklearn'

16.5.4. Evaluating Algorithms#

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
Y = data.target

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[24], line 1
----> 1 from sklearn.datasets import load_breast_cancer
      2 data = load_breast_cancer()
      3 X = data.data

ModuleNotFoundError: No module named 'sklearn'
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
accuracy
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[25], line 1
----> 1 from sklearn.model_selection import train_test_split
      2 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
      3 from sklearn.preprocessing import StandardScaler

ModuleNotFoundError: No module named 'sklearn'
cm = confusion_matrix(Y_test, Y_pred)
cm
# tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
#TN, FP, FN, TP
# 

- Sensitivity refers to a test's ability to designate an individual with disease as positive. A highly sensitive test means that there are few false negative results, and thus fewer cases of disease are missed. 
- The specificity of a test is its ability to designate an individual who does not have a disease as negative.
- https://en.wikipedia.org/wiki/Confusion_matrix
  Cell In[26], line 7
    - Sensitivity refers to a test's ability to designate an individual with disease as positive. A highly sensitive test means that there are few false negative results, and thus fewer cases of disease are missed.
                                  ^
SyntaxError: unterminated string literal (detected at line 7)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
tn
fp
fn
tp
# 380 b
# 20 m

TN = 380
TP = 0
TN, FP, FN, TP

https://stackoverflow.com/questions/56078203/why-scikit-learn-confusion-matrix-is-reversed

[[True Negative, False Positive] 
[False Negative, True Positive]]
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 results = classification_report(Y_test, Y_pred)
      2 print(results)

NameError: name 'classification_report' is not defined
# precision measures how accurate our positive predictions were
# precision = tp / (tp+fp)

# recall measures what fraction of the positives our model identified
# recall = tp / (tp+fn) -- same as sensitivity

16.5.4.1. Lots of Classifiers#

from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[28], line 1
----> 1 from sklearn.svm import SVC
      2 classifier = SVC(kernel = 'rbf', random_state = 0)
      3 classifier.fit(X_train, Y_train)

ModuleNotFoundError: No module named 'sklearn'
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 1
----> 1 results = classification_report(Y_test, Y_pred)
      2 print(results)

NameError: name 'classification_report' is not defined
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[30], line 1
----> 1 from sklearn.naive_bayes import GaussianNB
      2 classifier = GaussianNB()
      3 classifier.fit(X_train, Y_train)

ModuleNotFoundError: No module named 'sklearn'
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[31], line 1
----> 1 results = classification_report(Y_test, Y_pred)
      2 print(results)

NameError: name 'classification_report' is not defined
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[32], line 1
----> 1 from sklearn.tree import DecisionTreeClassifier
      2 classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
      3 classifier.fit(X_train, Y_train)

ModuleNotFoundError: No module named 'sklearn'
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[33], line 1
----> 1 results = classification_report(Y_test, Y_pred)
      2 print(results)

NameError: name 'classification_report' is not defined
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm

from sklearn.metrics import fbeta_score
output = fbeta_score(Y_test, Y_pred, average='macro', beta=0.5)
output
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[34], line 1
----> 1 from sklearn.ensemble import RandomForestClassifier
      2 classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
      3 classifier.fit(X_train, Y_train)

ModuleNotFoundError: No module named 'sklearn'

16.5.4.1.1. Feature Importance#

for score, name in sorted(zip(classifier.feature_importances_, data.feature_names), reverse=True):
    print(round(score, 2), name)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 for score, name in sorted(zip(classifier.feature_importances_, data.feature_names), reverse=True):
      2     print(round(score, 2), name)

NameError: name 'classifier' is not defined
results = classification_report(Y_test, Y_pred)
print(results)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[36], line 1
----> 1 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
      2 sc = StandardScaler()
      3 X_train = sc.fit_transform(X_train)

NameError: name 'train_test_split' is not defined
from sklearn.metrics import classification_report
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[37], line 1
----> 1 from sklearn.metrics import classification_report
      2 results = classification_report(Y_test, Y_pred)
      3 print(results)

ModuleNotFoundError: No module named 'sklearn'
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(Y_test, Y_pred)


from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_pred)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[38], line 1
----> 1 from sklearn.metrics import roc_auc_score
      3 roc_auc = roc_auc_score(Y_test, Y_pred)
      6 from sklearn import metrics

ModuleNotFoundError: No module named 'sklearn'
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[39], line 1
----> 1 plt.figure()
      2 lw = 2
      3 plt.plot(
      4     fpr,
      5     tpr,
   (...)      8     label="ROC curve (area = %0.2f)" % roc_auc,
      9 )

NameError: name 'plt' is not defined

16.5.5. Imbalanced Data sets#

16.5.6. Stratify#

stratify

16.5.7. Vocabulary#

  • Supervised Learning

  • Unsupervised Learning

  • Classification

  • Prediction

  • Clustering

  • Cross-Validation

  • Dimensionality Reduction (curse of dimensionality)

  • Feature Selection

  • Accuracy

  • True Positive

  • True Negative

  • False Positive

  • False Negative

  • Confusion Matrix

  • Sensitivity

  • Specificity

  • Recall

  • Precision

  • F1-Score

  • Imbalanced Data

  • Area Under the Curve (Sensitivity/Specificity and Precision/Recall)

  • Underfitting

  • Overfitting – reduced via regularization (reduce degrees of freedom), early stopping (stop when validation error reaches minimum)

  • Bias – Error due to wrong assumptions, e.g., linear, when non linear; high-bias results in under fit training data

  • Variance – Excessive sensitivity to small variations in the training data; high results in over fitting.

  • Bias/Variance Trade-Off

  • Stratified Sampling