16.5. Feature Selection and Evaluation#

16.5.1. PCA#

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
%matplotlib inline 
np.random.seed = 1
N = 1000
fs = 500
w = np.arange(1,N+1) * 2 * np.pi/fs
t = np.arange(1,N+1)/fs
x = 0.75 * np.sin(w*5)
y = signal.sawtooth(w*7, 0.5)
d1 = 0.5*y   + 0.5*x   + 0.1*np.random.rand(1,N)
d2 = 0.2*y   + 0.75*x  + 0.15*np.random.rand(1,N)
d3 = 0.7*y   + 0.25*x  + 0.1*np.random.rand(1,N)
d4 = -0.5*y  + 0.4*x   + 0.2*np.random.rand(1,N)
d5 = 0.6*np.random.rand(1,N)


d1 = d1 - d1.mean()
d2 = d2 - d2.mean()
d3 = d3 - d3.mean()
d4 = d4 - d4.mean()
d5 = d5 - d5.mean()
plt.plot(d1.transpose())
[<matplotlib.lines.Line2D at 0x7f9b25e38380>]
../../_images/d32bfb61f8cfc57e94fe4f568714421137b46f306d5e5f53547b8f25f4765599.png
plt.plot(t, x)
[<matplotlib.lines.Line2D at 0x7f9b25ed5910>]
../../_images/34a8cc63cdece5b9242b382516263bdabdc834444e84310d2092ced9e2ed9727.png
plt.plot(t, y)
[<matplotlib.lines.Line2D at 0x7f9b25d55e20>]
../../_images/c89c3d6258995bc56664e7c2179b6a63ba6afe574baeb4f81c0c366578cd84a5.png
import numpy as np
X = np.array([d1[0], d2[0], d3[0], d4[0], d5[0]])
X
X.shape
array([[-0.49577998, -0.43911388, -0.30707421, ..., -0.46031308,
        -0.46742218, -0.46980286],
       [-0.11763279, -0.16679998, -0.02423898, ..., -0.28749778,
        -0.21150511, -0.26889486],
       [-0.66356683, -0.5591988 , -0.50687389, ..., -0.60092988,
        -0.69128089, -0.69983561],
       [ 0.58933072,  0.53606769,  0.44182291, ...,  0.39421034,
         0.46428174,  0.44166366],
       [-0.19489226, -0.07663694,  0.23930929, ..., -0.10165508,
         0.17634231, -0.28770572]])
(5, 1000)
U,S,V = np.linalg.svd(X)
S
array([20.88215566, 14.24379295,  5.6277403 ,  1.46925551,  0.98238138])
for i in range(5):
    V[:,i] = V[:,i] * np.sqrt(S[i])
eigen = S**2
eigen
array([436.06442484, 202.88563767,  31.67146085,   2.15871176,
         0.96507318])
eigen = eigen/N
eigen = eigen/sum(eigen)

16.5.1.1. Scree plot#

Gives the measure of the associated principal component’s importance with regards to how much of the total information it represents.

plt.plot(range(1,6), eigen)
[<matplotlib.lines.Line2D at 0x7f9b25f2cf80>]
../../_images/78f324c4be7aa4f72872ab15193c822ff2639a6ae0f9250fc979206a38f13b38.png
plt.plot(V[:,0])
plt.show()
[<matplotlib.lines.Line2D at 0x7f9b234a83e0>]
../../_images/b03c838db909bffa1a550238cfe0a9a22cb3832772116776ed4f47338597323c.png
plt.plot(V[:,1])
plt.show()
[<matplotlib.lines.Line2D at 0x7f9b25e69850>]
../../_images/23e20fd1bbebcf28e81752b589c2cde2cac64e5ff2056f5e3b1c8a3a95633b19.png
plt.plot(V[:,2])
plt.show()
[<matplotlib.lines.Line2D at 0x7f9b2355e390>]
../../_images/cc59c8a44018c7ebaa2d49550e3a1229b1bcecb12107745d15b68388a4ce34d7.png

16.5.1.2. PCA on the IRIS Data#

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

plt.figure(2, figsize=(8, 6))
plt.clf()

# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
            edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show()
<Figure size 800x600 with 0 Axes>
<matplotlib.collections.PathCollection at 0x7f9b21d14b30>
Text(0.5, 0, 'Sepal length')
Text(0, 0.5, 'Sepal width')
(3.8, 8.4)
(1.5, 4.9)
([], [])
([], [])
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f9b21d3dca0>
Text(0.5, 0.92, 'First three PCA directions')
Text(0.5, 0, '1st eigenvector')
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[13], line 37
     35 ax.set_title("First three PCA directions")
     36 ax.set_xlabel("1st eigenvector")
---> 37 ax.w_xaxis.set_ticklabels([])
     38 ax.set_ylabel("2nd eigenvector")
     39 ax.w_yaxis.set_ticklabels([])

AttributeError: 'Axes3D' object has no attribute 'w_xaxis'
../../_images/001594eab427d88b35c7bebe0d8a521d8164417ed5f3d062ef8e34a9b354af4c.png
<Figure size 800x600 with 0 Axes>
iris = datasets.load_iris()
X = iris.data[:50,:] 
X2 = X +0.05*np.random.rand(50,4)
X_combined = np.zeros((50,8))
X_combined[:,0:4] = X
X_combined[:,4:] = X2
X_combined.mean(axis=0)
array([5.006     , 3.428     , 1.462     , 0.246     , 5.03084846,
       3.45157123, 1.4866681 , 0.27342048])
from sklearn import preprocessing
X_scaled = preprocessing.scale(X_combined)
X_combined.mean(axis=0)  
array([5.006     , 3.428     , 1.462     , 0.246     , 5.03084846,
       3.45157123, 1.4866681 , 0.27342048])
U,S,V = np.linalg.svd(X_scaled)
S
array([14.39921277, 10.00322912,  8.15751845,  5.03811249,  0.668255  ,
        0.39902347,  0.20300037,  0.15297129])
eigen = S**2
eigen = eigen/50
eigen = eigen/sum(eigen)
eigen = np.round(eigen*100)/100
print(eigen)
[0.52 0.25 0.17 0.06 0.   0.   0.   0.  ]
sum([0.51, 0.26, 0.17, 0.06])
1.0
plt.plot(range(1, 9), eigen)
[<matplotlib.lines.Line2D at 0x7f9b21a347d0>]
../../_images/427300ff6135d2247da07ab707fa8c0f348deb2e5f3b714f539c764a25728b38.png
X_reduced = PCA(n_components=3).fit_transform(X_scaled)

16.5.2. K-Means Clustering#

# https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203
# https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/
from sklearn.datasets import make_blobs
plt.title("Three blobs", fontsize='small')
X1, Y1 = make_blobs(n_features=2, centers=3, random_state=10)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='k')
Text(0.5, 1.0, 'Three blobs')
<matplotlib.collections.PathCollection at 0x7f9b21a3ab40>
../../_images/e65e613c843ea1175e0fcb37e2b176ff523f613c51e049cb7f7fd6bca697a4bf.png

16.5.3. Using scikit-learn to perform K-Means clustering#

from sklearn.cluster import KMeans
    
# Specify the number of clusters (3) and fit the data X
kmeans = KMeans(n_clusters=3, random_state=0).fit(X1)
# Get the cluster centroids

# Plotting the cluster centers and the data points on a 2D plane
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
            s=25, edgecolor='k')
    
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x')
    
plt.title('Data points and cluster centroids')
plt.show()
<matplotlib.collections.PathCollection at 0x7f9b21abab40>
<matplotlib.collections.PathCollection at 0x7f9b2195f920>
Text(0.5, 1.0, 'Data points and cluster centroids')
../../_images/05007e64c103dc2577b8a08456c6efde6b46877d7abd4feec7f4b5d40a65ab18.png

16.5.4. Evaluating Algorithms#

from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
Y = data.target

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
accuracy
LogisticRegression(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
0.958041958041958
cm = confusion_matrix(Y_test, Y_pred)
cm
# tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
#TN, FP, FN, TP
# 

- Sensitivity refers to a test's ability to designate an individual with disease as positive. A highly sensitive test means that there are few false negative results, and thus fewer cases of disease are missed. 
- The specificity of a test is its ability to designate an individual who does not have a disease as negative.
- https://en.wikipedia.org/wiki/Confusion_matrix
  Cell In[26], line 7
    - Sensitivity refers to a test's ability to designate an individual with disease as positive. A highly sensitive test means that there are few false negative results, and thus fewer cases of disease are missed.
                                  ^
SyntaxError: unterminated string literal (detected at line 7)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
tn
fp
fn
tp
# 380 b
# 20 m

TN = 380
TP = 0
TN, FP, FN, TP

https://stackoverflow.com/questions/56078203/why-scikit-learn-confusion-matrix-is-reversed

[[True Negative, False Positive] 
[False Negative, True Positive]]
results = classification_report(Y_test, Y_pred)
print(results)
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        53
           1       0.97      0.97      0.97        90

    accuracy                           0.96       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143
# precision measures how accurate our positive predictions were
# precision = tp / (tp+fp)

# recall measures what fraction of the positives our model identified
# recall = tp / (tp+fn) -- same as sensitivity

16.5.4.1. Lots of Classifiers#

from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
SVC(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
0.965034965034965
array([[50,  3],
       [ 2, 88]])
results = classification_report(Y_test, Y_pred)
print(results)
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        53
           1       0.97      0.98      0.97        90

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.97      0.96       143
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
0.916083916083916
array([[47,  6],
       [ 6, 84]])
results = classification_report(Y_test, Y_pred)
print(results)
              precision    recall  f1-score   support

           0       0.89      0.89      0.89        53
           1       0.93      0.93      0.93        90

    accuracy                           0.92       143
   macro avg       0.91      0.91      0.91       143
weighted avg       0.92      0.92      0.92       143
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
DecisionTreeClassifier(criterion='entropy', random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
0.958041958041958
array([[51,  2],
       [ 4, 86]])
results = classification_report(Y_test, Y_pred)
print(results)
              precision    recall  f1-score   support

           0       0.93      0.96      0.94        53
           1       0.98      0.96      0.97        90

    accuracy                           0.96       143
   macro avg       0.95      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm

from sklearn.metrics import fbeta_score
output = fbeta_score(Y_test, Y_pred, average='macro', beta=0.5)
output
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
0.972027972027972
array([[52,  1],
       [ 3, 87]])
np.float64(0.968271924154277)

16.5.4.1.1. Feature Importance#

for score, name in sorted(zip(classifier.feature_importances_, data.feature_names), reverse=True):
    print(round(score, 2), name)
0.14 mean concave points
0.14 worst perimeter
0.11 worst concave points
0.1 area error
0.08 worst area
0.06 mean concavity
0.05 radius error
0.05 mean area
0.04 worst radius
0.04 mean perimeter
0.03 mean texture
0.02 worst texture
0.02 worst fractal dimension
0.02 mean radius
0.01 perimeter error
0.01 worst concavity
0.01 concavity error
0.01 mean fractal dimension
0.01 mean smoothness
0.01 fractal dimension error
0.01 worst compactness
0.01 mean compactness
0.01 worst symmetry
0.01 concave points error
0.01 symmetry error
0.01 worst smoothness
0.0 compactness error
0.0 texture error
0.0 mean symmetry
0.0 smoothness error
results = classification_report(Y_test, Y_pred)
print(results)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
LogisticRegression(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
0.9649122807017544
array([[45,  2],
       [ 2, 65]])
from sklearn.metrics import classification_report
results = classification_report(Y_test, Y_pred)
print(results)
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        47
           1       0.97      0.97      0.97        67

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(Y_test, Y_pred)


from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_pred)
plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()
<Figure size 640x480 with 0 Axes>
[<matplotlib.lines.Line2D at 0x7f9b1d8b9be0>]
[<matplotlib.lines.Line2D at 0x7f9b1d8b8f80>]
(0.0, 1.0)
(0.0, 1.05)
Text(0.5, 0, 'False Positive Rate')
Text(0, 0.5, 'True Positive Rate')
Text(0.5, 1.0, 'Receiver operating characteristic example')
<matplotlib.legend.Legend at 0x7f9b1dc178f0>
../../_images/7260b9328f94f8d9411680998ebdd0cb118e4786b273ea68e01e7264094bc4aa.png

16.5.5. Imbalanced Data sets#

16.5.6. Stratify#

stratify

16.5.7. Vocabulary#

  • Supervised Learning

  • Unsupervised Learning

  • Classification

  • Prediction

  • Clustering

  • Cross-Validation

  • Dimensionality Reduction (curse of dimensionality)

  • Feature Selection

  • Accuracy

  • True Positive

  • True Negative

  • False Positive

  • False Negative

  • Confusion Matrix

  • Sensitivity

  • Specificity

  • Recall

  • Precision

  • F1-Score

  • Imbalanced Data

  • Area Under the Curve (Sensitivity/Specificity and Precision/Recall)

  • Underfitting

  • Overfitting – reduced via regularization (reduce degrees of freedom), early stopping (stop when validation error reaches minimum)

  • Bias – Error due to wrong assumptions, e.g., linear, when non linear; high-bias results in under fit training data

  • Variance – Excessive sensitivity to small variations in the training data; high results in over fitting.

  • Bias/Variance Trade-Off

  • Stratified Sampling