16.5. Feature Selection and Evaluation#
16.5.1. PCA#
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
from scipy import signal
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed = 1
N = 1000
fs = 500
w = np.arange(1,N+1) * 2 * np.pi/fs
t = np.arange(1,N+1)/fs
x = 0.75 * np.sin(w*5)
y = signal.sawtooth(w*7, 0.5)
d1 = 0.5*y + 0.5*x + 0.1*np.random.rand(1,N)
d2 = 0.2*y + 0.75*x + 0.15*np.random.rand(1,N)
d3 = 0.7*y + 0.25*x + 0.1*np.random.rand(1,N)
d4 = -0.5*y + 0.4*x + 0.2*np.random.rand(1,N)
d5 = 0.6*np.random.rand(1,N)
d1 = d1 - d1.mean()
d2 = d2 - d2.mean()
d3 = d3 - d3.mean()
d4 = d4 - d4.mean()
d5 = d5 - d5.mean()
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 3
1 from IPython.core.interactiveshell import InteractiveShell
2 InteractiveShell.ast_node_interactivity = "all"
----> 3 import numpy as np
4 from scipy import signal
5 import matplotlib.pyplot as plt
ModuleNotFoundError: No module named 'numpy'
plt.plot(d1.transpose())
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[2], line 1
----> 1 plt.plot(d1.transpose())
NameError: name 'plt' is not defined
plt.plot(t, x)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 plt.plot(t, x)
NameError: name 'plt' is not defined
plt.plot(t, y)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 1
----> 1 plt.plot(t, y)
NameError: name 'plt' is not defined
import numpy as np
X = np.array([d1[0], d2[0], d3[0], d4[0], d5[0]])
X
X.shape
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[5], line 1
----> 1 import numpy as np
2 X = np.array([d1[0], d2[0], d3[0], d4[0], d5[0]])
3 X
ModuleNotFoundError: No module named 'numpy'
U,S,V = np.linalg.svd(X)
S
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 1
----> 1 U,S,V = np.linalg.svd(X)
2 S
NameError: name 'np' is not defined
for i in range(5):
V[:,i] = V[:,i] * np.sqrt(S[i])
eigen = S**2
eigen
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 2
1 for i in range(5):
----> 2 V[:,i] = V[:,i] * np.sqrt(S[i])
3 eigen = S**2
4 eigen
NameError: name 'V' is not defined
eigen = eigen/N
eigen = eigen/sum(eigen)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 1
----> 1 eigen = eigen/N
2 eigen = eigen/sum(eigen)
NameError: name 'eigen' is not defined
16.5.1.1. Scree plot#
Gives the measure of the associated principal component’s importance with regards to how much of the total information it represents.
plt.plot(range(1,6), eigen)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 1
----> 1 plt.plot(range(1,6), eigen)
NameError: name 'plt' is not defined
plt.plot(V[:,0])
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 plt.plot(V[:,0])
2 plt.show()
NameError: name 'plt' is not defined
plt.plot(V[:,1])
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[11], line 1
----> 1 plt.plot(V[:,1])
2 plt.show()
NameError: name 'plt' is not defined
plt.plot(V[:,2])
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[12], line 1
----> 1 plt.plot(V[:,2])
2 plt.show()
NameError: name 'plt' is not defined
16.5.1.2. PCA on the IRIS Data#
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
plt.figure(2, figsize=(8, 6))
plt.clf()
# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[13], line 1
----> 1 import matplotlib.pyplot as plt
2 from mpl_toolkits.mplot3d import Axes3D
3 from sklearn import datasets
ModuleNotFoundError: No module named 'matplotlib'
iris = datasets.load_iris()
X = iris.data[:50,:]
X2 = X +0.05*np.random.rand(50,4)
X_combined = np.zeros((50,8))
X_combined[:,0:4] = X
X_combined[:,4:] = X2
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[14], line 1
----> 1 iris = datasets.load_iris()
2 X = iris.data[:50,:]
3 X2 = X +0.05*np.random.rand(50,4)
NameError: name 'datasets' is not defined
X_combined.mean(axis=0)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 1
----> 1 X_combined.mean(axis=0)
NameError: name 'X_combined' is not defined
from sklearn import preprocessing
X_scaled = preprocessing.scale(X_combined)
X_combined.mean(axis=0)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[16], line 1
----> 1 from sklearn import preprocessing
2 X_scaled = preprocessing.scale(X_combined)
3 X_combined.mean(axis=0)
ModuleNotFoundError: No module named 'sklearn'
U,S,V = np.linalg.svd(X_scaled)
S
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 1
----> 1 U,S,V = np.linalg.svd(X_scaled)
2 S
NameError: name 'np' is not defined
eigen = S**2
eigen = eigen/50
eigen = eigen/sum(eigen)
eigen = np.round(eigen*100)/100
print(eigen)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 1
----> 1 eigen = S**2
2 eigen = eigen/50
3 eigen = eigen/sum(eigen)
NameError: name 'S' is not defined
sum([0.51, 0.26, 0.17, 0.06])
1.0
plt.plot(range(1, 9), eigen)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[20], line 1
----> 1 plt.plot(range(1, 9), eigen)
NameError: name 'plt' is not defined
X_reduced = PCA(n_components=3).fit_transform(X_scaled)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 1
----> 1 X_reduced = PCA(n_components=3).fit_transform(X_scaled)
NameError: name 'PCA' is not defined
16.5.2. K-Means Clustering#
# https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203
# https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/
from sklearn.datasets import make_blobs
plt.title("Three blobs", fontsize='small')
X1, Y1 = make_blobs(n_features=2, centers=3, random_state=10)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
s=25, edgecolor='k')
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[22], line 3
1 # https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203
2 # https://blog.floydhub.com/introduction-to-k-means-clustering-in-python-with-scikit-learn/
----> 3 from sklearn.datasets import make_blobs
4 plt.title("Three blobs", fontsize='small')
5 X1, Y1 = make_blobs(n_features=2, centers=3, random_state=10)
ModuleNotFoundError: No module named 'sklearn'
16.5.3. Using scikit-learn to perform K-Means clustering#
from sklearn.cluster import KMeans
# Specify the number of clusters (3) and fit the data X
kmeans = KMeans(n_clusters=3, random_state=0).fit(X1)
# Get the cluster centroids
# Plotting the cluster centers and the data points on a 2D plane
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1,
s=25, edgecolor='k')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='x')
plt.title('Data points and cluster centroids')
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[23], line 1
----> 1 from sklearn.cluster import KMeans
3 # Specify the number of clusters (3) and fit the data X
4 kmeans = KMeans(n_clusters=3, random_state=0).fit(X1)
ModuleNotFoundError: No module named 'sklearn'
16.5.3.1. Some links#
https://jakevdp.github.io/PythonDataScienceHandbook/05.07-support-vector-machines.html https://scikit-learn.org/stable/auto_examples/linear_model/plot_ransac.html http://www.cse.psu.edu/~rtc12/CSE486/lecture15.pdf https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9
16.5.4. Evaluating Algorithms#
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
Y = data.target
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[24], line 1
----> 1 from sklearn.datasets import load_breast_cancer
2 data = load_breast_cancer()
3 X = data.data
ModuleNotFoundError: No module named 'sklearn'
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[25], line 1
----> 1 from sklearn.model_selection import train_test_split
2 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)
3 from sklearn.preprocessing import StandardScaler
ModuleNotFoundError: No module named 'sklearn'
cm = confusion_matrix(Y_test, Y_pred)
cm
# tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
#TN, FP, FN, TP
#
- Sensitivity refers to a test's ability to designate an individual with disease as positive. A highly sensitive test means that there are few false negative results, and thus fewer cases of disease are missed.
- The specificity of a test is its ability to designate an individual who does not have a disease as negative.
- https://en.wikipedia.org/wiki/Confusion_matrix
Cell In[26], line 7
- Sensitivity refers to a test's ability to designate an individual with disease as positive. A highly sensitive test means that there are few false negative results, and thus fewer cases of disease are missed.
^
SyntaxError: unterminated string literal (detected at line 7)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred).ravel()
tn
fp
fn
tp
# 380 b
# 20 m
TN = 380
TP = 0
TN, FP, FN, TP
https://stackoverflow.com/questions/56078203/why-scikit-learn-confusion-matrix-is-reversed
[[True Negative, False Positive]
[False Negative, True Positive]]
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[27], line 1
----> 1 results = classification_report(Y_test, Y_pred)
2 print(results)
NameError: name 'classification_report' is not defined
# precision measures how accurate our positive predictions were
# precision = tp / (tp+fp)
# recall measures what fraction of the positives our model identified
# recall = tp / (tp+fn) -- same as sensitivity
16.5.4.1. Lots of Classifiers#
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[28], line 1
----> 1 from sklearn.svm import SVC
2 classifier = SVC(kernel = 'rbf', random_state = 0)
3 classifier.fit(X_train, Y_train)
ModuleNotFoundError: No module named 'sklearn'
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[29], line 1
----> 1 results = classification_report(Y_test, Y_pred)
2 print(results)
NameError: name 'classification_report' is not defined
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[30], line 1
----> 1 from sklearn.naive_bayes import GaussianNB
2 classifier = GaussianNB()
3 classifier.fit(X_train, Y_train)
ModuleNotFoundError: No module named 'sklearn'
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[31], line 1
----> 1 results = classification_report(Y_test, Y_pred)
2 print(results)
NameError: name 'classification_report' is not defined
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[32], line 1
----> 1 from sklearn.tree import DecisionTreeClassifier
2 classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
3 classifier.fit(X_train, Y_train)
ModuleNotFoundError: No module named 'sklearn'
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[33], line 1
----> 1 results = classification_report(Y_test, Y_pred)
2 print(results)
NameError: name 'classification_report' is not defined
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
from sklearn.metrics import fbeta_score
output = fbeta_score(Y_test, Y_pred, average='macro', beta=0.5)
output
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[34], line 1
----> 1 from sklearn.ensemble import RandomForestClassifier
2 classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
3 classifier.fit(X_train, Y_train)
ModuleNotFoundError: No module named 'sklearn'
16.5.4.1.1. Feature Importance#
for score, name in sorted(zip(classifier.feature_importances_, data.feature_names), reverse=True):
print(round(score, 2), name)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[35], line 1
----> 1 for score, name in sorted(zip(classifier.feature_importances_, data.feature_names), reverse=True):
2 print(round(score, 2), name)
NameError: name 'classifier' is not defined
results = classification_report(Y_test, Y_pred)
print(results)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracy
cm = confusion_matrix(Y_test, Y_pred)
cm
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[36], line 1
----> 1 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
2 sc = StandardScaler()
3 X_train = sc.fit_transform(X_train)
NameError: name 'train_test_split' is not defined
from sklearn.metrics import classification_report
results = classification_report(Y_test, Y_pred)
print(results)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[37], line 1
----> 1 from sklearn.metrics import classification_report
2 results = classification_report(Y_test, Y_pred)
3 print(results)
ModuleNotFoundError: No module named 'sklearn'
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(Y_test, Y_pred)
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(Y_test, Y_pred)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[38], line 1
----> 1 from sklearn.metrics import roc_auc_score
3 roc_auc = roc_auc_score(Y_test, Y_pred)
6 from sklearn import metrics
ModuleNotFoundError: No module named 'sklearn'
plt.figure()
lw = 2
plt.plot(
fpr,
tpr,
color="darkorange",
lw=lw,
label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[39], line 1
----> 1 plt.figure()
2 lw = 2
3 plt.plot(
4 fpr,
5 tpr,
(...) 8 label="ROC curve (area = %0.2f)" % roc_auc,
9 )
NameError: name 'plt' is not defined
16.5.5. Imbalanced Data sets#
For imbalanced data sets, the AUC of precision/recall curve is more informative than the AUC for sensitivity/specificity curve.
https://www.kaggle.com/code/vedbharti/classification-precision-recall-vs-roc-plot
16.5.6. Stratify#
stratify
16.5.7. Vocabulary#
Supervised Learning
Unsupervised Learning
Classification
Prediction
Clustering
Cross-Validation
Dimensionality Reduction (curse of dimensionality)
Feature Selection
Accuracy
True Positive
True Negative
False Positive
False Negative
Confusion Matrix
Sensitivity
Specificity
Recall
Precision
F1-Score
Imbalanced Data
Area Under the Curve (Sensitivity/Specificity and Precision/Recall)
Underfitting
Overfitting – reduced via regularization (reduce degrees of freedom), early stopping (stop when validation error reaches minimum)
Bias – Error due to wrong assumptions, e.g., linear, when non linear; high-bias results in under fit training data
Variance – Excessive sensitivity to small variations in the training data; high results in over fitting.
Bias/Variance Trade-Off
Stratified Sampling