16.4. Select and Train a Model#
16.4.1. Splitting data into Train and Test#
import numpy as np
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), [0, 1, 0, 0, 1]
X
list(y)
# X -- feature
# y -- label
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 import numpy as np
2 from sklearn.model_selection import train_test_split
3 X, y = np.arange(10).reshape((5, 2)), [0, 1, 0, 0, 1]
ModuleNotFoundError: No module named 'numpy'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train
y_train
X_test
y_test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[2], line 1
----> 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
3 X_train
5 y_train
NameError: name 'train_test_split' is not defined
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.40, random_state=43)
X_train
y_train
X_test
y_test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 X_train, X_test, y_train, y_test = train_test_split(
2 X, y, test_size=0.40, random_state=43)
4 X_train
6 y_train
NameError: name 'train_test_split' is not defined
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
X_train
y_train
X_test
y_test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 1
----> 1 X_train, X_test, y_train, y_test = train_test_split(
2 X, y, test_size=0.33, random_state=42)
4 X_train
6 y_train
NameError: name 'train_test_split' is not defined
import random
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=random.randint(1, 10000))
X_train
y_train
X_test
y_test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 2
1 import random
----> 2 X_train, X_test, y_train, y_test = train_test_split(
3 X, y, test_size=0.33, random_state=random.randint(1, 10000))
5 X_train
7 y_train
NameError: name 'train_test_split' is not defined
import random
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=random.randint(1, 10000))
X_train
y_train
X_test
y_test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 2
1 import random
----> 2 X_train, X_test, y_train, y_test = train_test_split(
3 X, y, test_size=0.33, random_state=random.randint(1, 10000))
5 X_train
7 y_train
NameError: name 'train_test_split' is not defined
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
diabetes = datasets.load_diabetes()
diabetes.data.shape
feature_names = diabetes.feature_names
feature_names
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[7], line 1
----> 1 import pandas as pd
2 from sklearn import datasets, linear_model
3 from sklearn.model_selection import train_test_split
ModuleNotFoundError: No module named 'pandas'
df = pd.DataFrame(diabetes.data, columns=feature_names)
y = diabetes.target
df
y
df.shape
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[8], line 1
----> 1 df = pd.DataFrame(diabetes.data, columns=feature_names)
2 y = diabetes.target
3 df
NameError: name 'pd' is not defined
import random
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=random.randint(1, 10000))
X_train
y_train
X_test
y_test
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[9], line 2
1 import random
----> 2 X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=random.randint(1, 10000))
4 X_train
6 y_train
NameError: name 'train_test_split' is not defined
X_train.shape
len(y_train)
X_test.shape
len(y_test)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[10], line 1
----> 1 X_train.shape
3 len(y_train)
5 X_test.shape
NameError: name 'X_train' is not defined
16.4.2. Linear Regression Example#
To predict a numerical value
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)
reg.score(X, y)
reg.coef_
reg.intercept_
# y = mx + b
reg.predict(np.array([[3, 5]]))
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[11], line 1
----> 1 import numpy as np
2 from sklearn.linear_model import LinearRegression
3 X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
ModuleNotFoundError: No module named 'numpy'
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
dataset=pd.read_csv('Salary_Data.csv')
dataset.head()
dataset.shape
dataset
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[12], line 1
----> 1 import numpy as np
2 import pandas as pd
3 from sklearn.model_selection import train_test_split
ModuleNotFoundError: No module named 'numpy'
16.4.2.1. Selecting the data#
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
X
y
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[13], line 1
----> 1 X = dataset.iloc[:, :-1].values
2 y = dataset.iloc[:, 1].values
3 X
NameError: name 'dataset' is not defined
16.4.2.2. Split the data#
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[14], line 1
----> 1 from sklearn.model_selection import train_test_split
2 X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20)
ModuleNotFoundError: No module named 'sklearn'
16.4.2.3. Train and Test#
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)
y_pred=regressor.predict(X_test)
y_pred
y_test
metrics.mean_squared_error(y_test, y_pred, squared=False)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[15], line 1
----> 1 from sklearn.linear_model import LinearRegression
2 regressor=LinearRegression()
3 regressor.fit(X_train,y_train)
ModuleNotFoundError: No module named 'sklearn'
r2_score(y_test, y_pred)
regressor.coef_
regressor.intercept_
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[16], line 1
----> 1 r2_score(y_test, y_pred)
2 regressor.coef_
3 regressor.intercept_
NameError: name 'r2_score' is not defined
plt.scatter(X_train,y_train,color='red')
plt.plot(X_train,regressor.predict(X_train),color='blue')
plt.title('Salary VS Experience (Training Data)')
plt.xlabel('Years of experiene')
plt.ylabel('Salary')
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[17], line 1
----> 1 plt.scatter(X_train,y_train,color='red')
2 plt.plot(X_train,regressor.predict(X_train),color='blue')
3 plt.title('Salary VS Experience (Training Data)')
NameError: name 'plt' is not defined
plt.scatter(X_test,y_test,color='red')
plt.plot(X_test,regressor.predict(X_test),color='blue')
plt.title('Salary VS Experience (Test Data)');
plt.xlabel('Years of experiene');
plt.ylabel('Salary');
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 1
----> 1 plt.scatter(X_test,y_test,color='red')
2 plt.plot(X_test,regressor.predict(X_test),color='blue')
3 plt.title('Salary VS Experience (Test Data)');
NameError: name 'plt' is not defined
16.4.2.4. Apply DecisionTreeRegressor#
Decision Trees are good for finding complex nonlinear relationships
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20)
from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor()
regressor.fit(X_train,y_train)
y_pred=regressor.predict(X_test)
y_pred
y_test
metrics.mean_squared_error(y_test, y_pred, squared=False)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[19], line 1
----> 1 X = dataset.iloc[:, :-1].values
2 y = dataset.iloc[:, 1].values
3 from sklearn.model_selection import train_test_split
NameError: name 'dataset' is not defined
16.4.2.5. Cross-Validation#
from sklearn.pipeline import make_pipeline
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
preprocessing = make_pipeline(StandardScaler())
reg_pipeline = make_pipeline(preprocessing, LinearRegression())
reg_pipeline.fit(X, y)
y_pred=reg_pipeline.predict(X)
metrics.mean_squared_error(y, pred, squared=False)
from sklearn.model_selection import cross_val_score
reg_rmses = cross_val_score(reg_pipeline, X, y, scoring="neg_root_mean_squared_error", cv=10)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[20], line 1
----> 1 from sklearn.pipeline import make_pipeline
2 X = dataset.iloc[:, :-1].values
3 y = dataset.iloc[:, 1].values
ModuleNotFoundError: No module named 'sklearn'
from sklearn.pipeline import make_pipeline
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values
preprocessing = make_pipeline(StandardScaler())
tree_pipeline = make_pipeline(preprocessing, DecisionTreeRegressor())
tree_rmses = cross_val_score(tree_pipeline, X, y, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(tree_rmses).describe()
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[21], line 1
----> 1 from sklearn.pipeline import make_pipeline
2 X = dataset.iloc[:, :-1].values
3 y = dataset.iloc[:, 1].values
ModuleNotFoundError: No module named 'sklearn'
16.4.3. Classification Example#
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# Load the data
from sklearn.datasets import load_iris
iris = load_iris()
from matplotlib import pyplot as plt
# The indices of the features that we are plotting
x_index = 0
y_index = 1
# this formatter will label the colorbar with the correct target names
formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])
plt.figure(figsize=(5, 4))
plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target)
plt.colorbar(ticks=[0, 1, 2], format=formatter)
plt.xlabel(iris.feature_names[x_index])
plt.ylabel(iris.feature_names[y_index])
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[22], line 1
----> 1 from sklearn.datasets import load_iris
2 from sklearn.model_selection import train_test_split
3 from sklearn import metrics
ModuleNotFoundError: No module named 'sklearn'
X = iris.data
Y = iris.target
X_train, X_test, y_train, y_test=train_test_split(X, Y, test_size=0.2, random_state=0)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[23], line 1
----> 1 X = iris.data
2 Y = iris.target
3 X_train, X_test, y_train, y_test=train_test_split(X, Y, test_size=0.2, random_state=0)
NameError: name 'iris' is not defined
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
100*metrics.accuracy_score(y_test, y_pred)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 1
----> 1 gnb = GaussianNB()
2 gnb.fit(X_train, y_train)
3 y_pred = gnb.predict(X_test)
NameError: name 'GaussianNB' is not defined