from cProfile import label
import imp
from re import L
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from cProfile import label
import imp
from re import L
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
#Pre-processing Mettre en forme les données
#Les données sont triées, normalisées
#Encodage Transformer des données qualitatives sous la forme de mot en valeur numérique
#Normalisation (même échelle)
#Imputation
#Sélection
#Extration
#Les fonctions Transformer de sklearn
#1. Encodage
#Encodage LabelEncoder et LabelBinarizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OrdinalEncoder, OneHotEncoder
y = np.array(['chat', 'chien', 'chat', 'oiseau'])
encoder = LabelEncoder()
encoder.fit_transform(y)
encoder.inverse_transform(np.array([0, 0, 2]))
encoder = LabelBinarizer()
encoder.fit_transform(y)
#Encodage Ordinal et Encodage OneHot
X = np.array([['chat', 'poils'],
['chien', 'poils'],
['chat', 'poils'],
['oiseau', 'plumes']])
encoder = OrdinalEncoder()
encoder.fit_transform(X)
#pour ne pas pénaliser le modèle, il faut utiliser
#l'encodage ci-dessous
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(X)
#2. Normalisation
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
#MinMaxScaler compris entre 0 et 1
#sensinle aux valeurs abérantes (out layer)
X_minmax = MinMaxScaler().fit_transform(X)
plt.scatter(X[:, 2], X[:, 3])
plt.scatter(X_minmax[:, 2], X_minmax[:, 3])
#StandardScaler sensinle aux valeurs abérantes
X_stdscl = StandardScaler().fit_transform(X)
plt.scatter(X[:, 2], X[:, 3])
plt.scatter(X_stdscl[:, 2], X_stdscl[:, 3])
#RobustScaler
X_robust = RobustScaler().fit_transform(X)
plt.scatter(X[:, 2], X[:, 3])
plt.scatter(X_robust[:, 2], X_robust[:, 3])
#3. Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
m = 100
X = np.linspace(0, 4, m).reshape((m, 1))
y = X**2 + 5*np.cos(X) + np.random.randn(m, 1)
model = LinearRegression().fit(X, y)
y_pred = model.predict(X)
plt.scatter(X, y)
plt.plot(X, y_pred, c='r', lw=3)
X_poly = PolynomialFeatures(3).fit_transform(X)
model = LinearRegression().fit(X_poly, y)
y_pred = model.predict(X_poly)
plt.scatter(X, y)
plt.plot(X, y_pred, c='r', lw=3)
#4. Discretisation
from sklearn.preprocessing import Binarizer, KBinsDiscretizer
X = np.linspace(0, 5, 10).reshape((10, 1))
np.hstack((X, Binarizer(threshold=3).fit_transform(X)))
KBinsDiscretizer(n_bins=6).fit_transform(X).toarray()
#5. Pipelines
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = make_pipeline(StandardScaler(), SGDClassifier())
model.fit(X_train, y_train)
model.score(X_test, y_test)
from sklearn.model_selection import GridSearchCV
model = make_pipeline(PolynomialFeatures(),
StandardScaler(),
SGDClassifier(random_state=0))
params = {
'polynomialfeatures__degree':[2, 3, 4],
'sgdclassifier__penalty':['l1', 'l2']
}
grid = GridSearchCV(model, param_grid=params, cv=4)
grid.fit(X_train, y_train)
print(grid.best_params_)
score = grid.score(X_test,y_test)
print(score)
Open In Colab
Pipelines Composites
Exemple d'utilisation de make_coloumn_selector et make_column_transformer
#pipeline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.impute import SimpleImputer
import seaborn as sns
titanic = sns.load_dataset('titanic')
print(titanic)
y = titanic['survived']
X = titanic.drop('survived', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
numerical_features = ['pclass', 'age', 'fare']
categorical_features = ['sex', 'deck', 'alone']
numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
(categorical_pipeline, categorical_features))
model = make_pipeline(preprocessor, SGDClassifier())
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)
#Source = https://www.youtube.com/watch?v=OGWwzm304Xs