from cProfile import label
import imp
from re import L
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from cProfile import label
import imp
from re import L
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import PolynomialFeatures

#Pre-processing Mettre en forme les données
#Les données sont triées, normalisées

#Encodage Transformer des données qualitatives sous la forme de mot en valeur numérique
#Normalisation (même échelle)
#Imputation
#Sélection
#Extration

#Les fonctions Transformer de sklearn

#1. Encodage
#Encodage LabelEncoder et LabelBinarizer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OrdinalEncoder, OneHotEncoder
y = np.array(['chat', 'chien', 'chat', 'oiseau'])

encoder = LabelEncoder()
encoder.fit_transform(y)

encoder.inverse_transform(np.array([0, 0, 2]))

encoder = LabelBinarizer()
encoder.fit_transform(y)


#Encodage Ordinal et Encodage OneHot
X = np.array([['chat', 'poils'],
['chien', 'poils'],
['chat', 'poils'],
['oiseau', 'plumes']])

encoder = OrdinalEncoder()
encoder.fit_transform(X)

#pour ne pas pénaliser le modèle, il faut utiliser
#l'encodage ci-dessous
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(X)


#2. Normalisation
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data

#MinMaxScaler compris entre 0 et 1
#sensinle aux valeurs abérantes (out layer)
X_minmax = MinMaxScaler().fit_transform(X)

plt.scatter(X[:, 2], X[:, 3])
plt.scatter(X_minmax[:, 2], X_minmax[:, 3])


#StandardScaler sensinle aux valeurs abérantes
X_stdscl = StandardScaler().fit_transform(X)

plt.scatter(X[:, 2], X[:, 3])
plt.scatter(X_stdscl[:, 2], X_stdscl[:, 3])


#RobustScaler
X_robust = RobustScaler().fit_transform(X)

plt.scatter(X[:, 2], X[:, 3])
plt.scatter(X_robust[:, 2], X_robust[:, 3])


#3. Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
m = 100
X = np.linspace(0, 4, m).reshape((m, 1))
y = X**2 + 5*np.cos(X) + np.random.randn(m, 1)

model = LinearRegression().fit(X, y)
y_pred = model.predict(X)

plt.scatter(X, y)
plt.plot(X, y_pred, c='r', lw=3)


X_poly = PolynomialFeatures(3).fit_transform(X)
model = LinearRegression().fit(X_poly, y)
y_pred = model.predict(X_poly)

plt.scatter(X, y)
plt.plot(X, y_pred, c='r', lw=3)


#4. Discretisation
from sklearn.preprocessing import Binarizer, KBinsDiscretizer
X = np.linspace(0, 5, 10).reshape((10, 1))
np.hstack((X, Binarizer(threshold=3).fit_transform(X)))

KBinsDiscretizer(n_bins=6).fit_transform(X).toarray()


#5. Pipelines
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = make_pipeline(StandardScaler(), SGDClassifier())

model.fit(X_train, y_train)
model.score(X_test, y_test)


from sklearn.model_selection import GridSearchCV
model = make_pipeline(PolynomialFeatures(),
StandardScaler(),
SGDClassifier(random_state=0))
params = {
'polynomialfeatures__degree':[2, 3, 4],
'sgdclassifier__penalty':['l1', 'l2']
}

grid = GridSearchCV(model, param_grid=params, cv=4)

grid.fit(X_train, y_train)
print(grid.best_params_)
score = grid.score(X_test,y_test)
print(score)
Open In Colab
Pipelines Composites
Exemple d'utilisation de make_coloumn_selector et make_column_transformer

#pipeline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.impute import SimpleImputer
import seaborn as sns

titanic = sns.load_dataset('titanic')
print(titanic)

y = titanic['survived']
X = titanic.drop('survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
numerical_features = ['pclass', 'age', 'fare']
categorical_features = ['sex', 'deck', 'alone']
numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
(categorical_pipeline, categorical_features))
model = make_pipeline(preprocessor, SGDClassifier())

model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)





#Source = https://www.youtube.com/watch?v=OGWwzm304Xs

En poursuivant votre navigation sur mon site, vous acceptez l’utilisation des Cookies et autres traceurs  pour réaliser des statistiques de visites et enregistrer sur votre machine vos activités pédagogiques. En savoir plus.