Pour entrainer la machine, on divise (split) les données en 2 : Train_set et Test_set
########
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
print(X.shape)
#plt.scatter(X[:,0],X[:,1],c=y,alpha=0.8)
#plt.show()
from sklearn.model_selection import train_test_split
#il faut soumettre notre modele a des
#donnees qu'il n'a jamais vu
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train,y_train)
score = model.score(X_train,y_train)
print("Train",score)
score = model.score(X_test,y_test)
print("Test",score)
#Ameliorer le resultat en modifiant les hyper-paramètres
#Test set - Validation set - Train set
# La cross validation
from sklearn.model_selection import cross_val_score
cross = cross_val_score(KNeighborsClassifier(),X_train,y_train,cv=5,scoring='accuracy')
print(cross)
#Faire la moyenne pour obtenir le modele qui a la meilleur performance
cross = cross_val_score(KNeighborsClassifier(),X_train,y_train,cv=5,scoring='accuracy').mean()
print(cross)
#Faire varier les hyper parametres
val_scores = []
for k in range(1,50):
score = cross_val_score(KNeighborsClassifier(n_neighbors=k),X_train,y_train,cv=5,scoring='accuracy').mean()
val_scores.append(score)
#plt.plot(val_scores)
#plt.show()
#Faire la meme chose sans for
#Avec 1 hyperparamètre
from sklearn.model_selection import validation_curve
model = KNeighborsClassifier()
k = np.arange(1,50)
train_score,val_score = validation_curve(model,X_train,y_train,
param_name='n_neighbors',param_range=k,cv=5)
print(val_score.mean(axis=1))
#plt.plot(k,val_score.mean(axis=1),label="validation")
#plt.plot(k,train_score.mean(axis=1),label="train")
#plt.ylabel("score")
##plt.xlabel("n_neighbors")
#plt.legend()
#plt.show()
#GridSearchCv
#Avec n hyperparamètres
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors':np.arange(1,20),
'metric':['euclidean','mahnattan'] }
grid = GridSearchCV(KNeighborsClassifier(),param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
#meilleur score
print(grid.best_score_)
#meilleurs paramètres
print(grid.best_params_)
#sauvegatde du model
model= grid.best_estimator_
#Test sur les donnees non utilisées
print(model.score(X_test,y_test))
#Autre methode de mesure
from sklearn.metrics import confusion_matrix
score = confusion_matrix(y_test,model.predict(X_test))
#retourne une matrice pour dire ou sont les erreurs
print(score)
#Courbes d'apprentissage
#Voir si en fournissant plus de données, le modèle apprend mieux
#Learning curve
from sklearn.model_selection import learning_curve
N,train_score,val_score = learning_curve(model,X_train,y_train,train_sizes=np.linspace(0.2,1.0,5),cv=5)
print(N)
plt.plot(N,train_score.mean(axis=1),label='train')
plt.plot(N,val_score.mean(axis=1),label='validatiob')
plt.xlabel('train_score')
plt.legend()
plt.show()