Simulations Gaussiennes
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from matplotlib.colors import ListedColormap
mycolormap = ListedColormap(['#FF0000', '#0000FF'])
Exchoice = 1 # Choisir l'exemple
##############################################
# Data set generation using scipy.stats.multivariate_normal
#############################################
def generate_data(n1, n2, mu1, cov1, mu2, cov2):
"""
Generates simulated data from two multivariate normal distributions.
Parameters:
- n1: Size of sample 1
- n2: Size of sample 2
- mu1: Mean vector for group 1 (in 2 dimensions)
- cov1: Covariance matrix for group 1 (in 2 dimensions)
- mu2: Mean vector for group 2 (in 2 dimensions)
- cov2: Covariance matrix for group 2 (in 2 dimensions)
Returns:
- X: Concatenated data matrix for both groups (n1 + n2, 2)
- Y: Associated class vector (0 for group 1, 1 for group 2)
"""
# Generate data for each group
xG1 = multivariate_normal(mean=mu1, cov=cov1).rvs(n1)
xG2 = multivariate_normal(mean=mu2, cov=cov2).rvs(n2)
# Concatenate data from both groups
X = np.concatenate((xG1, xG2), axis=0)
# Create class vector (labels)
Y = np.array([0] * n1 + [1] * n2)
return X, Y
n1, n2 = 50, 150
mu1 = [2, 2] # Moyenne G1
cov1 = [[1, 0], [0, 1]] # Matrice de covariance G1 (indépendance)
mu2 = [0, 0] # Moyenne G2
cov2 = [[1, 0], [0, 2]] # Matrice de covariance G2 (indépendance mais variance différente)
X, Y = generate_data(n1, n2, mu1, cov1, mu2, cov2)
# Visualisation des données générées
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=mycolormap)
plt.title('Raw data')
plt.grid()
plt.xlabel('x1')
plt.ylabel('x2')
plt.show()
Simulation de l’exemple 2 (gaussiennes côte-côte) puis affichage de la posterior
-
$P(X = x \mid Y = 1)$ est une gaussienne bivariée avec une moyenne $\mu_1 = (\mu_{11}, \mu_{12})$ et une matrice de covariance $\Sigma_1$.
-
La densité a posteriori $P(Y = 1 \mid X = x)$ peut être calculée à l’aide du théorème de Bayes :
$$ P(Y = 1 \mid X = x) = \frac{P(X = x \mid Y = 1) P(Y = 1)}{P(X = x)} $$
où :
$$ P(X = x) = P(X = x \mid Y = 1) P(Y = 1) + P(X = x \mid Y = 0) P(Y = 0) $$
Cela permet de calculer la probabilité a posteriori en fonction des probabilités conditionnelles et des probabilités a priori.
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
class Bayes_Classifier:
def __init__(self, mu1, cov1, mu2, cov2, p1=0.5):
"""
Initializes the Bayes Classifier with parameters of two bivariate Gaussian distributions.
Parameters:
- mu1: Mean vector for class 1 (in 2 dimensions)
- cov1: Covariance matrix for class 1 (in 2 dimensions)
- mu2: Mean vector for class 0 (in 2 dimensions)
- cov2: Covariance matrix for class 0 (in 2 dimensions)
- p1: Prior probability of class 1 (default is 0.5)
"""
self.mu1 = mu1
self.cov1 = cov1
self.mu2 = mu2
self.cov2 = cov2
self.p1 = p1
self.p2 = 1 - p1
self.rv1 = multivariate_normal(mu1, cov1, allow_singular=True)
self.rv2 = multivariate_normal(mu2, cov2, allow_singular=True)
def predict_proba(self, X):
"""
Predicts the posterior probability of class 1 for the given data points.
Parameters:
- X: Data points (n_samples, 2)
Returns:
- p1_x: Posterior probability of class 1 for each data point
"""
# Calculate likelihoods P(X | Y=1) and P(X | Y=0)
px_1 = self.rv1.pdf(X)
px_2 = self.rv2.pdf(X)
# Total density P(X=x)
px = px_1 * self.p1 + px_2 * self.p2
# Posterior probability P(Y=1 | X=x) using Bayes' theorem
p1_x = (px_1 * self.p1) / px
return p1_x
def display_posterior_density(X, Y, classifier):
"""
Displays the posterior density P(Y=1 | X=x) using a contour plot.
Parameters:
- X: Data points (n_samples, 2)
- Y: Class labels (n_samples,)
- classifier: Bayes_Classifier object used to predict the posterior probability
"""
# Define the grid for evaluating the posterior probability
min_x1, min_x2 = np.min(X, axis=0) - 1
max_x1, max_x2 = np.max(X, axis=0) + 1
# Generate a grid of 2D points (mesh)
x1, x2 = np.mgrid[min_x1:max_x1:.01, min_x2:max_x2:.01]
pos = np.dstack((x1, x2))
# Predict posterior probability of first class for each point in the grid
p1_x = classifier.predict_proba(pos.reshape(-1, 2))
if p1_x.ndim > 1 and p1_x.shape[1]>1:
p1_x=p1_x[:,0]
p1_x=p1_x.reshape(x1.shape)
# Display the posterior density in 2D
plt.contourf(x1, x2, p1_x, levels=20, cmap='coolwarm')
plt.colorbar(label='P(Y=1 | X=x)')
plt.contour(x1, x2, p1_x, levels=[0.5], colors='black', linewidths=2)
plt.title("Posterior Density P(Y=1 | X=x)")
plt.xlabel("x1")
plt.ylabel("x2")
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y, cmap='coolwarm', label='Data Points')
# Customize the plot
plt.title("Posterior Density P(Y=1 | X=x) with Data Points")
plt.xlabel("x1")
plt.ylabel("x2")
plt.legend()
# Show the plot
plt.show()
# Example usage
n1, n2 = 200, 200
mu1 = [2, 2]
cov1 = [[1, 0], [0, 1]]
mu2 = [0, 0]
cov2 = [[1, 0], [0, 2]]
X, Y = generate_data(n1, n2, mu1, cov1, mu2, cov2)
classifier = Bayes_Classifier(mu1, cov1, mu2, cov2)
# Display posterior density
display_posterior_density(X, Y, classifier)
Mon Classifieur de Bayes Naif
Classifieur de Bayes Naif
import numpy as np
class BayesNaifGaussien:
def __init__(self):
self.mu_0 = None # Moyenne pour la classe 0
self.mu_1 = None # Moyenne pour la classe 1
self.sigma_0 = None # Variance pour la classe 0
self.sigma_1 = None # Variance pour la classe 1
self.p_y0 = None # Probabilité a priori de Y=0
self.p_y1 = None # Probabilité a priori de Y=1
def fit(self, X, Y):
""" Calcule les paramètres du modèle à partir des données X et Y """
# Séparer les données selon les classes
X_0 = X[Y == 0] # Données pour la classe 0
X_1 = X[Y == 1] # Données pour la classe 1
# Calculer les moyennes et variances pour chaque classe
self.mu_0 = np.mean(X_0, axis=0)
self.mu_1 = np.mean(X_1, axis=0)
self.sigma_0 = np.var(X_0, axis=0)
self.sigma_1 = np.var(X_1, axis=0)
# Probabilités a priori P(Y=0) et P(Y=1)
self.p_y0 = len(X_0) / len(X)
self.p_y1 = len(X_1) / len(X)
def gaussienne(self, x, mu, sigma):
""" Calcul de la densité de probabilité gaussienne pour un point x """
return (1 / np.sqrt(2 * np.pi * sigma)) * np.exp(-0.5 * ((x - mu) ** 2) / sigma)
def predict_proba(self, X):
""" Calcule les probabilités a posteriori P(Y=1 | X) pour chaque individu dans X """
proba_y1_given_x = []
for x in X:
# Calculer P(X | Y=1) et P(X | Y=0) pour chaque variable
p_x_given_y1 = np.prod(self.gaussienne(x, self.mu_1, self.sigma_1))
p_x_given_y0 = np.prod(self.gaussienne(x, self.mu_0, self.sigma_0))
# Densité totale P(X=x)
p_x = p_x_given_y1 * self.p_y1 + p_x_given_y0 * self.p_y0
# Calculer la probabilité a posteriori P(Y=1 | X=x) (Théorème de Bayes)
p_y1_x = (p_x_given_y1 * self.p_y1) / p_x
# Ajouter la probabilité pour cet individu
proba_y1_given_x.append(p_y1_x)
return np.array(proba_y1_given_x)
def predict(self, X):
""" Prédit la classe (0 ou 1) pour chaque individu en fonction de P(Y=1 | X) """
proba_y1 = self.predict_proba(X)
return (proba_y1 >= 0.5).astype(int)
# Exemple d'utilisation :
# Générer des données aléatoires (2 variables)
#np.random.seed(42)
#X = np.random.randn(100, 2) # 100 individus, 2 variables
#Y = np.random.choice([0, 1], size=100) # Classes aléatoires 0 ou 1
# Initialiser et entraîner le modèle
model = BayesNaifGaussien()
model.fit(X, Y)
# Prédire les probabilités a posteriori P(Y=1 | X)
proba_y1_given_x = model.predict_proba(X)
# Afficher les probabilités et les paramètres du modèle
print("Probabilités P(Y=1 | X):", proba_y1_given_x)
print("Moyenne classe 0:", model.mu_0)
print("Moyenne classe 1:", model.mu_1)
print("Variance classe 0:", model.sigma_0)
print("Variance classe 1:", model.sigma_1)
# Afficher les performances
error=(model.predict(X)!=Y).sum()/len(Y)
print("Erreur empirique",error)
Probabilités P(Y=1 | X): [4.77129432e-02 1.88198612e-01 1.19806441e-02 1.61156057e-03
7.57724865e-01 5.47979333e-03 2.86561102e-02 5.28354680e-03
4.19710179e-03 2.32670361e-03 4.25351100e-05 2.75755452e-02
1.66745301e-03 2.49435759e-01 1.00766811e-03 6.33178333e-02
7.11726400e-02 8.06725044e-03 3.49788468e-02 2.33123217e-01
1.85679153e-01 1.61388217e-02 4.73890955e-04 3.40903507e-03
5.48351905e-02 6.11117153e-01 7.97042638e-04 2.39487337e-02
1.81606673e-01 7.51506787e-01 1.44906911e-01 3.20614819e-03
1.49182986e-02 2.36029577e-02 1.10421848e-03 4.06797993e-02
2.76919459e-01 9.80876216e-03 1.51080835e-03 1.19535527e-02
7.54920408e-02 2.37668468e-03 1.34267871e-01 1.50892025e-04
2.82689122e-04 3.97560524e-01 1.85964298e-01 2.89790882e-02
2.61032354e-01 4.09004267e-01 2.28426189e-04 2.69845210e-01
6.10553550e-03 8.50242142e-01 2.03655676e-01 2.20047876e-02
2.75311642e-02 9.77504492e-02 3.40259695e-01 3.73255885e-01
1.52570034e-03 2.83882103e-01 1.47553228e-03 1.59946985e-02
3.12402875e-03 9.51981172e-03 1.26780125e-01 1.38245003e-03
1.45565399e-01 2.11117520e-01 4.74854726e-04 6.81187545e-02
3.17382874e-01 1.45887411e-01 3.72396824e-03 1.00399080e-04
4.13313282e-03 1.17479805e-03 6.78963271e-01 1.77858266e-02
3.18035514e-01 1.04417578e-01 3.76900434e-02 3.05594629e-01
8.03794773e-05 6.33186245e-01 4.86832863e-03 2.00066529e-02
4.04799919e-03 2.19082430e-01 3.34528481e-01 6.23729351e-03
1.66303929e-02 8.95526616e-03 3.98065567e-02 1.98639470e-01
4.04561745e-02 5.82954816e-02 3.39805011e-03 2.09663974e-02
2.53093383e-02 2.21555326e-02 1.09365256e-02 3.74422495e-02
3.43872900e-04 1.89273542e-02 1.38456942e-03 4.80891278e-02
1.07933445e-01 3.05213147e-01 6.69535889e-02 1.44887853e-03
2.20980438e-03 3.40678910e-02 1.56474474e-02 4.55360912e-04
1.52599230e-04 3.38423666e-02 5.30316981e-02 3.95061749e-03
1.66579442e-03 2.99502034e-01 4.89077220e-01 7.24637743e-02
3.72126273e-03 9.97495673e-01 5.81488703e-02 2.71824520e-03
3.31314879e-03 1.87599392e-01 2.84798275e-02 2.23632018e-01
3.60488048e-01 2.38428084e-01 6.82397142e-02 1.70207819e-04
1.65429076e-02 8.97641020e-02 1.47437064e-02 1.02971798e-02
8.66017352e-04 2.90444112e-01 2.73443451e-03 3.12627059e-02
4.67172669e-02 1.67625382e-02 3.70097236e-02 1.45051358e-03
3.23307861e-01 3.18419139e-03 2.22863238e-02 1.22510786e-03
9.82459480e-02 1.11912747e-02 1.20060399e-01 1.62403349e-03
2.90996052e-01 5.85842940e-02 7.63932976e-03 2.20318786e-02
1.71423565e-02 1.87681205e-02 3.41083199e-02 2.05334553e-03
4.36902743e-01 3.31158281e-03 1.42633539e-04 9.32139342e-04
1.73175620e-02 2.04639977e-03 2.40672704e-05 2.58175562e-03
2.00897353e-01 7.09373320e-02 1.93387721e-02 3.43676984e-02
6.39837963e-03 2.08496946e-02 5.02641687e-02 4.40223776e-02
9.23455857e-01 8.50272970e-01 1.20956546e-01 1.36344422e-01
1.65764476e-01 6.13372600e-02 5.06173937e-03 3.54062021e-02
1.70972598e-03 3.92652867e-01 1.50124067e-01 2.12859692e-02
1.16263615e-02 4.21556783e-02 7.13342266e-02 2.05523669e-02
6.42687173e-02 1.95607635e-01 2.36928616e-01 9.28496158e-02
9.92363237e-01 9.99347763e-01 9.91116855e-01 9.87181820e-01
9.57392190e-01 8.68372052e-01 9.97952198e-01 9.70984948e-01
9.28307824e-01 9.85646977e-01 9.84346244e-01 9.96345508e-01
9.65157702e-01 9.97200756e-01 9.59341537e-01 9.90321766e-01
9.97160385e-01 9.99963510e-01 9.99954006e-01 8.65862808e-01
9.72073336e-01 9.99887907e-01 7.29077611e-01 9.95690771e-01
9.55895128e-01 9.94207279e-01 9.86176885e-01 9.97203715e-01
9.72794523e-01 9.95845219e-01 9.31445242e-01 9.33333563e-01
9.99562104e-01 9.99866670e-01 9.99837689e-01 9.75130335e-01
9.97995441e-01 9.99417007e-01 9.96072060e-01 9.32357076e-01
9.55016044e-01 6.53713472e-01 6.78432054e-01 9.84099988e-01
1.38612115e-01 9.86935547e-01 9.30047978e-01 9.98871069e-01
9.73709329e-01 9.88313439e-01 9.96964639e-01 9.99237123e-01
9.92546248e-01 9.88113710e-01 9.99551783e-01 1.73016802e-01
9.99651727e-01 9.99983129e-01 9.99789445e-01 9.78577405e-01
9.55891996e-01 1.46452492e-01 9.98577642e-01 9.21704460e-01
9.99953932e-01 9.99837455e-01 9.47543861e-01 9.03746805e-01
9.99103334e-01 9.99792672e-01 9.99956825e-01 9.99997796e-01
8.57309795e-01 6.71868292e-01 5.29623647e-01 4.96240922e-01
9.99619180e-01 8.82698551e-01 7.67662542e-01 9.95413843e-01
9.99991943e-01 9.17265098e-01 9.99470345e-01 8.39243490e-01
9.99937582e-01 9.99482108e-01 9.97125578e-01 6.24979920e-01
9.34161919e-01 9.95496765e-01 9.94315135e-01 8.83557684e-01
8.97947696e-01 5.10179180e-01 9.94743249e-01 9.61605183e-01
9.99644538e-01 9.88320148e-01 8.76415088e-01 9.59577794e-01
9.50858263e-01 9.78410036e-01 9.99913326e-01 9.70535075e-01
9.99969452e-01 9.75022751e-01 8.64891438e-01 9.82382548e-01
9.96648735e-01 9.99061819e-01 3.99177158e-01 9.98788237e-01
9.73770146e-01 9.99608663e-01 8.59156079e-01 9.99982757e-01
9.99654456e-01 8.38934579e-01 8.74529610e-01 9.99903513e-01
9.99044151e-01 7.21647474e-01 9.96229908e-01 4.32779327e-02
9.99652362e-01 9.96643859e-01 5.28982658e-01 9.99655356e-01
7.02083645e-01 9.99986917e-01 5.33882446e-01 3.51556956e-01
2.58009589e-01 9.93309344e-01 9.99923436e-01 9.99929086e-01
9.94525560e-01 9.89293529e-01 9.95676851e-01 9.98902457e-01
6.56435543e-01 9.99881910e-01 9.20536849e-01 9.96802987e-01
9.99722059e-01 9.99930401e-01 9.99997011e-01 8.70526167e-01
9.63482657e-01 9.81299644e-01 9.99988951e-01 9.99666706e-01
9.86843060e-01 9.86592838e-01 9.99742465e-01 1.54492741e-01
3.62478287e-01 9.99781860e-01 1.29421022e-02 5.74462107e-01
2.94746510e-01 3.23258171e-01 9.91575605e-01 9.57284443e-01
9.99659660e-01 9.92712669e-01 9.71220091e-01 9.99939050e-01
9.90235282e-01 6.12219544e-01 3.16340570e-01 5.57344204e-01
9.99994937e-01 9.94087895e-01 9.02534982e-01 2.28010496e-01
9.96876362e-01 9.99703341e-01 7.93473528e-01 3.83625876e-01
9.99988105e-01 9.76297207e-01 9.84924536e-01 9.99999680e-01
9.99962153e-01 8.96720802e-01 9.98621979e-01 9.98555326e-01
9.95912839e-01 9.51539421e-01 3.72359423e-01 9.84742990e-01
9.94566778e-01 7.34086111e-01 1.14850155e-01 9.87775139e-01
9.99788052e-01 9.45727676e-01 8.42356808e-01 9.95216255e-01]
Moyenne classe 0: [2.04616058 1.91278979]
Moyenne classe 1: [-0.07510028 -0.14203197]
Variance classe 0: [0.97936836 0.97850395]
Variance classe 1: [0.89293017 2.05696981]
Erreur empirique 0.0675
Erreur en apprentissage et test pour le classifieur de Bayes naïf
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=1/3, random_state=3)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB();
gnbmod=gnb.fit(X_train, y_train);
y_pred_test = gnbmod.predict(X_test)
y_pred_train = gnbmod.predict(X_train)
E_test=(y_test != y_pred_test).sum()/len(y_test)
E_train=(y_train != y_pred_train).sum()/len(y_train)
print("Error on the test set=",E_test)
print("Error on the train set=",E_train)
Error on the test set= 0.05970149253731343
Error on the train set= 0.07142857142857142
Courbe ROC et AUC pour une validation en 5 plis
#Etude MM 2022
y=Y
#############################################
# Cross-validation sur la courbe ROC
############################################
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import StratifiedKFold
n_samples, n_features = X.shape
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = GaussianNB();
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
y_train=[y[i] for i in train];
y_test=[y[i] for i in test];
classifier.fit(X[train], y_train)
viz = RocCurveDisplay.from_estimator(
classifier, # Le modèle entraîné
X[test], # Les données de test
y_test, # Les étiquettes de test
name='ROC fold {}'.format(i), # Nom de la courbe pour chaque fold
alpha=0.3, # Transparence
lw=1, # Épaisseur de la ligne
ax=ax # L'axe de la figure pour tracer la courbe
)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(viz.roc_auc)
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
title="Receiver operating characteristic example")
ax.legend(loc="lower right")
plt.show()
Analyse discriminante linéaire
# Example usage
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
# Generate data
n1, n2 = 200, 200
mu1 = [2, 2]
cov1 = [[1, 0], [0, 1]]
mu2 = [0, 0]
cov2 = [[1, 0], [0, 2]]
X, Y = generate_data(n1, n2, mu1, cov1, mu2, cov2)
# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Train LDA model
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
# Predict on test set
Y_pred = lda.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"LDA Classification Accuracy: {accuracy:.2f}")
# Display posterior density
display_posterior_density(X, Y, lda)
LDA Classification Accuracy: 0.93
Nous remarquons la frontière linéaire liée à l’égalité des matrices de covariance
Analyse discriminante quadratique
Les matrices de variances-covariances sont laissées libres et la frontière est une hyperbole (proche de la frontière obtenu par le classifieur de Bayes naïf)
# Example usage with QDA
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score
# Generate data
n1, n2 = 200, 200
mu1 = [2, 2]
cov1 = [[1, 0], [0, 1]]
mu2 = [0, 0]
cov2 = [[1, 0], [0, 2]]
X, Y = generate_data(n1, n2, mu1, cov1, mu2, cov2)
# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Train QDA model
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, Y_train)
# Predict on test set
Y_pred = qda.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"QDA Classification Accuracy: {accuracy:.2f}")
# Display posterior density
display_posterior_density(X, Y, qda)
QDA Classification Accuracy: 0.93
Les plus proches voisins
# Example usage with KNN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Generate data
n1, n2 = 200, 200
mu1 = [2, 2]
cov1 = [[1, 0], [0, 1]]
mu2 = [0, 0]
cov2 = [[1, 0], [0, 2]]
X, Y = generate_data(n1, n2, mu1, cov1, mu2, cov2)
# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5) # You can adjust the number of neighbors (k)
knn.fit(X_train, Y_train)
# Predict on test set
Y_pred = knn.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"KNN Classification Accuracy: {accuracy:.2f}")
# Display posterior density
display_posterior_density(X, Y, knn)
KNN Classification Accuracy: 0.85
La régression logistique
# Example usage with Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Generate data
n1, n2 = 200, 200
mu1 = [2, 2]
cov1 = [[1, 0], [0, 1]]
mu2 = [0, 0]
cov2 = [[1, 0], [0, 2]]
X, Y = generate_data(n1, n2, mu1, cov1, mu2, cov2)
# Split the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, Y_train)
# Predict on test set
Y_pred = log_reg.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Logistic Regression Classification Accuracy: {accuracy:.2f}")
# Display posterior density
display_posterior_density(X, Y, log_reg)
Logistic Regression Classification Accuracy: 0.88
Comparaison des méthodes par AUC et erreur de classification
Remarquons les grandes différences entre ensemble d’apprentissage et test.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.datasets import make_classification
# Générer des données simulées
n1, n2 = 200, 200
X, Y = make_classification(n_samples=n1 + n2, n_features=2, n_informative=2, n_redundant=0, random_state=42)
# Split les données en train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Initialiser les modèles
models = {
'Naive Bayes': GaussianNB(),
'LDA': LinearDiscriminantAnalysis(),
'Logistic Regression': LogisticRegression(),
'QDA': QuadraticDiscriminantAnalysis(),
'KNN': KNeighborsClassifier(n_neighbors=5)
}
# Fonction pour calculer les AUC avec validation croisée (10-fold)
def calculate_auc_cv(model, X_train, Y_train, X_test, Y_test):
scorer = make_scorer(roc_auc_score)
train_scores = cross_val_score(model, X_train, Y_train, cv=10, scoring=scorer)
test_scores = cross_val_score(model, X_test, Y_test, cv=10, scoring=scorer)
return train_scores, test_scores
# Stocker les résultats des AUC
results_train = []
results_test = []
model_names = []
# Calculer l'AUC pour chaque modèle
for name, model in models.items():
train_auc, test_auc = calculate_auc_cv(model, X_train, Y_train, X_test, Y_test)
results_train.append(train_auc)
results_test.append(test_auc)
model_names.append(name)
# Créer les boxplots pour visualiser les AUC en train et en test
fig, ax = plt.subplots(figsize=(12, 6))
# Préparer les données pour afficher les boxplots côte à côte
positions_train = np.arange(1, len(models) * 2, 2) # Positions pour les boxplots en train
positions_test = np.arange(2, len(models) * 2 + 1, 2) # Positions pour les boxplots en test
# Boxplot pour l'AUC sur les données de train (en bleu)
bp_train = ax.boxplot(results_train, positions=positions_train, widths=0.6, patch_artist=True,
boxprops=dict(facecolor="lightblue"), medianprops=dict(color="blue"))
# Boxplot pour l'AUC sur les données de test (en orange)
bp_test = ax.boxplot(results_test, positions=positions_test, widths=0.6, patch_artist=True,
boxprops=dict(facecolor="orange"), medianprops=dict(color="red"))
# Ajouter une légende pour les boxplots
ax.legend([bp_train["boxes"][0], bp_test["boxes"][0]], ['Train', 'Test'], loc='upper right')
# Ajuster les labels et le titre
ax.set_xticks(np.arange(1.5, len(models) * 2, 2))
ax.set_xticklabels(model_names)
ax.set_title("Comparaison des modèles - AUC en Train et Test (10-fold CV)")
ax.set_ylabel("AUC Score")
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import make_scorer
from sklearn.datasets import make_classification
# Générer des données simulées
n1, n2 = 200, 200
X, Y = make_classification(n_samples=n1 + n2, n_features=2, n_informative=2, n_redundant=0, random_state=42)
# Split les données en train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# Initialiser les modèles
models = {
'Naive Bayes': GaussianNB(),
'LDA': LinearDiscriminantAnalysis(),
'Logistic Regression': LogisticRegression(),
'QDA': QuadraticDiscriminantAnalysis(),
'KNN': KNeighborsClassifier(n_neighbors=5)
}
# Fonction pour calculer les erreurs avec validation croisée (10-fold)
def calculate_error_cv(model, X_train, Y_train, X_test, Y_test):
scorer = make_scorer(lambda y_true, y_pred: 1 - np.mean(y_true == y_pred)) # Calculer l'erreur
train_errors = cross_val_score(model, X_train, Y_train, cv=10, scoring=scorer)
test_errors = cross_val_score(model, X_test, Y_test, cv=10, scoring=scorer)
return train_errors, test_errors
# Stocker les résultats des erreurs
results_train = []
results_test = []
model_names = []
# Calculer l'erreur pour chaque modèle
for name, model in models.items():
train_error, test_error = calculate_error_cv(model, X_train, Y_train, X_test, Y_test)
results_train.append(train_error)
results_test.append(test_error)
model_names.append(name)
# Créer les boxplots pour visualiser les erreurs en train et en test
fig, ax = plt.subplots(figsize=(12, 6))
# Préparer les données pour afficher les boxplots côte à côte
positions_train = np.arange(1, len(models) * 2, 2) # Positions pour les boxplots en train
positions_test = np.arange(2, len(models) * 2 + 1, 2) # Positions pour les boxplots en test
# Boxplot pour l'erreur sur les données de train (en bleu)
bp_train = ax.boxplot(results_train, positions=positions_train, widths=0.6, patch_artist=True,
boxprops=dict(facecolor="lightblue"), medianprops=dict(color="blue"))
# Boxplot pour l'erreur sur les données de test (en orange)
bp_test = ax.boxplot(results_test, positions=positions_test, widths=0.6, patch_artist=True,
boxprops=dict(facecolor="orange"), medianprops=dict(color="red"))
# Ajouter une légende pour les boxplots
ax.legend([bp_train["boxes"][0], bp_test["boxes"][0]], ['Train', 'Test'], loc='upper right')
# Ajuster les labels et le titre
ax.set_xticks(np.arange(1.5, len(models) * 2, 2))
ax.set_xticklabels(model_names)
ax.set_title("Comparaison des modèles - Erreur en Train et Test (10-fold CV)")
ax.set_ylabel("Erreur de Classification")
plt.tight_layout()
plt.show()