#coding:utf-8# from __future__ import print_functionfrom time import time #有些步骤要计时,看每一个步骤花多长时间import logging #打印出来progress程序进展import matplotlib.pyplot as plt #pyplot程序最后把咱们预测出来的人脸打印出来,强大的绘图工具from sklearn.cross_validation import train_test_splitfrom sklearn.datasets import fetch_lfw_peoplefrom sklearn.grid_search import GridSearchCVfrom sklearn.metrics import classification_reportfrom sklearn.metrics import confusion_matrixfrom sklearn.decomposition import RandomizedPCAfrom sklearn.svm import SVCprint(__doc__)# Display progress logs on stdout#打印程序进展的信息logging.basicConfig(level=logging.INFO,format='%(asctime)s %(message)s') #这个等下能够不用了#下载数据集,数据的参数能够参考文档lfw_people = fetch_lfw_people(min_faces_per_person=70,resize=0.4) ##下面介绍数据预处理和分类#返回多少个图n_samples,h,w = lfw_people.images.shape#X是特征向量的矩阵,每一行是个实例,每一列是个特征值X = lfw_people.data#n_featers表示的就是维度n_features = X.shape[1] #维度:每一个人会提取多少的特征值#提取每一个实例对应每一个人脸,目标分类标记,不一样的人的身份y = lfw_people.targettarget_names = lfw_people.target_namesn_classes = target_names.shape[0] #多少行,shape就是多少行,多少我的,多少类print("Total dataset size:")print("n_samples:%d" % n_samples) #实例的个数print("n_features:%d" % n_features) #特征向量的维度print("n_classes:%d" % n_classes) #总共有多少人#下面开始拆分数据,分红训练集和测试集,有个现成的函数,经过调用train_test_split;来分红两部分X_train,X_test,y_train,y_test = train_test_split( X,y,test_size=0.25)#数据降维,由于特征值的维度仍是比较高n_components = 150print("Extracting the top %d eigenfaces from %d faces" %(n_components,X_train.shape[0]))t0 = time() #计算出打印每一步须要的时间#经典算法,高维下降为低维的pca = RandomizedPCA(n_components=n_components,whiten=True).fit(X_train)print("done in %0.3fs" % (time() - t0))#对于人脸的一张照片上提取的特征值名为eigenfaceseigenfaces = pca.components_.reshape((n_components,h,w))print("Projecting the inpyt data on the eigenfaces orthonormal basis")t0 = time()X_train_pca = pca.transform(X_train) #特征量中训练集全部的特征向量经过pca转换成更低维的矩阵X_test_pca = pca.transform(X_test)print("done in %0.3fs" % (time() - t0))print("Fitting the classifier to the training set")t0 = time()#param_grid把参数设置成了不一样的值,C:权重;gamma:多少的特征点将被使用,由于咱们不知道多少特征点最好,选择了不一样的组合param_grid = {'C':[1e3,5e3,1e4,5e4,1e5], 'gamma':[0.0001,0.0005,0.001,0.005,0.01,0.1],}#把全部咱们所列参数的组合都放在SVC里面进行计算,最后看出哪一组函数的表现度最好clf = GridSearchCV(SVC(kernel='rbf',class_weight='auto'),param_grid)#其实建模很是很是简单,主要是数据的预处理麻烦clf = clf.fit(X_train_pca,y_train)print("done in %0.3fs" % (time() - t0))print("Best estimator found by grid search:")print(clf.best_estimator_)#测试集预测看看准确率能到多少print("Predicting people's names on the test set")t0 = time()y_pred = clf.predict(X_test_pca)print("done in %0.3fs" % (time() - t0))print(classification_report(y_test,y_pred,target_names=target_names))print(confusion_matrix(y_test,y_pred,labels=range(n_classes)))#把数据可视化的能够看到,把须要打印的图打印出来def plot_gallery(images,titles,h,w,n_row=3,n_col=4): """Helper function to plot a gallery of portraits""" #在figure上创建一个图当背景 plt.figure(figsize=(1.8*n_col,2.4*n_row)) plt.subplots_adjust(bottom=0,left=.01,right=.99,top=.90,hspace=.35) for i in range(n_row * n_col): plt.subplot(n_row,n_col,i+1) plt.imshow(images[i].reshape((h,w)),cmap=plt.cm.gray) plt.title(titles[i],size=12) plt.xticks(()) plt.yticks(())#把预测的函数归类标签和实际函数归类标签,好比布什def title(y_pred,y_test,target_names,i): pred_name = target_names[y_pred[i]].rsplit(' ',1)[-1] true_name = target_names[y_test[i]].rsplit(' ',1)[-1] return 'predicted: %s\ntrue: %s'% (pred_name,true_name)#把预测出来的人名存起来prediction_titles = [title(y_pred,y_test,target_names,i) for i in range(y_pred.shape[0])]#plot_gallery(X_test,prediction_titles,h,w)eigenface_titles = ['eigenface %d' %i for i in range(eigenfaces.shape[0])]#提取过特征向量以后的脸是什么样子plot_gallery(eigenfaces,eigenface_titles,h,w)plt.show()