写了一个简单的多类分类程序,给定一个数据集,在其上作10-fold交叉检验,输出loss,以及分类的结果。python
最关键的函数是def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf)。各个参数的含义是git
dict_feature_list是一个python的dict列表,列表中每个元素表明一个样本,好比一个文档,dict做为<k,v>,k表明特征,v是特征的值。算法
y_list是样本的标签app
num_features是数据集的维度大小dom
num_fold是几回交叉检验,10则表明10-fold交叉检验机器学习
clf是分类算法函数
作交叉检验时,关键代码是skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None),这个方法将根据类别的分布状况,对数据集作stratified分隔,尽可能使得每一个fold里的类别分布与原始数据集相同。毕竟,机器学习train出来的model假设作分类时,所面对的数据和训练数据有一样的分布才好。经过一个for循环,for i, (train_index, test_index) in enumerate(skf):将十次交叉检验的数据展开。代码最后用了metrix计算loss,固然能够偷懒直接用classification_report,我这里根据micro、macro、weighted三种方式,计算了下precision,recall和f1-score。学习
函数def make_2d_matrix_to_dict_lst(matrix)彻底是为了测试代码,做用是将一个dense的矩阵,变成dict的列表。测试
函数def dict_lst_to_coo_sparse_matrix(dict_lst, num_features):是将一个dict的列表,转成sparse matrix,这样能够很大幅度的节约内存,尤为是在作文本分类的时候。spa
具体用的时候,偷懒用了one-vs-rest的多分类策略,基础算法使用的逻辑回归clf = OneVsRestClassifier(LogisticRegression())
# -*- coding: utf-8 -*- from sklearn.cross_validation import StratifiedKFold from sklearn import metrics import numpy as np from sklearn.multiclass import OneVsRestClassifier from sklearn import datasets from sklearn.linear_model import LogisticRegression #transfer a python dict list to scipy COO sparse matrix #dict_lst: [{a:b},{a:b,c:d}], each dict is the feature set of an instance #num_features: the total number of features in dataset def dict_lst_to_coo_sparse_matrix(dict_lst, num_features): from scipy.sparse import coo_matrix import numpy as np n_doc = len(dict_lst) #find non-zero element row_vec = [] col_vec = [] data_vec = [] for d_index in range(len(dict_lst)): for k in dict_lst[d_index]: row_vec.append(d_index) col_vec.append(k) data_vec.append(dict_lst[d_index][k]) row_vec = np.array(row_vec) col_vec = np.array(col_vec) data_vec = np.array(data_vec) return coo_matrix((data_vec, (row_vec, col_vec)), shape=(n_doc, num_features)) #transfer a dense 2d matrix to dict lst def make_2d_matrix_to_dict_lst(matrix): lst = [] for row in matrix: d = {} for j in range(len(row)): if row[j] != 0: d[j] = row[j] lst.append(d) return lst #base experimental code def do_cross_validation(dict_feature_list, y_list, num_features, num_fold, clf): X = dict_feature_list#instance set y = np.array(y_list)#label set ids = np.arange(len(X))#instance id set id2result = {} loss_lst = [] predicted_lst = [] #make cross validation set skf = StratifiedKFold(y, n_folds=num_fold, shuffle=False, random_state=None) for i, (train_index, test_index) in enumerate(skf): #split dataset into train and test y_train = y[train_index] id_train = ids[train_index] X_train = [] for t in train_index: X_train.append(X[t]) y_test = y[test_index] id_test = ids[test_index] X_test = [] for t in test_index: X_test.append(X[t]) #make sparse representation sparse_X_train = dict_lst_to_coo_sparse_matrix(X_train, num_features) sparse_X_test = dict_lst_to_coo_sparse_matrix(X_test, num_features) #train a classifier on the training set clf.fit(sparse_X_train, y_train) #do prediction on the test set predicted_labels = clf.predict(sparse_X_test) #store results for later comparision for index in range(len(id_test)): id2result[id_test[index]] = (y_test[index], predicted_labels[index]) #compute loss macro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='macro') macro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='macro') macro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='macro') micro_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='micro') micro_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='micro') micro_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='micro') weighted_pr = metrics.precision_score(y_test, predicted_labels, pos_label=None, average='weighted') weighted_re = metrics.recall_score(y_test, predicted_labels, pos_label=None, average='weighted') weighted_f1 = metrics.f1_score(y_test, predicted_labels, pos_label=None, average='weighted') loss_lst.append((macro_pr, macro_re, macro_f1, micro_pr, micro_re, micro_f1, weighted_pr, weighted_re, weighted_f1)) return loss_lst, id2result #load digital recognition dataset digits = datasets.load_digits() X = digits.data y = digits.target num_features = len(X[0]) #make dict lst features feature_lst = make_2d_matrix_to_dict_lst(X) clf = OneVsRestClassifier(LogisticRegression()) loss_lst, id2result = do_cross_validation(feature_lst, y, num_features, 10, clf) for loss in loss_lst: print ['%.3f' % r for r in loss]