1.安装kaggle,获取数据源python
pip install kaggle
将数据下载到目录/data/data-test/digit_recognize/下linux
cd /data/data-test/digit_recognize/ kaggle competitions download -c digit-recognizer
2.安装anaconda3做为python3环境,自带sklearn,pandas,numpy等经常使用工具包git
3.代码实现app
import pandas as pd from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier import pickle # 文件路径 project_path = '/data/data-test/digit_recognize/' clf_file = project_path + 'knn.pickle' def get_data_chunk(file_name): # 文件太大分块读取文件 9000万条 reader = pd.read_csv(file_name, iterator=True) loop = True chunk_size = 100000 chunks = [] while loop: try: chunk = reader.get_chunk(chunk_size) chunks.append(chunk) print(len(chunks)) except StopIteration: loop = False print("Iteration is stopped.") res = pd.concat(chunks, ignore_index=True) return res def save_clf(clf_s): clf_f = open(clf_file, 'wb') pickle.dump(clf_s, clf_f) clf_f.close() def get_clf(): clf_f = open(clf_file, 'rb') res = pickle.load(clf_f) return res # 对测试数据集预测结果 def predict(): knn_clf = get_clf() test_data = get_data_chunk(project_path + "test.csv") res_data = knn_clf.predict(test_data) df = pd.DataFrame() df["imageId"] = test_data["imageId"] df["Label"] = res_data df.to_csv(project_path + 'res.csv', index=False) def train(): train_data = get_data_chunk(project_path + "train.csv") print(train_data.info()) print(train_data) train_lable = train_data['label'] x = train_data.drop(columns=['label']) max = 0 max_k = 5 # k取值从5,15用K折交叉验证算出正确率分数 for k in range(5, 15): clf = KNeighborsClassifier(n_neighbors=k) # cv为2折 scores = cross_val_score(clf, x, train_lable, cv=2, scoring='accuracy') mean = scores.mean() print(k, mean) if mean > max: max_k = k print("maxK=", max_k) # 用max_k做为knn参数训练模型 clf = KNeighborsClassifier(n_neighbors=max_k) clf.fit(x, train_lable) # 存储模型到pickle文件 save_clf(clf) if __name__ == '__main__': train() predict()