虽然把text转成所有量化是能够的,可是仍是须要把text转成numpy的形式(这个是必须掌握的)python
在将数据输入到分类器以前,必须将待处理数据的格式改变为分类器能够接受的格式。算法
数据规范化、数据归一化、数据算法化、输出偏差分析数组
代码:app
# -*- coding:utf-8 -*- from numpy import * def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat,classLabelVector #结果所有量化,把喜欢不喜欢排名一、二、3 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') import matplotlib import matplotlib.pyplot as plt # matplotlib 是python最著名的绘图库,它提供了一整套和matlab类似的命令API,十分适合交互式地行制图。并且也能够方便地将它做为绘图控件,嵌入GUI应用程序中。 fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) plt.show()
def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) #建立新的返回矩阵 m = dataSet.shape[0] #获得数据集的行数 shape方法用来获得矩阵或数组的维数 normDataSet = dataSet - tile(minVals,(m,1)) #tile:numpy中的函数。tile将原来的一个数组minVals,扩充成了m行1列的数组 normDataSet = normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals normMat,ranges,minVals = autoNorm((datingDataMat)) import operator def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def datingClassTest(): hoRatio = 0.10 ErrorCount = 0.0 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] count = int(m*hoRatio) #这里须要整型化 for i in range(count): #算法里使用的数据是count(总数)仍是i(当前数), #逐渐被测试的数据inX使用[i,:],可是数据集使用count # 输入参数:normMat[i,:]为测试样例,表示归一化后的第i行数据 # normMat[numTestVecs:m,:]为训练样本数据,样本数量为(m-numTestVecs)个 # datingLabels[numTestVecs:m]为训练样本对应的类型标签 # k为k-近邻的取值 classifierResult = classify0(normMat[i,:],normMat[count:m,:],datingLabels[count:m],4) print "the classifier came back with:%d,the real answer is :%d"\ % (classifierResult,datingLabels[i]) if (classifierResult != datingLabels[i]) : ErrorCount += 1.0 print "the total error rate is :%f" % (ErrorCount/float(count)) def classifyPerson(): resultList = ['not at all','in small doses','in large doses'] #float定义了输入的类型 percentTats = float(raw_input( "percentage of time spent playing video games?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat,datingLabels = file2matrix(("datingTestSet2.txt")) normMat,ranges,minVals = autoNorm(datingDataMat) #将输入的数据数组化 inArr = array([ffMiles,percentTats,iceCream]) classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3) print "You will probably like this person:",resultList[classifierResult - 1]