set<X1,X2……Xn> 为已知类别数据集,预测 点Xt 的类别:算法
(1)计算中的set中每个点与Xt的距离数组
(2)按距离增序排列app
(3)选择距离最小的前k个点ide
(4)肯定前k个点所在的类别的出现频率函数
(5)返回频率最高的类别做为测试的结果测试
1 from numpy import * 2 import operator 3 def createDataSet(): 4 group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) 5 labels = ['A','A','B','B'] 6 return group, labels 7 8 #kNN 9 def classify0(inX , dataSet ,labels,k): 10 dataSetSize = dataSet.shape[0] #行数 11 diffMat = tile(inX,(dataSetSize,1)) - dataSet # tile(inX,(dataSetSize,1)) 生成 dataSetSize 行 1 列的 元素为 inX的 数组 12 sqDiffMat = diffMat ** 2 # ** 为 ^ 13 sqDistances = sqDiffMat.sum(axis=1) # axis=0是按列求和 axis=1 是按行求和 14 distance = sqDistances ** 0.5 15 sortedDisInd = distance.argsort()# argsort,属于numpy中的函数 返回排序后元素在原对象中的下标 16 classCount = {} 17 for i in range(k): 18 votelabel = labels[sortedDisInd[i]] 19 classCount[votelabel] = classCount.get(votelabel,0) + 1 #dict.get(key, default=None) key:key在字典中查找。 default:在key不存在的状况下返回值None。 20 sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse =True) 21 ''' 22 要经过student的第三个域排序,能够这么写: 23 sorted(students, key=operator.itemgetter(2)) 24 sorted函数也能够进行多级排序,例如要根据第二个域和第三个域进行排序,能够这么写: 25 sorted(students, key=operator.itemgetter(1,2)) 26 即先跟句第二个域排序,再根据第三个域排序。 27 ''' 28 return sortedClassCount[0][0]
下载地址:http://pan.baidu.com/s/1c0NeKCg网站
数据格式:[fre flier miles earned per year]'\t'[per of time spent playing video games]'\t'[liters of ice cream consumed per year]'\t'[1,means do not at all/2,means small do/3,means large do]this
1 #加载数据 2 def file2matrix(filename): 3 fr = open(filename) 4 arrayOLines = fr.readlines() #注意须要加s 5 numberOfLines = len(arrayOLines) 6 returnMat = zeros((numberOfLines,3)) 7 classLabelVector = [] 8 index = 0 9 for line in arrayOLines: 10 line = line.strip() 11 listFormLine = line.split('\t') 12 for x in range(0,3): 13 returnMat[index,x] = float(listFormLine[x]) 14 classLabelVector.append(int(listFormLine[-1])) # -1 为最后一个元素 15 index += 1 16 return returnMat,classLabelVector
1 import matplotlib 2 import matplotlib.pyplot as plt 3 datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt') 4 fig = plt.figure() #figure建立一个绘图对象 5 ax = fig.add_subplot(111)# 若参数为349,意思是:将画布分割成3行4列,图像画在从左到右从上到下的第9块, 6 7 ''' 8 matplotlib.pyplot.scatter(x, y, s=20, c='b', marker='o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, hold=None,**kwargs) 9 其中,xy是点的坐标,s点的大小 10 maker是形状能够maker=(5,1)5表示形状是5边型,1表示是星型(0表示多边形,2放射型,3圆形) 11 alpha表示透明度;facecolor=‘none’表示不填充。 12 ''' 13 14 ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),marker=(5,1),alpha=0.5) 15 plt.show()
因为特征值的大小不一样,因此就会对结果的影响程度不一样。这就须要咱们归一化特征值,把每一个特征值的大小固定在[0,1]:spa
range = MaxVal - MinValcode
normVal = rawVal / (MaxVal - MinVal)
1 #归一化特征值 2 def autoNorm(dataSet): 3 minVals = dataSet.min(0) 4 maxVals = dataSet.max(0) 5 ranges = maxVals - minVals 6 normDataSet = zeros(shape(dataSet)) 7 m = dataSet.shape[0] 8 normDataSet = dataSet - tile(minVals,(m,1)) 9 normDataSet = normDataSet / tile(ranges,(m,1)) 10 return normDataSet,ranges,minVals
用10%的数据做为输入来测试,另外90%做为已知集合
1 def datingClassTest(): 2 hoRatio = 0.10 3 datingDataMat,datingLabels = file2matrix('datingTestSet.txt') 4 normMat,ranges,minVals = autoNorm(datingDataMat) 5 m = normMat.shape[0] 6 numTestVecs = int(m * hoRatio) 7 errorCount = 0.0 8 for i in range(numTestVecs): 9 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 10 print "back %d ,real %d" % (classifierResult,datingLabels[i]) 11 if(classifierResult != datingLabels[i]): 12 errorCount += 1.0 13 print "range is %f" % (errorCount / float(numTestVecs))
1 #约会网站测试函数 2 def classifyPerson(): 3 resultList = ['not at all','in small doses','in large dose'] 4 percentTats = float(raw_input("per of time spent playing video games?")) 5 ffMiles = float(raw_input("fre flier miles earned per year?")) 6 iceCream = float(raw_input("liters of ice cream consumed per year?")) 7 datingDataMat,datingLabels = file2matrix('datingTestSet.txt') 8 normMat,ranges,minVals = autoNorm(datingDataMat) 9 inArr = array([ffMiles,percentTats,iceCream]) 10 classifierResult = classify0((inArr - minVals)/ranges,normMat,datingLabels,3) 11 print "You will probably like this person :", 12 print resultList[classifierResult-1]