k邻近法实际上利用训练数据集对特征向量空间进行划分,并做为其分类的“模型”。
曼哈顿距离:html
#-*- coding:utf-8 -*- import numpy as np import operator def createDataset(): #四组二维特征 group = np.array([[5,115],[7,106],[56,11],[66,9]]) #四组对应标签 labels = ('动做片','动做片','爱情片','爱情片') return group,labels """ KNN算法 """ def classify(intX, dataSet, labels, k): ''' numpy中shape[0]返回数组的行数,shape[1]返回列数 ''' dataSetSize = dataSet.shape[0] """ 将intX在横向重复dataSetSize次,纵向重复1次 例如intX=([1,2])--->([[1,2],[1,2],[1,2],[1,2]])便于后面计算 """ diffMat = np.tile(intX, (dataSetSize, 1)) - dataSet """ 计算距离:欧式距离, 特征相减后乘方,而后再开方 """ sqdifMax = diffMat**2 seqDistances = sqdifMax.sum(axis=1) distances = seqDistances**0.5 #返回distance中元素从小到大排序后的索引 print ("distances:",distances) sortDistance = distances.argsort() print ("sortDistance:", sortDistance) """ 取出前k个元素的类别 """ classCount = {} for i in range(k): voteLabel = labels[sortDistance[i]] s = "第{}个voteLabel={}".format(i, voteLabel) print(s) classCount[voteLabel] = classCount.get(voteLabel,0)+1 #dict.get(key,default=None),字典的get()方法,返回指定键的值,若是值不在字典中返回默认值。 #计算类别次数 #key=operator.itemgetter(1)根据字典的值进行排序 #key=operator.itemgetter(0)根据字典的键进行排序 #reverse降序排序字典 sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True) #结果sortedClassCount = [('动做片', 2), ('爱情片', 1)] print ("sortedClassCount:") print(sortedClassCount) return sortedClassCount[0][0] if __name__ == '__main__': group,labels = createDataset() test = [20,101] test_class = classify(test,group,labels,3) print (test_class)