1基于用户的协同过滤算法:node
基于用户的协同过滤算法是推荐系统中最古老的的算法,能够说是这个算法的诞生标志了推荐系统的诞生。该算法在1992年被提出,并应用于邮件过滤系统,1994年被GroupLens用于新闻过滤。算法
在一个在线个性化推荐系统中,当一个用户A须要个性化推荐时,能够先找到和他有类似兴趣的其余用户,而后把那些用户喜欢的而用户A没有接触过的物品推荐给A。这种方法称为基于用户的协同过滤算法。app
给定用户u和用户v,令N(u)表示用户u曾经有过正反馈的物品集合,经过余弦类似度计算用户的类似度。因为不少用户相互之间并无对一样的物品产生过行为,即,所以能够先计算
的用户对(u,v)。为此,能够首先创建物品到用户的倒查表,对于每一个物品保存对该物品产生过行为的用户列表。令稀疏矩阵
,假设用户u和用户v同时属于倒查表中K个物品对应的用户列表,就有
(即用户u和v对相同物品产生正反馈的物品数),从而能够扫描倒查表中每一个物品对应的用户列表,将用户列表中的两两用户对应的
加1,最终就能够得到全部用户之间不为0的
(也就是余弦类似度的分子)。dom
获得用户之间的兴趣类似度以后,基于用户的协同过滤算法(User Based Collaborative Filering)会给用户推荐和他兴趣最类似的K个用户喜欢的物品。以下公式度量了UserCF算法中用户u对物品i的感兴趣程度:ide
其中,S(u,k)包含和用户u兴趣类似度最接*的k个用户集合,N(i)是对物品i有过行为的用户集合,是用户u和用户v的兴趣类似度,
表明用户v对物品i的兴趣,由于使用的是单一行为的隐反馈数据,所以
为1。测试
根据以上思路,使用Python实现UserCF算法的代码以下:网站
import random |
002 |
import math |
003 |
class UserBasedCF: |
004 |
def __init__( self ,datafile = None ): |
005 |
self .datafile = datafile |
006 |
self .readData() |
007 |
self .splitData( 3 , 47 ) |
008 |
|
009 |
def readData( self ,datafile = None ): |
010 |
""" |
011 |
read the data from the data file which is a data set |
012 |
""" |
013 |
self .datafile = datafile or self .datafile |
014 |
self .data = [] |
015 |
for line in open ( self .datafile): |
016 |
userid,itemid,record,_ = line.split() |
017 |
self .data.append((userid,itemid, int (record))) |
018 |
019 |
def splitData( self ,k,seed,data = None ,M = 8 ): |
020 |
""" |
021 |
split the data set |
022 |
testdata is a test data set |
023 |
traindata is a train set |
024 |
test data set / train data set is 1:M-1 |
025 |
""" |
026 |
self .testdata = {} |
027 |
self .traindata = {} |
028 |
data = data or self .data |
029 |
random.seed(seed) |
030 |
for user,item, record in self .data: |
031 |
if random.randint( 0 ,M) = = k: |
032 |
self .testdata.setdefault(user,{}) |
033 |
self .testdata[user][item] = record |
034 |
else : |
035 |
self .traindata.setdefault(user,{}) |
036 |
self .traindata[user][item] = record |
037 |
|
038 |
def userSimilarityBest( self ,train = None ): |
039 |
""" |
040 |
the other method of getting user similarity which is better than above |
041 |
you can get the method on page 46 |
042 |
In this experiment,we use this method |
043 |
""" |
044 |
train = train or self .traindata |
045 |
self .userSimBest = dict () |
046 |
item_users = dict () |
047 |
for u,item in train.items(): |
048 |
for i in item.keys(): |
049 |
item_users.setdefault(i, set ()) |
050 |
item_users[i].add(u) |
051 |
user_item_count = dict () |
052 |
count = dict () |
053 |
for item,users in item_users.items(): |
054 |
for u in users: |
055 |
user_item_count.setdefault(u, 0 ) |
056 |
user_item_count[u] + = 1 |
057 |
for v in users: |
058 |
if u = = v: continue |
059 |
count.setdefault(u,{}) |
060 |
count[u].setdefault(v, 0 ) |
061 |
count[u][v] + = 1 |
062 |
for u ,related_users in count.items(): |
063 |
self .userSimBest.setdefault(u, dict ()) |
064 |
for v, cuv in related_users.items(): |
065 |
self .userSimBest[u][v] = cuv / math.sqrt(user_item_count[u] * user_item_count[v] * 1.0 ) |
066 |
|
067 |
def recommend( self ,user,train = None ,k = 8 ,nitem = 40 ): |
068 |
train = train or self .traindata |
069 |
rank = dict () |
070 |
interacted_items = train.get(user,{}) |
071 |
for v ,wuv in sorted ( self .userSimBest[user].items(),key = lambda x : x[ 1 ],reverse = True )[ 0 :k]: |
072 |
for i , rvi in train[v].items(): |
073 |
if i in interacted_items: |
074 |
continue |
075 |
rank.setdefault(i, 0 ) |
076 |
rank[i] + = wuv |
077 |
return dict ( sorted (rank.items(),key = lambda x :x[ 1 ],reverse = True )[ 0 :nitem]) |
078 |
079 |
def recallAndPrecision( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
080 |
""" |
081 |
Get the recall and precision, the method you want to know is listed |
082 |
in the page 43 |
083 |
""" |
084 |
train = train or self .traindata |
085 |
test = test or self .testdata |
086 |
hit = 0 |
087 |
recall = 0 |
088 |
precision = 0 |
089 |
for user in train.keys(): |
090 |
tu = test.get(user,{}) |
091 |
rank = self .recommend(user, train = train,k = k,nitem = nitem) |
092 |
for item,_ in rank.items(): |
093 |
if item in tu: |
094 |
hit + = 1 |
095 |
recall + = len (tu) |
096 |
precision + = nitem |
097 |
return (hit / (recall * 1.0 ),hit / (precision * 1.0 )) |
098 |
|
099 |
def coverage( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
100 |
train = train or self .traindata |
101 |
test = test or self .testdata |
102 |
recommend_items = set () |
103 |
all_items = set () |
104 |
for user in train.keys(): |
105 |
for item in train[user].keys(): |
106 |
all_items.add(item) |
107 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
108 |
for item,_ in rank.items(): |
109 |
recommend_items.add(item) |
110 |
return len (recommend_items) / ( len (all_items) * 1.0 ) |
111 |
|
112 |
def popularity( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
113 |
""" |
114 |
Get the popularity |
115 |
the algorithm on page 44 |
116 |
""" |
117 |
train = train or self .traindata |
118 |
test = test or self .testdata |
119 |
item_popularity = dict () |
120 |
for user ,items in train.items(): |
121 |
for item in items.keys(): |
122 |
item_popularity.setdefault(item, 0 ) |
123 |
item_popularity[item] + = 1 |
124 |
ret = 0 |
125 |
n = 0 |
126 |
for user in train.keys(): |
127 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
128 |
for item ,_ in rank.items(): |
129 |
ret + = math.log( 1 + item_popularity[item]) |
130 |
n + = 1 |
131 |
return ret / (n * 1.0 ) |
132 |
133 |
def testUserBasedCF(): |
134 |
cf = UserBasedCF( 'u.data' ) |
135 |
cf.userSimilarityBest() |
136 |
print "%3s%20s%20s%20s%20s" % ( 'K' , "recall" , 'precision' , 'coverage' , 'popularity' ) |
137 |
for k in [ 5 , 10 , 20 , 40 , 80 , 160 ]: |
138 |
recall,precision = cf.recallAndPrecision( k = k) |
139 |
coverage = cf.coverage(k = k) |
140 |
popularity = cf.popularity(k = k) |
141 |
print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100 ,precision * 100 ,coverage * 100 ,popularity) |
142 |
143 |
if __name__ = = "__main__" : |
144 |
testUserBasedCF() |
基于物品的协同过滤算法(Item-Based Collaborative Filtering)是目前业界应用最多的算法,亚马逊、Netflix、Hulu、YouTube都采用该算法做为其基础推荐算法。this
基于用户的协同过滤算法有一些缺点:随着网站的用户数目愈来愈大,计算用户兴趣类似度矩阵将愈来愈困难,其运算时间复杂度和空间复杂度的增加和用户数的增加*似*方关心。而且,基于用户的协同过滤算法很难对推荐结果作出解释。所以亚马逊提出了基于物品的协同过滤算法。spa
基于物品的协同过滤算法给用户推荐那些和他们以前喜欢的物品类似的物品。不过ItemCF算法并不利用物品的内容属性计算物品之间的类似度,它主要经过分析用户的行为记录计算用户之间的类似度,也就是说物品A和物品B具备很大的类似度是由于喜欢物品A的用户大都也喜欢物品B(这一点也是基于物品的协同过滤算法和基于内容的推荐算法最主要的区别)。同时,基于物品的协同过滤算法能够利用用户的历史行为给推荐结果提供推荐解释,用于解释的物品都是用户以前喜欢的或者购买的物品。3d
“Customers Who Bought This Item Also Bought”(亚马逊显示相关物品推荐时的标题),从这句话的定义出发,利用如下公式定义物品之间的类似度:
其中N(i)是喜欢物品i的用户数,分子是同时喜欢物品i和物品j的用户数。这个公式惩罚了物品j的权重,减轻了热门物品会和不少物品类似的可能性(这样wij的值会很大,接*于1)。这个公式说明两个物品产生类似度是由于它们共同被不少用户喜欢,也就是说每一个用户均可以经过他们的历史兴趣列表给物品“贡献”类似度。
和UserCF算法相似,用ItemCF算法计算物品类似度时也能够首先创建用户-物品倒排表,即对每一个用户创建一个包含他喜欢的物品的列表,而后对于每一个用户,将他物品列表中的物品两两在共现矩阵C中加1,最终就能够获得全部物品之间不为0的,也就是公式中的分子。
在获得物品之间的类似度后,ItemCF经过以下公式计算用户u对一个物品i的兴趣:
其中,N(u)是用户喜欢的物品集合,S(i,k)是和物品i最类似的k个物品的集合,是物品j和物品i的类似度,
是用户u对物品i的兴趣,对于隐反馈数据集,若是用户u对物品i有过行为,便可令
为1。
根据以上思路,使用Python实现ItemCF算法的代码以下:
001 |
import math |
002 |
import random |
003 |
class ItemBasedCF: |
004 |
def __init__( self , datafile = None ): |
005 |
self .datafile = datafile |
006 |
self .readData() |
007 |
self .splitData( 3 , 47 ) |
008 |
|
009 |
def readData( self ,datafile = None ): |
010 |
self .datafile = datafile or self .datafile |
011 |
self .data = [] |
012 |
file = open ( self .datafile, 'r' ) |
013 |
for line in file .readlines()[ 0 : 100 * 1000 ]: |
014 |
userid, itemid, record,_ = line.split() |
015 |
self .data.append((userid,itemid, int (record))) |
016 |
|
017 |
def splitData( self ,k,seed,data = None ,M = 8 ): |
018 |
self .testdata = {} |
019 |
self .traindata = {} |
020 |
data = data or self .data |
021 |
random.seed(seed) |
022 |
for user,item,record in self .data: |
023 |
if random.randint( 0 , 7 ) = = k: |
024 |
self .testdata.setdefault(item,{}) |
025 |
self .testdata[item][user] = record |
026 |
else : |
027 |
self .traindata.setdefault(item,{}) |
028 |
self .traindata[item][user] = record |
029 |
|
030 |
def ItemSimilarity( self , train = None ): |
031 |
train = train or self .traindata |
032 |
self .itemSim = dict () |
033 |
#user_items = dict() |
034 |
item_user_count = dict () #item_user_count{item: likeCount} the number of users who like the item |
035 |
count = dict () #count{i:{j:value}} the number of users who both like item i and j |
036 |
for user, item in train.items(): #initialize the user_items{user: items} |
037 |
for i in item.keys(): |
038 |
item_user_count.setdefault(i, 0 ) |
039 |
item_user_count[i] + = 1 |
040 |
for j in item.keys(): |
041 |
if i = = j: |
042 |
continue |
043 |
count.setdefault(i,{}) |
044 |
count[i].setdefault(j, 0 ) |
045 |
count[i][j] + = 1 |
046 |
for i, related_items in count.items(): |
047 |
self .itemSim.setdefault(i, dict ()) |
048 |
for j, cuv in related_items.items(): |
049 |
self .itemSim[i].setdefault(j, 0 ) |
050 |
self .itemSim[i][j] = cuv / math.sqrt(item_user_count[i] * item_user_count[j] * 1.0 ) |
051 |
|
052 |
def recommend( self ,user,train = None , k = 8 , nitem = 40 ): |
053 |
train = train or self .traindata |
054 |
rank = dict () |
055 |
ru = train.get(user,{}) |
056 |
for i,pi in ru.items(): |
057 |
for j,wj in sorted ( self .itemSim[i].items(), key = lambda x:x[ 1 ], reverse = True )[ 0 :k]: |
058 |
if j in ru: |
059 |
continue |
060 |
rank.setdefault(j, 0 ) |
061 |
rank[j] + = wj |
062 |
#print dict(sorted(rank.items(), key = lambda x:x[1], reverse = True)[0:nitem]) |
063 |
return dict ( sorted (rank.items(), key = lambda x:x[ 1 ], reverse = True )[ 0 :nitem]) |
064 |
|
065 |
def recallAndPrecision( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
066 |
train = train or self .traindata |
067 |
test = test or self .testdata |
068 |
hit = 0 |
069 |
recall = 0 |
070 |
precision = 0 |
071 |
for user in train.keys(): |
072 |
tu = test.get(user,{}) |
073 |
rank = self .recommend(user,train = train,k = k,nitem = nitem) |
074 |
for item,_ in rank.items(): |
075 |
if item in tu: |
076 |
hit + = 1 |
077 |
recall + = len (tu) |
078 |
precision + = nitem |
079 |
return (hit / (recall * 1.0 ),hit / (precision * 1.0 )) |
080 |
|
081 |
def coverage( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
082 |
train = train or self .traindata |
083 |
test = test or self .testdata |
084 |
recommend_items = set () |
085 |
all_items = set () |
086 |
for user in train.keys(): |
087 |
for item in train[user].keys(): |
088 |
all_items.add(item) |
089 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
090 |
for item,_ in rank.items(): |
091 |
recommend_items.add(item) |
092 |
return len (recommend_items) / ( len (all_items) * 1.0 ) |
093 |
|
094 |
def popularity( self ,train = None ,test = None ,k = 8 ,nitem = 10 ): |
095 |
""" |
096 |
Get the popularity |
097 |
the algorithm on page 44 |
098 |
""" |
099 |
train = train or self .traindata |
100 |
test = test or self .testdata |
101 |
item_popularity = dict () |
102 |
for user ,items in train.items(): |
103 |
for item in items.keys(): |
104 |
item_popularity.setdefault(item, 0 ) |
105 |
item_popularity[item] + = 1 |
106 |
ret = 0 |
107 |
n = 0 |
108 |
for user in train.keys(): |
109 |
rank = self .recommend(user, train, k = k, nitem = nitem) |
110 |
for item ,_ in rank.items(): |
111 |
ret + = math.log( 1 + item_popularity[item]) |
112 |
n + = 1 |
113 |
return ret / (n * 1.0 ) |
114 |
|
115 |
def testRecommend(): |
116 |
ubcf = ItemBasedCF( 'u.data' ) |
117 |
ubcf.readData() |
118 |
ubcf.splitData( 4 , 100 ) |
119 |
ubcf.ItemSimilarity() |
120 |
user = "345" |
121 |
rank = ubcf.recommend(user,k = 3 ) |
122 |
for i,rvi in rank.items(): |
123 |
items = ubcf.testdata.get(user,{}) |
124 |
record = items.get(i, 0 ) |
125 |
print "%5s: %.4f--%.4f" % (i,rvi,record) |
126 |
|
127 |
def testItemBasedCF(): |
128 |
cf = ItemBasedCF( 'u.data' ) |
129 |
cf.ItemSimilarity() |
130 |
print "%3s%20s%20s%20s%20s" % ( 'K' , "recall" , 'precision' , 'coverage' , 'popularity' ) |
131 |
for k in [ 5 , 10 , 20 , 40 , 80 , 160 ]: |
132 |
recall,precision = cf.recallAndPrecision( k = k) |
133 |
coverage = cf.coverage(k = k) |
134 |
popularity = cf.popularity(k = k) |
135 |
print "%3d%19.3f%%%19.3f%%%19.3f%%%20.3f" % (k,recall * 100 ,precision * 100 ,coverage * 100 ,popularity) |
136 |
|
137 |
if __name__ = = "__main__" : |
138 |
testItemBasedCF() |
UserCF给用户推荐那些和他有共同兴趣爱好的用户喜欢的物品,而ItemCF给用户推荐那些和他以前喜欢的物品相似的物品。从这个原理能够看到,UserCF的推荐结果着重于反映和用户兴趣类似的小群体的热点,而ItemCF的推荐结果着重于维系用户的历史兴趣。UserCF的推荐更社会化,反映了用户所在的小型兴趣群体中物品的热门程度,而ItemCF的推荐更加个性化,反映了用户本身的兴趣传承。同时,从技术上来讲,UserCF须要维护一个用户类似度的矩阵,而ItemCF须要维护一个物品类似度矩阵。从存储的角度来讲,若是用户不少,那么维护用户兴趣类似度矩阵须要很大的空间,同理,若是物品不少,维护物品类似度矩阵代价较大。
对于UserCF和ItemCF,咱们采用http://www.grouplens.org/node/73 的数据集进行测试,使用准确率/召回率、覆盖率和流行度对实验结果进行评测。
对用户u推荐N个物品R(u),令用户u在测试集上喜欢的物品集合为T(u),则:
召回率描述有多少比例的用户-物品评分记录包含在最终的推荐列表中,而准确率描述最终的推荐列表中有多少比例是发生过的用户-物品评分记录。
覆盖率表示最终的推荐列表中包含多大比例的物品,若是全部的物品都被推荐给至少一个用户,那么覆盖率就是100%。
最后还须要评测推荐的新颖度,这里用推荐列表中物品的*均流行度度量推荐结果的新颖度,若是推荐出的物品都很热门,说明推荐的新颖度较低,不然说明推荐结果比较新颖。
图1 UserCF实验结果
图2 ItemCF实验结果
对于以上UserCF和ItemCF的实验结果能够看出,推荐系统的精度指标(准确率和召回率)并不和参数k成线性关系。推荐结果的精度对k也不是特别敏感,只要选在必定的区域内,就能够得到不错的精度。
对于覆盖率而言,k越大则推荐结果的覆盖率越低,覆盖率的下降是由于流行度的增长,随着流行度增长,推荐算法愈来愈倾向于推荐热门的物品,这是由于k决定了推荐算法在作推荐时参考多少和你兴趣类似的其余用户的兴趣,若是k越大,参考的人或者物品越多,结果就愈来愈趋*于全局热门的物品。