import mathfrom math import isnanimport pandas as pd#结巴分词,切开以后,有分隔符def jieba_function(sent): import jieba sent1 = jieba.cut(sent) s = [] for each in sent1: s.append(each) return ' '.join(str(i) for i in s)def count_cos_similarity(vec_1, vec_2): if len(vec_1) != len(vec_2): return 0 s = sum(vec_1[i] * vec_2[i] for i in range(len(vec_2))) den1 = math.sqrt(sum([pow(number, 2) for number in vec_1])) den2 = math.sqrt(sum([pow(number, 2) for number in vec_2])) return s / (den1 * den2)#计算文本向量,传入文本,接受的是字符串def tf(sent1, sent2): from sklearn.feature_extraction.text import CountVectorizer sent1 = jieba_function(sent1) sent2 = jieba_function(sent2) count_vec = CountVectorizer() sentences = [sent1, sent2] print('sentences',sentences) print('vector',count_vec.fit_transform(sentences).toarray())## 输出特征向量化后的表示 print('cut_word',count_vec.get_feature_names())#输出的是切分的词, 输出向量各个维度的特征含义 #转换成维度相同的 vec_1 = count_vec.fit_transform(sentences).toarray()[0] vec_2 = count_vec.fit_transform(sentences).toarray()[1] similarity=count_cos_similarity(vec_1, vec_2) if isnan(similarity): similarity=0.0 print('count_cos_similarity',similarity)def tfidf(sent1, sent2): from sklearn.feature_extraction.text import TfidfVectorizer sent1 = jieba_function(sent1) sent2 = jieba_function(sent2) tfidf_vec = TfidfVectorizer() sentences = [sent1, sent2] vec_1 = tfidf_vec.fit_transform(sentences).toarray()[0] vec_2 = tfidf_vec.fit_transform(sentences).toarray()[1] similarity=count_cos_similarity(vec_1, vec_2) if isnan(similarity): similarity=0.0 return similarityif __name__=='__main__': sent1 = '我喜欢看电视也喜欢看电影,' sent2 = '我不喜欢看电视也不喜欢看电影' print('<<<<tf<<<<<<<') tf(sent1, sent2) print('<<<<tfidf<<<<<<<') tfidf(sent1, sent2)