如要检索“布尔检索”或“几率检索”但不包括“向量检索”方面的文档,其相应的查询表达式为:Q=检索 and (布尔or 几率 not向量),那么Q能够在其相应的(检索,布尔,几率,向量)标引词向量上取(1,1,0,0)(1,0,1,0)(1,1,1,0),那么文档Dj的向量若是与这中间一个相等,那么便可认为他们之间存在类似关系,而这种相互关系也是布尔值,即sim(Q,Dj)只能为0或1。python
在某个一共有一千词的网页中“原子能”、“的”和“应用”分别出现了 2 次、35 次 和 5 次,那么它们的词频TF就分别是 0.00二、0.035 和 0.005。 咱们将这三个数相加,其和 0.042 就是相应网页和查询“原子能的应用”。
一个词预测主题能力越强,权重就越大,反之,权重就越小。咱们在网页中看到“原子能”这个词,或多或少地能了解网页的主题。咱们看到“应用”一次,对主题基本上仍是一无所知。所以,“原子能“的权重就应该比应用大。
应删除词的权重应该是零。express
def regularization(s): ss = s.split(' ') expression = [] target = {} for i in ss: if i != "and" and i != "or" and i != "not" and i != "(" and i != ")": if i[0] == "(": expression.append("(") expression.append(i[1:]) target[i[1:]] = 0 elif i[-1] == ")": expression.append(i[:-1]) expression.append(")") target[i[:-1]] = 0 else: expression.append(i) target[i] = 0 else: expression.append(i) return target, expression def analysis(line): output = [] # 去除每行的换行符 t_line = line.strip('\n') # 按空格分开每一个词 words = t_line.split(' ') for word in words[1:]: if word == "": continue # 按/分开标记和词 t_word = word.split('/') # 左方括号去除 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] # 若不在列表中 if f_word not in output: output.append(f_word) big_word1 = t_line.split('[') for i in range(1, len(big_word1)): big_word2 = big_word1[i].split(']')[0] words = big_word2.split(' ') big_word = "" for word in words: # 按/分开标记和词 t_word = word.split('/') big_word = big_word + t_word[0] # 若不在列表中 if big_word not in output: output.append(big_word) return output def getValue(target, reg): # 逆波兰 RPN = [] stack = [] stack.append("#") for i in reg: if i in target.keys(): RPN.append(target[i]) elif i == "(": stack.append(i) elif i == ")": while stack[-1] != "(": RPN.append(stack.pop()) stack.pop() elif i == "not": while stack[-1] == "not": RPN.append(stack.pop()) stack.append(i) elif i == "and": while stack[-1] == "not" or stack[-1] == "and": RPN.append(stack.pop()) stack.append(i) else: while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or": RPN.append(stack.pop()) stack.append(i) while len(stack) != 1: RPN.append(stack.pop()) # 计算逆波兰式 ans = [] for i in RPN: if i == 0 or i == 1: ans.append(i) elif i == "not": ans.append(1 ^ ans.pop()) elif i == "and": op1 = ans.pop() op2 = ans.pop() ans.append(op1 and op2) elif i == "or": op1 = ans.pop() op2 = ans.pop() ans.append(op1 or op2) return ans[0] if __name__ == '__main__': booltext = input("输入布尔表达式:") target, reg = regularization(booltext) key_target = target.keys() num = 0 with open('语料库.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): if num >=10: break for i in key_target: target[i] = 0 if line is not None and line != "\n": output = analysis(line) for i in key_target: if i in output: target[i] = 1 if getValue(target, reg): print(line) num = num + 1 f.close()
TF-IDF模型数组
getWeight.py(提早计算权重)app
import sys output = {} with open('语料库.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): if line is not None and line != "\n": t_line = line.strip('\n') words = t_line.split(' ') word_w = [] for word in words[1:]: if word == "": continue t_word = word.split('/') # 左方括号 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] if f_word not in word_w: word_w.append(f_word) for f_word in word_w: if f_word in output.keys(): output[f_word] = output[f_word]+1 else: output[f_word] = 1 f.close() with open('outputWeight.txt', mode='w', encoding='UTF-8') as f: while output: minNum = sys.maxsize minName = "" for key, values in output.items(): if values < minNum: minNum = values minName = key f.write(minName+": "+str(minNum)+"\n") del output[minName] f.close()
TF-IDF.pycode
import math def analysis(line): output = [] # 去除每行的换行符 t_line = line.strip('\n') # 按空格分开每一个词 words = t_line.split(' ') for word in words[1:]: if word == "": continue # 按/分开标记和词 t_word = word.split('/') # 左方括号去除 tf_word = t_word[0].split('[') if len(tf_word) == 2: f_word = tf_word[1] else: f_word = t_word[0] # 若不在列表中 if f_word not in output: output.append(f_word) big_word1 = t_line.split('[') for i in range(1, len(big_word1)): big_word2 = big_word1[i].split(']')[0] words = big_word2.split(' ') big_word = "" for word in words: # 按/分开标记和词 t_word = word.split('/') big_word = big_word + t_word[0] # 若不在列表中 if big_word not in output: output.append(big_word) return output def getValue(target, reg): # 逆波兰 RPN = [] stack = [] stack.append("#") for i in reg: if i in target.keys(): RPN.append(target[i]) elif i == "(": stack.append(i) elif i == ")": while stack[-1] != "(": RPN.append(stack.pop()) stack.pop() elif i == "not": while stack[-1] == "not": RPN.append(stack.pop()) stack.append(i) elif i == "and": while stack[-1] == "not" or stack[-1] == "and": RPN.append(stack.pop()) stack.append(i) else: while stack[-1] == "not" or stack[-1] == "and" or stack[-1] == "or": RPN.append(stack.pop()) stack.append(i) while len(stack) != 1: RPN.append(stack.pop()) # 计算逆波兰式 ans = [] for i in RPN: if i == 0 or i == 1: ans.append(i) elif i == "not": ans.append(1 ^ ans.pop()) elif i == "and": op1 = ans.pop() op2 = ans.pop() ans.append(op1 and op2) elif i == "or": op1 = ans.pop() op2 = ans.pop() ans.append(op1 or op2) return ans[0] def getW(): word_list = {} with open('outputWeight.txt', mode='r', encoding='UTF-8')as f: for line in f.readlines(): if line is not None: word = line.split(':') word_list[word[0]]=word[1] f.close() return word_list def BMM(origin_sentence): MAX_WORD = 19 word_list = [] with open('output.txt', mode='r', encoding='UTF-8')as f: for line in f.readlines(): if line is not None: word = line.split(':') word_list.append(word[0]) f.close() ans_word = [] while len(origin_sentence) != 0: len_word = MAX_WORD while len_word > 0: # 从后读取最大词长度的数据,若该数据在字典中,则存入数组,并将其去除 if origin_sentence[-len_word:] in word_list: ans_word.append(origin_sentence[-len_word:]) len_sentence = len(origin_sentence) origin_sentence = origin_sentence[0:len_sentence - len_word] break # 不在词典中,则从后取词长度-1 else: len_word = len_word - 1 # 单词直接存入数组 if len_word == 0: if origin_sentence[-1:] != ' ': ans_word.append(origin_sentence[-1:]) len_sentence = len(origin_sentence) origin_sentence = origin_sentence[0:len_sentence - 1] return ans_word if __name__ == '__main__': w = getW() sentence = input("输入短语:") words = BMM(sentence) ans = [] # 计算总文档数(一行一文档) count = 0 for index, line in enumerate(open('语料库.txt', 'r', encoding='UTF-8')): count += 1 with open('语料库.txt', mode='r', encoding='UTF-8') as f: for line in f.readlines(): score = 0 if line is not None and line != "\n": out = analysis(line) for word in words: # TF-IDF计算 score = score + out.count(word) / len(out) * math.log(count*1.0/int(w[word])) ans.append((line, score)) f.close() new_ans = sorted(ans, key=lambda a: a[1], reverse=True) for i in range(10): print(new_ans[i])