一个搜索引擎由搜索器、索引器、检索器和用户接口四个部分组成正则表达式
搜索器就是爬虫(scrawler),爬出的内容送给索引器生成索引(Index)存储在内部数据库。用户经过用户接口发出询问(query),询问解析后送达检索器,检索器高效检索后,将结果返回给用户。算法
如下5个文件为爬取的搜索样本。数据库
# # 1.txt # I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today. # # 2.txt # I have a dream that one day down in Alabama, with its vicious racists, . . . one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers. I have a dream today. # # 3.txt # I have a dream that one day every valley shall be exalted, every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight, and the glory of the Lord shall be revealed, and all flesh shall see it together. # # 4.txt # This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day. . . . # # 5.txt # And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God's children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: "Free at last! Free at last! Thank God Almighty, we are free at last!"
class SearchEngineBase(object): def __init__(self): pass #将文件做为id,与内容一块儿送到process_corpus def add_corpus(self, file_path): with open(file_path, 'r') as fin: text = fin.read() self.process_corpus(file_path, text) #索引器 将文件路径做为id,将处理的内容做为索引存储 def process_corpus(self, id, text): raise Exception('process_corpus not implemented.') #检索器 def search(self, query): raise Exception('search not implemented.') #多态 def main(search_engine): for file_path in ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt']: search_engine.add_corpus(file_path) while True: query = input() results = search_engine.search(query) print('found {} result(s):'.format(len(results))) for result in results: print(result) class SimpleEngine(SearchEngineBase): def __init__(self): super(SimpleEngine, self).__init__() self.__id_to_texts = dict() def process_corpus(self, id, text): self.__id_to_texts[id] = text def search(self, query): results = [] for id, text in self.__id_to_texts.items(): if query in text: results.append(id) return results search_engine = SimpleEngine() main(search_engine) ########## 输出 ########## # simple # found 0 result(s): # whe # found 2 result(s): # 1.txt # 5.txt
import re class BOWEngine(SearchEngineBase): def __init__(self): super(BOWEngine, self).__init__() self.__id_to_words = dict() def process_corpus(self, id, text): self.__id_to_words[id] = self.parse_text_to_words(text) def search(self, query): query_words = self.parse_text_to_words(query) results = [] for id, words in self.__id_to_words.items(): if self.query_match(query_words, words): results.append(id) return results @staticmethod def query_match(query_words, words): for query_word in query_words: if query_word not in words: return False return True #for query_word in query_words: # return False if query_word not in words else True #result = filter(lambda x:x not in words,query_words) #return False if (len(list(result)) > 0) else True @staticmethod def parse_text_to_words(text): # 使用正则表达式去除标点符号和换行符 text = re.sub(r'[^\w ]', ' ', text) # 转为小写 text = text.lower() # 生成全部单词的列表 word_list = text.split(' ') # 去除空白单词 word_list = filter(None, word_list) # 返回单词的 set return set(word_list) search_engine = BOWEngine() main(search_engine) ########## 输出 ########## # i have a dream # found 3 result(s): # 1.txt # 2.txt # 3.txt # freedom children # found 1 result(s): # 5.txt
缺点:每次search仍是须要遍历全部文档缓存
import re class BOWInvertedIndexEngine(SearchEngineBase): def __init__(self): super(BOWInvertedIndexEngine, self).__init__() self.inverted_index = dict() #生成索引 word -> id def process_corpus(self, id, text): words = self.parse_text_to_words(text) for word in words: if word not in self.inverted_index: self.inverted_index[word] = [] self.inverted_index[word].append(id) #{'little':['1.txt','2.txt'],...} def search(self, query): query_words = list(self.parse_text_to_words(query)) query_words_index = list() for query_word in query_words: query_words_index.append(0) # 若是某一个查询单词的倒序索引为空,咱们就马上返回 for query_word in query_words: if query_word not in self.inverted_index: return [] result = [] while True: # 首先,得到当前状态下全部倒序索引的 index current_ids = [] for idx, query_word in enumerate(query_words): current_index = query_words_index[idx] current_inverted_list = self.inverted_index[query_word] #['1.txt','2.txt'] # 已经遍历到了某一个倒序索引的末尾,结束 search if current_index >= len(current_inverted_list): return result current_ids.append(current_inverted_list[current_index]) # 而后,若是 current_ids 的全部元素都同样,那么代表这个单词在这个元素对应的文档中都出现了 if all(x == current_ids[0] for x in current_ids): result.append(current_ids[0]) query_words_index = [x + 1 for x in query_words_index] continue # 若是不是,咱们就把最小的元素加一 min_val = min(current_ids) min_val_pos = current_ids.index(min_val) query_words_index[min_val_pos] += 1 @staticmethod def parse_text_to_words(text): # 使用正则表达式去除标点符号和换行符 text = re.sub(r'[^\w ]', ' ', text) # 转为小写 text = text.lower() # 生成全部单词的列表 word_list = text.split(' ') # 去除空白单词 word_list = filter(None, word_list) # 返回单词的 set return set(word_list) search_engine = BOWInvertedIndexEngine() main(search_engine) ########## 输出 ########## # little # found 2 result(s): # 1.txt # 2.txt # little vicious # found 1 result(s): # 2.txt
import pylru class LRUCache(object): def __init__(self, size=32): self.cache = pylru.lrucache(size) def has(self, key): return key in self.cache def get(self, key): return self.cache[key] def set(self, key, value): self.cache[key] = value class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine, LRUCache): def __init__(self): super(BOWInvertedIndexEngineWithCache, self).__init__() LRUCache.__init__(self) def search(self, query): if self.has(query): print('cache hit!') return self.get(query) result = super(BOWInvertedIndexEngineWithCache, self).search(query) self.set(query, result) return result search_engine = BOWInvertedIndexEngineWithCache() main(search_engine) ########## 输出 ########## # little # found 2 result(s): # 1.txt # 2.txt # little # cache hit! # found 2 result(s): # 1.txt # 2.txt
注意BOWInvertedIndexEngineWithCache继承了两个类。app