搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现

主机环境：Ubuntu 13.04
Python版本：2.7.4
Django版本：1.5.4
Scrapy版本：0.18.2
ElasticSearch版本：0.90.5

原创做品，转载请标明：http://blog.geekcome.com/archives/138

闲来无聊，查看了相关搜索引擎的基本知识，通过搜集资料，了解了搜索引擎所须要的基本子系统，爬取子系统，索引服务子系统，Web请求和应答子系统。而后通过学习基本的开源框架文档，集成的项目已经PUSH到GitHub。javascript

首先查看基于开源的Scrapy爬虫框架编写的一个爬虫，爬取校园网的内容（主要是免流量）

 
       01 
       #!/usr/bin/env python 
      
       02 
       #-*- coding:utf-8 -*- 
      
       03 
       #from urlparse import urljoin 
      
       04 
       from scrapy.utils.url import urljoin_rfc 
      
       05 
       from scrapy.spider import BaseSpider 
      
       06 
       from scrapy.selector import HtmlXPathSelector 
      
       07 
       from scrapy.http import Request 
      
       08 
         
       09 
       from scrapy.exceptions import DropItem 
      
       10 
         
       11 
       from mymodules.items import Website 
      
       12 
         
       13 
       import urllib 
      
       14 
       import re 
      
       15 
         
       16 
       class Xidian_Spider(BaseSpider): 
      
       17 
           name = "xidian_spider" 
      
       18 
           start_urls = [ 
      
       19 
              "http://www.xidian.edu.cn", 
      
       20 
              #"http://rs.xidian.edu.cn/forum.php", 
      
       21 
         
       22 
           ] 
      
       23 
         
       24 
           def __init__(self): 
      
       25 
               """init the allowed_domain""" 
      
       26 
               self.allowed_domains = ['xidian.edu.cn'] 
      
       27 
         
       28 
           def parse(self, response): 
      
       29 
               """In this parse,we use double yeild to return the item or Request""" 
      
       30 
               hxs = HtmlXPathSelector(response) 
      
       31 
         
       32 
               refer_websites = hxs.select('//@href').extract() 
      
       33 
         
       34 
               #if not self.gethostname(response.url) in self.allowed_domains: 
      
       35 
               #    self.allowed_domains.append(self.gethostname(response.url)) 
      
       36 
         
       37 
               item = Website() 
      
       38 
               item['url'] = response.url 
      
       39 
               item['title'] = hxs.select('/html/head/title/text()').extract()[0] 
      
       40 
         
       41 
               """FIXME:This XPath select all the elements,include the javascript code.BAD!!""" 
      
       42 
               str = '' 
      
       43 
               list = hxs.select('/html/body//*/text()').extract() 
      
       44 
               for s in list: 
      
       45 
                   str += s.strip() 
      
       46 
                   str += ' ' 
      
       47 
         
       48 
               item['content'] = str 
      
       49 
         
       50 
               yield item 
      
       51 
         
       52 
               for weburl in refer_websites: 
      
       53 
         
       54 
                   utf8_url = weburl.encode('utf-8') 
      
       55 
         
       56 
                   """The following regex to match the prefix and postfix of urls""" 
      
       57 
                   postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$') 
      
       58 
                   prefix = re.compile(r'^((javascript:)|(openapi)).+') 
      
       59 
         
       60 
                   if postfix.match(utf8_url): 
      
       61 
                       continue 
      
       62 
                   if prefix.match(utf8_url): 
      
       63 
                       continue 
      
       64 
                   if not utf8_url.startswith('http://'): 
      
       65 
                       #weburl = urljoin_rfc(response.url, weburl, response.encoding) 
      
       66 
                       weburl = 'http://'+self.gethostname(response.url)+'/'+weburl 
      
       67 
         
       68 
                   weburl = re.sub(r'/\.\./\.\./',r'/',weburl) 
      
       69 
                   weburl = re.sub(r'/\.\./',r'/',weburl) 
      
       70 
         
       71 
                   yield Request(weburl, callback=self.parse) 
      
       72 
         
       73 
           def gethostname(self, res_url): 
      
       74 
               """get the host name of a url""" 
      
       75 
               proto, rest = urllib.splittype(res_url) 
      
       76 
               host, rest = urllib.splithost(rest) 
      
       77 
               return host

爬取获得的ITEM会交给PIPELINE处理。

这里的PipeLine作了去重处理，不能简单的放在内容，因此使用的是Bloom Filter的算法，这里直接安装了Python的开源库中的pybloomfilter（有时间研究一下）

 
       01 
       class DuplicatesPipeline(object): 
      
       02 
         
       03 
           def __init__(self): 
      
       04 
               self.bf = BloomFilter(10000000, 0.01, 'filter.bloom') 
      
       05 
               self.f_write = open('visitedsites','w') 
      
       06 
               self.si = SearchIndex() 
      
       07 
               self.si.SearchInit() 
      
       08 
         
       09 
           def process_item(self, item, spider): 
      
       10 
               print '************%d pages visited!*****************' %len(self.bf) 
      
       11 
               if self.bf.add(item['url']):#True if item in the BF 
      
       12 
                   raise DropItem("Duplicate item found: %s" % item) 
      
       13 
               else: 
      
       14 
                   #print '%d pages visited!'% len(self.url_seen) 
      
       15 
                   self.save_to_file(item['url'],item['title']) 
      
       16 
                   self.si.AddIndex(item) 
      
       17 
                   return item 
      
       18 
         
       19 
           def save_to_file(self,url,utitle): 
      
       20 
               self.f_write.write(url) 
      
       21 
               self.f_write.write('\t') 
      
       22 
               self.f_write.write(utitle.encode('utf-8')) 
      
       23 
               self.f_write.write('\n') 
      
       24 
         
       25 
           def __del__(self): 
      
       26 
               """docstring for __del__""" 
      
       27 
               self.f_write.close() 
      
       28 
               self.si.IndexDone()

该类中的SearchIndex是ElasticSearch创建索引的类。定义以下：

 
       01 
       #!/usr/bin/env python 
      
       02 
       #-*- coding:utf-8-*- 
      
       03 
       import os 
      
       04 
       import sys 
      
       05 
       from pyes import * 
      
       06 
       from mymodules.items import Website 
      
       07 
       INDEX_NAME='xidian_spider' 
      
       08 
         
       09 
       class SearchIndex(object): 
      
       10 
         
       11 
           def SearchInit(self): 
      
       12 
               self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES 
      
       13 
               try: 
      
       14 
                   self.conn.delete_index(INDEX_NAME) 
      
       15 
                   #pass 
      
       16 
               except: 
      
       17 
                   pass 
      
       18 
               self.conn.create_index(INDEX_NAME)#Create a new INDEX 
      
       19 
         
       20 
               #Define the structure of the data format 
      
       21 
               mapping = {u'content': {'boost': 1.0, 
      
       22 
                                 'index': 'analyzed', 
      
       23 
                                 'store': 'yes', 
      
       24 
                                 'type': u'string', 
      
       25 
                                 "indexAnalyzer":"ik", 
      
       26 
                                 "searchAnalyzer":"ik", 
      
       27 
                                 "term_vector" : "with_positions_offsets"}, 
      
       28 
                         u'title': {'boost': 1.0, 
      
       29 
                                    'index': 'analyzed', 
      
       30 
                                    'store': 'yes', 
      
       31 
                                    'type': u'string', 
      
       32 
                                    "indexAnalyzer":"ik", 
      
       33 
                                    "searchAnalyzer":"ik", 
      
       34 
                                    "term_vector" : "with_positions_offsets"}, 
      
       35 
                         u'url': {'boost': 1.0, 
      
       36 
                                    'index': 'analyzed', 
      
       37 
                                    'store': 'yes', 
      
       38 
                                    'type': u'string', 
      
       39 
                                    #"indexAnalyzer":"ik", 
      
       40 
                                    #"searchAnalyzer":"ik", 
      
       41 
                                    "term_vector" : "with_positions_offsets"}, 
      
       42 
               } 
      
       43 
         
       44 
               self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type 
      
       45 
         
       46 
           def AddIndex(self,item): 
      
       47 
         
       48 
               print 'Adding Index item URL %s'% item['title'].encode('utf-8') 
      
       49 
               self.conn.index({'title':item['title'].encode('utf-8'), \ 
      
       50 
                       'url':item['url'].encode('utf-8'),\