搜索引擎–基于Django/Scrapy/ElasticSearch的搜索引擎的实现

  • 主机环境:Ubuntu 13.04
  • Python版本:2.7.4
  • Django版本:1.5.4
  • Scrapy版本:0.18.2
  • ElasticSearch版本:0.90.5

原创做品,转载请标明:http://blog.geekcome.com/archives/138

闲来无聊,查看了相关搜索引擎的基本知识,通过搜集资料,了解了搜索引擎所须要的基本子系统,爬取子系统,索引服务子系统,Web请求和应答子系统。而后通过学习基本的开源框架文档,集成的项目已经PUSH到GitHubjavascript

首先查看基于开源的Scrapy爬虫框架编写的一个爬虫,爬取校园网的内容(主要是免流量)

01 #!/usr/bin/env python
02 #-*- coding:utf-8 -*-
03 #from urlparse import urljoin
04 from scrapy.utils.url import urljoin_rfc
05 from scrapy.spider import BaseSpider
06 from scrapy.selector import HtmlXPathSelector
07 from scrapy.http import Request
08  
09 from scrapy.exceptions import DropItem
10  
11 from mymodules.items import Website
12  
13 import urllib
14 import re
15  
16 class Xidian_Spider(BaseSpider):
17     name = "xidian_spider"
18     start_urls = [
19        "http://www.xidian.edu.cn",
20        #"http://rs.xidian.edu.cn/forum.php",
21  
22     ]
23  
24     def __init__(self):
25         """init the allowed_domain"""
26         self.allowed_domains = ['xidian.edu.cn']
27  
28     def parse(self, response):
29         """In this parse,we use double yeild to return the item or Request"""
30         hxs = HtmlXPathSelector(response)
31  
32         refer_websites = hxs.select('//@href').extract()
33  
34         #if not self.gethostname(response.url) in self.allowed_domains:
35         #    self.allowed_domains.append(self.gethostname(response.url))
36  
37         item = Website()
38         item['url'= response.url
39         item['title'= hxs.select('/html/head/title/text()').extract()[0]
40  
41         """FIXME:This XPath select all the elements,include the javascript code.BAD!!"""
42         str = ''
43         list = hxs.select('/html/body//*/text()').extract()
44         for in list:
45             str += s.strip()
46             str += ' '
47  
48         item['content'= str
49  
50         yield item
51  
52         for weburl in refer_websites:
53  
54             utf8_url = weburl.encode('utf-8')
55  
56             """The following regex to match the prefix and postfix of urls"""
57             postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$')
58             prefix = re.compile(r'^((javascript:)|(openapi)).+')
59  
60             if postfix.match(utf8_url):
61                 continue
62             if prefix.match(utf8_url):
63                 continue
64             if not utf8_url.startswith('http://'):
65                 #weburl = urljoin_rfc(response.url, weburl, response.encoding)
66                 weburl = 'http://'+self.gethostname(response.url)+'/'+weburl
67  
68             weburl = re.sub(r'/\.\./\.\./',r'/',weburl)
69             weburl = re.sub(r'/\.\./',r'/',weburl)
70  
71             yield Request(weburl, callback=self.parse)
72  
73     def gethostname(self, res_url):
74         """get the host name of a url"""
75         proto, rest = urllib.splittype(res_url)
76         host, rest = urllib.splithost(rest)
77         return host

爬取获得的ITEM会交给PIPELINE处理。

这里的PipeLine作了去重处理,不能简单的放在内容,因此使用的是Bloom Filter的算法,这里直接安装了Python的开源库中的pybloomfilter(有时间研究一下)

01 class DuplicatesPipeline(object):
02  
03     def __init__(self):
04         self.bf = BloomFilter(100000000.01'filter.bloom')
05         self.f_write = open('visitedsites','w')
06         self.si = SearchIndex()
07         self.si.SearchInit()
08  
09     def process_item(self, item, spider):
10         print '************%d pages visited!*****************' %len(self.bf)
11         if self.bf.add(item['url']):#True if item in the BF
12             raise DropItem("Duplicate item found: %s" % item)
13         else:
14             #print '%d pages visited!'% len(self.url_seen)
15             self.save_to_file(item['url'],item['title'])
16             self.si.AddIndex(item)
17             return item
18  
19     def save_to_file(self,url,utitle):
20         self.f_write.write(url)
21         self.f_write.write('\t')
22         self.f_write.write(utitle.encode('utf-8'))
23         self.f_write.write('\n')
24  
25     def __del__(self):
26         """docstring for __del__"""
27         self.f_write.close()
28         self.si.IndexDone()

该类中的SearchIndex是ElasticSearch创建索引的类。定义以下:

01 #!/usr/bin/env python
02 #-*- coding:utf-8-*-
03 import os
04 import sys
05 from pyes import *
06 from mymodules.items import Website
07 INDEX_NAME='xidian_spider'
08  
09 class SearchIndex(object):
10  
11     def SearchInit(self):
12         self.conn = ES('127.0.0.1:9200', timeout=3.5)#Connect to ES
13         try:
14             self.conn.delete_index(INDEX_NAME)
15             #pass
16         except:
17             pass
18         self.conn.create_index(INDEX_NAME)#Create a new INDEX
19  
20         #Define the structure of the data format
21         mapping = {u'content': {'boost'1.0,
22                           'index''analyzed',
23                           'store''yes',
24                           'type': u'string',
25                           "indexAnalyzer":"ik",
26                           "searchAnalyzer":"ik",
27                           "term_vector" "with_positions_offsets"},
28                   u'title': {'boost'1.0,
29                              'index''analyzed',
30                              'store''yes',
31                              'type': u'string',
32                              "indexAnalyzer":"ik",
33                              "searchAnalyzer":"ik",
34                              "term_vector" "with_positions_offsets"},
35                   u'url': {'boost'1.0,
36                              'index''analyzed',
37                              'store''yes',
38                              'type': u'string',
39                              #"indexAnalyzer":"ik",
40                              #"searchAnalyzer":"ik",
41                              "term_vector" "with_positions_offsets"},
42         }
43  
44         self.conn.put_mapping("searchEngine-type", {'properties':mapping}, [INDEX_NAME])#Define the type
45  
46     def AddIndex(self,item):
47  
48         print 'Adding Index item URL %s'% item['title'].encode('utf-8')
49         self.conn.index({'title':item['title'].encode('utf-8'), \
50                 'url':item['url'].encode('utf-8'),\
相关文章
相关标签/搜索