以前数据采集时有2个需求就是url去重与数据的增量爬去(只可以请求增长的url,否则会增长被爬网站的服务器负荷),最开始的想法是指用redis的set实现url去重,但在后面开发中无心的解决了增量爬去的类容。下面贴上主要代码。redis
class InsertRedis(object): def __init__(self): self.Redis = RedisOpera('insert') def process_item(self,item,spider): self.Redis.write(item['url']) return item
注:redis的具体操做此处不表数据库
class IngoreRequestMiddleware(object): def __init__(self): self.Redis = RedisOpera('query') def process_request(self,request,spider): if self.Redis.query(request.url): raise IgnoreRequest("IgnoreRequest : %s" % request.url) else: return None
def start_requests(self): yield FormRequest('https://www.demo.org/vuldb/vulnerabilities?page='+str(self.page_num), callback=self.parse_page)#page_num是分页参数 def parse_page(self,response): urls = response.xpath('//tbody/tr').extract() for url in urls: request_url = Selector(text=url).xpath('//td[@class=\'vul-title-wrapper\']/a/@href').extract()[0] if re.search('/vuldb/ssvid-\d+',request_url): yield FormRequest('https://www.demo.org'+request_url.strip(),callback=self.parse_item,dont_filter=False) if len(urls) == 20: self.page_num += 1 def parse_item(self,response): item = WebcrawlItem() self.count += 1 item['url'] = response.url yield item yield FormRequest('https://www.demo.org/vuldb/vulnerabilities?page=' + str(self.page_num), callback=self.parse_page)
第三段函数parse_item()回调parse_page(),若是redis数据库中没有一条url数据则会一直将整站的page抓取,但r若是是在某个时间点咱们已经爬去完了数据,继续启动程序爬去增长的数据是会去判断每一个url是否已经爬去,当url有重复时parse_page不会回调parse_item(url去重),固然也就不会在去执行yield FormRequest('https://www.demo.org/vuldb/vu...' + str(self.page_num), callback=self.parse_page)
,故程序会跳出循环结束。服务器
在这不上Redis相关的操做app
1 redisopera.pycurl
# -*- coding: utf-8 -*- import redis import time from scrapy import log from newscrawl.util import RedisCollection class RedisOpera: def __init__(self,stat): log.msg('init redis %s connection!!!!!!!!!!!!!!!!!!!!!!!!!' %stat,log.INFO) self.r = redis.Redis(host='localhost',port=6379,db=0) def write(self,values): # print self.r.keys('*') collectionname = RedisCollection(values).getCollectionName() self.r.sadd(collectionname,values) def query(self,values): collectionname = RedisCollection(values).getCollectionName() return self.r.sismember(collectionname,values)
2 util.pyscrapy
# -*- coding: utf-8 -*- import re from scrapy import log class RedisCollection(object): def __init__(self,OneUrl): self.collectionname = OneUrl def getCollectionName(self): # name = None if self.IndexAllUrls() is not None: name = self.IndexAllUrls() else: name = 'publicurls' # log.msg("the collections name is %s"(name),log.INFO) return name def IndexAllUrls(self): allurls = ['wooyun','freebuf'] result = None for str in allurls: if re.findall(str,self.collectionname): result = str break return result