使用扩展+spider_idle信号关闭爬虫。redis
启用扩展:settings.pyapp
EXTENSIONS = {
#'scrapy.extensions.telnet.TelnetConsole': None,
'extention_my.RedisSpiderSmartIdleClosedExensions': 300,
}scrapy
额外配置参数:conf.pyide
MYEXT_ENABLED = True
IDLE_NUMBER = 5设计
扩展类:orm
extention_my.py对象
#coding:utf-8
"""
----------------------------------------
description:
author: sss
date:
----------------------------------------
change:
----------------------------------------
"""
__author__ = 'sss'
import time
from scrapy import signals
from scrapy.exceptions import NotConfigured
ip
from utils.mylogger import mylogger
logger_c = mylogger(__name__)
logger_m = logger_c.logger
class RedisSpiderSmartIdleClosedExensions(object):
def __init__(self, idle_number, crawler):
self.crawler = crawler
self.idle_number = idle_number
self.idle_list = []
self.idle_count = 0
@classmethod
def from_crawler(cls, crawler):
# 首先检查是否应该启用和提升扩展
# 不然不配置
from conf import MYEXT_ENABLED
if not MYEXT_ENABLED:
raise NotConfigured
# 获取配置中的时间片个数,默认为360个,30分钟
from conf import IDLE_NUMBER as idle_number
# 实例化扩展对象
ext = cls(idle_number, crawler)
# 将扩展对象链接到信号, 将signals.spider_idle 与 spider_idle() 方法关联起来。
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
# return the extension object
return ext
def spider_opened(self, spider):
logger_m.info("opened spider %s redis spider Idle, Continuous idle limit: %d", spider.name, self.idle_number)
def spider_closed(self, spider):
logger_m.info("closed spider %s, idle count %d , Continuous idle count %d",
spider.name, self.idle_count, len(self.idle_list))
def spider_idle(self, spider):
self.idle_count += 1 # 空闲计数
self.idle_list.append(time.time()) # 每次触发 spider_idle时,记录下触发时间戳
idle_list_len = len(self.idle_list) # 获取当前已经连续触发的次数
print(self.idle_number, self.idle_count, self.idle_list)
# 判断 当前触发时间与上次触发时间 之间的间隔是否大于5秒,若是大于5秒,说明redis 中还有key
if idle_list_len > 2 and self.idle_list[-1] - self.idle_list[-2] > 6:
self.idle_list = [self.idle_list[-1]]
elif idle_list_len > self.idle_number:
# 连续触发的次数达到配置次数后关闭爬虫
logger_m.info('\n continued idle number exceed {} Times'
'\n meet the idle shutdown conditions, will close the reptile operation'
'\n idle start time: {}, close spider time: {}'.format(self.idle_number,
self.idle_list[0], self.idle_list[0]))
# 执行关闭爬虫操做
self.crawler.engine.close_spider(spider, 'closespider_pagecount')utf-8
其它没有什么,主要是判断是否关闭条件的设计。it