from_crawler(crawler, *args, **kwargs)
:这个就是优先于__init__执行函数举例代码能够以下#通常配置数据库的属性时候稍微用影响 #简单些下 @classmethod def from_crawler(cls,crawler): HOST = crawler.settings.get('HOST') #这里面的属性都是在settings中设置的名称 PORT = crawler.settings.get('PORT') USER = crawler.settings.get('USER') PWD = crawler.settings.get('PWD') DB = crawler.settings.get('DB') TABLE = crawler.settings.get('TABLE') return cls(HOST,PORT,USER,PWD,DB,TABLE) def __init__(self,HOST,PORT,USER,PWD,DB,TABLE): self.HOST = HOST self.PORT = PORT self.USER = USER self.PWD = PWD self.DB = DB self.TABLE = TABLE #看一眼就知道了吧
start_requests(self)
:该方法用来发起第一个Requests请求,且必须返回一个可迭代的对象。它在爬虫程序打开时就被Scrapy调用,Scrapy只调用它一次。举例python
若是不写start_requests
方法:他会把start_urls
的两个网址都发送过去数据库
import scrapy class BaiduSpider(scrapy.Spider): name = 'test' allowed_domains = ['http://httpbin.org/get'] start_urls = ['http://httpbin.org/get','http://httpbin.org/get'] def parse(self, response): print('接受一次')
若是写start_requests
方法:他会把咱们指定的Request对象发送出去,发送必须以迭代器
的形式输出dom
parse(self,response)
:这是默认的回调函数scrapy
log(self, message, level=logging.DEBUG, **kw):
定义日志级别close(self,reason)
:关闭爬虫程序执行ide