Scrapy 是Python开发的一个快速、高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。Scrapy用途普遍,能够用于数据挖掘、监测和自动化测试。html
request: scrapy.http.request.Request # HtmlResponse 继承自 TextResponse 继承自 HtmlResponse response: scrapy.http.response.html.HtmlResponse response: scrapy.http.response.text.TextResponse response: scrapy.http.response.Response
for k in self.settings: print(k, self.settings.get(k)) if isinstance(self.settings.get(k), scrapy.settings.BaseSettings): for kk in self.settings.get(k): print('\t', kk, self.settings.get(k).get(kk))
(How to get the number of requests in queue in scrapy?)python
# scrapy.core.scheduler.Scheduler # spider len(self.crawler.engine.slot.scheduler) # pipeline len(spider.crawler.engine.slot.scheduler)
# scrapy.core.engine.Slot.inprogress 就是个 set # spider len(self.crawler.engine.slot.inprogress) # pipeline len(spider.crawler.engine.slot.inprogress)
(How to get the pipeline object in Scrapy spider)mysql
# Pipline class MongoDBPipeline(object): def __init__(self, mongodb_db=None, mongodb_collection=None): self.connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) def get_date(self): pass def open_spider(self, spider): spider.myPipeline = self def process_item(self, item, spider): pass # spider class MySpider(Spider): def __init__(self): self.myPipeline = None def start_requests(self): # 可直接存储数据 self.mysqlPipeline.process_item(item, self) def parse(self, response): self.myPipeline.get_date()
(Multiple cookie sessions per spider)git
# Scrapy经过使用 cookiejar Request meta key来支持单spider追踪多cookie session。 # 默认状况下其使用一个cookie jar(session),不过您能够传递一个标示符来使用多个。 for i, url in enumerate(urls): yield scrapy.Request("http://www.example.com", meta={'cookiejar': i}, callback=self.parse_page) # 须要注意的是 cookiejar meta key不是”黏性的(sticky)”。 您须要在以后的request请求中接着传递。 def parse_page(self, response): # do some processing return scrapy.Request("http://www.example.com/otherpage", meta={'cookiejar': response.meta['cookiejar']}, callback=self.parse_other_page)
Closing spider (finished)github
# scrapy.core.engine.ExecutionEngine def spider_is_idle(self, spider): if not self.scraper.slot.is_idle(): # scraper is not idle return False if self.downloader.active: # downloader has pending requests return False if self.slot.start_requests is not None: # not all start requests are handled return False if self.slot.scheduler.has_pending_requests(): # scheduler has pending requests return False return True # spider 里面打印条件 self.logger.debug('engine.scraper.slot.is_idle: %s' % repr(self.crawler.engine.scraper.slot.is_idle())) self.logger.debug('\tengine.scraper.slot.active: %s' % repr(self.crawler.engine.scraper.slot.active)) self.logger.debug('\tengine.scraper.slot.queue: %s' % repr(self.crawler.engine.scraper.slot.queue)) self.logger.debug('engine.downloader.active: %s' % repr(self.crawler.engine.downloader.active)) self.logger.debug('engine.slot.start_requests: %s' % repr(self.crawler.engine.slot.start_requests)) self.logger.debug('engine.slot.scheduler.has_pending_requests: %s' % repr(self.crawler.engine.slot.scheduler.has_pending_requests()))
(Scrapy: How to manually insert a request from a spider_idle event callback?)web
class FooSpider(BaseSpider): yet = False @classmethod def from_crawler(cls, crawler, *args, **kwargs): from_crawler = super(FooSpider, cls).from_crawler spider = from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.idle, signal=scrapy.signals.spider_idle) return spider def idle(self): if not self.yet: self.crawler.engine.crawl(self.create_request(), self) self.yet = True
默认值: False
non-200 response | timeout | |
---|---|---|
True | callback | errback |
False | errback | errback |
walker 看起来新图只是旧图的细化,无实质性差别。sql
本文出自 walker snapshot