pip3 install Scrapy
process_request(request, spider)
process_response(request, response, spider)
process_exception(request, exception, spider)
process_spider_input(response, spider)
process_spider_output(response, result, spider)
process_spider_exception(response, exception, spider)
process_start_requests(start_requests, spider)
open_spider(spider)
close_spider(spider)
from_crawler(cls, crawler)
from scrapy import selector body = '...' selector = Selector(text=body) title = selector.xpath('//title/text()').extract_first() #提取title标签里的内容 print(title)
构建时传入text参数,生成selector对象,经过xpath(), css()等方法提取css
result = response.selector.xpath('//a') result result.xpath('.//a[@href="image1.html"]/text()')extract_first() #xpath result.css('a[@href="image1.html"]::text]).extract_first() #css
scrapy startproject tutorial
tutorial/ scrapy.cfg tutorial/ __init__.py items.py pipelines.py settings.py spiders/ __init__.py ...
scrapy genspider quotes quotes.toscrape.com
import scrapy class QuoteItem(scrapy.Item): title = scrapy.Field() link = scrapy.Field() desc = scrapy.Field()
scrapy crawl quotes
quotes = response.css('.quote') #选择quote的区块 for quote in quotes: item = QuoteItem() item['text'] = quote.css('.text::text').extract_first() item['author'] = quote.css('.author::text').extract_first() item['tags'] = quote.css('.tags .tag::text').extract() yield item
scrapy shell quotes.toscrape.com #能够在命令行交互
next = response.css('.pager .next a::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
scrapy crawl quotes -o quotes.json #也能够保存成csv,xml等不一样文件格式
from scrapy.exceptions import DropItem class TextPipeline(object): def __init__(self): self.limit = 50 def process_item(self, item, spider): if item['text']: if len(item['text']) > self.limit: item['text'] = item['text'][0:self.limit].rstrip() + '...' #设置长度50截断字符串 return item else: return DropItem('Missing Text')
##本系列内容为《python3爬虫开发实战》学习笔记。本系列博客列表以下:html
(零)学习路线python
(一)开发环境配置正则表达式
(二)爬虫基础shell
(三)基本库使用数据库
(四)解析库使用json
(五)数据存储数据结构
持续更新...