import scrapy class QuotesSpider(scrapy.Spider): name = "quotes" def start_requests(self): urls = [ 'http://quotes.toscrape.com/page/1/', #'http://quotes.toscrape.com/page/2/', ] for url in urls: yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): for quote in response.xpath('//div[@class="quote"]'): yield { 'text': quote.xpath('.//span[@class="text"]/text()').extract(), 'author': quote.xpath('.//small[@class="author"]/text()').extract(), 'tags': quote.xpath('./div/meta/@content').extract(), } next_page = response.xpath('//li[@class="next"]/a/@href').extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
知识点:python
1.xpath如何在循环中访问当前节点下的内容('.//scrapy
2.当前循环节点内容下的值能够按照绝对路径获取 (./div/metaide
3.当前页面的url如何访问 response.urljoin(url