需求:爬取豆瓣电影top250(https://movie.douban.com/top250)的电影数据:html
标题(title ),电影评分(score),电影信息(content),简介 info。mongodb
1、分析页面,用xpath获得相应的数据数据库
标题的xpath是://div[@class="info"]//span[@class="title"][1]/text()json
电影信息xpath://div[@class="info"]//div[@class="bd"]/p[1]cookie
电影评分xpath://div[@class="info"]//div[@class="star"]/span[2]/text()app
获得简介的xpath://div[@class="info"]//span[@class="inq"]/text()dom
2、建立项目而且完成items.pyscrapy
建立项目命令:scrapy startproject doubanide
进入项目目录:cd doubanpost
建立爬虫程序:scrapy genspider movetop250 douban.com
1.启动程序的脚本---main.py
from scrapy import cmdline cmdline.execute("scrapy crawl movetop250".split())
2.items.py
import scrapy class DoubanItem(scrapy.Item): # 电影标题 title = scrapy.Field() # 电影评分 score = scrapy.Field() # 电影信息 content = scrapy.Field() # 简介 info = scrapy.Field()
3.movetop250.py
import scrapy from douban.items import DoubanItem class Movetop250Spider(scrapy.Spider): name = 'movetop250' allowed_domains = ['douban.com'] offset = 0 url = "https://movie.douban.com/top250?start=" start_urls = [url + str(offset)] def parse(self, response): moves = response.xpath('//div[@class="info"]') for move in moves: item = DoubanItem() # 电影名称 title = move.xpath('.//span[@class="title"][1]/text()').extract()[0] # 点击的信息,例如导演等 content = move.xpath('.//div[@class="bd"]/p[1]/text()').extract()[0] content = "".join(content).strip() # 评分 score = move.xpath('.//div[@class="star"]/span[2]/text()').extract()[0] # 电影一句话简介 info = move.xpath('.//span[@class="inq"]/text()').extract() if len(info) > 0: info = info[0] item["title"] = title item["content"] = content item["score"] = score item["info"] = info yield item # 请求每一页数据 if self.offset < 225: self.offset += 25 url = self.url + str(self.offset) yield scrapy.Request(url, callback=self.parse)
4.pipelines.py
import json import pymongo from scrapy.conf import settings class DoubanMongodbPipeline(object): def __init__(self): print("=====start=====") host = settings["MONGO_HOST"] port = settings["MONGO_PORT"] dbname = settings["MONGO_DBNAME"] sheetname = settings["MONGO_SHEETNAME"] print("host==", host) print("port==", port) print("dbname==", dbname) print("sheetname==", sheetname) # 建立客户端 client = pymongo.MongoClient(host=host, port=port) # 获得或者建立数据库对象 mydb = client[dbname] # 获得或者建立表 self.post = mydb[sheetname] def process_item(self, item, spider): # dict_json = dict(item) # json_str = json.dumps(dict_json, ensure_ascii=False) + "\n" # self.file.write(json_str) dict_item = dict(item) self.post.insert(dict_item) return item def close_spider(self, spider): print("======end======") self.file.close() class DoubanPipeline(object): def __init__(self): print("=====start=====") self.file = open("movetop250.json", "w", encoding="utf-8") def process_item(self, item, spider): dict_json = dict(item) json_str = json.dumps(dict_json, ensure_ascii=False) + "\n" self.file.write(json_str) return item def close_spider(self, spider): print("======close_spide======") self.file.close()
5.settings.py
BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' ROBOTSTXT_OBEY = False # 通常状况不遵循爬虫规则 DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/67.0.3396.99 Safari/537.36', } ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 301, 'douban.pipelines.DoubanMongodbPipeline': 300, } # 设置日志 LOG_FILE = "dongguan.log" LOG_LEVEL = "DEBUG" # 设置用户代理 USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 " \ "Safari/537.36" # 配置mongodb数据库的信息 # mongo 主机 MONGO_HOST = "127.0.0.1" # mongo 端口 MONGO_PORT = 27017 # mongo 数据存放数据库库名称 MONGO_DBNAME = "douban" # mongo 数据存放的表名称 MONGO_SHEETNAME = "movetop250" # Disable cookies (enabled by default) COOKIES_ENABLED = False
路过的关注一下,你的支持是我前进的动力