根据yield
迭代器生成的对象是request对象
仍是item对象
css
在items.py
文件中设置类python
class MyscrapyItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() price = scrapy.Field() prostatus = scrapy.Field()
from myscrapy.items import MyscrapyItem def get_info(self,response): elements_list = response.css('.product') for element in elements_list: title = element.css('.productTitle a::attr(title)').extract_first() #这是css选择器 price = element.css('.productPrice em::attr(title)').extract_first() prostatus = element.css('.productStatus em::text').extract_first() item = MyscrapyItem() #实例话一个item对象 item['title'] = title #填写配置的参数 item['price'] = price item['prostatus'] = prostatus yield item
ITEM_PIPELINES = { 'myscrapy.pipelines.MyscrapyPipeline': 300, #小的优先级高 # 'myscrapy.pipelines.MyscrapyPipeline1': 500, } #和中间件一个道理
#其中两个方法很是经常使用 #def open_spider(self): 运行这个函数开始执行,通常都是链接数据库用 #def close_spider(self): 运行完这个函数执行,通常都是关闭数据库用 #简单拿MongoDB举例 from pymongo import MongoClient class MyscrapyPipeline(object): def __init__(self,HOST,PORT,USER,PWD,DB,TABLE): self.HOST = HOST self.PORT = PORT self.USER = USER self.PWD = PWD self.DB = DB self.TABLE = TABLE #执行__init__以前执行 @classmethod def from_crawler(cls,crawler): HOST = crawler.settings.get('HOST') #crawler.settings能够直接得到setting文件中的全部名称 PORT = crawler.settings.get('PORT') USER = crawler.settings.get('USER') PWD = crawler.settings.get('PWD') DB = crawler.settings.get('DB') TABLE = crawler.settings.get('TABLE') return cls(HOST,PORT,USER,PWD,DB,TABLE) def open_spider(self,spider): self.client = MongoClient(host=self.HOST,port=self.PORT,username=self.USER,password=self.PWD) print('链接数据库成功') def close_spider(self,spider): self.client.close() print('关闭数据库') def process_item(self, item, spider): self.client[self.DB][self.TABLE].insert_one(dict(item)) return item