python version:3.7
IDE:Pycharm
environment:windows10
Framework:Scrapy
Website:
github:https://github.com/daorenfeixueyuhua/PythonStudy.git
ProjectName:ammmi
base_url: https://www.ammmi.com/category/dongmantupian // 获取某类图片的网页的列表 x = response.xpath('//div[@class="mi_cont "]//ul/li/h3/a/@href').extract() 下一列: base_url+'page/pagesize' 图片种类名称: // 获取图片种类名称 x = response.xpath('//h2[@class="titletips"]/text()').extract_first() // 获取图片url 图片下载: x = response.xpath('//a[@class="NavLinks"]/@href').extract_first() // 命名图片 picture_name = x[58:-4] web_name = 'ammmi' save_directory= '../resource/' + web_name + '/' + title + '/' + picture_name + '.jpg' // Item设计 Item(title,picture_name,download_url,save_directory)
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class AmmmiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # 图片标题 title = scrapy.Field() # 图片名 picture_name = scrapy.Field() # 图片下载地址 image_url = scrapy.Field() # 图片保存地址 image_path = scrapy.Field() # pass
import scrapy from ammmi.items import AmmmiItem class AmmiSipder(scrapy.Spider): # spider name name = 'ammmi' # 表示允许爬虫爬的网站 allowed_domains = ['www.ammmi.com'] count = 0 page_num = 1 web_name = 'ammmi' base_url = 'https://www.ammmi.com/category/dongmantupian' start_urls = ['https://www.ammmi.com/category/dongmantupian'] def parse(self, response): # 获取图片urls img_list = response.xpath('//a[@class="NavLinks"]/@href').extract() if len(img_list): for img in img_list: item = AmmmiItem() item['title'] = response.xpath('//h2[@class="titletips"]/text()').extract_first() item['picture_name'] = img[58:-4] item['image_url'] = img self.count += 1 # if self.count >= 50: # return yield item page_list = response.xpath('//div[@class="mi_cont "]//ul/li/h3/a/@href').extract() if len(page_list) == 0: return else: for page in page_list: # 进入某类图片html yield scrapy.Request(page, callback=self.parse) # 控制图片种类,并保证结束 if self.page_num >= 30: return self.page_num += 1 # 下一分页类图片 yield scrapy.Request(self.base_url + '/page/' + str(self.page_num), callback=self.parse)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.exceptions import DropItem import os from scrapy.utils.project import get_project_settings from scrapy.pipelines.images import ImagesPipeline class AmmmiPipeline(object): def process_item(self, item, spider): # r = requests.get(item['download_url']) # path = '../resource/' # if 'resource' not in os.listdir('../'): # os.mkdir(path) # dirs = os.listdir(path) # if item['title'] not in dirs: # os.mkdir('../resource/'+item['title']) # with open(item['save_directory'], 'wb') as f: # f.write(r.content) return item class AmmmiImagesPipeline(ImagesPipeline): IMAGES_STORE = get_project_settings().get("IMAGES_STORE") def get_media_requests(self, item, info): yield scrapy.Request(item['image_url']) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] # if not image_paths: # raise DropItem("Item contains no images") # if item['title'] not in os.listdir(self.IMAGES_STORE): # os.mkdir(self.IMAGES_STORE + '/' + item['title']) # 更改文件名 # os.rename(self.IMAGES_STORE + "/" + image_paths[0], self.IMAGES_STORE + "/" + item['title'] + '/' + item['picture_name'] + '.jpg') # 更改图片路径名 # item['image_path'] = self.IMAGES_STORE + "/" + item['title'] + '/' + item['picture_name'] # item['image_paths'] = image_paths return item
注:item_completed去掉注释部分,即可实现按图片title目录分类保存图片,但是很抱歉,此处存在一个bug,只能保存一张不图片(若那位大佬发现,请告诉我,感激不敬,谢谢)。
BOT_NAME = 'ammmi' SPIDER_MODULES = ['ammmi.spiders'] NEWSPIDER_MODULE = 'ammmi.spiders' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'ammmi.pipelines.AmmmiPipeline': 300, 'ammmi.pipelines.AmmmiImagesPipeline': 300, } IMAGES_STORE = "../Images"
感兴趣的朋友,可以在github上克隆运行使用!
能力有限,不足之处,还请多多包涵!
谢谢!