手机App抓包爬虫
class DouyuspiderItem(scrapy.Item): name = scrapy.Field()# 存储照片的名字 imagesUrls = scrapy.Field()# 照片的url路径 imagesPath = scrapy.Field()# 照片保存在本地的路径
import scrapy import json from douyuSpider.items import DouyuspiderItem class DouyuSpider(scrapy.Spider): name = "douyu" allowd_domains = ["http://capi.douyucdn.cn"] offset = 0 url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=" start_urls = [url + str(offset)] def parse(self, response): # 返回从json里获取 data段数据集合 data = json.loads(response.text)["data"] for each in data: item = DouyuspiderItem() item["name"] = each["nickname"] item["imagesUrls"] = each["vertical_src"] yield item self.offset += 20 yield scrapy.Request(self.url + str(self.offset), callback = self.parse)
ITEM_PIPELINES = {'douyuSpider.pipelines.ImagesPipeline': 1} # Images 的存放位置,以后会在pipelines.py里调用 IMAGES_STORE = "/Users/Power/lesson_python/douyuSpider/Images" # user-agent USER_AGENT = 'DYZB/2.290 (iPhone; iOS 9.3.4; Scale/2.00)'
import scrapy import os from scrapy.pipelines.images import ImagesPipeline from scrapy.utils.project import get_project_settings class ImagesPipeline(ImagesPipeline): IMAGES_STORE = get_project_settings().get("IMAGES_STORE") def get_media_requests(self, item, info): image_url = item["imagesUrls"] yield scrapy.Request(image_url) def item_completed(self, results, item, info): # 固定写法,获取图片路径,同时判断这个路径是否正确,若是正确,就放到 image_path里,ImagesPipeline源码剖析可见 image_path = [x["path"] for ok, x in results if ok] os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg") item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"] return item #get_media_requests的做用就是为每个图片连接生成一个Request对象,这个方法的输出将做为item_completed的输入中的results,results是一个元组,每一个元组包括(success, imageinfoorfailure)。若是success=true,imageinfoor_failure是一个字典,包括url/path/checksum三个key。
from scrapy import cmdline cmdline.execute('scrapy crawl douyu'.split())
http://wz.sun0769.com/index.php/question/questionType?type=4php
爬取投诉帖子的编号、帖子的url、帖子的标题,和帖子里的内容。html
import scrapy class DongguanItem(scrapy.Item): # 每一个帖子的标题 title = scrapy.Field() # 每一个帖子的编号 number = scrapy.Field() # 每一个帖子的文字内容 content = scrapy.Field() # 每一个帖子的url url = scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from dongguan.items import DongguanItem import time class SunSpider(CrawlSpider): name = 'sun' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page='] # 每一页的匹配规则 pagelink = LinkExtractor(allow=('type=4')) # 每一个帖子的匹配规则 contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml') rules = [ # 本案例为特殊状况,须要调用deal_links方法处理每一个页面里的连接 Rule(pagelink, process_links = "deal_links", follow = True), Rule(contentlink, callback = 'parse_item') ] # 须要从新处理每一个页面里的连接,将连接里的‘Type&type=4?page=xxx’替换为‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替换为‘Type?page=xxx&type=4’),不然没法发送这个连接 def deal_links(self, links): for link in links: link.url = link.url.replace("?","&").replace("Type&", "Type?") print link.url return links def parse_item(self, response): print response.url item = DongguanItem() # 标题 item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0] # 编号 item['number'] = item['title'].split(' ')[-1].split(":")[-1] # 文字内容,默认先取出有图片状况下的文字内容列表 content = response.xpath('//div[@class="contentext"]/text()').extract() # 若是没有内容,则取出没有图片状况下的文字内容列表 if len(content) == 0: content = response.xpath('//div[@class="c1 text14_2"]/text()').extract() # content为列表,经过join方法拼接为字符串,并去除首尾空格 item['content'] = "".join(content).strip() else: item['content'] = "".join(content).strip() # 连接 item['url'] = response.url yield item
# -*- coding: utf-8 -*- # 文件处理类库,能够指定编码格式 import codecs import json class JsonWriterPipeline(object): def __init__(self): # 建立一个只写文件,指定文本编码格式为utf-8 self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8') def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) + "\n" self.filename.write(content) return item def spider_closed(self, spider): self.file.close()
ITEM_PIPELINES = { 'dongguan.pipelines.DongguanPipeline': 300, } # 日志文件名和处理等级 LOG_FILE = "dg.log" LOG_LEVEL = "DEBUG"
from scrapy import cmdline cmdline.execute('scrapy crawl sunwz'.split())
py2 main.py
爬取新浪网导航页全部下全部大类、小类、小类里的子连接,以及子连接页面的新闻内容。python
效果演示图:android
import scrapy import sys reload(sys) sys.setdefaultencoding("utf-8") class SinaItem(scrapy.Item): # 大类的标题 和 url parentTitle = scrapy.Field() parentUrls = scrapy.Field() # 小类的标题 和 子url subTitle = scrapy.Field() subUrls = scrapy.Field() # 小类目录存储路径 subFilename = scrapy.Field() # 小类下的子连接 sonUrls = scrapy.Field() # 文章标题和内容 head = scrapy.Field() content = scrapy.Field()
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*- from Sina.items import SinaItem import scrapy import os import sys reload(sys) sys.setdefaultencoding("utf-8") class SinaSpider(scrapy.Spider): name= "sina" allowed_domains= ["sina.com.cn"] start_urls= [ "http://news.sina.com.cn/guide/" ] def parse(self, response): items= [] # 全部大类的url 和 标题 parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract() parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract() # 全部小类的ur 和 标题 subUrls = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract() subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract() #爬取全部大类 for i in range(0, len(parentTitle)): # 指定大类目录的路径和目录名 parentFilename = "./Data/" + parentTitle[i] #若是目录不存在,则建立目录 if(not os.path.exists(parentFilename)): os.makedirs(parentFilename) # 爬取全部小类 for j in range(0, len(subUrls)): item = SinaItem() # 保存大类的title和urls item['parentTitle'] = parentTitle[i] item['parentUrls'] = parentUrls[i] # 检查小类的url是否以同类别大类url开头,若是是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba) if_belong = subUrls[j].startswith(item['parentUrls']) # 若是属于本大类,将存储目录放在本大类目录下 if(if_belong): subFilename =parentFilename + '/'+ subTitle[j] # 若是目录不存在,则建立目录 if(not os.path.exists(subFilename)): os.makedirs(subFilename) # 存储 小类url、title和filename字段数据 item['subUrls'] = subUrls[j] item['subTitle'] =subTitle[j] item['subFilename'] = subFilename items.append(item) #发送每一个小类url的Request请求,获得Response连同包含meta数据 一同交给回调函数 second_parse 方法处理 for item in items: yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse) #对于返回的小类的url,再进行递归请求 def second_parse(self, response): # 提取每次Response的meta数据 meta_1= response.meta['meta_1'] # 取出小类里全部子连接 sonUrls = response.xpath('//a/@href').extract() items= [] for i in range(0, len(sonUrls)): # 检查每一个连接是否以大类url开头、以.shtml结尾,若是是返回True if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls']) # 若是属于本大类,获取字段值放在同一个item下便于传输 if(if_belong): item = SinaItem() item['parentTitle'] =meta_1['parentTitle'] item['parentUrls'] =meta_1['parentUrls'] item['subUrls'] = meta_1['subUrls'] item['subTitle'] = meta_1['subTitle'] item['subFilename'] = meta_1['subFilename'] item['sonUrls'] = sonUrls[i] items.append(item) #发送每一个小类下子连接url的Request请求,获得Response后连同包含meta数据 一同交给回调函数 detail_parse 方法处理 for item in items: yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse) # 数据解析方法,获取文章标题和内容 def detail_parse(self, response): item = response.meta['meta_2'] content = "" head = response.xpath('//h1[@id=\"main_title\"]/text()') content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract() # 将p标签里的文本内容合并到一块儿 for content_one in content_list: content += content_one item['head']= head item['content']= content yield item
from scrapy import signals import sys reload(sys) sys.setdefaultencoding("utf-8") class SinaPipeline(object): def process_item(self, item, spider): sonUrls = item['sonUrls'] # 文件名为子连接url中间部分,并将 / 替换为 _,保存为 .txt格式 filename = sonUrls[7:-6].replace('/','_') filename += ".txt" fp = open(item['subFilename']+'/'+filename, 'w') fp.write(item['content']) fp.close() return item settings.py BOT_NAME = 'Sina' SPIDER_MODULES = ['Sina.spiders'] NEWSPIDER_MODULE = 'Sina.spiders' ITEM_PIPELINES = { 'Sina.pipelines.SinaPipeline': 300, } LOG_LEVEL = 'DEBUG'
from scrapy import cmdline cmdline.execute('scrapy crawl sina'.split())
py2 main.py
class CoserItem(scrapy.Item): url = scrapy.Field() name = scrapy.Field() info = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
# -*- coding: utf-8 -*- from scrapy.selector import Selector import scrapy from scrapy.contrib.loader import ItemLoader from Cosplay.items import CoserItem class CoserSpider(scrapy.Spider): name = "coser" allowed_domains = ["bcy.net"] start_urls = ( 'http://bcy.net/cn125101', 'http://bcy.net/cn126487', 'http://bcy.net/cn126173' ) def parse(self, response): sel = Selector(response) for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract(): link = 'http://bcy.net%s' % link request = scrapy.Request(link, callback=self.parse_item) yield request def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()
import requests from Cosplay import settings import os class ImageDownloadPipeline(object): def process_item(self, item, spider): if 'image_urls' in item: images = [] dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['image_urls']: us = image_url.split('/')[3:] image_file_name = '_'.join(us) file_path = '%s/%s' % (dir_path, image_file_name) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item['images'] = images return item
ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1} IMAGES_STORE = '../Images' DOWNLOAD_DELAY = 0.25 # 250 ms of delay
from scrapy import cmdline cmdline.execute('scrapy crawl coser'.split())
py2 main.py
爬取豆瓣电影top250movie.douban.com/top250的电影数据,并保存在MongoDB中。web
class DoubanspiderItem(scrapy.Item): # 电影标题 title = scrapy.Field() # 电影评分 score = scrapy.Field() # 电影信息 content = scrapy.Field() # 简介 info = scrapy.Field()
import scrapy from doubanSpider.items import DoubanspiderItem class DoubanSpider(scrapy.Spider): name = "douban" allowed_domains = ["movie.douban.com"] start = 0 url = 'https://movie.douban.com/top250?start=' end = '&filter=' start_urls = [url + str(start) + end] def parse(self, response): item = DoubanspiderItem() movies = response.xpath("//div[@class=\'info\']") for each in movies: title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract() content = each.xpath('div[@class="bd"]/p/text()').extract() score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract() info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() item['title'] = title[0] # 以;做为分隔,将content列表里全部元素合并成一个新的字符串 item['content'] = ';'.join(content) item['score'] = score[0] item['info'] = info[0] # 提交item yield item if self.start <= 225: self.start += 25 yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)
from scrapy.conf import settings import pymongo class DoubanspiderPipeline(object): def __init__(self): # 获取setting主机名、端口号和数据库名 host = settings['MONGODB_HOST'] port = settings['MONGODB_PORT'] dbname = settings['MONGODB_DBNAME'] # pymongo.MongoClient(host, port) 建立MongoDB连接 client = pymongo.MongoClient(host=host,port=port) # 指向指定的数据库 mdb = client[dbname] # 获取数据库里存放数据的表名 self.post = mdb[settings['MONGODB_DOCNAME']] def process_item(self, item, spider): data = dict(item) # 向指定的表里添加数据 self.post.insert(data) return item settings.py BOT_NAME = 'doubanSpider' SPIDER_MODULES = ['doubanSpider.spiders'] NEWSPIDER_MODULE = 'doubanSpider.spiders' ITEM_PIPELINES = { 'doubanSpider.pipelines.DoubanspiderPipeline' : 300 } # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36' # MONGODB 主机环回地址127.0.0.1 MONGODB_HOST = '127.0.0.1' # 端口号,默认是27017 MONGODB_PORT = 27017 # 设置数据库名称 MONGODB_DBNAME = 'DouBan' # 存放本次数据的表名称 MONGODB_DOCNAME = 'DouBanMovies'
启动MongoDB数据库须要两个命令: mongod:是mongoDB数据库进程自己 mongo:是命令行shell客户端 sudo mongod # 首先启动数据库服务,再执行Scrapy sudo mongo # 启动数据库shell 在mongo shell下使用命令: # 查看当前数据库 > db # 列出全部的数据库 > show dbs # 链接DouBan数据库 > use DouBan # 列出全部表 > show collections # 查看表里的数据 > db.DouBanMoives.find()
COOKIES_ENABLED
(Cookies中间件) 处于开启状态COOKIES_ENABLED = True 或 # COOKIES_ENABLED = False
只要是须要提供post数据的,就能够用这种方法。下面示例里post的数据是帐户密码:spring
# -*- coding: utf-8 -*- import scrapy class Renren1Spider(scrapy.Spider): name = "renren1" allowed_domains = ["renren.com"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' # FormRequest 是Scrapy发送POST请求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"}, callback = self.parse_page) def parse_page(self, response): with open("mao2.html", "w") as filename: filename.write(response.body)
正统模拟登陆方法:shell
首先发送登陆页面的get请求,获取到页面里的登陆必须的参数(好比说zhihu登录界面的 _xsrf)数据库
而后和帐户密码一块儿post到服务器,登陆成功json
# -*- coding: utf-8 -*- import scrapy class Renren2Spider(scrapy.Spider): name = "renren2" allowed_domains = ["renren.com"] start_urls = ( "http://www.renren.com/PLogin.do", ) # 处理start_urls里的登陆url的响应内容,提取登录须要的参数(若是须要的话) def parse(self, response): # 提取登录须要的参数 #_xsrf = response.xpath("//_xsrf").extract()[0] # 发送请求参数,并调用指定回调函数处理 yield scrapy.FormRequest.from_response( response, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf}, callback = self.parse_page ) # 获取登陆成功状态,访问须要登陆后才能访问的页面 def parse_page(self, response): url = "http://www.renren.com/422167102/profile" yield scrapy.Request(url, callback = self.parse_newpage) # 处理响应内容 def parse_newpage(self, response): with open("xiao.html", "w") as filename: filename.write(response.body)
若是实在没办法了,能够用这种方法模拟登陆,虽然麻烦一点,可是成功率100%api
# -*- coding: utf-8 -*- import scrapy class RenrenSpider(scrapy.Spider): name = "renren" allowed_domains = ["renren.com"] start_urls = ( 'http://www.renren.com/111111', 'http://www.renren.com/222222', 'http://www.renren.com/333333', ) cookies = { "anonymid" : "ixrna3fysufnwv", "_r01_" : "1", "ap" : "327550029", "JSESSIONID" : "abciwg61A_RvtaRS3GjOv", "depovince" : "GW", "springskin" : "set", "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950", "t" : "691808127750a83d33704a565d8340ae9", "societyguester" : "691808127750a83d33704a565d8340ae9", "id" : "327550029", "xnsid" : "f42b25cf", "loginfrom" : "syshome" } # 能够重写Spider类的start_requests方法,附带Cookie值,发送POST请求 def start_requests(self): for url in self.start_urls: yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page) # 处理响应内容 def parse_page(self, response): print "===========" + response.url with open("deng.html", "w") as filename: filename.write(response.body)
经过Fiddler抓包工具,能够抓取手机的网络通讯,但前提是手机和电脑处于同一局域网内(WI-FI或热点),而后进行如下设置:
打开Fiddler设置
2.在里设置容许链接远程计算机,确认后从新启动FiddlerConnections
1.在命令提示符下输入查看本机IPipconfig
2.打开Android设备的“设置”->“WLAN”,找到你要链接的网络,在上面长按,而后选择“修改网络”,弹出网
络设置对话框,而后勾选“显示高级选项”。
3.在“代理”后面的输入框选择“手动”,在“代理服务器主机名”后面的输入框输入电脑的ip地址,在“代理服务器端口”后面的输入框输入8888,而后点击“保存”按钮。
4.启动Android设备中的浏览器,访问网页便可在Fiddler中能够看到完成的请求和响应数据。
基本流程差很少,只是手机设置不太同样:
iPhone手机:点击设置 > 无线局域网 > 无线网络 > HTTP代理 > 手动:
代理地址(电脑IP):192.168.xx.xxx
端口号:8888