注意:模拟登录时,必须保证settings.py里的COOKIES_ENABLED(Cookies中间件) 处于开启状态
COOKIES_ENABLED = True
或# COOKIES_ENABLED = False
php
只要是须要提供post数据的,就能够用这种方法。下面示例里post的数据是帐户密码:html
yield scrapy.FormRequest(url, formdata, callback)
方法发送POST请求。start_requests(self)
方法,而且再也不调用start_urls
里的url。class mySpider(scrapy.Spider): # start_urls = ["http://www.example.com/"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' #从源码中form表单提取的action网址 # FormRequest 是Scrapy发送POST请求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"}, callback = self.parse_page ) def parse_page(self, response): # do something # 业务逻辑
正统模拟登陆方法:python
- 首先发送登陆页面的get请求,获取到页面里的登陆必须的参数(好比说zhihu登录界面的 _xsrf)
- 而后和帐户密码一块儿post到服务器,登陆成功
- 使用
FormRequest.from_response()
方法[模拟用户登陆]
import scrapy class LoginSpider(scrapy.Spider): name = 'example.com' start_urls = ['http://www.example.com/users/login.php'] def parse(self, response): return scrapy.FormRequest.from_response( response, formdata={'username': 'john', 'password': 'secret'}, callback=self.after_login ) def after_login(self, response): # check login succeed before going on if "authentication failed" in response.body: self.log("Login failed", level=log.ERROR) return # continue scraping with authenticated session...
模拟浏览器登陆web
start_requests()方法,能够返回一个请求给爬虫的起始网站,这个返回的请求至关于start_urls,start_requests()返回的请求会替代start_urls里的请求spring
Request()get请求,能够设置,url、cookie、回调函数chrome
FormRequest.from_response()表单post提交,第一个必须参数,上一次响应cookie的response对象,其余参数,cookie、url、表单内容等api
import scrapy # 正统模拟登陆方法: # 首先发送登陆页面的get请求,获取到页面里的登陆必须的参数,好比说zhihu的 _xsrf # 而后和帐户密码一块儿post到服务器,登陆成功 # 第二种标准 def parse(self, response): print(response.body.decode('utf-8'), "@@" * 40) yield scrapy.FormRequest.from_response(response,formdata={ "email": "18588403840", "origURL":"http://www.renren.com/422167102/profile", "domain": "renren.com", "key_id": "1", "captcha_type": "web_login", "password": "97bfc03b0eec4df7c76eaec10cd08ea57b01eefd0c0ffd4c0e5061ebd66460d9", "rkey": "26615a8e93fee56fc1fb3d679afa3cc4", "f": "" }, dont_filter=True, headers=self.headers, callback=self.get_page) def get_page(self, response): print("===================", response.url) print(response.body.decode('utf-8')) url = "http://www.renren.com/353111356/profile" yield scrapy.Request(url, callback=self.get_info) def get_info(self, response): print('*******' * 30) print(response.body.decode('utf-8'))
yield Request()能够将一个新的请求返回给爬虫执行浏览器
在发送请求时cookie的操做, meta={'cookiejar':1}表示开启cookie记录,首次请求时写在Request()里 meta={'cookiejar':response.meta['cookiejar']}表示使用上一次response的cookie,写在FormRequest.from_response()里post受权 meta={'cookiejar':True}表示使用受权后的cookie访问须要登陆查看的页面服务器
import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class MyrenSpider(CrawlSpider): name = 'myren' allowed_domains = ['renren.com'] start_urls = ["http://www.renren.com/353111356/profile"] rules = [Rule(LinkExtractor(allow=('(\d+)/profile')), callback='get_info', follow=True)] headers = { "Accept": "*/*", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36", } def start_requests(self): yield scrapy.Request(url="http://www.renren.com/", meta={'cookiejar': 1}, callback=self.post_login) # 第二种标准 def post_login(self, response): yield scrapy.FormRequest.from_response(response, url="http://www.renren.com/PLogin.do", meta={'cookiejar': response.meta['cookiejar']}, # 在以前须要打开 meta = {'cookiejar' : 1} headers=self.headers, formdata={ "email": "18588403840", "password": "Changeme_123" }, dont_filter=True, callback=self.after_login) def after_login(self, response): for url in self.start_urls: # yield self.make_requests_from_url(url) yield scrapy.Request(url, meta={'cookiejar': response.meta['cookiejar']}) def get_info(self, response): print('*******' * 30) print(response.body.decode('utf-8')) def _requests_to_follow(self, response): """重写加入cookiejar的更新""" if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) # 下面这句是我重写的 r.meta.update(rule=n, link_text=link.text, cookiejar=response.meta['cookiejar']) yield rule.process_request(r)
若是实在没办法了,能够用这种方法模拟登陆,虽然麻烦一点,可是成功率100%cookie
ChangeCookies 将cookie解析成字典形式
class transCookie: def __init__(self, cookie): self.cookie = cookie def stringToDict(self): ''' 将从浏览器上Copy来的cookie字符串转化为Scrapy能使用的Dict :return: ''' itemDict = {} items = self.cookie.split(';') for item in items: key = item.split('=')[0].strip() value = item.split('=')[1] itemDict[key] = value return itemDict if __name__ == "__main__": cookie = "你的cookie" trans = transCookie(cookie) print(trans.stringToDict())
将解析好的cookie格式放入请求
# -*- coding: utf-8 -*- import scrapy class RenrenSpider(scrapy.Spider): name = "renren" allowed_domains = ["renren.com"] start_urls = [ 'http://www.renren.com/111111', 'http://www.renren.com/222222', 'http://www.renren.com/333333', ] #开始请求url列表 cookies = { "anonymid" : "ixrna3fysufnwv", "_r01_" : "1", "ap" : "327550029", "JSESSIONID" : "abciwg61A_RvtaRS3GjOv", "depovince" : "GW", "springskin" : "set", "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950", "t" : "691808127750a83d33704a565d8340ae9", "societyguester" : "691808127750a83d33704a565d8340ae9", "id" : "327550029", "xnsid" : "f42b25cf", "loginfrom" : "syshome" } # 能够重写Spider类的start_requests方法,附带Cookie值,发送POST请求 def start_requests(self): return [scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse)] # 处理响应内容 def parse(self, response): print "===========" + response.url with open("deng.html", "w") as filename: filename.write(response.body)
1 spider.browser.page_source 获取响应的源代码
2 session.get(request.url).text 获取响应的源代码
3 requests采用session管理cookie
4 urllib 采用cookieJar管理cookie
模拟登陆淘宝
class TaobaoSpider(scrapy.Spider): name = 'mytaobao' allowed_domains = ['taobao.com'] start_urls = ['https://login.m.taobao.com/login.htm', "http://h5.m.taobao.com/mlapp/olist.html?spm=a2141.7756461.2.6"] def __init__(self): # 初始化 self.browser = None self.cookies = None super(TaobaoSpider, self).__init__() # 传递给父类 def parse(self, response): # 打印连接,打印网页源代码 print(response.url) print(response.body.decode("utf-8", "ignore"))
#中间件middleware 自定义LoginMiddleware登陆 from scrapy import signals from selenium import webdriver from scrapy.http import HtmlResponse # 网页响应 import requests import time class LoginMiddleware(object): ''' 找到password username输入框并send_keys 点击登陆并抓取cookie,spider.browser.get_cookies() 返回页面信息,HtmlResponse ''' def process_request(self, request, spider): if spider.name == "mytaobao": # 指定仅仅处理这个名称的爬虫 if request.url.find("login") != -1: # 判断是否登录页面 mobilesetting = {"deviceName": "iPhone 6 Plus"} options = webdriver.ChromeOptions() # 浏览器选项 options.add_experimental_option("mobileEmulation", mobilesetting) # 模拟手机 spider.browser = webdriver.Chrome(chrome_options=options) # 建立一个浏览器对象 spider.browser.set_window_size(400, 800) # 配置手机大小 spider.browser.get(request.url) # 爬虫访问连接 time.sleep(3) #必需要睡下由于考虑到输入:用户名密码 要时间 print("login访问", request.url) username = spider.browser.find_element_by_id("username") password = spider.browser.find_element_by_id("password") time.sleep(1) username.send_keys("2403239393@qq.com") # 帐户 time.sleep(2) password.send_keys("bama100") # 密码 time.sleep(2) spider.browser.find_element_by_id("btn-submit").click() time.sleep(4) spider.cookies = spider.browser.get_cookies() # 抓取所有的cookie # spider.browser.close() return HtmlResponse(url=spider.browser.current_url, # 当前链接 body=spider.browser.page_source, # 源代码 encoding="utf-8") # 返回页面信息 else:#登陆后则执行 ''' 1 采用requests.session保存cookie 2 设置cookie session.cookie.set(name,value) 3 清空headers session.headers.clear() 4 发起get请求 session.get(url) ''' print("request 访问") session = requests.session() # 会话 for cookie in spider.cookies: session.cookies.set(cookie['name'], cookie["value"]) session.headers.clear() # 清空头 newpage = session.get(request.url) print("---------------------") print(request.url) print("---------------------") print(newpage.text) print("---------------------") # 页面 time.sleep(3) return HtmlResponse(url=request.url, # 当前链接 body=newpage.text, # 源代码 encoding="utf-8") # 返回页面信息