以前介绍过经过requests的session 会话模拟登录;必须是session,涉及到验证码和xsrf的
写入cookie验证的问题;在scrapy中不需担忧此问题,由于Request会保证这是一个会话,而且自动传递cookies
原理想通,由于验证码识别的问题,这里先使用cookie模拟登录html
1 # -*- coding: utf-8 -*- 2 3 import scrapy 4 import json 5 import re 6 7 8 9 10 11 class ZhihuSpider(scrapy.Spider): 12 13 name = "zhihu" 14 allowed_domains = ["zhihu.com"] 15 start_urls = ['http://www.zhihu.com/'] 16 #头部 17 headers = { 18 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", 19 "Host":"www.zhihu.com", 20 "Referer":"https://www.zhihu.com/", 21 } 22 #从已经登录的浏览在中copy下来的 23 cookies = { 24 "d_c0":"", 25 "l_cap_id":"", 26 "r_cap_id":"", 27 "cap_id":"", 28 "_zap":"", 29 "__utmc":"", 30 "__utmb":"", 31 "__utmv":"", 32 "__utma":"", 33 "__utmz":"5", 34 "q_c1":"", 35 } 36 #最开始请求的reqeust函数,自动调用,将首次获取的response返回给登录函数(里面有xsrf) 37 def start_requests(self): 38 #必须带上cookie;return返回,不用生成器,只需爬取登录页面一次,并且必须返回一个可迭代对象,因此是列表 39 return [scrapy.Request(url="https://www.zhihu.com/#signin",cookies=self.cookies,headers=self.headers,callback=self.login)] 40 41 42 #知乎登陆 43 def login(self,response): 44 #正则匹配出xsrf 45 response_text = response.text 46 match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL) 47 if match_obj: 48 xsrf = (match_obj.group(1)) 49 50 url = "https://www.zhihu.com/login/phone_num" 51 data={ 52 "_xsrf":xsrf, 53 'remember_me': 'true', 54 "password":"", 55 "phone_num":"" 56 } 57 58 #将获取到的xsrf加载到cookie中 59 self.cookies["_xsrf"] = xsrf 60 #经过FormRequest提交表单,这里的request对象和以前的session同样,仍是处于刚刚的对话中;回调给检查登录的函数 61 return [scrapy.FormRequest(url=url,headers=self.headers,formdata=data,callback=self.check_login)] 62 63 #查看登陆状态;登录成功则默认回调parse函数进行解析网页 64 def check_login(self,response): 65 text_json = json.load(response.text) 66 if "msg" in text_json and text_json["msg"]=="\u767b\u5f55\u6210\u529f": 67 for urls in self.start_urls: 68 yield scrapy.Request(url=urls,dont_filter=True,headers=self.headers) 69 70 71 72 73 def parse(self, response): 74 pass