day 112天,爬虫(拉钩网,斗音,GitHub)次日

提早准备工做、安装准备工做(day3用)

 1. 安装scrapy 11
https://www.cnblogs.com/wupeiqi/articles/6229292.html

a. 下载twisted 
http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted

b. 安装wheel 
pip3 install wheel

c. 安装twisted 

pip3 install Twisted‑18.7.0‑cp36‑cp36m‑win_amd64.whl

d. 安装pywin32
pip3 install pywin32


e. 安装scrapy 
pip3 install scrapy 
 
 
https://www.cnblogs.com/wupeiqi/articles/6283017.html   武沛齐老师博客。

 

1、访问登陆界面

# ============================第一步访问登陆界面======================================
import requests
r1 =requests.get(
    url="https://passport.lagou.com/login/login.html",
    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\
    Chrome/67.0.3396.87 Safari/537.36",}
                )
r1_cookie_dict =r1.cookies.get_dict()
print(r1.text)
print("r1-cookie:===>",r1_cookie_dict)

  打印结果:html

 

 

2、登陆界面,登陆成功

# =======================================第二步、去登陆拉钩网================================================

import re

token =  re.findall("X_Anti_Forge_Token = '(.*)';",r1.text)[0]
code =re.findall("X_Anti_Forge_Code = '(.*)';",r1.text)[0]
print(token)
print(code)

r2 =requests.post(
    url="https://passport.lagou.com/login/login.json",

    headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
    "X-Requested-With":"XMLHttpRequest",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Connection": "keep-alive",
    "Content-Length": "111",
    "Origin": "https: // passport.lagou.com",
    "Referer": "https://passport.lagou.com/login/login.html",
    "X-Anit-Forge-Code":code,
    "X-Anit-Forge-Token" :token ,
    },
    data={"isValidate": "true",
            "username": "",
            "password": "4d541689997b5ff6ac90a350b5dd6693",
            "request_form_verifyCode":"",
            "submit":""
          },
    cookies= r1_cookie_dict
)


print(r2.text) 

打印结果前端

 

3、登陆邀请(invitation)界面

 

import  requests

r3 =requests.get(
    url="https://www.lagou.com/mycenter/invitation.html",

    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        "Host": "www.lagou.com",
        "Upgrade-Insecure-Requests": "1"
    },
 cookies= r1_cookie_dict #cookies不正确.print出来的不正确. )
print(r3.text)

打印结果:

登陆失败。 因此 第三步的流程不对,应该尝试其余的流程

从新登陆后会发现有个以下的请求

 重定向到新的网站

  

 

 又重定向新的网址

又重定向新的网址

重定向后:

 

 

 

 第三步; grant 登陆

 

 

import  requests

r3 =requests.get(
    url="https://passport.lagou.com/grantServiceTicket/grant.html",

    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        "Host": "passport.lagou.com",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive",
        "Referer": "https://passport.lagou.com/login/login.html?ts=1532004536388&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=F241DF2A40C183BA91C33BA6604912F0",
    },
    cookies= r1_cookie_dict,
 allow_redirects =False #把重定向关掉. )
r3_cookie_dict =r3.cookies.get_dict()
print(r3.text)
print(r3.cookies.get_dict()) 

 

  打印结果:vue

 

 

 

第四步, 发送action 请求

# =======================================第四步、Action发请求================================================
all_cookies_dict.update()
import  requests
r4 =requests.get(
    url="https://www.lagou.com/?action=grantST&ticket=ST-f6c670b8a6104480a96cd835d80a8db8",

    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        "Host": "www.lagou.com",
        "Referer": "https://passport.lagou.com/login/login.html?ts=1532005741245&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=ED6DE46236FC2638697A5ECC080822F7",
    },
    cookies= all_cookies_dict,
    allow_redirects =False
)
r4_cookie_dict =r4.cookies.get_dict()
print("r4===>",r4.text)

  

 

第五次请求 

 

# ##################################### 第五步:获取认证信息 #####################################
r5 = requests.get(
    url=r4.headers['Location'],
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer':'https://passport.lagou.com/login/login.html',
        'Host':'www.lagou.com',
        'Upgrade-Insecure-Requests':'1',
    },
    cookies=all_cookie_dict,
    allow_redirects=False

)
r5_cookie_dict = r5.cookies.get_dict()
all_cookie_dict.update(r5_cookie_dict)

print(r5.headers['Location'])

  

第六次请求python

# ##################################### 第六步:个人邀请 #####################################
r = requests.get(
    url='https://www.lagou.com/mycenter/invitation.html',
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Host':'www.lagou.com',
        'Upgrade-Insecure-Requests':'1',
        'Pragma':'no-cache',
    },
    cookies=all_cookie_dict
)
print('wupeiqi' in r.text)

最后登陆成功:

 

第七步:

# ##################################### 第七步 #####################################
r7 = requests.get(
    url=r6.headers['Location'],
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer':'https://passport.lagou.com/login/login.html',
        'Host':'www.lagou.com',
        'Upgrade-Insecure-Requests':'1',
    },
    cookies=all_cookie_dict,
    allow_redirects=False

)
r7_cookie_dict = r7.cookies.get_dict()
all_cookie_dict.update(r7_cookie_dict)

  

 第八步:

# ##################################### 第九步:查看我的信息 #####################################

r9 = requests.put(
    url='https://gate.lagou.com/v1/neirong/account/users/0/',
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Host':'gate.lagou.com',
        'Origin':'https://account.lagou.com',
        'Referer':'https://account.lagou.com/v2/account/userinfo.html',
        'X-L-REQ-HEADER':'{deviceType:1}',
        'X-Anit-Forge-Code':r8_response_json.get('submitCode'),
        'X-Anit-Forge-Token':r8_response_json.get('submitToken'),
        'Content-Type':'application/json;charset=UTF-8',
    },
    json={"userName":"wupeiqi999","sex":"MALE","portrait":"images/myresume/default_headpic.png","positionName":"...","introduce":"...."},
    cookies=all_cookie_dict
)

print(r9.text)

  

 

爬虫 抖音视频

 

3. requests模块
参数:
url
params
headers
cookies
data
示例:
request.post(
data={
user:'alex',
pwd:'sb'
}
)

user=alex&pwd=sb

chrome: formdata
json
示例:
request.post(
json={
user:'alex',
pwd:'sb'
}
)

'{"user":"alex","pwd":"sb"}'

chrome: request payload git

 

 

s10day112 

内容回顾:
	第一部分:爬虫相关
		1. 谈谈你对http协议的理解?
			规范:
				1. Http请求收发数据的格式
					GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n
					POST /index/ http1.1/r/nhost:xxx.com/r/n/r/nuser=xxx
					
				2. 短链接(无状态)
					一次请求一次响应以后,就断开链接
					
				3. 基于TCP协议之上
					sk = socket()
					sk.send('GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n')
			常见请求头有哪些?
				host 
				content-type 
				user-agent
				cookies
				referer,上一次请求地址
			常见的请求方法有哪些?
				GET
				POST 
				DELETE
				PUT 
				PATCH
				OPTIONS
		2. requests 
			用于伪造浏览器发送请求
			参数:
				- url 
				- headers
				- data 
				- cookies 
			响应:
				- content 
				- text 
				- encoding='gbk'
				- headers 
				- cookies.get_dict()
				
		3. bs  
			用于解析HTML格式的字符串 
			方法和属性:
				- find 
				- find_all 
				- attrs
				- get 
				- text 
				
		4. 套路 
			- 汽车之家
			- 抽屉新闻:携带user-agent 
			- 登陆抽屉:第一访问保留cookie,登陆时须要再次携带;
			- 自动登陆github:获取csrf_token,到底携带那一个cookie

			
			补充:自动登陆github
			
	第二部分:路飞相关
		1. 公司的组织架构?
			开发:
				- 村长
				- 前端姑娘
				- 涛
				- 云(产品+开发)
			UI:1人
			测试:1人
			运维:1人
			运营:2人
			销售:3人
			班主任:1人
			全职助教:2人
			人事/财务:老男孩共享
			
		2. 项目架构
			- 管理后台(1)
				- 权限
				- xadmin
			- 导师后台(1)
				- 权限
				- xadmin 
			- 主站(1+1+0.5+1)
				- restful api 
				- vue.js 
			
			如今开发:题库系统

		3. 涉及技术点:
			- django 
			- django rest framework 
			- vue.js 
			- 跨域cors
			- redis 
			- 支付宝支付
			- 视频播放
				- CC视频
				- 保利 
			- 微信消息推送
				- 已认证的服务号
				- 发送模板消息 
			- content-type 
		
		
今日内容:
	- 拉勾网
	- 抖音 
	- requests
	- bs4 
	- 初识scrapy框架
	
	
内容详细:
	1.拉勾网
		- Token和Code存在页面上,自定义请求头上
		- 重定向:
			- 响应头的Location中获取要重定向的地址
			- 本身去处理
		- 请求发送时须要携带上次请求的code和token 
		
		原则:
			- 彻底模拟浏览器的行为
	
	2. 爬抖音视频 
	
	3. requests模块 
		参数:	
			url 
			params 
			headers 
			cookies 
			data 
				示例:
					request.post(
						data={
							user:'alex',
							pwd:'sb'
						}
					)
					
					user=alex&pwd=sb 
				
				chrome: formdata
			json 
				示例:
					request.post(
						json={
							user:'alex',
							pwd:'sb'
						}
					)
					
					'{"user":"alex","pwd":"sb"}'
	
				chrome: request payload 
			allow_redirecs
			stream
	
			files 
				requests.post(
					url='xxx',
					files={
						'f1': open('readme', 'rb')
					}
				)
			
			auth
				from requests.auth import HTTPBasicAuth, HTTPDigestAuth

				ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('admin', 'admin'))
				print(ret.text)
	
			timeout 
				ret = requests.get('http://google.com/', timeout=1)
				
				ret = requests.get('http://google.com/', timeout=(5, 1))
			proxies
				proxies = {
					"http": "61.172.249.96:80",
					"https": "http://61.185.219.126:3128",
				}
				# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}
				
				ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxies)
				print(ret.headers)
				
				
				from requests.auth import HTTPProxyAuth
				auth = HTTPProxyAuth('username', 'mypassword')
				
				r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
				
			证书相关:
				cert
				verify
				
		session:自动管理cookie和headers(不建议使用)
			import requests

			session = requests.Session()
			i1 = session.get(url="http://dig.chouti.com/help/service")
			i2 = session.post(
				url="http://dig.chouti.com/login",
				data={
					'phone': "8615131255089",
					'password': "xxooxxoo",
					'oneMonth': ""
				}
			)
			i3 = session.post(
				url="http://dig.chouti.com/link/vote?linksId=8589523"
			)
			print(i3.text)
				
	4. bs4 

		参考示例:https://www.cnblogs.com/wupeiqi/articles/6283017.html
	
	
预习:
	1. 安装scrapy 
		https://www.cnblogs.com/wupeiqi/articles/6229292.html
		
		a. 下载twisted 
			http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
		
		b. 安装wheel 
			pip3 install wheel
			
		c. 安装twisted 
			
			pip3 install Twisted‑18.7.0‑cp36‑cp36m‑win_amd64.whl
			
		d. 安装pywin32
			pip3 install pywin32
			
			
		e. 安装scrapy 
			pip3 install scrapy 

  

 

 

requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs) 
相关文章
相关标签/搜索