# ============================第一步访问登陆界面====================================== import requests r1 =requests.get( url="https://passport.lagou.com/login/login.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\ Chrome/67.0.3396.87 Safari/537.36",} ) r1_cookie_dict =r1.cookies.get_dict() print(r1.text) print("r1-cookie:===>",r1_cookie_dict)
打印结果:html
# =======================================第二步、去登陆拉钩网================================================
import re
token = re.findall("X_Anti_Forge_Token = '(.*)';",r1.text)[0]
code =re.findall("X_Anti_Forge_Code = '(.*)';",r1.text)[0]
print(token)
print(code)
r2 =requests.post(
url="https://passport.lagou.com/login/login.json",
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
"X-Requested-With":"XMLHttpRequest",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Connection": "keep-alive",
"Content-Length": "111",
"Origin": "https: // passport.lagou.com",
"Referer": "https://passport.lagou.com/login/login.html",
"X-Anit-Forge-Code":code,
"X-Anit-Forge-Token" :token ,
},
data={"isValidate": "true",
"username": "",
"password": "4d541689997b5ff6ac90a350b5dd6693",
"request_form_verifyCode":"",
"submit":""
},
cookies= r1_cookie_dict
)
print(r2.text)
打印结果前端
import requests r3 =requests.get( url="https://www.lagou.com/mycenter/invitation.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Host": "www.lagou.com", "Upgrade-Insecure-Requests": "1" }, cookies= r1_cookie_dict #cookies不正确.print出来的不正确. ) print(r3.text)
import requests r3 =requests.get( url="https://passport.lagou.com/grantServiceTicket/grant.html", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Host": "passport.lagou.com", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Referer": "https://passport.lagou.com/login/login.html?ts=1532004536388&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=F241DF2A40C183BA91C33BA6604912F0", }, cookies= r1_cookie_dict, allow_redirects =False #把重定向关掉. ) r3_cookie_dict =r3.cookies.get_dict() print(r3.text) print(r3.cookies.get_dict())
打印结果:vue
# =======================================第四步、Action发请求================================================ all_cookies_dict.update() import requests r4 =requests.get( url="https://www.lagou.com/?action=grantST&ticket=ST-f6c670b8a6104480a96cd835d80a8db8", headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Host": "www.lagou.com", "Referer": "https://passport.lagou.com/login/login.html?ts=1532005741245&serviceId=lagou&service=https%253A%252F%252Fwww.lagou.com%252F&action=login&signature=ED6DE46236FC2638697A5ECC080822F7", }, cookies= all_cookies_dict, allow_redirects =False ) r4_cookie_dict =r4.cookies.get_dict() print("r4===>",r4.text)
# ##################################### 第五步:获取认证信息 ##################################### r5 = requests.get( url=r4.headers['Location'], headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Referer':'https://passport.lagou.com/login/login.html', 'Host':'www.lagou.com', 'Upgrade-Insecure-Requests':'1', }, cookies=all_cookie_dict, allow_redirects=False ) r5_cookie_dict = r5.cookies.get_dict() all_cookie_dict.update(r5_cookie_dict) print(r5.headers['Location'])
第六次请求python
# ##################################### 第六步:个人邀请 ##################################### r = requests.get( url='https://www.lagou.com/mycenter/invitation.html', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Host':'www.lagou.com', 'Upgrade-Insecure-Requests':'1', 'Pragma':'no-cache', }, cookies=all_cookie_dict ) print('wupeiqi' in r.text)
# ##################################### 第七步 ##################################### r7 = requests.get( url=r6.headers['Location'], headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Referer':'https://passport.lagou.com/login/login.html', 'Host':'www.lagou.com', 'Upgrade-Insecure-Requests':'1', }, cookies=all_cookie_dict, allow_redirects=False ) r7_cookie_dict = r7.cookies.get_dict() all_cookie_dict.update(r7_cookie_dict)
# ##################################### 第九步:查看我的信息 ##################################### r9 = requests.put( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'Host':'gate.lagou.com', 'Origin':'https://account.lagou.com', 'Referer':'https://account.lagou.com/v2/account/userinfo.html', 'X-L-REQ-HEADER':'{deviceType:1}', 'X-Anit-Forge-Code':r8_response_json.get('submitCode'), 'X-Anit-Forge-Token':r8_response_json.get('submitToken'), 'Content-Type':'application/json;charset=UTF-8', }, json={"userName":"wupeiqi999","sex":"MALE","portrait":"images/myresume/default_headpic.png","positionName":"...","introduce":"...."}, cookies=all_cookie_dict ) print(r9.text)
3. requests模块
参数:
url
params
headers
cookies
data
示例:
request.post(
data={
user:'alex',
pwd:'sb'
}
)
user=alex&pwd=sb
chrome: formdata
json
示例:
request.post(
json={
user:'alex',
pwd:'sb'
}
)
'{"user":"alex","pwd":"sb"}'
chrome: request payload git
s10day112
内容回顾:
第一部分:爬虫相关
1. 谈谈你对http协议的理解?
规范:
1. Http请求收发数据的格式
GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n
POST /index/ http1.1/r/nhost:xxx.com/r/n/r/nuser=xxx
2. 短链接(无状态)
一次请求一次响应以后,就断开链接
3. 基于TCP协议之上
sk = socket()
sk.send('GET /index/ http1.1/r/nhost:xxx.com/r/n/r/n')
常见请求头有哪些?
host
content-type
user-agent
cookies
referer,上一次请求地址
常见的请求方法有哪些?
GET
POST
DELETE
PUT
PATCH
OPTIONS
2. requests
用于伪造浏览器发送请求
参数:
- url
- headers
- data
- cookies
响应:
- content
- text
- encoding='gbk'
- headers
- cookies.get_dict()
3. bs
用于解析HTML格式的字符串
方法和属性:
- find
- find_all
- attrs
- get
- text
4. 套路
- 汽车之家
- 抽屉新闻:携带user-agent
- 登陆抽屉:第一访问保留cookie,登陆时须要再次携带;
- 自动登陆github:获取csrf_token,到底携带那一个cookie
补充:自动登陆github
第二部分:路飞相关
1. 公司的组织架构?
开发:
- 村长
- 前端姑娘
- 涛
- 云(产品+开发)
UI:1人
测试:1人
运维:1人
运营:2人
销售:3人
班主任:1人
全职助教:2人
人事/财务:老男孩共享
2. 项目架构
- 管理后台(1)
- 权限
- xadmin
- 导师后台(1)
- 权限
- xadmin
- 主站(1+1+0.5+1)
- restful api
- vue.js
如今开发:题库系统
3. 涉及技术点:
- django
- django rest framework
- vue.js
- 跨域cors
- redis
- 支付宝支付
- 视频播放
- CC视频
- 保利
- 微信消息推送
- 已认证的服务号
- 发送模板消息
- content-type
今日内容:
- 拉勾网
- 抖音
- requests
- bs4
- 初识scrapy框架
内容详细:
1.拉勾网
- Token和Code存在页面上,自定义请求头上
- 重定向:
- 响应头的Location中获取要重定向的地址
- 本身去处理
- 请求发送时须要携带上次请求的code和token
原则:
- 彻底模拟浏览器的行为
2. 爬抖音视频
3. requests模块
参数:
url
params
headers
cookies
data
示例:
request.post(
data={
user:'alex',
pwd:'sb'
}
)
user=alex&pwd=sb
chrome: formdata
json
示例:
request.post(
json={
user:'alex',
pwd:'sb'
}
)
'{"user":"alex","pwd":"sb"}'
chrome: request payload
allow_redirecs
stream
files
requests.post(
url='xxx',
files={
'f1': open('readme', 'rb')
}
)
auth
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('admin', 'admin'))
print(ret.text)
timeout
ret = requests.get('http://google.com/', timeout=1)
ret = requests.get('http://google.com/', timeout=(5, 1))
proxies
proxies = {
"http": "61.172.249.96:80",
"https": "http://61.185.219.126:3128",
}
# proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}
ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxies)
print(ret.headers)
from requests.auth import HTTPProxyAuth
auth = HTTPProxyAuth('username', 'mypassword')
r = requests.get("http://www.google.com", proxies=proxyDict, auth=auth)
证书相关:
cert
verify
session:自动管理cookie和headers(不建议使用)
import requests
session = requests.Session()
i1 = session.get(url="http://dig.chouti.com/help/service")
i2 = session.post(
url="http://dig.chouti.com/login",
data={
'phone': "8615131255089",
'password': "xxooxxoo",
'oneMonth': ""
}
)
i3 = session.post(
url="http://dig.chouti.com/link/vote?linksId=8589523"
)
print(i3.text)
4. bs4
参考示例:https://www.cnblogs.com/wupeiqi/articles/6283017.html
预习:
1. 安装scrapy
https://www.cnblogs.com/wupeiqi/articles/6229292.html
a. 下载twisted
http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
b. 安装wheel
pip3 install wheel
c. 安装twisted
pip3 install Twisted‑18.7.0‑cp36‑cp36m‑win_amd64.whl
d. 安装pywin32
pip3 install pywin32
e. 安装scrapy
pip3 install scrapy
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs)