python爬虫requests的使用

时间 2019-11-12

原文原文链接

1 发送get请求获取页面html

 1 import requests  2 
 3 # 1 要爬取的页面地址
 4 url = 'http://www.baidu.com'
 5 # 2 发送get请求 拿到响应
 6 response = requests.get(url=url)  7 # 3 获取响应内容文本 两种方法
 8 html1 = response.content.decode() #response.content为bytes类型，decode() 将它转换为utf8
 9 print(html1) 10 
11 response.encoding='utf8'
12 html2 = response.text # 用response.text 会自动选择一种方式解码 有时候会乱码，要提早设置response.encoding
13 print(html2)

2 发送post请求获取页面浏览器

 1 import requests  2 
 3 # 1 要爬取的页面地址
 4 url = 'http://www.baidu.com'
 5 # 2 发送get请求 拿到响应
 6 response = requests.post(url=url)  7 # 3 获取响应内容文本 两种方法
 8 html1 = response.content.decode() #response.content为bytes类型，decode() 将它转换为utf8
 9 print(html1) 10 
11 response.encoding='utf8'
12 html2 = response.text # 用response.text 会自动选择一种方式解码 有时候会乱码，要提早设置response.encoding
13 print(html2)

3 假装浏览器，携带报头服务器

 1 import requests  2 
 3 # 假装咱们的报文头，加上Use-Agent 假装成浏览器
 4 headers = {  5     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',  6     # 若是要带着cookie 能够传入cookie，也能够放在报文头当中
 7     #'Cookie':'这里放入cookie'
 8 }  9 # 1 要爬取的页面地址
10 url = 'http://www.baidu.com'
11 # 2 发送get请求 拿到响应
12 response = requests.get(url=url,headers=headers) 13 # 3 获取响应内容文本 两种方法
14 html = response.content.decode() #response.content为bytes类型，decode() 将它转换为utf8
15 print(html)

4 携带数据（好比发送请求去登录）cookie

 1 import requests  2 
 3 # 若是假装登陆，能够传送一个字典类型数据
 4 data = {  5 '''这里放入须要的key：value'''
 6 }  7 # 1 要爬取的页面地址
 8 url = 'http://www.baidu.com'
 9 # 2 发送get请求 拿到响应 
10 # get请求用params 至关于在url后面拼接key=value&key=value
11 response = requests.get(url=url,params=data) 12 # post用data传入参数 携带post的数据
13 response = requests.post(url=url,data=data) 14 # 3 获取响应内容文本 两种方法
15 html = response.content.decode() #response.content为bytes类型，decode() 将它转换为utf8
16 print(html)

5 代理session

import requests # 将代理的服务器放入这里，key为协议类型 value为代理的ip和端口 # 发送https或者http请求会根据不一样代理ip选择 为咱们发送请求
proxies = { 'http':'http://127.0.0.1:80', 'https':'https://127.0.0.1:80' } # 1 要爬取的页面地址
url = 'http://www.baidu.com'
# 2 发送get请求 拿到响应
response = requests.get(url=url,proxies=proxies) # 3 获取响应内容文本 两种方法
html = response.content.decode() #response.content为bytes类型，decode() 将它转换为utf8
print(html)

6 携带cookiepost

 1 import requests  2 
 3 # 若是要带着cookie字典 能够传入cookie，也能够放在报文头当中
 4 cookies = {  5     #'key':'value',
 6 }  7 
 8 # 或者将cookie放在报文头当中
 9 headers = { 10     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 11     # 若是要带着cookie 能够传入cookie，也能够放在报文头当中
12     #'Cookie':'这里放入cookie'
13 } 14 
15 # 1 要爬取的页面地址
16 url = 'http://www.baidu.com'
17 # 2 发送get请求 拿到响应
18 response = requests.get(url=url,cookies=cookies) 19 #response = requests.get(url=url,headers=headers)
20 # 3 获取响应内容文本 两种方法
21 html = response.content.decode() #response.content为bytes类型，decode() 将它转换为utf8
22 print(html)

7 保持session 帮咱们保存response中的sessionurl

 1 import requests  2 # 获取一个session对象为咱们发送请求 用法与requests对象相同
 3 session = requests.session()  4 
 5 url = 'http://www.baidu.com'
 6 #保持session发送请求
 7 response = session.get(url=url)  8 # 获取页面
 9 html = response.content.decode() 10 print(html) 11 #查看session
12 print(response.cookies)

8 设置链接超时时间spa

 1 import requests  2 # 获取一个session对象为咱们发送请求 用法与requests对象相同
 3 session = requests.session()  4 
 5 url = 'http://www.baidu.com'
 6 #保持session发送请求
 7 response = session.get(url=url,timeout = 3) # 3秒时间为超时时间
 8 # 获取页面
 9 html = response.content.decode() 10 print(html) 11 #查看session
12 print(response.cookies)

9 设置ssl校验对方https协议合法性是否忽略代理

 1 import requests  2 # 获取一个session对象为咱们发送请求 用法与requests对象相同
 3 session = requests.session()  4 
 5 url = 'http://www.baidu.com'
 6 #保持session发送请求
 7 response = session.get(url=url,verify=False) # 不校验ssl 若是对方https协议不合法，咱们忽略 继续请求
 8 # 获取页面
 9 html = response.content.decode() 10 print(html) 11 #查看session
12 print(response.cookies)

10 从新链接次数code

 1 import requests  2 from retrying import retry  3 
 4 
 5 @retry(stop_max_attempt_number=3) # 设置超时从新链接 次数3
 6 def get( url ):  7     response = requests.get(url=url,timeout=3)  8     return response.content.decode()  9 
10 url = 'http://www.baidu.com'
11 html = get(url) 12 print(html)