在学习爬虫入门时,会经常使用到requests模块,熟悉这个模块的使用须要熟悉http,https ,及浏览器的请求原理。初次接触爬虫时了解下,掌握浏览器的请求过程和爬虫的本质,学起来就轻松多啦。
python
# get response = requests.get(url, headers=headers) # get 带参数 requests.get(url, params=kw , headers=headers) # post response = requests.post(url, data=data, headers=headers)
# 建立 session实例 # get请求 session = requests.session() response = session.get(url,headers) # post请求 session = requests.session() response = session.post(post_url, data=post_data , headers=headers)
cookies = {"cookie的name":"cookie的value"}
requests.get(url,headers=headers,cookies=cookie_dict}
>>> {i:i+10 for i in range(10)} {0: 10, 1: 11, 2: 12, 3: 13, 4: 14, 5: 15, 6: 16, 7: 17, 8: 18, 9: 19} >>> {i:i+10 for i in range(10) if i%2 == 0} {0: 10, 2: 12, 4: 14, 6: 16, 8: 18}
# 准备cookie形式的字典 # cookie_dict = { i for i in cookie.spilt("; " )} cookie_dict = {i.split("=")[0]: i.split("=")[1] for i in cookie.split("; ")} response = requests.get(url, headers=headers, cookies=cookie_dict)
# coding="utf-8" import requests url = "http://www.baidu.com" response = requests.get(url) print(type(response.cookies)) ---输出结果以下: ---<class 'requests.cookies.RequestsCookieJar'> # 把cookiejar对象转化为字典 cookies = requests.utils.dict_from_cookiejar(response.cookies) print(cookies) ---输出结果以下: ---{'BDORZ': '27315'}
第1种方式:timeoutlinux
response = requests.get(url,timeout=3)
from retrying import retry @retry(stop_max_attempt_number=3) # 最大尝试链接次数 def _parse_url(url): """加上一个下划线表示该方法仅能在当前模块内使用""" # print("-----------") response = requests.get(url, headers=headers, timeout=3) assert response.status_code == 200 return response