Requests 惟一的一个非转基因的 Python HTTP 库,人类能够安全享用。html
警告:非专业使用其余 HTTP 库会致使危险的反作用,包括:安全缺陷症、冗余代码症、从新发明轮子症、啃文档症、抑郁、头疼、甚至死亡。python
由于在使用urllib模块的时候,会有诸多不便之处,总结以下:git
使用requests模块:github
如何使用requests模块web
# 各类请求方式:经常使用的就是requests.get()和requests.post() >>> import requests >>> r = requests.get('https://api.github.com/events') >>> r = requests.post('http://httpbin.org/post', data = {'key':'value'}) >>> r = requests.put('http://httpbin.org/put', data = {'key':'value'}) >>> r = requests.delete('http://httpbin.org/delete') >>> r = requests.head('http://httpbin.org/get') >>> r = requests.options('http://httpbin.org/get')
基本请求ajax
import requests response=requests.get('http://dig.chouti.com/') print(response.text)
带参数的GET请求->paramsjson
# 本身拼接GET参数 # 在请求头内将本身假装成浏览器,不然百度不会正常返回页面内容 import requests response=requests.get('https://www.baidu.com/s?wd=python&pn=1', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', }) print(response.text) # 若是查询关键词是中文或者有其余特殊符号,则不得不进行url编码 from urllib.parse import urlencode wd='egon老师' encode_res=urlencode({'k':wd},encoding='utf-8') keyword=encode_res.split('=')[1] print(keyword) # 而后拼接成url url='https://www.baidu.com/s?wd=%s&pn=1' %keyword response=requests.get(url, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', }) res1=response.text
# params参数的使用 # 上述操做能够用requests模块的一个params参数搞定,本质仍是调用urlencode from urllib.parse import urlencode wd='egon老师' pn=1 response=requests.get('https://www.baidu.com/s', params={ 'wd':wd, 'pn':pn }, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36', }) res2=response.text #验证结果,打开a.html与b.html页面内容同样 with open('a.html','w',encoding='utf-8') as f: f.write(res1) with open('b.html', 'w', encoding='utf-8') as f: f.write(res2)
带参数的GET请求->headersapi
# 一般咱们在发送请求时都须要带上请求头,请求头是将自身假装成浏览器的关键,常见的有用的请求头以下 Host Referer # 大型网站一般都会根据该参数判断请求的来源 User-Agent # 客户端 Cookie # Cookie信息虽然包含在请求头里,但requests模块有单独的参数来处理他headers={}内就不要放它了
# 添加headers(浏览器会识别请求头,不加可能会被拒绝访问,好比访问https://www.zhihu.com/explore) import requests response=requests.get('https://www.zhihu.com/explore') response.status_code #500 # 本身定制headers headers={ 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36', } respone=requests.get('https://www.zhihu.com/explore', headers=headers) print(respone.status_code) #200
带参数的GET请求->cookies浏览器
# 登陆github,而后从浏览器中获取cookies,之后就能够直接拿着cookie登陆了,无需输入用户名密码 # 用户名:egonlin 邮箱767124330@qq.com 密码jiumo@123 import requests Cookies={ 'user_session':'wGMHFJKgDcmRIVvcA14_Wrt_3xaUyJNsBnPbYzEL6L0bHcfc', } response=requests.get('https://github.com/settings/emails', cookies=Cookies) # github对请求头没有什么限制,咱们无需定制user-agent,对于其余网站可能还须要定制 print('378533872@qq.com' in response.text) #True
介绍安全
# GET请求 HTTP默认的请求方法就是GET * 没有请求体 * 数据必须在1K以内! * GET请求数据会暴露在浏览器的地址栏中 GET请求经常使用的操做: 1. 在浏览器的地址栏中直接给出URL,那么就必定是GET请求 2. 点击页面上的超连接也必定是GET请求 3. 提交表单时,表单默认使用GET请求,但能够设置为POST # POST请求 (1). 数据不会出如今地址栏中 (2). 数据的大小没有上限 (3). 有请求体 (4). 请求体中若是存在中文,会使用URL编码! #!!!requests.post()用法与requests.get()彻底一致,特殊的是requests.post()有一个data参数,用来存放请求体数据
发送post请求,模拟浏览器的登陆行为
# 对于登陆来讲,应该输错用户名或密码而后分析抓包流程,用脑子想想,输对了浏览器就跳转了,还分析个毛线,累死你也找不到包
# 自动登陆github(本身处理cookie信息) ''' 一 目标站点分析 浏览器输入https://github.com/login 而后输入错误的帐号密码,抓包 发现登陆行为是post提交到:https://github.com/session 并且请求头包含cookie 并且请求体包含: commit:Sign in utf8:✓ authenticity_token:lbI8IJCwGslZS8qJPnof5e7ZkCoSoMn6jmDTsL1r/m06NLyIbw7vCrpwrFAPzHMep3Tmf/TSJVoXWrvDZaVwxQ== login:egonlin password:123 二 流程分析 先GET:https://github.com/login拿到初始cookie与authenticity_token 返回POST:https://github.com/session, 带上初始cookie,带上请求体(authenticity_token,用户名,密码等) 最后拿到登陆cookie ps:若是密码时密文形式,则能够先输错帐号,输对密码,而后到浏览器中拿到加密后的密码,github的密码是明文 ''' import requests import re #第一次请求 r1=requests.get('https://github.com/login') r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被受权) authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN #第二次请求:带着初始cookie和TOKEN发送POST请求给登陆页面,带上帐号密码 data={ 'commit':'Sign in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'317828332@qq.com', 'password':'alex3714' } r2=requests.post('https://github.com/session', data=data, cookies=r1_cookie ) login_cookie=r2.cookies.get_dict() #第三次请求:之后的登陆,拿着login_cookie就能够,好比访问一些我的配置 r3=requests.get('https://github.com/settings/emails', cookies=login_cookie) print('317828332@qq.com' in r3.text) #True
# requests.session()自动帮咱们保存cookie信息 import requests import re session=requests.session() #第一次请求 r1=session.get('https://github.com/login') authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN #第二次请求 data={ 'commit':'Sign in', 'utf8':'✓', 'authenticity_token':authenticity_token, 'login':'767124330@qq.com', 'password':'jiumo123' } r2=session.post('https://github.com/session', data=data, ) #第三次请求 r3=session.get('https://github.com/settings/emails') print('317828332@qq.com' in r3.text) #True
补充
requests.post(url='xxxxxxxx', data={'xxx':'yyy'}) #没有指定请求头,#默认的请求头:application/x-www-form-urlencoed #若是咱们自定义请求头是application/json,而且用data传值, 则服务端取不到值 requests.post(url='', data={'':1,}, headers={ 'content-type':'application/json' }) requests.post(url='', json={'':1,}, ) #默认的请求头:application/jso
response属性
import requests respone=requests.get('http://www.jianshu.com') # respone属性 print(respone.text) print(respone.content) print(respone.status_code) print(respone.headers) print(respone.cookies) print(respone.cookies.get_dict()) print(respone.cookies.items()) print(respone.url) print(respone.history) print(respone.encoding) #关闭:response.close() from contextlib import closing with closing(requests.get('xxx',stream=True)) as response: for line in response.iter_content(): pass
编码问题
# 编码问题 import requests response=requests.get('http://www.autohome.com/news') # response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,若是不设置成gbk则中文乱码 print(response.text)
获取二进制数据
import requests response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg') with open('a.jpg','wb') as f: f.write(response.content)
#stream参数:一点一点的取,好比下载视频时,若是视频100G,用response.content而后一会儿写到文件中是不合理的 import requests response=requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4', stream=True) with open('b.mp4','wb') as f: for line in response.iter_content(): f.write(line)
解析json
#解析json import requests response=requests.get('http://httpbin.org/get') import json res1=json.loads(response.text) #太麻烦 res2=response.json() #直接获取json数据 print(res1 == res2) #True
import requests import os #指定搜索关键字 word = input('enter a word you want to search:') #自定义请求头信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } #指定url url = 'https://www.sogou.com/web' #封装get请求参数 prams = { 'query':word, 'ie':'utf-8' } #发起请求 response = requests.get(url=url,params=param) #获取响应数据 page_text = response.text with open('./sougou.html','w',encoding='utf-8') as fp: fp.write(page_text)
import requests import os url = 'https://accounts.douban.com/login' #封装请求参数 data = { "source": "movie", "redir": "https://movie.douban.com/", "form_email": "15027900535", "form_password": "bobo@15027900535", "login": "登陆", } #自定义请求头信息 headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', } response = requests.post(url=url,data=data) page_text = response.text with open('./douban111.html','w',encoding='utf-8') as fp: fp.write(page_text)
# -*- coding:utf-8 -*- import requests import urllib.request if __name__ == "__main__": #指定ajax-get请求的url(经过抓包进行获取) url = 'https://movie.douban.com/j/chart/top_list?' #定制请求头信息,相关的头信息必须封装在字典结构中 headers = { #定制请求头中的User-Agent参数,固然也能够定制请求头中其余的参数 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } #定制get请求携带的参数(从抓包工具中获取) param = { 'type':'5', 'interval_id':'100:90', 'action':'', 'start':'0', 'limit':'20' } #发起get请求,获取响应对象 response = requests.get(url=url,headers=headers,params=param) #获取响应内容:响应内容为json串 print(response.text)
# -*- coding:utf-8 -*- import requests import urllib.request if __name__ == "__main__": #指定ajax-post请求的url(经过抓包进行获取) url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' #定制请求头信息,相关的头信息必须封装在字典结构中 headers = { #定制请求头中的User-Agent参数,固然也能够定制请求头中其余的参数 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', } #定制post请求携带的参数(从抓包工具中获取) data = { 'cname':'', 'pid':'', 'keyword':'北京', 'pageIndex': '1', 'pageSize': '10' } #发起post请求,获取响应对象 response = requests.get(url=url,headers=headers,data=data) #获取响应内容:响应内容为json串 print(response.text)
import requests from fake_useragent import UserAgent ua = UserAgent(use_cache_server=False,verify_ssl=False).random headers = { 'User-Agent':ua } url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' pageNum = 3 for page in range(3,5): data = { 'on': 'true', 'page': str(page), 'pageSize': '15', 'productName':'', 'conditionType': '1', 'applyname':'', 'applysn':'' } json_text = requests.post(url=url,data=data,headers=headers).json() all_id_list = [] for dict in json_text['list']: id = dict['ID']#用于二级页面数据获取 #下列详情信息能够在二级页面中获取 # name = dict['EPS_NAME'] # product = dict['PRODUCT_SN'] # man_name = dict['QF_MANAGER_NAME'] # d1 = dict['XC_DATE'] # d2 = dict['XK_DATE'] all_id_list.append(id) #该url是一个ajax的post请求 post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for id in all_id_list: post_data = { 'id':id } response = requests.post(url=post_url,data=post_data,headers=headers) #该请求响应回来的数据有两个,一个是基于text,一个是基于json的,因此能够根据content-type,来获取指定的响应数据 if response.headers['Content-Type'] == 'application/json;charset=UTF-8': #print(response.json()) #进行json解析 json_text = response.json() print(json_text['businessPerson'])