目录html
urllib
库的学习记录说明:urllib是一个收集几个模块的以处理URL的包python
urilib.request
用于打开和阅读URLurllib.error
包含由...提出的例外urllib.parse
用于解析URLurllib.robotparser
用于解析 robots.txt
文件urllib.request
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)
url
: 须要打开的网址data
:post提交的数据timeout
:设置网站的访问超时时间get
请求基本的使用nginx
import urllib.request response = urllib.request.urlopen('http://www.baidu.com') # response.read() -- > bytes print(response.read().decode('utf-8'))
post
请求# 1. 对 data 进行处理 import urllib.parse data = bytes(urllib.parse.urlencode({'hello':'word'}),encoding='utf-8') # 处理后的结果 b'hello=word' # 2. 使用request.urlopen() 发送请求 response = urllib.request.urlopen('http://www.httpbin.org/post',data=data) response._method # 获取请求方法 'POST' response.url # 获取请求的url 'http://www.httpbin.org/post'
import urllib.request response = urllib.request.urlopen('http://httpbin.org/get',timeout=1) print(response.read()) import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1) except urllib.error.URLError as e: if isinstance(e.reason,socket.timeout): print('TIME OUT')
Reponse
print(dir(response) ['__abstractmethods__', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', '_checkClosed', '_checkReadable', '_checkSeekable', '_checkWritable', '_check_close', '_close_conn', '_get_chunk_left', '_method', '_peek_chunked', '_read1_chunked', '_read_and_discard_trailer', '_read_next_chunk_size', '_read_status', '_readall_chunked', '_readinto_chunked', '_safe_read', '_safe_readinto', 'begin', 'chunk_left', 'chunked', 'close', 'closed', 'code', 'debuglevel', 'detach', 'fileno', 'flush', 'fp', 'getcode', 'getheader', 'getheaders', 'geturl', 'headers', 'info', 'isatty', 'isclosed', 'length', 'msg', 'peek', 'read', 'read1', 'readable', 'readinto', 'readinto1', 'readline', 'readlines', 'reason', 'seek', 'seekable', 'status', 'tell', 'truncate', 'url', 'version', 'will_close', 'writable', 'write', 'writelines' ]
response.status # 200 response.getcode() # 200 response.code # 200 response.url # 'http://www.baidu.com' response._method # 'GET' response.getheaders() [ ('Access-Control-Allow-Credentials', 'true'), ('Access-Control-Allow-Origin', '*'), ('Content-Type', 'application/json'), ('Date', 'Fri, 14 Jun 2019 02:33:18 GMT'), ('Referrer-Policy', 'no-referrer-when-downgrade'), ('Server', 'nginx'), ('X-Content-Type-Options', 'nosniff'), ('X-Frame-Options', 'DENY'), ('X-XSS-Protection', '1; mode=block'), ('Content-Length', '226'), ('Connection', 'Close') ] response.getheader('Server') # nginx
Request
源码git
class Request: def __init__(self, url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None): In [3]: url = 'https://www.baidu.com/' In [4]: req = urllib.request.Request(url=url) In [5]: dir(req) Out[5]: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_data', '_full_url', '_parse', '_tunnel_host', 'add_header', 'add_unredirected_header', 'data', 'fragment', 'full_url', 'get_full_url', 'get_header', 'get_method', 'has_header', 'has_proxy', 'header_items', 'headers', 'host', 'origin_req_host', 'remove_header', 'selector', 'set_proxy', 'type', 'unredirected_hdrs', 'unverifiable']
data
post数据的处理headers={}
构造请求头信息import urllib.request import urllib.parse url = 'http://httpbin.org/post' # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} headers['Host'] = 'httpbin.org' # data -- bytes dict = {'name': 'Germey'} data = urllib.parse.urlencode(dict).encode('utf-8') # 实例化请求对象 传入参数 request = urllib.request.Request(url=url, data=data, headers=headers, method='POST') print(request) # <urllib.request.Request object at 0x000002404A9689E8> ############################################################## # --------以上只不过是建立请求对象但并无发送请求------------ ############################################################# # 发送请求对象并返回响应对象 response = urllib.request.urlopen(request) print(response) # <http.client.HTTPResponse object at 0x000002404AFBC358>
# 增长header from urllib import request, parse url = 'http://httpbin.org/post' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' 'Host':'httpbin.org' } # 构造POST表格 dict = { 'name':'Germey' } data = bytes(parse.urlencode(dict),encoding='utf8') req = request.Request(url=url,data=data,headers=headers,method='POST') response = request.urlopen(req) print(response.read()).decode('utf-8')
add_header
方法添加import urllib.request req = urllib.request.Request('http://www.example.com/') req.add_header('Referer', 'http://www.python.org/') # Customize the default User-Agent header value: req.add_header('User-Agent', 'urllib-example/0.1 (Contact: . . .)') r = urllib.request.urlopen(req)
import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open("http://www.baidu.com") for item in cookie: print(item.name+"="+item.value) # 保存cooki为文本 import http.cookiejar, urllib.request filename = "cookie.txt" # 保存类型有不少种 ## 类型1 cookie = http.cookiejar.MozillaCookieJar(filename) ## 类型2 cookie = http.cookiejar.LWPCookieJar(filename) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open("http://www.baidu.com") # 使用相应的方法读取 import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar() cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True) handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open("http://www.baidu.com")
from urllib import request url = 'http://httpbin.org/ip' proxy = {'http':'218.18.232.26:80','https':'218.18.232.26:80'} # 建立代理处理器 proxies = request.ProxyHandler(proxy) # 建立opener对象 opener = request.build_opener(proxies) resp = opener.open(url) print(resp.read().decode())
urllib.parse
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", "urlsplit", "urlunsplit", "urlencode", "parse_qs", "parse_qsl", "quote", "quote_plus", "quote_from_bytes", "unquote", "unquote_plus", "unquote_to_bytes", "DefragResult", "ParseResult", "SplitResult", "DefragResultBytes", "ParseResultBytes", "SplitResultBytes"]
urlparse
urlunpars
quote/quote_plus
unquote/unquote_plus
urljoin
urlencode
parse_qs/parse_qsl
url
解析 urlparse
In [8]: from urllib.parse import urlparse In [9]: o = urlparse('https://docs.python.org/3/library/urllib.parse.html') ''' 将url分红六个部分,返回一个包含6个字符串项目的元组: 协议 : scheme 位置 : netloc 路径 : path 参数 ..... 查询 判断 输出结果以下 ''' In [10]: o Out[10]: ParseResult(scheme='https', netloc='docs.python.org', path='/3/library/urllib.parse.html', params='', query='', fragment='') In [11]: dir(o) Out[11]: ['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '_asdict', '_encoded_counterpart', '_fields', '_hostinfo', '_make', '_replace', '_source', '_userinfo', 'count', 'encode', 'fragment', 'geturl', 'hostname', 'index', 'netloc', 'params', 'password', 'path', 'port', 'query', 'scheme', 'username'] In [12]: o.path Out[12]: '/3/library/urllib.parse.html' In [13]: o.scheme Out[13]: 'https' In [14]: o.geturl() Out[14]: 'https://docs.python.org/3/library/urllib.parse.html' url = "https://docs.python.org/3.5/library/urllib.parse.html?highlight=parse#module-urllib.parse" result = parse.urlparse(url) print(result.query) # 获取返回结果参数内容 print(parse.parse_qs(result.query)) # 结果转换成字典 print(parse.parse_qsl(result.query)) # 结果转换成列表
url
解析 urlunpars
In [15]: o Out[15]: ParseResult(scheme='https', netloc='docs.python.org', path='/3/library/urllib.parse.html', params='', query='', fragment='') In [16]: from urllib.parse import urlunparse In [17]: urlunparse(o) Out[17]: 'https://docs.python.org/3/library/urllib.parse.html' # list(o) In [18]: urlunparse(list(o)) Out[18]: 'https://docs.python.org/3/library/urllib.parse.html'
url
解析 parse_qs/parse_qsl
In [52]: parse_qs('https://i.cnblogs.com/EditPosts.aspx?opt=1') Out[52]: {'https://i.cnblogs.com/EditPosts.aspx?opt': ['1']} In [53]: parse_qsl('https://i.cnblogs.com/EditPosts.aspx?opt=1') Out[53]: [('https://i.cnblogs.com/EditPosts.aspx?opt', '1')]
url
解析 quote/unquote
Help on function quote in module urllib.parse: quote(string, safe='/', encoding=None, errors=None) quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," Each of these characters is reserved in some component of a URL, but not necessarily in all of them. By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. string and safe may be either str or bytes objects. encoding and errors must not be specified if string is a bytes object. The optional encoding and errors parameters specify how to deal with non-ASCII characters, as accepted by the str.encode method. By default, encoding='utf-8' (characters are encoded with UTF-8), and errors='strict' (unsupported characters raise a UnicodeEncodeError). In [26]: search = '搜索内容' In [27]: quote(search) Out[27]: '%E6%90%9C%E7%B4%A2%E5%86%85%E5%AE%B9'
url
反向解析 unquote/unquote_plus
In [41]: from urllib import parse In [42]: parse.quote('a&b/c') Out[42]: 'a%26b/c' # 未编码斜线 In [43]: parse.quote_plus('a&b/c') Out[43]: 'a%26b%2Fc' # 编码了斜线
url
urlencode
In [44]: query = { ...: ...: 'name': 'Lee', ...: ...: 'age': 19, ...: ...: } In [45]: type(query) Out[45]: dict In [46]: parse.urlencode(query) Out[46]: 'name=Lee&age=19'
GET
的请求方式>>> import urllib.request >>> import urllib.parse >>> params = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) >>> url = "http://www.musi-cal.com/cgi-bin/query?%s" % params >>> with urllib.request.urlopen(url) as f: ... print(f.read().decode('utf-8'))
POST
的请求方式>>> import urllib.request >>> import urllib.parse >>> data = urllib.parse.urlencode({'spam': 1, 'eggs': 2, 'bacon': 0}) >>> data = data.encode('ascii') >>> with urllib.request.urlopen("http://requestb.in/xrbl82xr", data) as f: ... print(f.read().decode('utf-8'))
本文参考连接github
官方教程: https://docs.python.org/3/library/urllib.request.htmljson
GitHub源码: https://github.com/python/cpython/blob/3.7/Lib/urllib/request.pycookie
博主Coder : https://www.cnblogs.com/zhaof/p/6910871.htmlapp
博主Hoptop : https://www.jianshu.com/u/9ea40b5f607assh
博主支付宝 : http://www.pianshen.com/article/2667231307/socket
脚本之家 : https://www.jb51.net/article/128540.htm