它是python内置的HTTP请求库,使用它发送Request。它主要包含如下几个基本模块:php
虽然urllib库是python的内置库,可是仍然须要导入。导入后能够直接使用urllib.request.urlopen()函数直接向服务器发送Request。Request中含有data数据时是POST请求,不然为GET请求。详细代码以下:html
urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)#urlopen函数形式,主要使用前三个参数 #GET请求 import urllib.request #导入相应的库 response = urllib.request.urlopen('http://www.baidu.com') #发送Request print(response.read().decode('utf-8')) '''打印相关请求,关于网页的编码格式若是常见的仍然没法编译,查看网页源代码,在head的第一行charset属性中可能会有相应信息。''' # POST请求 import urllib.parse import urllib.request data = bytes(urllib.parse.urlencode({'word': 'hello'}), encoding='utf8') #POST请求比GET多了一个data文件 response = urllib.request.urlopen('http://httpbin.org/post', data=data) print(response.read()) #设置延迟时间 import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('http://httpbin.org/get', timeout=0.1)#反应时间0.1s except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout):#判断错误类型 print('TIME OUT')
urlopen()
可以发送Request,可是没法直接进行更多的设置,如设置请求头等。这时候能够先声明一个Request对象,而后传入相应的信息,最后将Request对象传入给urlopen()
.python
from urllib import request, parse #导入相应的包 url = 'http://httpbin.org/post' #网址 headers = { 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'httpbin.org' } #设置请求头 dict = { 'name': 'Germey' }#设置DataFrom信息 data = bytes(parse.urlencode(dict), encoding='utf8')#将DataFrom信息编译成二进制流 req = request.Request(url=url, data=data, headers=headers, method='POST')#构建Request类 #若是req中缺乏header时,urllib提供了add_header方法 req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = request.urlopen(req)#传入urlopen print(response.read().decode('utf-8'))#打印
对于服务器发送的相应体,咱们能够获取其类型、状态码和响应头。浏览器
import urllib.request response = urllib.request.urlopen('https://www.python.org') print(type(response))#获取相应类型 print(response.status)# 获取状态码 print(response.getheaders())#获取响应头 print(response.getheader('Server'))#获取相应头的中的参数 print(response.read().decode('utf-8'))#打印相应体
除了正常的Request内容以外,urllib提供不少附加功能,一般使用handler实现。服务器
设置代理须要首先床架ProxyHandler,再将其构建为一个opener,使用open()
方法打开。上文中urlopen()
内部一样是构建一个opener,而后使用open()
打开网页。cookie
import urllib.request proxy_handler = urllib.request.ProxyHandler({ 'http': 'http://127.0.0.1:9743', 'https': 'https://127.0.0.1:9743' }) opener = urllib.request.build_opener(proxy_handler) response = opener.open('http://httpbin.org/get') print(response.read())
Cookie用来维持网页登录状态,用于爬取须要登录的网站。常见的Cookie设置格式以下:socket
import http.cookiejar, urllib.request cookie = http.cookiejar.CookieJar() #首先建立一个CookieJar类 handler = urllib.request.HTTPCookieProcessor(cookie)#借助handler处理Cookie opener = urllib.request.build_opener(handler)#构建opener response = opener.open('http://www.baidu.com')#打开网页 for item in cookie: print(item.name+"="+item.value)#打印出Cookie的值 import http.cookiejar, urllib.request filename = "cookie.txt" cookie = http.cookiejar.MozillaCookieJar(filename)#火狐浏览器格式存储cookie handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True, ignore_expires=True)#保存 import http.cookiejar, urllib.request cookie = http.cookiejar.LWPCookieJar()#是用另外一种格式存储 cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)#加载Cookie handler = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handler) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))
python中只定义了两种错误类,URLError和Base#Ear融入,厂用try--except
,捕集判断错误类型。函数
from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.URLError as e: print(e.reason) from urllib import request, error try: response = request.urlopen('http://cuiqingcai.com/index.htm') except error.HTTPError as e: print(e.reason, e.code, e.headers, sep='\n') except error.URLError as e: print(e.reason) else: print('Request Successfully') import socket import urllib.request import urllib.error try: response = urllib.request.urlopen('https://www.baidu.com', timeout=0.01) except urllib.error.URLError as e: print(type(e.reason)) if isinstance(e.reason, socket.timeout): print('TIME OUT')
这就像一个工具包,里面有好多功能。工具
#获取URl信息 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment') print(type(result), result) #设置URL信息,若有URL已经存在相应信息,那么该设置不会起做用 from urllib.parse import urlparse result = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme='https') print(result) #能够经过指定不存在相应信息方式更改切分结果 from urllib.parse import urlparse result = urlparse('http://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)#设置不存在allow_fragments print(result)
from urllib.parse import urlunparse data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment'] print(urlunparse(data))
from urllib.parse import urljoin print(urljoin('http://www.baidu.com', 'FAQ.html')) print(urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html')) print(urljoin('http://www.baidu.com/about.html', 'https://cuiqingcai.com/FAQ.html?question=2')) print(urljoin('http://www.baidu.com?wd=abc', 'https://cuiqingcai.com/index.php')) print(urljoin('http://www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com', '?category=2#comment')) print(urljoin('www.baidu.com#comment', '?category=2'))
from urllib.parse import urlencode params = { 'name': 'germey', 'age': 22 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params)#拼接URL print(url)