#!/usr/bin/env python # coding=utf-8 import urllib2 ''' urllib2能够简单认为是urllib的加强版,但因为urllib中提供了urllib2中没有的函数,由于又不能彻底替代urllib。 二者不能相互替代 ,只能是配合着使用。urllib和urllib2的区别: urllib2经过Request参数来修改Header,也就是能够经过更改User Agent来假装浏览器。 urllib提供urlencode函数,支持编码,若是在模拟登录时,当须要编码以后的参数,就只能用urllib。 urllib提供了一系列如urlretrieve,quote等函数,而在urllib2中并无。 ''' ''' urllib2模块:https://docs.python.org/2/library/urllib2.html urllib2.urlopen(url[, data[, timeout[, cafile[, capath[, cadefault[, context]]]]]) urllib2.install_opener(opener) urllib2.build_opener([handler, ...]) exception urllib2.URLError exception urllib2.HTTPError Request类: class urllib2.Request(url[, data][, headers][, origin_req_host][, unverifiable]) Request.add_data(data) Request.get_method() Request.has_data() Request.get_data() Request.add_header(key, val) Request.add_unredirected_header(key, header) Request.has_header(header) Request.get_full_url() Request.get_type() Request.get_host() Request.get_selector() Request.get_header(header_name, default=None) Request.header_items() Request.set_proxy(host, type) Request.get_origin_req_host() Request.is_unverifiable() OpenerDirector类: class urllib2.OpenerDirector OpenerDirector.add_handler(handler) OpenerDirector.open(url[, data][, timeout]) OpenerDirector.error(proto[, arg[, ...]]) BaseHandler类: class urllib2.BaseHandler BaseHandler.add_parent(director) BaseHandler.close() BaseHandler.parent BaseHandler.default_open(req) BaseHandler.protocol_open(req) BaseHandler.unknown_open(req) BaseHandler.http_error_default(req, fp, code, msg, hdrs) BaseHandler.http_error_nnn(req, fp, code, msg, hdrs) BaseHandler.protocol_request(req) BaseHandler.protocol_response(req, response) HTTPDefaultErrorHandler类: class urllib2.HTTPDefaultErrorHandler HTTPRedirectHandler类: class urllib2.HTTPRedirectHandler HTTPRedirectHandler.redirect_request(req, fp, code, msg, hdrs, newurl) HTTPRedirectHandler.http_error_301(req, fp, code, msg, hdrs) HTTPRedirectHandler.http_error_302(req, fp, code, msg, hdrs) HTTPRedirectHandler.http_error_303(req, fp, code, msg, hdrs) HTTPRedirectHandler.http_error_307(req, fp, code, msg, hdrs) HTTPCookieProcessor类: class urllib2.HTTPCookieProcessor([cookiejar]) HTTPCookieProcessor.cookiejar ProxyHandler类: class urllib2.ProxyHandler([proxies]) ProxyHandler.protocol_open(request) HTTPPasswordMgr类: class urllib2.HTTPPasswordMgr HTTPPasswordMgr.add_password(realm, uri, user, passwd) HTTPPasswordMgr.find_user_password(realm, authuri) HTTPPasswordMgrWithDefaultRealm类: class urllib2.HTTPPasswordMgrWithDefaultRealm AbstractBasicAuthHandler类: class urllib2.AbstractBasicAuthHandler([password_mgr]) AbstractBasicAuthHandler.http_error_auth_reqed(authreq, host, req, headers) HTTPBasicAuthHandler类: class urllib2.HTTPBasicAuthHandler([password_mgr]) HTTPBasicAuthHandler.http_error_401(req, fp, code, msg, hdrs) ProxyBasicAuthHandler类: class urllib2.ProxyBasicAuthHandler([password_mgr]) ProxyBasicAuthHandler.http_error_407(req, fp, code, msg, hdrs) AbstractDigestAuthHandler类: class urllib2.AbstractDigestAuthHandler([password_mgr]) AbstractDigestAuthHandler.http_error_auth_reqed(authreq, host, req, headers) HTTPDigestAuthHandler类: class urllib2.HTTPDigestAuthHandler([password_mgr]) HTTPDigestAuthHandler.http_error_401(req, fp, code, msg, hdrs) ProxyDigestAuthHandler类: class urllib2.ProxyDigestAuthHandler([password_mgr]) ProxyDigestAuthHandler.http_error_407(req, fp, code, msg, hdrs) HTTPHandler类: class urllib2.HTTPHandler HTTPHandler.http_open(req) HTTPSHandler类: class urllib2.HTTPSHandler([debuglevel[, context]]) HTTPSHandler.https_open(req) FileHandler类: class urllib2.FileHandler FileHandler.file_open(req) FTPHandler类: class urllib2.FTPHandler FTPHandler.ftp_open(req) CacheFTPHandler类: class urllib2.CacheFTPHandler CacheFTPHandler.setTimeout(t) CacheFTPHandler.setMaxConns(m) UnknownHandler类: class urllib2.UnknownHandler UnknownHandler.unknown_open() HTTPErrorProcessor类: class urllib2.HTTPErrorProcessor HTTPErrorProcessor.http_response() HTTPErrorProcessor.https_response() ''' def test_urllib2(): # 获取页面,显示前100个字节 f = urllib2.urlopen('https://www.baidu.com') print f.read(100) # 设置请求头 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'} # 建立Request对象,传入请求头 req = urllib2.Request(url='https://www.baidu.com', headers=headers) # 传入Request对象来接收页面 resp = urllib2.urlopen(req) # 读取页面文本 html = resp.read() print '*' * 200 print html print '*' * 200 print resp.getcode() # 响应码 print resp.geturl() # url print resp.info() # 报头 if __name__ == '__main__': test_urllib2()
源码可于github下载:https://github.com/gkimeeq/PythonLearning。html