简介html
Python标准库中提供了:urllib、urllib二、httplib等模块以供Http请求,可是,它的 API 太渣了。它是为另外一个时代、另外一个互联网所建立的。它须要巨量的工做,甚至包括各类方法覆盖,来完成最简单的任务。前端
Requests 是使用 Apache2 Licensed 许可证的 基于Python开发的HTTP 库,其在Python内置模块的基础上进行了高度的封装,从而使得Pythoner进行网络请求时,变得美好了许多,使用Requests能够垂手可得的完成浏览器可有的任何操做。python
爬虫的本质:模仿浏览器的行为,爬取网页信息。linux
# 一、无参数实例 import requests ret = requests.get('https://github.com/timeline.json') print ret.url print ret.text # 二、有参数实例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.get("http://httpbin.org/get", params=payload) print ret.url print ret.text
二、POST请求git
# 一、基本POST实例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post("http://httpbin.org/post", data=payload) print ret.text # 二、发送请求头和数据实例 import requests import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text print ret.cookies
三、其余请求github
requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基础上构建 requests.request(method, url, **kwargs)
请求的参数redis
1 url 2 headers 3 cookies 4 params 5 data,传请求体 requests.post( ..., data={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123 6 json,传请求体 requests.post( ..., json={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123} 7 代理 proxies # 无验证 proxie_dict = { "http": "61.172.249.96:80", "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 验证代理 from requests.auth import HTTPProxyAuth proxyDict = { 'http': '77.75.105.165', 'https': '77.75.106.165' } auth = HTTPProxyAuth('用户名', '密码') r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth) print(r.text) ----------------------------------------------------------------------------------------- 8 文件上传 files # 发送文件 file_dict = { 'f1': open('xxxx.log', 'rb') } requests.request( method='POST', url='http://127.0.0.1:8000/test/', files=file_dict ) 9 认证 auth 内部: 用户名和密码,用户和密码加密,放在请求头中传给后台。 - "用户:密码" - base64("用户:密码") - "Basic base64("用户|密码")" - 请求头: Authorization: "basic base64("用户|密码")" from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text) 10 超时 timeout # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) 11 容许重定向 allow_redirects ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text) 12 大文件下载 stream from contextlib import closing with closing(requests.get('http://httpbin.org/get', stream=True)) as r1: # 在此处理响应。 for i in r1.iter_content(): print(i) 13 证书 cert - 百度、腾讯 => 不用携带证书(系统帮你作了) - 自定义证书 requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem") requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 14 确认 verify =False
关于auth认证算法
认证 auth 浏览器的弹窗认证,在浏览器中 内部: 用户名和密码,用户和密码加密,放在请求头中传给后台。 - "用户:密码" - base64("用户:密码") - "Basic base64("用户:密码")" - 请求头: Authorization: "basic base64("用户:密码")" request的 HTTPBasicAuth帮助作以上操做 from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text)
def param_method_url(): ret=requests.request(method='get', url='http://127.0.0.1:8000/test/') ret=requests.request(method='post', url='http://127.0.0.1:8000/test/')
import requests requests.get(url='http://127.0.0.1:8000/test/', params={'k1': 'v1', 'k2': 'v2'}) #他的本质与requests.get(url='xxxxx?k1=v1&k2=v2')
# 能够是字典 # 能够是字符串 # 能够是字节 # 能够是文件对象 # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data={'k1': 'v1', 'k2': '水电费'}) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1; k2=v2; k3=v3; k3=v4" # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data="k1=v1;k2=v2;k3=v3;k3=v4", # headers={'Content-Type': 'application/x-www-form-urlencoded'} # ) # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4 # headers={'Content-Type': 'application/x-www-form-urlencoded'} # )
#若是请求体是 payload的话则须要传入json格式 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水电费'})
ret1 = requests.get( url='https://dig.chouti.com/', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } ) ret1_cookies = ret1.cookies.get_dict() #获取的ret1.cookies是访问该url返回的cookies对象 #经过get_dict()获取到字典类型的cookies
# 发送请求头到服务器端 requests.request(method='POST', url='http://127.0.0.1:8000/test/', json={'k1': 'v1', 'k2': '水电费'}, headers={'Content-Type': 'application/x-www-form-urlencoded'} ) #具体须要什么请求头要看服务器端
# 发送文件 # file_dict = { # 'f1': open('readme', 'rb') # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 发送文件,定制文件名 # file_dict = { # 'f1': ('test.txt', open('readme', 'rb')) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 发送文件,定制文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf") # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) # 发送文件,定制文件名 # file_dict = { # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'}) # } # requests.request(method='POST', # url='http://127.0.0.1:8000/test/', # files=file_dict) pass
设置超时时间,若是访问超过超时时间就中止访问 # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) pass
#是否容许重定向,默认为true ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text)
BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,而后将其进行格式化,以后遍可使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。django
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story总共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html_doc, 'html.parse') # 找到第一个a标签 tag1 = soup.find(name='a') # 找到全部的a标签 tag2 = soup.find_all(name='a') # 找到id=link2的标签 tag3 = soup.select('#link2')
安装:json
pip3 install beautifulsoup4
1. name,标签名称
# tag = soup.find('a') # name = tag.name # 获取 # print(name) # tag.name = 'span' # 设置 # print(soup)
2. attr,标签属性
# tag = soup.find('a') # attrs = tag.attrs # 获取 # print(attrs) # tag.attrs = {'ik':123} # 设置 # tag.attrs['id'] = 'iiiii' # 设置 # print(soup)
3. children,全部子标签
# body = soup.find('body') # v = body.children
4. children,全部子子孙孙标签
# body = soup.find('body') # v = body.descendants
5. clear,将标签的全部子标签所有清空(保留标签名)
# tag = soup.find('body') # tag.clear() # print(soup)
6. decompose,递归的删除全部的标签
# body = soup.find('body') # body.decompose() # print(soup)
7. extract,递归的删除全部的标签,并获取删除的标签
# body = soup.find('body') # v = body.extract() # print(soup)
8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
# body = soup.find('body') # v = body.decode() # v = body.decode_contents() # print(v)
9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# body = soup.find('body') # v = body.encode() # v = body.encode_contents() # print(v)
10. find,获取匹配的第一个标签
# tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag)
11. find_all,获取匹配的全部标签
# tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v) # ####### 正则 ####### import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v) # ####### 方法筛选 ####### # def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # v = soup.find_all(name=func) # print(v) # ## get,获取标签属性 # tag = soup.find('a') # v = tag.get('id') # print(v)
12. has_attr,检查标签是否具备该属性
# tag = soup.find('a') # v = tag.has_attr('id') # print(v)
13. get_text,获取标签内部文本内容
# tag = soup.find('a') # v = tag.get_text('id') # print(v)
14. index,检查标签在某标签中的索引位置
# tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v)
15. is_empty_element,是不是空标签(是否能够是空)或者自闭合标签,
判断是不是以下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
# tag = soup.find('br') # v = tag.is_empty_element # print(v)
16. 当前的关联标签
# soup.next # soup.next_element # soup.next_elements # soup.next_sibling # soup.next_siblings # # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # # tag.parent # tag.parents
17. 查找某标签的关联标签
# tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 参数同find_all
18. select,select_one, CSS选择器
soup.select("title") soup.select("p nth-of-type(3)") soup.select("body a") soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select('a[href]') soup.select('a[href="http://example.com/elsie"]') soup.select('a[href^="http://example.com/"]') soup.select('a[href$="tillie"]') soup.select('a[href*=".com/el"]') from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) print(type(tags), tags) from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) print(type(tags), tags)
19. 标签的内容
# tag = soup.find('span') # print(tag.string) # 获取 # tag.string = 'new content' # 设置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 递归内部获取全部标签的文本 # print(v)
20.append在当前标签内部追加一个标签
# tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # tag.append(obj) # print(soup)
21.insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup)
22. insert_after,insert_before 在当前标签后面或前面插入
# from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup)
23. replace_with 在当前标签替换为指定标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
24. 建立标签之间的关系
# tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling)
25. wrap,将指定标签把当前标签包裹起来
# from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一个新来的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup
26. unwrap,去掉当前标签,将保留其包裹的标签
# tag = soup.find('a') # v = tag.unwrap() # print(soup)
更多参数官方:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests # ############## 方式一 ############## """ # ## 一、首先登录任何页面,获取cookie i1 = requests.get(url="http://dig.chouti.com/help/service") i1_cookies = i1.cookies.get_dict() # ## 二、用户登录,携带上一次的cookie,后台对cookie中的 gpsd 进行受权 i2 = requests.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" }, cookies=i1_cookies ) # ## 三、点赞(只须要携带已经被受权的gpsd便可) gpsd = i1_cookies['gpsd'] i3 = requests.post( url="http://dig.chouti.com/link/vote?linksId=8589523", cookies={'gpsd': gpsd} ) print(i3.text) """ # ############## 方式二 ############## """ import requests session = requests.Session() i1 = session.get(url="http://dig.chouti.com/help/service") i2 = session.post( url="http://dig.chouti.com/login", data={ 'phone': "8615131255089", 'password': "xxooxxoo", 'oneMonth': "" } ) i3 = session.post( url="http://dig.chouti.com/link/vote?linksId=8589523" ) print(i3.text) """ 抽屉新热榜
返回主页 春生 博客园 首页 新随笔 联系 订阅 管理 随笔 - 181 文章 - 2 评论 - 24 requests+BeautifulSoup详解 简介 Python标准库中提供了:urllib、urllib二、httplib等模块以供Http请求,可是,它的 API 太渣了。它是为另外一个时代、另外一个互联网所建立的。它须要巨量的工做,甚至包括各类方法覆盖,来完成最简单的任务。 Requests 是使用 Apache2 Licensed 许可证的 基于Python开发的HTTP 库,其在Python内置模块的基础上进行了高度的封装,从而使得Pythoner进行网络请求时,变得美好了许多,使用Requests能够垂手可得的完成浏览器可有的任何操做。 请求的方法 1、GET请求 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 # 一、无参数实例 import requests ret = requests.get('https://github.com/timeline.json') print ret.url print ret.text # 二、有参数实例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.get("http://httpbin.org/get", params=payload) print ret.url print ret.text 2、POST请求 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 # 一、基本POST实例 import requests payload = {'key1': 'value1', 'key2': 'value2'} ret = requests.post("http://httpbin.org/post", data=payload) print ret.text # 二、发送请求头和数据实例 import requests import json url = 'https://api.github.com/some/endpoint' payload = {'some': 'data'} headers = {'content-type': 'application/json'} ret = requests.post(url, data=json.dumps(payload), headers=headers) print ret.text print ret.cookies 3、其余请求 1 2 3 4 5 6 7 8 9 10 requests.get(url, params=None, **kwargs) requests.post(url, data=None, json=None, **kwargs) requests.put(url, data=None, **kwargs) requests.head(url, **kwargs) requests.delete(url, **kwargs) requests.patch(url, data=None, **kwargs) requests.options(url, **kwargs) # 以上方法均是在此方法的基础上构建 requests.request(method, url, **kwargs) 请求的参数 常见参数 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 1 url 2 headers 3 cookies 4 params 5 data,传请求体 requests.post( ..., data={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\n\r\nuser=alex&pwd=123 6 json,传请求体 requests.post( ..., json={'user':'alex','pwd':'123'} ) GET /index http1.1\r\nhost:c1.com\r\nContent-Type:application/json\r\n\r\n{"user":"alex","pwd":123} 7 代理 proxies # 无验证 proxie_dict = { "http": "61.172.249.96:80", "https": "http://61.185.219.126:3128", } ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict) # 验证代理 from requests.auth import HTTPProxyAuth proxyDict = { 'http': '77.75.105.165', 'https': '77.75.106.165' } auth = HTTPProxyAuth('用户名', '密码') r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth) print(r.text) ----------------------------------------------------------------------------------------- 8 文件上传 files # 发送文件 file_dict = { 'f1': open('xxxx.log', 'rb') } requests.request( method='POST', url='http://127.0.0.1:8000/test/', files=file_dict ) 9 认证 auth 内部: 用户名和密码,用户和密码加密,放在请求头中传给后台。 - "用户:密码" - base64("用户:密码") - "Basic base64("用户|密码")" - 请求头: Authorization: "basic base64("用户|密码")" from requests.auth import HTTPBasicAuth, HTTPDigestAuth ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf')) print(ret.text) 10 超时 timeout # ret = requests.get('http://google.com/', timeout=1) # print(ret) # ret = requests.get('http://google.com/', timeout=(5, 1)) # print(ret) 11 容许重定向 allow_redirects ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False) print(ret.text) 12 大文件下载 stream from contextlib import closing with closing(requests.get('http://httpbin.org/get', stream=True)) as r1: # 在此处理响应。 for i in r1.iter_content(): print(i) 13 证书 cert - 百度、腾讯 => 不用携带证书(系统帮你作了) - 自定义证书 requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem") requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key")) 14 确认 verify =False 更多参数 参数列表 参数示例 官方文档:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4 BeautifulSoup BeautifulSoup是一个模块,该模块用于接收一个HTML或XML字符串,而后将其进行格式化,以后遍可使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story总共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") # 找到第一个a标签 tag1 = soup.find(name='a') # 找到全部的a标签 tag2 = soup.find_all(name='a') # 找到id=link2的标签 tag3 = soup.select('#link2') 安装: 1 pip3 install beautifulsoup4 使用示例: 1 2 3 4 5 6 7 8 9 10 11 from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> ... </body> </html> """ soup = BeautifulSoup(html_doc, features="lxml") 1. name,标签名称 1 2 3 4 5 # tag = soup.find('a') # name = tag.name # 获取 # print(name) # tag.name = 'span' # 设置 # print(soup) 2. attr,标签属性 1 2 3 4 5 6 # tag = soup.find('a') # attrs = tag.attrs # 获取 # print(attrs) # tag.attrs = {'ik':123} # 设置 # tag.attrs['id'] = 'iiiii' # 设置 # print(soup) 3. children,全部子标签 1 2 # body = soup.find('body') # v = body.children 4. children,全部子子孙孙标签 1 2 # body = soup.find('body') # v = body.descendants 5. clear,将标签的全部子标签所有清空(保留标签名) 1 2 3 # tag = soup.find('body') # tag.clear() # print(soup) 6. decompose,递归的删除全部的标签 1 2 3 # body = soup.find('body') # body.decompose() # print(soup) 7. extract,递归的删除全部的标签,并获取删除的标签 1 2 3 # body = soup.find('body') # v = body.extract() # print(soup) 8. decode,转换为字符串(含当前标签);decode_contents(不含当前标签) 1 2 3 4 # body = soup.find('body') # v = body.decode() # v = body.decode_contents() # print(v) 9. encode,转换为字节(含当前标签);encode_contents(不含当前标签) 1 2 3 4 # body = soup.find('body') # v = body.encode() # v = body.encode_contents() # print(v) 10. find,获取匹配的第一个标签 1 2 3 4 5 # tag = soup.find('a') # print(tag) # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tag) 11. find_all,获取匹配的全部标签 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 # tags = soup.find_all('a') # print(tags) # tags = soup.find_all('a',limit=1) # print(tags) # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie') # print(tags) # ####### 列表 ####### # v = soup.find_all(name=['a','div']) # print(v) # v = soup.find_all(class_=['sister0', 'sister']) # print(v) # v = soup.find_all(text=['Tillie']) # print(v, type(v[0])) # v = soup.find_all(id=['link1','link2']) # print(v) # v = soup.find_all(href=['link1','link2']) # print(v) # ####### 正则 ####### import re # rep = re.compile('p') # rep = re.compile('^p') # v = soup.find_all(name=rep) # print(v) # rep = re.compile('sister.*') # v = soup.find_all(class_=rep) # print(v) # rep = re.compile('http://www.oldboy.com/static/.*') # v = soup.find_all(href=rep) # print(v) # ####### 方法筛选 ####### # def func(tag): # return tag.has_attr('class') and tag.has_attr('id') # v = soup.find_all(name=func) # print(v) # ## get,获取标签属性 # tag = soup.find('a') # v = tag.get('id') # print(v) 12. has_attr,检查标签是否具备该属性 1 2 3 # tag = soup.find('a') # v = tag.has_attr('id') # print(v) 13. get_text,获取标签内部文本内容 1 2 3 # tag = soup.find('a') # v = tag.get_text('id') # print(v) 14. index,检查标签在某标签中的索引位置 1 2 3 4 5 6 7 # tag = soup.find('body') # v = tag.index(tag.find('div')) # print(v) # tag = soup.find('body') # for i,v in enumerate(tag): # print(i,v) 15. is_empty_element,是不是空标签(是否能够是空)或者自闭合标签, 判断是不是以下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base' 1 2 3 # tag = soup.find('br') # v = tag.is_empty_element # print(v) 16. 当前的关联标签 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 # soup.next # soup.next_element # soup.next_elements # soup.next_sibling # soup.next_siblings # # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # # tag.parent # tag.parents 17. 查找某标签的关联标签 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) # tag.find_previous(...) # tag.find_all_previous(...) # tag.find_previous_sibling(...) # tag.find_previous_siblings(...) # tag.find_parent(...) # tag.find_parents(...) # 参数同find_all 18. select,select_one, CSS选择器 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 soup.select("title") soup.select("p nth-of-type(3)") soup.select("body a") soup.select("html head title") tag = soup.select("span,a") soup.select("head > title") soup.select("p > a") soup.select("p > a:nth-of-type(2)") soup.select("p > #link1") soup.select("body > a") soup.select("#link1 ~ .sister") soup.select("#link1 + .sister") soup.select(".sister") soup.select("[class~=sister]") soup.select("#link1") soup.select("a#link2") soup.select('a[href]') soup.select('a[href="http://example.com/elsie"]') soup.select('a[href^="http://example.com/"]') soup.select('a[href$="tillie"]') soup.select('a[href*=".com/el"]') from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator) print(type(tags), tags) from bs4.element import Tag def default_candidate_generator(tag): for child in tag.descendants: if not isinstance(child, Tag): continue if not child.has_attr('href'): continue yield child tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1) print(type(tags), tags) 19. 标签的内容 1 2 3 4 5 6 7 8 9 10 11 12 13 # tag = soup.find('span') # print(tag.string) # 获取 # tag.string = 'new content' # 设置 # print(soup) # tag = soup.find('body') # print(tag.string) # tag.string = 'xxx' # print(soup) # tag = soup.find('body') # v = tag.stripped_strings # 递归内部获取全部标签的文本 # print(v) 20.append在当前标签内部追加一个标签 1 2 3 4 5 6 7 8 9 10 # tag = soup.find('body') # tag.append(soup.find('a')) # print(soup) # # from bs4.element import Tag # obj = Tag(name='i',attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # tag.append(obj) # print(soup) 21.insert在当前标签内部指定位置插入一个标签 1 2 3 4 5 6 # from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # tag.insert(2, obj) # print(soup) 22. insert_after,insert_before 在当前标签后面或前面插入 1 2 3 4 5 6 7 # from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('body') # # tag.insert_before(obj) # tag.insert_after(obj) # print(soup) 23. replace_with 在当前标签替换为指定标签 1 2 3 4 5 6 # from bs4.element import Tag # obj = Tag(name='i', attrs={'id': 'it'}) # obj.string = '我是一个新来的' # tag = soup.find('div') # tag.replace_with(obj) # print(soup) 24. 建立标签之间的关系 1 2 3 4 # tag = soup.find('div') # a = soup.find('a') # tag.setup(previous_sibling=a) # print(tag.previous_sibling) 25. wrap,将指定标签把当前标签包裹起来 1 2 3 4 5 6 7 8 9 10 11 # from bs4.element import Tag # obj1 = Tag(name='div', attrs={'id': 'it'}) # obj1.string = '我是一个新来的' # # tag = soup.find('a') # v = tag.wrap(obj1) # print(soup) # tag = soup.find('a') # v = tag.wrap(soup.find('p')) # print(soup) 26. unwrap,去掉当前标签,将保留其包裹的标签 1 2 3 # tag = soup.find('a') # v = tag.unwrap() # print(soup) 更多参数官方:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/ 一大波"自动登录"示例 按 Ctrl+C 复制代码 按 Ctrl+C 复制代码 复制代码 #!/usr/bin/env python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup # ############## 方式一 ############## # # # 1. 访问登录页面,获取 authenticity_token # i1 = requests.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 携带authenticity_token和用户名密码等信息,发送用户验证 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "wupeiqi@live.com", # 'password': 'xxoo' # } # # i2 = requests.post('https://github.com/session', data=form_data, cookies=c1) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = requests.get('https://github.com/settings/repositories', cookies=c1) # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp) # ############## 方式二 ############## # session = requests.Session() # # 1. 访问登录页面,获取 authenticity_token # i1 = session.get('https://github.com/login') # soup1 = BeautifulSoup(i1.text, features='lxml') # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'}) # authenticity_token = tag.get('value') # c1 = i1.cookies.get_dict() # i1.close() # # # 1. 携带authenticity_token和用户名密码等信息,发送用户验证 # form_data = { # "authenticity_token": authenticity_token, # "utf8": "", # "commit": "Sign in", # "login": "wupeiqi@live.com", # 'password': 'xxoo' # } # # i2 = session.post('https://github.com/session', data=form_data) # c2 = i2.cookies.get_dict() # c1.update(c2) # i3 = session.get('https://github.com/settings/repositories') # # soup3 = BeautifulSoup(i3.text, features='lxml') # list_group = soup3.find(name='div', class_='listgroup') # # from bs4.element import Tag # # for child in list_group.children: # if isinstance(child, Tag): # project_tag = child.find(name='a', class_='mr-1') # size_tag = child.find(name='small') # temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, ) # print(temp) 复制代码 知乎 博客园 拉勾网 分类: 爬虫 好文要顶 关注我 收藏该文 春生 关注 - 22 粉丝 - 68 +加关注 0 0 « 上一篇:Flask之flask-session » 下一篇:Scrapy框架基础 posted @ 2018-06-25 23:01 春生 阅读(66) 评论(0) 编辑 收藏 刷新评论刷新页面返回顶部 发表评论 昵称: 评论内容: 引用 粗体 连接 缩进 代码 图片 退出 订阅评论 [Ctrl+Enter快捷键提交] 【推荐】超50万C++/C#源码: 大型实时仿真HMI组态CAD\GIS图形源码! 【推荐】专业便捷的企业级代码托管服务 - Gitee 码云 相关博文: · python 安装插件 requests、BeautifulSoup · Python 爬虫—— requests BeautifulSoup · requests + BeautifulSoup + json · requests和BeautifulSoup · Requests与BeautifulSoup 最新新闻: · 苹果大屏手机方面花了四年时间才遇上三星 在可折叠手机方面呢? · 惹祸的就是它 图解马斯克的4925条推文 · FF:“遣散员工”传闻为误读 已召回百名员工 · IBM为招聘网页出现种族歧视选项致歉 · 认可吧星巴克,你就是个卖杯子的 » 更多新闻... 公告 昵称:春生 园龄:1年2个月 粉丝:68 关注:22 +加关注 < 2019年2月 > 日 一 二 三 四 五 六 27 28 29 30 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 1 2 3 4 5 6 7 8 9 搜索 经常使用连接 个人随笔 个人评论 个人参与 最新评论 个人标签 随笔分类 Ajax(2) Django model系统(5) Django-rest framework(8) DJango-templates系统(1) Django-处理流程(1) Django框架(21) Django-组件-😄😄😄(8) Flask(9) Git(4) go 语言(1) linux(10) MySQL(15) PyCharm 教程使用文档(9) python(41) Python 经常使用模块(1) RabbitMQ(1) Redis(5) requirements.txt(1) Tornado(1) Vue(2) wepsocket(1) 报错(1) 静态文件 各类工具 (7) 爬虫(7) 前端(15) 区块链(1) 算法(1) 网络(2) 项目部署 上线(2) 项目实战(4) 信号(1) 虚拟环境(2) 支付宝(1) 随笔档案 2019年2月 (1) 2018年12月 (11) 2018年11月 (1) 2018年10月 (5) 2018年8月 (9) 2018年7月 (20) 2018年6月 (32) 2018年5月 (19) 2018年4月 (19) 2018年3月 (33) 2018年2月 (6) 2018年1月 (14) 2017年12月 (11) 相册 sdd(3) 最新评论 1. Re:Django之logging日志 问一下,handlers和loggers都有level,那么究竟以哪一个为准handlers': { # 在终端打印 'console': { 'lev...... --桦仔 2. Re:Django之logging日志 'file': { 'level': 'INFO', 'class': 'logging.handlers.TimedRotatingFileHandler......... --桦仔 3. Re:Django之logging日志 感谢 --我好像在哪见过你 4. Re:Python经常使用的标准库以及第三方库有哪些? 楼主的资料太全了,对python的库有了大体了解。 --高效快乐学习 5. Re:面向对象 春生总结 无敌 --骑驴老神仙 阅读排行榜 1. Python经常使用的标准库以及第三方库有哪些?(12659) 2. redis之django-redis(2635) 3. Django之logging日志(2060) 4. Flask之flask-script 指定端口(1782) 5. Celery 大量任务 分发(786) 评论排行榜 1. python 控制台颜色(11) 2. 文件处理(5) 3. Django之logging日志(3) 4. 面向对象 春生总结(2) 5. MySQL 简洁 数据操做 增删改查 记不住的 看这里把(1) 推荐排行榜 1. python 控制台颜色(9) 2. WEB框架之Tornado(2) 3. Django之logging日志(2) 4. Python经常使用的标准库以及第三方库有哪些?(2) 5. Font Awesome矢量图标框架(1) Copyright ©2019 春生
#!/usr/bin/env python # -*- coding:utf-8 -*- import time import requests from bs4 import BeautifulSoup session = requests.Session() i1 = session.get( url='https://www.zhihu.com/#signin', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup1 = BeautifulSoup(i1.text, 'lxml') xsrf_tag = soup1.find(name='input', attrs={'name': '_xsrf'}) xsrf = xsrf_tag.get('value') current_time = time.time() i2 = session.get( url='https://www.zhihu.com/captcha.gif', params={'r': current_time, 'type': 'login'}, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', }) with open('zhihu.gif', 'wb') as f: f.write(i2.content) captcha = input('请打开zhihu.gif文件,查看并输入验证码:') form_data = { "_xsrf": xsrf, 'password': 'xxooxxoo', "captcha": 'captcha', 'email': '424662508@qq.com' } i3 = session.post( url='https://www.zhihu.com/login/email', data=form_data, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) i4 = session.get( url='https://www.zhihu.com/settings/profile', headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', } ) soup4 = BeautifulSoup(i4.text, 'lxml') tag = soup4.find(id='rename-section') nick_name = tag.find('span',class_='name').string print(nick_name) 知乎
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import json import base64 import rsa import requests def js_encrypt(text): b64der = 'MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQCp0wHYbg/NOPO3nzMD3dndwS0MccuMeXCHgVlGOoYyFwLdS24Im2e7YyhB0wrUsyYf0/nhzCzBK8ZC9eCWqd0aHbdgOQT6CuFQBMjbyGYvlVYU2ZP7kG9Ft6YV6oc9ambuO7nPZh+bvXH0zDKfi02prknrScAKC0XhadTHT3Al0QIDAQAB' der = base64.standard_b64decode(b64der) pk = rsa.PublicKey.load_pkcs1_openssl_der(der) v1 = rsa.encrypt(bytes(text, 'utf8'), pk) value = base64.encodebytes(v1).replace(b'\n', b'') value = value.decode('utf8') return value session = requests.Session() i1 = session.get('https://passport.cnblogs.com/user/signin') rep = re.compile("'VerificationToken': '(.*)'") v = re.search(rep, i1.text) verification_token = v.group(1) form_data = { 'input1': js_encrypt('wptawy'), 'input2': js_encrypt('asdfasdf'), 'remember': False } i2 = session.post(url='https://passport.cnblogs.com/user/signin', data=json.dumps(form_data), headers={ 'Content-Type': 'application/json; charset=UTF-8', 'X-Requested-With': 'XMLHttpRequest', 'VerificationToken': verification_token} ) i3 = session.get(url='https://i.cnblogs.com/EditDiary.aspx') print(i3.text) 博客园
#!/usr/bin/env python # -*- coding:utf-8 -*- import re import requests all_cookie = {} # ############### 1. 查看登陆页面 ############### r1 = requests.get( url='https://passport.lagou.com/login/login.html', headers={ 'Host': 'passport.lagou.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } ) all_cookie.update(r1.cookies.get_dict()) X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # ############### 2. 用户名密码登陆 ############### r2 = requests.post( url='https://passport.lagou.com/login/login.json', headers={ 'Host': 'passport.lagou.com', 'Referer': 'https://passport.lagou.com/login/login.html', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data={ 'isValidate': True, 'username': '15131255089', 'password': 'ab18d270d7126ea65915cc22c0d', 'request_form_verifyCode': '', 'submit': '', }, cookies=r1.cookies.get_dict() ) all_cookie.update(r2.cookies.get_dict()) # ############### 3. 用户受权 ############### r3 = requests.get( url='https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r3.cookies.get_dict()) # ############### 4. 用户认证 ############### r4 = requests.get( url=r3.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r4.cookies.get_dict()) r5 = requests.get( url=r4.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r5.cookies.get_dict()) r6 = requests.get( url=r5.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r6.cookies.get_dict()) r7 = requests.get( url=r6.headers['Location'], headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, allow_redirects=False, cookies=all_cookie ) all_cookie.update(r7.cookies.get_dict()) # ############### 5. 查看我的页面 ############### r5 = requests.get( url='https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' }, cookies=all_cookie ) print('武沛齐' in r5.text) # ############### 6. 查看 ############### r6 = requests.get( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'X-L-REQ-HEADER': "{deviceType:1}", 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', }, cookies=all_cookie ) r6_json = r6.json() all_cookie.update(r6.cookies.get_dict()) # ############### 7. 修改我的信息 ############### r7 = requests.put( url='https://gate.lagou.com/v1/neirong/account/users/0/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Origin': 'https://account.lagou.com', 'Host': 'gate.lagou.com', 'X-Anit-Forge-Code': r6_json['submitCode'], 'X-Anit-Forge-Token': r6_json['submitToken'], 'X-L-REQ-HEADER': "{deviceType:1}", }, cookies=all_cookie, json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png", "positionName": '...', "introduce": '....'} ) print(r7.text)
from bs4 import BeautifulSoup class XSSFilter(object): __instance = None def __init__(self): # XSS白名单 self.valid_tags = { "font": ['color', 'size', 'face', 'style'], 'b': [], 'div': [], "span": [], "table": [ 'border', 'cellspacing', 'cellpadding' ], 'th': [ 'colspan', 'rowspan' ], 'td': [ 'colspan', 'rowspan' ], "a": ['href', 'target', 'name'], "img": ['src', 'alt', 'title'], 'p': ['align'], "pre": ['class'], "hr": ['class'], 'strong': [] } def __new__(cls, *args, **kwargs): if not cls.__instance: obj = object.__new__(cls, *args, **kwargs) cls.__instance = obj return cls.__instance def process(self, content): soup = BeautifulSoup(content, 'html.parser') # 遍历全部HTML标签 for tag in soup.find_all(): # 判断标签名是否在白名单中 if tag.name not in self.valid_tags: tag.hidden = True if tag.name not in ['html', 'body']: tag.hidden = True tag.clear() continue # 当前标签的全部属性白名单 attr_rules = self.valid_tags[tag.name] keys = list(tag.attrs.keys()) for key in keys: if key not in attr_rules: del tag[key] return soup.decode() #这里返回的就是过滤完的内容 content=""" <p class='c1' id='i1'> asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf </p> <p> <strong class='c2' id='i2'>asdf</strong> <script>alert(123)</script> </p> <h2> asdf </h2> """ content = XSSFilter().process(content) print('content',content)