regexcss
灵活方便的网页解析库,高效,支持多种解析器。利用bs不用编写正则表达式便可方便地实现网页信息的提取html
解析器 | 使用方法 | 优点 | 劣势 |
---|---|---|---|
python标准库 | BeautifulSoup(markup, 'html.parser') | python内置标准库,执行速度适中,文档容错能力强 | python2.7.3或3.2.2前的版本中文容错能力差 |
lxml html解析 | BeautifulSoup(markup, 'lxml') | 速度快,文档容错能力强 | 须要安装c语言库 |
lxml xml解析 | BeautifulSoup(markup, 'xml') | 速度快,惟一支持xml的解析器 | 须要安装c语言库 |
html5lib | BeautifulSoup(markup, 'html5lib') | 最好的容错性,以浏览器的方式解析文档、生成HTML5格式的文档 | 速度慢,不依赖外部扩展 |
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, "lxml")
print(soup.prettify())
print(soup.title.string())
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title.name)
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.atrs['name'])
print(soup.p['name'])
复制代码
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.string)
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.head.title.string) // 这句是重点,能够嵌套的往下去访问节点
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.contents)
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children): // 全部的子节点
print(i, child)
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants): // 全部的子孙节点
print(i, child)
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.a.parent) // 父节点
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.parents)))
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <p class="title" name="dromouse"> <b> The Dirmouse's story</b></p> <p class="story"> Onece upon a time there were three little sisters; and their names were <a href = "http://example.com/elsio" class = "sister" id="link1"></a> <a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a> <a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a> and they lived at the bottom of a well.</p> <p class="story> ...</p> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
复制代码
find_all(name, attrs, recursive, text, **kwargs)html5
可根据标签名、属性、内容查找文档python
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all('ul'))
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.find_all('ul'):
print(ul.find_all('ul'))
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
复制代码
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class='element'))
复制代码
find(name, attrs, recursive, text, **kwargs)web
find返回单个元素,find_all返回全部的元素正则表达式
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find('ul'))
print(soup.find('page'))
复制代码
find_parents() find_parent()api
find_next_siblings() find_next_sibling浏览器
find_previous_siblings() find_previous_sibling()cookie
find_all_next() find_next()python2.7
find_all_previous() 和find_previous()
经过select()直接传入css选择期便可完成选择
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.select('.panel-body'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
复制代码
html = """ <html><head><title>The Dommouse's story</title></head> <body> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ui class="list list-small" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> </ui> </div> </body> </html> """
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select("li"):
print(ui.get_text())
复制代码
强大又灵活的网页解析库,若是熟悉juqery的话,能够很快接入pyquery。不用写麻烦的正则
pip install pyquery
html=""" <div> <ul> <li class="item-0">first item</li> <li class='item-1'><a href="link2.html">second item</a></li> <li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li> <li class='item-1 active'>< a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """
from pyquery import PyQuery as pq
doc = pq(html)
print(doc("li"))
复制代码
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
复制代码
from pyquery import PyQuery as pq
doc = pq(filename="demo.html")
print(doc('li'))
复制代码
html=""" <div id="container"> <ul> <li class="item-0">first item</li> <li class='item-1'><a href="link2.html">second item</a></li> <li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li> <li class='item-1 active'>< a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> """
from pyquery import PyQeury as pq
doc = pq(html)
print(doc("#container ul .item-0"))
复制代码
find函数 find("li")
children()孩子节点
parent() 父元素
parents() 祖先节点
siblings() 全部兄弟元素
items() 全部元素
attr(name) 属性
text() 文本
html() 获取html内容
addClass(name) 添加css class
removeClass(name) 移除css class
attr("name", "link") 修改属性值
css("font-size", "14px") 设置css值
item.remove() 移除元素
pyquery.readthedocs.io
自动化测试工具,支持多种浏览器,驱动多种浏览器能够进行一系列的操做。爬虫中主要用来解决JavaScript渲染问题。
pip install selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
try:
browser.get("https://www.baidu.con")
input = browser.find_element_by_id("kw")
input.send_keys("Python")
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
browser.close()
复制代码
from selenium import webdriver
browser = webdriver.Chrome()
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantonJS()
browser = webdriver.Safari()
复制代码
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
print(browser.page_source)
browser.close()
复制代码
单个元素
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input_first = browser.find_element_by_id('q')
input_second = browser.find_element_by_css_selector('#q')
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(inpout_firsta, input_second, input_third)
复制代码
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID, 'q')
print(input_first)
browser.close()
复制代码
find_elements_by_css_selector
find_elements
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input.send_keys("iPhone")
time.sleep(1)
input.clear()
input.send_keys('iPad')
button = browser.find_element_by_class_name('btn-search')
button.click()
复制代码
更多操做: selenium-python.readthedocs.io/api.html
将动做附加到动做链中串行执行
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url= 'http://www.r
browser.switch_to_frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')
复制代码