用标题中的四种方式解析网页,比较其解析速度。固然比较结果数值与电脑配置,python版本都有关系,但整体差异不会很大。css
下面是个人结果,lxml xpath最快,bs4最慢html
==== Python version: 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 17:00:18) [MSC v.1900 64 bit (AMD64)] ===== ==== Total trials: 10000 ===== bs4 total time: 5.5 pq total time: 0.9 lxml (cssselect) total time: 0.8 lxml (xpath) total time: 0.5 regex total time: 1.1 (doesn't find all p)
如下是测试代码python
# -*- coding: utf-8 -*- """ @Datetime: 2019/3/13 @Author: Zhang Yafei """ import re import sys import time import requests from lxml.html import fromstring from pyquery import PyQuery as pq from bs4 import BeautifulSoup as bs headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} def Timer(): a = time.time() while True: c = time.time() yield time.time() - a a = c # ################# start request ################# timer = Timer() url = "https://www.python.org/" html = requests.get(url, headers=headers).text num = 10000 print('\n==== Python version: %s =====' % sys.version) print('\n==== Total trials: %s =====' % num) next(timer) # ################# bs4 ######################### soup = bs(html, 'lxml') for x in range(num): paragraphs = soup.findAll('p') t = next(timer) print('bs4 total time: %.1f' % t) # ################ pyquery ####################### d = pq(html) for x in range(num): paragraphs = d('p') t = next(timer) print('pq total time: %.1f' % t) # ############### lxml css ######################### tree = fromstring(html) for x in range(num): paragraphs = tree.cssselect('p') t = next(timer) print('lxml (cssselect) total time: %.1f' % t) # ############## lxml xpath ####################### tree = fromstring(html) for x in range(num): paragraphs = tree.xpath('.//p') t = next(timer) print('lxml (xpath) total time: %.1f' % t) # ############### re ########################## for x in range(num): paragraphs = re.findall('<[p ]>.*?</p>', html) t = next(timer) print('regex total time: %.1f (doesn\'t find all p)\n' % t)
测试代码二app
# -*- coding: utf-8 -*- """ @Datetime: 2019/3/13 @Author: Zhang Yafei """ import functools import re import sys import time import requests from bs4 import BeautifulSoup as bs from lxml.html import fromstring from pyquery import PyQuery as pq def timeit(fun): @functools.wraps(fun) def wrapper(*args, **kwargs): start_time = time.time() res = fun(*args, **kwargs) print('运行时间为%.6f' % (time.time() - start_time)) return res return wrapper @timeit # time1 = timeit(time) def time1(n): return [i * 2 for i in range(n)] # ################# start request ################# url = "https://www.taobao.com/" html = requests.get(url).text num = 10000 print('\n==== Python version: %s =====' % sys.version) print('\n==== Total trials: %s =====' % num) @timeit def bs4_test(): soup = bs(html, 'lxml') for x in range(num): paragraphs = soup.findAll('p') print('bs4 total time:') @timeit def pq_test(): d = pq(html) for x in range(num): paragraphs = d('p') print('pq total time:') @timeit def lxml_css(): tree = fromstring(html) for x in range(num): paragraphs = tree.cssselect('p') print('lxml (cssselect) total time:') @timeit def lxml_xpath(): tree = fromstring(html) for x in range(num): paragraphs = tree.xpath('.//p') print('lxml (xpath) total time:') @timeit def re_test(): for x in range(num): paragraphs = re.findall('<[p ]>.*?</p>', html) print('regex total time:') if __name__ == '__main__': bs4_test() pq_test() lxml_css() lxml_xpath() re_test()
测试结果测试
==== Python version: 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 17:00:18) [MSC v.1900 64 bit (AMD64)] ===== ==== Total trials: 10000 ===== bs4 total time: 运行时间为9.049424 pq total time: 运行时间为0.899639 lxml (cssselect) total time: 运行时间为0.841596 lxml (xpath) total time: 运行时间为0.619440 regex total time: 运行时间为1.207861