基于以前的爬取代码咱们进行函数的封装而且加入多线程html
以前的代码http://www.javashuo.com/article/p-gazlsdjj-do.htmlpython
from concurrent import futures
导入的模块多线程
ex = futures.ThreadPoolExecutor(max_workers =22) #设置线程个数
函数
ex.submit(方法,方法须要传入的参数)
url
import os import requests from lxml.html import etree from concurrent import futures #多线程 url = 'http://www.doutula.com/' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',} def img_url_lis(url): response = requests.get(url,headers = headers) response.encoding = 'utf8' response_html = etree.HTML(response.text) img_url_lis = response_html.xpath('.//img/@data-original') return img_url_lis #建立图片文件夹 img_file_path = os.path.join(os.path.dirname(__file__),'img') if not os.path.exists(img_file_path): # 没有文件夹名建立文件夹 os.mkdir(img_file_path) print(img_file_path) def dump_one_img(url): name = str(url).split('/')[-1] response = requests.get(url, headers=headers) img_path = os.path.join(img_file_path, name) with open(img_path, 'wb') as fw: fw.write(response.content) def dump_imgs(urls:list): for url in urls: ex = futures.ThreadPoolExecutor(max_workers =22) #多线程 ex.submit(dump_one_img,url) #方法,对象 # dump_one_img(url) def run(): count = 1 while True: if count == 10: count += 1 continue lis = img_url_lis(f'http://www.doutula.com/article/list/?page={count}') if len(lis) == 0: print(count) break dump_imgs(lis) print(f'第{count}页也就完成') count +=1 if __name__ == '__main__': run()
能够更加快速的爬取多个内容
线程