简易多线程爬虫框架

时间 2019-12-18

标签简易多线程爬虫框架栏目 Java 繁體版

原文原文链接

本文首发于知乎python

本文使用多线程实现一个简易爬虫框架，让咱们只须要关注网页的解析，不用本身设置多线程、队列等事情。调用形式相似scrapy，而诸多功能还不完善，所以称为简易爬虫框架。数据库

这个框架实现了Spider类，让咱们只须要写出下面代码，便可多线程运行爬虫编程

class DouBan(Spider):

    def __init__(self):
        super(DouBan, self).__init__()
        self.start_url = 'https://movie.douban.com/top250'
        self.filename = 'douban.json' # 覆盖默认值
        self.output_result = False 
        self.thread_num = 10

    def start_requests(self): # 覆盖默认函数
        yield (self.start_url, self.parse_first)

    def parse_first(self, url): # 只须要yield待爬url和回调函数
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')

        movies = soup.find_all('div', class_ = 'info')[:5]
        for movie in movies:
            url = movie.find('div', class_ = 'hd').a['href']
            yield (url, self.parse_second)

        nextpage = soup.find('span', class_ = 'next').a
        if nextpage:
            nexturl = self.start_url + nextpage['href']
            yield (nexturl, self.parse_first)
        else:
            self.running = False # 代表运行到这里则不会继续添加待爬URL队列

    def parse_second(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        mydict = {}
        title = soup.find('span', property = 'v:itemreviewed')
        mydict['title'] = title.text if title else None
        duration = soup.find('span', property = 'v:runtime')
        mydict['duration'] = duration.text if duration else None
        time = soup.find('span', property = 'v:initialReleaseDate')
        mydict['time'] = time.text if time else None
        yield mydict


if __name__ == '__main__':
    douban = DouBan()
    douban.run()
复制代码

能够看到这个使用方式和scrapy很是类似json

继承类，只须要写解析函数（由于是简易框架，所以还须要写请求函数）
用yield返回数据或者新的请求及回调函数
自动多线程（scrapy是异步）
运行都同样只要run
能够设置是否存储到文件等，只是没有考虑可扩展性(数据库等)

下面咱们来讲一说它是怎么实现的bash

咱们能够对比下面两个版本，一个是上一篇文章中的使用方法，另外一个是进行了一些修改，将一些功能抽象出来，以便扩展功能。多线程

上一篇文章版本代码请读者自行点击连接去看，下面是修改后的版本代码。app

import requests
import time
import threading
from queue import Queue, Empty
import json
from bs4 import BeautifulSoup

def run_time(func):
    def wrapper(*args, **kw):
        start = time.time()
        func(*args, **kw)
        end = time.time()
        print('running', end-start, 's')
    return wrapper


class Spider():

    def __init__(self):
        self.start_url = 'https://movie.douban.com/top250'
        self.qtasks = Queue()
        self.data = list()
        self.thread_num = 5
        self.running = True

    def start_requests(self):
        yield (self.start_url, self.parse_first)

    def parse_first(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')

        movies = soup.find_all('div', class_ = 'info')[:5]
        for movie in movies:
            url = movie.find('div', class_ = 'hd').a['href']
            yield (url, self.parse_second)

        nextpage = soup.find('span', class_ = 'next').a
        if nextpage:
            nexturl = self.start_url + nextpage['href']
            yield (nexturl, self.parse_first)
        else:
            self.running = False


    def parse_second(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        mydict = {}
        title = soup.find('span', property = 'v:itemreviewed')
        mydict['title'] = title.text if title else None
        duration = soup.find('span', property = 'v:runtime')
        mydict['duration'] = duration.text if duration else None
        time = soup.find('span', property = 'v:initialReleaseDate')
        mydict['time'] = time.text if time else None
        yield mydict


    def start_req(self):
        for task in self.start_requests():
            self.qtasks.put(task)

    def parses(self):
        while self.running or not self.qtasks.empty():
            try:
                url, func = self.qtasks.get(timeout=3)
                print('crawling', url)
                for task in func(url):
                    if isinstance(task, tuple):
                        self.qtasks.put(task)
                    elif isinstance(task, dict):
                        self.data.append(task)
                    else:
                        raise TypeError('parse functions have to yield url-function tuple or data dict')
            except Empty:
                print('{}: Timeout occurred'.format(threading.current_thread().name))
        print(threading.current_thread().name, 'finished')


    @run_time
    def run(self, filename=False):
        ths = []

        th1 = threading.Thread(target=self.start_req)
        th1.start()
        ths.append(th1)

        for _ in range(self.thread_num):
            th = threading.Thread(target=self.parses)
            th.start()
            ths.append(th)

        for th in ths:
            th.join()

        if filename:
            s = json.dumps(self.data, ensure_ascii=False, indent=4)
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(s)

        print('Data crawling is finished.')

if __name__ == '__main__':
    Spider().run(filename='frame.json')
复制代码

这个改进主要思路以下框架

咱们但愿写解析函数时，像scrapy同样，用yield返回待抓取的URL和它对应的解析函数，因而就作了一个包含(URL，解析函数)的元组队列，以后只要不断从队列中获取元素，用函数解析url便可，这个提取的过程使用多线程
yield能够返回两种类型数据，一种是元组（URL，解析函数），一种是字典（即咱们要的数据），经过判断分别加入不一样队列中。元组队列是不断消耗和增添的过程，而字典队列是一只增长，最后再一块儿输出到文件中
在queue.get时，加入了timeout参数并作异常处理，保证每个线程都能结束

这里其实没有特别的知识，也不须要解释不少，读者本身复制代码到文本文件里对比就知道了异步

而后框架的形式就是从第二种中，剥离一些通用的设定，让用户自定义每一个爬虫独特的部分，完整代码以下(本文开头的代码就是下面这块代码的后半部分)scrapy

import requests
import time
import threading
from queue import Queue, Empty
import json
from bs4 import BeautifulSoup

def run_time(func):
    def wrapper(*args, **kw):
        start = time.time()
        func(*args, **kw)
        end = time.time()
        print('running', end-start, 's')
    return wrapper


class Spider():

    def __init__(self):
        self.qtasks = Queue()
        self.data = list()
        self.thread_num = 5
        self.running = True
        self.filename = False
        self.output_result = True

    def start_requests(self):
        yield (self.start_url, self.parse)

    def start_req(self):
        for task in self.start_requests():
            self.qtasks.put(task)

    def parses(self):
        while self.running or not self.qtasks.empty():
            try:
                url, func = self.qtasks.get(timeout=3)
                print('crawling', url)
                for task in func(url):
                    if isinstance(task, tuple):
                        self.qtasks.put(task)
                    elif isinstance(task, dict):
                        if self.output_result:
                            print(task)
                        self.data.append(task)
                    else:
                        raise TypeError('parse functions have to yield url-function tuple or data dict')
            except Empty:
                print('{}: Timeout occurred'.format(threading.current_thread().name))
        print(threading.current_thread().name, 'finished')

    @run_time
    def run(self):
        ths = []

        th1 = threading.Thread(target=self.start_req)
        th1.start()
        ths.append(th1)

        for _ in range(self.thread_num):
            th = threading.Thread(target=self.parses)
            th.start()
            ths.append(th)

        for th in ths:
            th.join()

        if self.filename:
            s = json.dumps(self.data, ensure_ascii=False, indent=4)
            with open(self.filename, 'w', encoding='utf-8') as f:
                f.write(s)

        print('Data crawling is finished.')



class DouBan(Spider):

    def __init__(self):
        super(DouBan, self).__init__()
        self.start_url = 'https://movie.douban.com/top250'
        self.filename = 'douban.json' # 覆盖默认值
        self.output_result = False 
        self.thread_num = 10

    def start_requests(self): # 覆盖默认函数
        yield (self.start_url, self.parse_first)

    def parse_first(self, url): # 只须要yield待爬url和回调函数
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')

        movies = soup.find_all('div', class_ = 'info')[:5]
        for movie in movies:
            url = movie.find('div', class_ = 'hd').a['href']
            yield (url, self.parse_second)

        nextpage = soup.find('span', class_ = 'next').a
        if nextpage:
            nexturl = self.start_url + nextpage['href']
            yield (nexturl, self.parse_first)
        else:
            self.running = False # 代表运行到这里则不会继续添加待爬URL队列

    def parse_second(self, url):
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        mydict = {}
        title = soup.find('span', property = 'v:itemreviewed')
        mydict['title'] = title.text if title else None
        duration = soup.find('span', property = 'v:runtime')
        mydict['duration'] = duration.text if duration else None
        time = soup.find('span', property = 'v:initialReleaseDate')
        mydict['time'] = time.text if time else None
        yield mydict


if __name__ == '__main__':
    douban = DouBan()
    douban.run()
复制代码

咱们这样剥离以后，就只须要写后半部分的代码，只关心网页的解析，不用考虑多线程的实现了。

欢迎关注个人知乎专栏

专栏主页：python编程

版本说明：软件及包版本说明