Python 协程并发爬虫网页

时间 2019-11-11

原文原文链接

简单爬虫实例：

功能：经过urllib.request实现网站爬虫，捕获网站内容。

from urllib import request

def f(url):
    print("GET:%s"% url)

    # 实例化
    resp = request.urlopen(url)

    # data就是下载的网页
    data = resp.read()

    # 打开url下载到本地
    f = open("url.html","wb")
    f.write(data)
    f.close()
    print('%d bytes received from %s.' % (len(data), url))

# 须要爬的网页
f("http://www.cnblogs.com/alex3714/articles/5248247.html")

遇到IO阻塞时会自动切换任务：

功能：爬虫网页，并经过gevent.monkey 实现io自动切换，并发并行捕获网页。

测试：并行时间，串行时间。

import gevent,time
from urllib import request
from gevent import monkey

# 把当前程序的全部的io操做给我单独的坐上标记
# 至关于gevent.sleep
monkey.patch_all()

def f(url):
    print("GET:%s"% url)

    # 实例化网页捕获
    resp = request.urlopen(url)

    #　data就是下载的网页
    data = resp.read()
    print('%d bytes received from %s.' % (len(data), url))

#---------------------------串行------------------------------#
# 建立列表
urls = ['https://www.python.org/',
        'https://www.yahoo.com/',
        'https://github.com/'
        ]

# 获取同步时间
time_start = time.time()

# 循环打印网页
for url in urls:
    f(url)
print("同步cost",time.time() - time_start)


#--------------------------并行--------------------------------#
# 获取异步时间
async_time_start = time.time()
# 执行协程
gevent.joinall([
        # 生成三个协程，执行f函数，参数。
        gevent.spawn(f, 'https://www.python.org/'),
        gevent.spawn(f, 'https://www.yahoo.com/'),
        gevent.spawn(f, 'https://github.com/'),
])
print("异步cost",time.time() - async_time_start)