将要经过代码实现的是:找到其中是最新电影而且评分高于8.0的电影。html
第一步代码实现:网络
1 # _*_ coding:utf-8 _*_ 2 # _author:khal_Cgg 4 import requests 5 from bs4 import BeautifulSoup 6 import re 7 req = {} # 最后返回的电影列表 8 def re_func(text): # 每一次请求都会被这个函数处理分析是否过标准 9 global req 10 soup = BeautifulSoup(text,"lxml") # 建立beautifulsoup 11 table_list = soup.find_all(name="table",attrs={"class":"tbspan"}) # 找到每个电影的table 12 for one_table in table_list: # 循环每个table 13 soup2 = BeautifulSoup(str(one_table),"lxml") 14 first_a = soup2.find(name="a") # 找到的第一个a标签就是电影分类 15 if first_a.string == "[最新电影]": # 如果最新电影则再判断其余条件 16 film_name = soup2.find_all(name="a")[1] 17 td = soup2.find_all(name="td")[5] 18 re_aim = td.string 19 gaga = re.findall("豆瓣评分 (\d{1}.\d{1})|IMDb评分 (\d{1}.\d{1})",re_aim) #找评分 20 if gaga: # 判断评分是否知足必定的要求 21 douban = gaga[0][0] 22 if douban: 23 douban = float(douban) 24 if douban >= 8.0: 25 req[film_name.string] = {} 26 req[film_name.string]["豆瓣得分"] = douban # 如果知足就写入列表 27 else: 28 imdb = gaga[0][1] 29 if imdb: 30 imdb = float(imdb) 31 if imdb >= 8.0: 32 req[film_name.string]={} 33 req[film_name.string]["IMDb得分"] = imdb 34 return req 35 base_url = "http://www.ygdy8.net/html/gndy/china/list_4_%s.html" 36 for i in range(1,20): # 循环生成url就行requests 37 print('++++++++++',i) 38 user = base_url%(i) 39 print(user) 40 r1 = requests.get(user) 41 r1.encoding="gbk" 42 re_func(r1.text) 43 print(req)
上述代码就是只用requests模块最简单的方式进行爬虫。app
缺点:框架
第二步gevent异步优化dom
# _*_ coding:utf-8 _*_ import requests from bs4 import BeautifulSoup import re,gevent req = {} class Singleton(object): def __new__(cls, *args, **kwargs): if not hasattr(cls,'_instance'): orig = super(Singleton,cls) cls._instance = orig.__new__(cls) return cls._instance class re_func(Singleton): def __init__(self,text): global req soup = BeautifulSoup(text,"lxml") table_list = soup.find_all(name="table",attrs={"class":"tbspan"}) for one_table in table_list: soup2 = BeautifulSoup(str(one_table),"lxml") first_a = soup2.find(name="a") if first_a.string == "[最新电影]": film_name = soup2.find_all(name="a")[1] td = soup2.find_all(name="td")[5] re_aim = td.string gaga = re.findall("豆瓣评分 (\d{1}.\d{1})|IMDb评分 (\d{1}.\d{1})",re_aim) if gaga: douban = gaga[0][0] if douban: douban = float(douban) if douban >= 8.0: req[film_name.string] = {} req[film_name.string]["豆瓣得分"] = douban else: imdb = gaga[0][1] if imdb: imdb = float(imdb) if imdb >= 8.0: req[film_name.string]={} req[film_name.string]["IMDb得分"] = imdb base_url = "http://www.ygdy8.net/html/gndy/china/list_4_%s.html" def start(i): print('++++++++++',i) user = base_url%(i) print(user) r1 = requests.get(user) r1.encoding="gbk" re_func(r1.text) threads = [gevent.spawn(start, i) for i in range(1,88)] gevent.joinall(threads) print(req)
优势:异步
结果:显示中国电影依旧那么烂scrapy
下载安装好后,选择一个目录在命令行中:ide
settings参数:函数
ROBOTSTXT_OBEY = False 爬虫失败改成假oop
DEPTH_LIMIT = 2 容许迭代深度2 只迭代一次
代码:
# -*- coding: utf-8 -*- import scrapy from scrapy.http import Request import re from bs4 import BeautifulSoup class DyttSpider(scrapy.Spider): name = "dytt" allowed_domains = ["ygdy8.net"] start_urls = ['http://www.ygdy8.net/html/gndy/china/index.html'] visited_set = set() req = {} encoding = 'gbk' def take_page(self,page_list): req = [] for biaoqian in page_list: href = biaoqian.attrs["href"] new_rul = self.start_urls[0].replace("list_4_0.html/",href,1) req.append(new_rul) print(req) return req def re_func(self,text): # 每一次请求都会被这个函数处理分析是否过标准 soup = BeautifulSoup(text, "lxml") # 建立beautifulsoup # print(text) div_x = soup.find_all(name="div",attrs={"class","x"})[1] soupX = BeautifulSoup(str(div_x), "lxml") page_list = soupX.find_all(name="a",limit=6) url_loop = self.take_page(page_list) table_list = soup.find_all(name="table", attrs={"class": "tbspan"}) # 找到每个电影的table for one_table in table_list: # 循环每个table soup2 = BeautifulSoup(str(one_table), "lxml") first_a = soup2.find(name="a") # 找到的第一个a标签就是电影分类 if first_a.string == "[最新电影]": # 如果最新电影则再判断其余条件 film_name = soup2.find_all(name="a")[1] td = soup2.find_all(name="td")[5] re_aim = td.string gaga = re.findall("豆瓣评分 (\d{1}.\d{1})|IMDb评分 (\d{1}.\d{1})", re_aim) # 找评分 if gaga: # 判断评分是否知足必定的要求 douban = gaga[0][0] if douban: douban = float(douban) if douban >= 8.0: self.req[film_name.string] = {} self.req[film_name.string]["豆瓣得分"] = douban # 如果知足就写入列表 print(self.req) else: imdb = gaga[0][1] if imdb: imdb = float(imdb) if imdb >= 8.0: self.req[film_name.string] = {} self.req[film_name.string]["IMDb得分"] = imdb print(self.req) return url_loop def parse(self, response): print(response.url) self.visited_set.add(response.url) print(response.body.decode('gbk')) page_list = self.re_func(response) for url in page_list: if url in self.visited_set: pass else: obj = Request(url=url, method='GET', callback=self.parse,encoding='gbk') yield obj
代码中存在的编码问题没有解决
例子:
# -*- coding: utf-8 -*- import scrapy from scrapy.selector import HtmlXPathSelector from scrapy.http import Request count = 0 class XiaohuarSpider(scrapy.Spider): name = "xiaohuar" allowed_domains = ["xiaohuar.com"] start_urls = ['http://www.xiaohuar.com/list-1-0.html'] visited_set = set() def parse(self, response): self.visited_set.add(response.url) # 1. 当前页面的全部校花爬下来 # 获取div而且属性为 class=item masonry_brick hxs = HtmlXPathSelector(response) item_list = hxs.select('//div[@class="item masonry_brick"]') for item in item_list: v = item.select('.//span[@class="price"]/text()').extract_first() print(v) global count count +=1 print(count) # 2. 在当前页中获取 http://www.xiaohuar.com/list-1-\d+.html, # page_list = hxs.select('//a[@href="http://www.xiaohuar.com/list-1-1.html"]') page_list = hxs.select('//a[re:test(@href,"http://www.xiaohuar.com/list-1-\d+.html")]/@href').extract() for url in page_list: if url in self.visited_set: pass else: obj = Request(url=url,method='GET',callback=self.parse) print(self.parse) return obj # return 是结束这个函数后再调用 (循环只有一次)因此limit会对其有影响。可是yield是函数暂时中止在这里 # 再调用这个函数的时候继续循环第一个函数的for循环 因此这是会先循环第一页上的全部页码1-16。此时第一次调用才结束。因此limit3正好循环完42页。
经过这里来了解return和yield的差异和depth_limit的做用:
经过实验:limit在大于等于3的时候yield能循环完成42页的任务。可是return只能完成limit的数值的页数。
得出结论:limit监控的是parse函数本身被调用的次数return由于是结束了函数因此在循环第二次的时候就是再次调用了函数。
因此只能爬到limit数值限制的页码。没有limit的时候也能彻底爬完全部的页码,可是因为屡次(迭代)调用的关系比yield慢不少。
yield不一样的是yield住后第一次parse函数并无结束,再次进行的request并非再次调用第二个函数,而是运行了第一个函数的yield。因此直到第一个parse函数的全部page_list循环完后才会再次调用第二个parse函数。因此limit大于等于3能循环完共42页的数据。