接着上篇博客《用Scrapy抓取豆瓣小组数据(一)》http://my.oschina.net/chengye/blog/124157 python
1,引入Scrapy中的另外一个预约义的蜘蛛CrawlSpider cookie
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
2, 基于CrawSpider定义一个新的类GroupSpider,并添加相应的爬行规则。 session
class GroupSpider(CrawlSpider): name = "Group" allowed_domains = ["douban.com"] start_urls = [ "http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9", "http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB", "http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A", "http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F", "http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A", "http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3" ] rules = [ Rule(SgmlLinkExtractor(allow=('/group/[^/]+/$', )), callback='parse_group_home_page', process_request='add_cookie'), Rule(SgmlLinkExtractor(allow=('/group/explore\?tag', )), follow=True, process_request='add_cookie'), ]start_urls预约义了豆瓣有所小组分类页面,蜘蛛会从这些页面出发去找小组。
rules定义是CrawlSpider中最重要的一环,能够理解为:当蜘蛛看到某种类型的网页,如何去进行处理。 app
例如,以下规则会处理URL以/group/XXXX/为后缀的网页,调用parse_group_home_page为处理函数,而且会在request发送前调用add_cookie来附加cookie信息。 dom
Rule(SgmlLinkExtractor(allow=('/group/[^/]+/$', )), callback='parse_group_home_page', process_request='add_cookie'),又如,以下规则会抓取网页内容,并自动抓取网页中连接供下一步抓取,但不会处理网页的其余内容。
Rule(SgmlLinkExtractor(allow=('/group/explore\?tag', )), follow=True, process_request='add_cookie'),
定义以下函数,并如前面所讲在Rule定义里添加process_request=add_cookie。 scrapy
def add_cookie(self, request): request.replace(cookies=[ {'name': 'COOKIE_NAME','value': 'VALUE','domain': '.douban.com','path': '/'}, ]); return request;通常网站在client端都用cookie来保存用户的session信息,添加cookie信息就能够模拟登录用户来抓取数据。
首先能够尝试添加登录用户的cookie去抓取网页,即便你抓取的是公开网页,添加cookie有可能会防止蜘蛛在应用程序层被禁。这个我没有实际验证过,但确定没有坏处。 ide
其次,即便你是受权用户,若是你的访问过于频繁,你的IP会可能被ban,因此通常你须要让蜘蛛在访问网址中间休息1~2秒。 函数
还有就是配置User Agent,尽可能轮换使用不一样的UserAgent去抓取网页 网站
在Scrapy项目的settings.py钟,添加以下设置: url
DOWNLOAD_DELAY = 2 RANDOMIZE_DOWNLOAD_DELAY = True USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' COOKIES_ENABLED = True
================
到此位置,抓取豆瓣小组页面的蜘蛛就完成了。接下来,能够按照这种模式定义抓取小组讨论页面数据的Spider,而后就放手让蜘蛛去爬行吧!Have Fun!
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item from douban.items import DoubanItem import re class GroupSpider(CrawlSpider): name = "Group" allowed_domains = ["douban.com"] start_urls = [ "http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9", "http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB", "http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A", "http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F", "http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A", "http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3" ] rules = [ Rule(SgmlLinkExtractor(allow=('/group/[^/]+/$', )), callback='parse_group_home_page', process_request='add_cookie'), # Rule(SgmlLinkExtractor(allow=('/group/[^/]+/discussion\?start\=(\d{1,4})$', )), callback='parse_group_topic_list', process_request='add_cookie'), Rule(SgmlLinkExtractor(allow=('/group/explore\?tag', )), follow=True, process_request='add_cookie'), ] def __get_id_from_group_url(self, url): m = re.search("^http://www.douban.com/group/([^/]+)/$", url) if(m): return m.group(1) else: return 0 def add_cookie(self, request): request.replace(cookies=[ ]); return request; def parse_group_topic_list(self, response): self.log("Fetch group topic list page: %s" % response.url) pass def parse_group_home_page(self, response): self.log("Fetch group home page: %s" % response.url) hxs = HtmlXPathSelector(response) item = DoubanItem() #get group name item['groupName'] = hxs.select('//h1/text()').re("^\s+(.*)\s+$")[0] #get group id item['groupURL'] = response.url groupid = self.__get_id_from_group_url(response.url) #get group members number members_url = "http://www.douban.com/group/%s/members" % groupid members_text = hxs.select('//a[contains(@href, "%s")]/text()' % members_url).re("\((\d+)\)") item['totalNumber'] = members_text[0] #get relative groups item['RelativeGroups'] = [] groups = hxs.select('//div[contains(@class, "group-list-item")]') for group in groups: url = group.select('div[contains(@class, "title")]/a/@href').extract()[0] item['RelativeGroups'].append(url) #item['RelativeGroups'] = ','.join(relative_groups) return item