入门使用实例html
# 查看帮助信息 scrapy --help # 查看版本及组件版本信息 scrapy version -v # 建立项目/工程 scrapy startproject 项目/工程名称 # 建立Spider(能够建立多个,可是名称不能相同) scrapy genspider 名称 采集网址 scrapy genspider aaa aaa.com scrapy genspider bbb bbb.com # 列出工程中全部的Spider scrapy list # 查看采集网址在浏览器中的样子(命令执行完会在浏览器打开) scrapy view http://www.baidu.com # 在工程中用parse解析固定的网址,通常用于测试 scrapy parse http://www.baidu.com # shell能够不在具体工程中执行 scrapy shell # runspider 单独执行工程中的spider文件 scrapy runspider aaaaa.py # bench执行一个基准测试 scrapy bench
# 属性介绍 name:spider的名称,要求惟一 allowed_domains:准许的域名 start_urls:出事urls custom_settings:个性化设置,会覆盖全局的设置 crawler:抓取器,spider将绑定到它上面 settings:配置实例,包含工程中全部的配置变脸 logger:日志实例 # 方法介绍 from_crawler(crawler, *args,**kwargs):类方法,用于建立spiders start_requests():生成出事的requests make_requests_from_url(url):根据URL生成一个request parse(response):用来解析网页内容 log(message[,level,component]):用来记录日志,这里使用logger属性记录日志 self.logger.info("visited success") closed(reason):当spider关闭时候调用的方法
CrawlSpider # 最经常使用的spider,用于抓取普通的网页 # 增长了两个成员 ## rules:定义了一些抓取规则-连接怎么跟踪,使用哪个parse函数解析此连接 ## parse_start_url(response):解析初始化URL的相遇 XMLFeedSpider CSVFeedSpider SitemapSpider
Selector:用来解析网页的库有不少,好比beautifulsoup、lxml,但在scrapy里面默认使用的是selector,相对来讲也是比较好用的 # 使用 from scrapy.selector import Selector from scrapy.http import HtmlResponse # 使用test实例化 body = '<html><body><span>good</span></body></html>' Selector(text=body).xpath('//span/text()').extract() # 使用response实例化 response = HtmlResponse(url='http://example.com', body=body) Selector(response=response).xpath('//span/test()').extract() Ltems
# 建立项目 scrapy startproject tutorial # 建立Spider scrapy genspider pm25 # 编写Items import scrapy class Pm25CityItem(scrapy.Item): city_name = scrapy.Field() #城市的名称 home_link = scrapy.Field() #对应数据的连接地址 city_pinyin = scrapy.Field() #城市的拼音 # 完善Spider import scrapy from tutorial.items import Pm25CityItem class Pm25Spider(scrapy.Spider): name = "pm25" allowed_domains = ["pm25.in"] start_urls = [ 'http://www.pm25.in', ] def parse(self, response): sel = scrapy.Selector(response) citys = sel.xpath("//div[@class='all']/div[@class='bottom']/ul[@class='unstyled']/div[2]/li") city_items = [] for city in citys: city_item = Pm25CityItem() href = ''.join(city.xpath('a/@href').extract()).strip() city_item['city_name'] = ''.join(city.xpath('a/text()').extract()).strip().encode("UTF-8") city_item['home_link'] = 'http://www.pm25.in' + href city_item['city_pinyin'] = href.split('/')[1] city_items.append(city_item) return city_items
# 配置MySQL数据源 MYSQL_HOST = '127.0.0.1' MYSQL_DBNAME = 'test' #数据库名字 MYSQL_USER = 'root' #数据库帐号 MYSQL_PASSWD = '123456' #数据库密码 MYSQL_PORT = 3306 #数据库端口 # 配置MySQL存储的Pipeline ITEM_PIPELINES = { 'tutorial.pipelines.MySQLStoreDataPipeline': 300, #保存到数据库 } # 数据的存储 from scrapy import log from twisted.enterprise import adbapi import datetime, uuid import MySQLdb import MySQLdb.cursors class MySQLStoreDataPipeline(object): def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): dbargs = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DBNAME'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASSWD'], charset='utf8', cursorclass = MySQLdb.cursors.DictCursor, use_unicode= True,) dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) return cls(dbpool) def process_item(self, item, spider): query = self.dbpool.runInteraction(self.save_city, item) query.addErrback(self.handle_error) return item #插入城市的数据到tbl_all_city中 def save_city(self, conn, item): conn.execute(""" select 1 from tbl_all_city where city_pinyin = %s """, (item['city_pinyin'],)) ret0 = conn.fetchone() if not ret0: ret1 = conn.execute(""" insert into tbl_all_city(city_pinyin, city_name, home_link) values(%s, %s, %s) """, (item['city_pinyin'], item['city_name'], item['home_link'],)) log.msg('save to tbl_all_city: %s' % ret1, level=log.INFO) #异常处理 def handle_error(self, e): log.err(e) # 执行爬虫程序 scrapy crawl pm25