scrapy startproject daili_ips ...... cd daili_ips/ #爬虫名称和domains scrapy genspider xici xicidaili.com
In [1]: import requests In [2]: r = requests.get('http://www.xicidaili.com/nn/1') In [3]: r.status_code Out[3]: 500 In [4]:
返回500, 猜想是没有加User-Agent
致使html
In [4]: headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'} In [5]: In [5]: r = requests.get('http://www.xicidaili.com/nn/1', headers=headers) In [6]: r.status_code Out[6]: 200 In [7]:
返回正常python
USER_AGENT
的注释# Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
item定义存储哪些字段mysql
import scrapy class DailiIpsItem(scrapy.Item): ip = scrapy.Field() port = scrapy.Field() position = scrapy.Field() type = scrapy.Field() speed = scrapy.Field() last_check_time = scrapy.Field()
# -*- coding: utf-8 -*- import scrapy from daili_ips.items import DailiIpsItem class XiciSpider(scrapy.Spider): name = "xici" allowed_domains = ["xicidaili.com"] start_urls = ( 'http://www.xicidaili.com/', ) def start_requests(self): res = [] for i in range(1, 2): url = 'http://www.xicidaili.com/nn/%d'%i req = scrapy.Request(url) # 存储全部对应地址的请求 res.append(req) return res def parse(self, response): table = response.xpath('//table[@id="ip_list"]')[0] trs = table.xpath('//tr')[1:] #去掉标题行 items = [] for tr in trs: pre_item = DailiIpsItem() pre_item['ip'] = tr.xpath('td[2]/text()').extract()[0] pre_item['port'] = tr.xpath('td[3]/text()').extract()[0] pre_item['position'] = tr.xpath('string(td[4])').extract()[0].strip() pre_item['type'] = tr.xpath('td[6]/text()').extract()[0] pre_item['speed'] = tr.xpath('td[7]/div/@title').re('\d+\.\d*')[0] pre_item['last_check_time'] = tr.xpath('td[10]/text()').extract()[0] items.append(pre_item) return items
编写spider的时候能够经过命令行工具scrapy shell url
来测试要提取数据的xpath语法, 这样更高效web
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/top ics/item-pipeline.html import MySQLdb class DailiIpsPipeline(object): # 该函数必须返回一个具备数据的dict或者item对象 def process_item(self, item, spider): DBS = spider.settings.get('DBS') con = MySQLdb.connect(**DBS) # 下面这行代码表示设置MySQL使用的字符集为utf8 con.set_character_set('utf8') cur = con.cursor() insert_sql = ( "insert into proxy (ip, port, position, type, speed, last_check_time) " "values (%s,%s,%s,%s,%s,%s);" ) values = (item['ip'], item['port'], item['position'], item['type'], item['speed'], item['last_check_time']) # 插入数据库 try: cur.execute(insert_sql, values) except Exception, e: print "插入失败: ", e con.rollback() else: con.commit() cur.close() con.close() return item return item
注意:
这里我刚开始作的时候没有加con.set_character_set('utf8')
这一行, 结果报错以下sql
UnicodeEncodeError: 'latin-1' codec can't encode character
可是我在建立数据表的时候已经设置字符集为utf8, 查资料后是MySQLdb正常状况下会尝试将全部的内容转为latin1字符集处理
因此处理方法就是,设置链接和游标的charset为你所但愿的编码shell
con = MySQLdb.connect(...) # 设置连接编码 con.set_character_set('utf8') cur = con.cursor() # 设置游标编码 cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET CHARACTER_SET_CONNECTION=utf8;')
我在测试后发现仅仅设置链接(con)
的编码也不会报错, 因此上述程序并无设置游标编码数据库
mysql> create table porxy( -> id int primary key auto_increment, -> ip varchar(20), -> port varchar(20), -> position varchar(20), -> type varchar(20), -> speed varchar(20), -> last_check_time varchar(20) -> )charset=utf8; Query OK, 0 rows affected (0.01 sec) mysql>
更改settings.py
文件, 取消注释app
# Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'daili_ips.pipelines.SomePipeline': 300, #}
改成dom
ITEM_PIPELINES = { 'daili_ips.pipelines.DailiIpsPipeline': 300, }
后面的数字通常在0-1000之内, 当有多个Pipelines的时候表示执行顺粗, 数字小的先执行scrapy
scrapy crawl xici