网上直接搜,教程有很多。下面的内容是基于已经安装好python以及scrapy情况下。MYSQL包也需要提前安装好:
进入cmd终端下执行:
pip install PyMySQL
在cmd终端下输入:
scrapy startproject hospital
结果如下:
class HospitalItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() #名称 organization = scrapy.Field() #机构性质 address = scrapy.Field() #地址 phone=scrapy.Field() #简介 guahao=scrapy.Field() inspect=scrapy.Field() check=scrapy.Field() transport = scrapy.Field() yinshi=scrapy.Field() zhuxiu=scrapy.Field() zhuyuan=scrapy.Field()#交通 carparking=scrapy.Field() charge = scrapy.Field() #收费 notice = scrapy.Field() #入住要求 drug=scrapy.Field()
创建后打开进入写爬虫环节
首先,导入相关的库:我最终将爬去的数据保存在了mysql数据库中,故需要导入MySQLdb数据库,python3所用的与python2有所不同,python3为import pymysql
# -*- coding: utf-8 -*- import scrapy from Hospital.items import HospitalItem from scrapy.conf import settings # scrapy的设置函数 from datetime import datetime # 日期模块 import pymysql as mdb #数据库模块
其次,是主要代码块
class YanglaoSpider(scrapy.Spider): name = "hospital" # 爬虫名 allowed_domains = ["360jk.com"] # 可访问域名,定义域名后爬虫只访问该域名下的网址 start_urls start_urls= ( 'http://www.360jk.com/jibing/gxy/yiyuan?&level_id=0', ) def __init__(self): # ==========初始化函数 now = datetime.now() # 获取当前日期 today = str(now.year) + str(now.month).zfill(2) + str(now.day).zfill(2) # 将日期转换为字段形式,如:20160101 con = mdb.connect(host=settings['MYSQL_HOST'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASS'], db=settings['MYSQL_DB'],charset='utf8'); cur = con.cursor() # 创建数据库连接,定义连接指针 cur.execute("""DROP TABLE IF EXISTS `hospitals_%s`;""" % today) # 如果yl_datetime(yl_20160101)表存在,即删除该表 cur.execute(""" CREATE TABLE `hospitals_%s` ( `id` int(11) NOT NULL AUTO_INCREMENT, `名称` varchar(15) DEFAULT NULL, `机构类型` varchar(20) DEFAULT NULL, `地址` varchar(50) DEFAULT NULL, `phone` varchar(30) DEFAULT NULL, `挂号` varchar(1000) DEFAULT NULL, `检查` varchar(1000) DEFAULT NULL, `体检` varchar(1000) DEFAULT NULL, `交通` varchar(100) DEFAULT NULL, `饮食` varchar(1000) DEFAULT NULL, `住宿` varchar(1000) DEFAULT NULL, `住院` varchar(1000) DEFAULT NULL, `停车` varchar(1000) DEFAULT NULL, `收费` varchar(1000) DEFAULT NULL, `入住须知` varchar(2000) DEFAULT NULL, `取药` varchar(1000) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET='utf8'; """ % today) # 新建yl_datetime(yl_20160101)表 con.close() #`简介` varchar(6000) DEFAULT NULL, def parse(self, response): # ==========从访问的页面上页获取全部养老院链接,并生成Request请求,把response发给parse_page函数 for url in response.css('body > div.main > div > div.clearfix > div.main_part > div:nth-child(1) > div.box_body > ul > li > div.info_area > figure > div > a').xpath('@href').extract(): yield scrapy.Request(response.urljoin(url), callback=self.parse_page) # break # ==========从页底获取下一页的链接,生成Request请求,把response发给原来的parse函数做循环 next_url = response.css('body > div.main > div > div.clearfix > div.main_part > div:nth-child(1) > div.box_body > div.paginations > div > b.page_num > a').xpath('@href').extract() # 下一页链接是所获取链接的倒数第二个,所以要取[-2] next_url = next_url[-2] if len(next_url)>1 else next_url[0] yield scrapy.Request(response.urljoin(next_url), callback=self.parse) def parse_page(self, response): #def parse(self, response): item = HospitalItem() # 定义item item['intro'], item['name'],item['organization'],item['address'],item['phone'],item['guahao'],item['inspect'],item['check'],item['transport'],item['yinshi'],item['zhuxiu'],item['zhuyuan'],item['carparking'],item['charge'],item['notice'],item['drug']='','','','','','','','','','','','','','','' # 给全部item赋空值,避免以后出错 # ==========从页面上获取的字段字典 dic = { 'guahao':'guahao', 'jiancha':'inspect', 'tijian':'check', 'jiaotong':'transport', 'tingche':'carparking', 'yinshi':'yinshi', 'zhuxiu':'zhuxiu', 'zhuyuan':'zhuyuan', 'jiaofei':'charge', 'zhuyuan':'notice', 'quyao':'drug', } item['name']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.head_hospital_name > span.title::text').extract()]) item['organization']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.head_hospital_name > span.tag::text').extract()]) item['address']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.hospital_box > dl > dd:nth-child(4)::text').extract()]) item['phone']=''.join([i.strip() for i in response.css('#container_max > div.main_max > div:nth-child(2) > div.hospital_box > dl > dd:nth-child(6)::text').extract()]) for i in response.css('#container_max > div.main_max > div.article.bottom_20>div.hospital_jygl.fold_box'): text = ''.join([j.strip() for j in i.css('div.jygl_content.fold_body> p > span::text').extract()]).split(u'\uff1a') # 得到li里边的全部文字,并在“:”符号处分词 text = text[1] if len(text)==2 else ''.join(text)# 如果分词后有2个字段,即赋第二个值,如:“名称:养老院” = “养老院” try: key = dic[i.xpath('@id').extract_first(default='').encode('utf-8')] if key=='guahao': item['guahao']=text elif key=='inspect': item['inspect']=text elif key=='check': item['check']=text elif key=='transport': item['transport']=text elif key=='carparking': item['carparking']=text elif key=='charge': item['charge']=text elif key=='notice': item['notice']=text elif key=='drug': item['drug']=text elif key=='yinshi': item['yinshi'] = text elif key=='zhuxiu': item['zhuxiu']=text elif key=='zhuyuan': item['zhuyuan']=text except Exception, e: pass return item
from twisted.enterprise import adbapi import re, time import pymysql.cursors from scrapy import log from scrapy.conf import settings from datetime import datetime from hashlib import md5 # from scrapy import log from scrapy.exceptions import DropItem class HospitalPipeline(object): def __init__(self, dbpool): # ==========初始化函数 self.dbpool = dbpool # 定义多线程池 now = datetime.now() # 获取今天的日期 self.add_date = str(now.year) + str(now.month).zfill(2) + str(now.day).zfill(2) # 将日期转换为字段形式:20160101 @classmethod def from_settings(cls, settings): # ==========从settings.py里获取mysql数据库信息,并定数据编码为utf-8,以免入库时出错 dbargs = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DB'], user=settings['MYSQL_USER'], passwd=settings['MYSQL_PASS'], charset='utf8', use_unicode=True, ) dbpool = pymysql.connect(**dbargs) return dbpool def process_item(self, item, spider): # ==========处理item if item.get('name','') != '': # 如果item里有“address”字段,即判断为yanglao爬虫的item self.query = "insert IGNORE into `hospitals_"+self.add_date+"` (名称,机构类型,地址,phone,挂号,检查,体检,交通,饮食,住宿,住院,停车,收费,入住须知,取药) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" self.data = (item['name'],item['organization'],item['address'],item['phone'],item['guahao'],item['inspect'],item['check'],item['transport'],item['yinshi'],item['zhuxiu'],item['zhuyuan'],item['carparking'],item['charge'],item['notice'],item['drug']) # run db query in the thread pool d = self.dbpool.runInteraction(self._do_upsert, item, spider) #d.addErrback(self._handle_error, item, spider) # at the end return the item in case of success or failure d.addBoth(lambda _: item) # return the deferred instead the item. This makes the engine to # process next item (according to CONCURRENT_ITEMS setting) after this # operation (deferred) has finished. return d def _do_upsert(self, conn, item, spider): """Perform an insert or update.""" try: conn.execute(self.query,self.data) # 执行mysql语句 except Exception, e: print 'error========================================', e
这里主要是添加本地MYSQL数据库信息
BOT_NAME = 'Hospital' SPIDER_MODULES = ['Hospital.spiders'] NEWSPIDER_MODULE = 'Hospital.spiders' COOKIES_ENABLED = True #禁止cookies,防止被ban MYSQL_HOST = '127.0.0.1' #Mysql连接名 MYSQL_PORT = '3306' #连接端口号 MYSQL_USER = 'root' #Mysql用户名 MYSQL_PASS = '******' #Mysql用户密码 MYSQL_DB = 'gaoxueya' #Mysql数据库名 ITEM_PIPELINES={ 'Hospital.pipelines.HospitalPipeline':100, # 开通CrawlerStorePipeline }
最终入库结果如下: