来自传智播客的爬虫学习视频php
1.建立一个新的爬虫:json
scrapy genspider tencent "tencent.com"
2.编写items.pydom
获取职位名称、详细信息scrapy
class TencentItem(scrapy.Item): name = scrapy.Field() detailLink = scrapy.Field() positionInfo = scrapy.Field() peopleNumber = scrapy.Field() workLocation = scrapy.Field() publishTime = scrapy.Field()
3.编写tencent.pyide
# tencent.py from mySpider.items import TencentItem import scrapy import re class TencentSpider(scrapy.Spider): name = "tencent" allowed_domains = ["hr.tencent.com"] start_urls = [ "http://hr.tencent.com/position.php?&start=0#a" ] def parse(self, response): for each in response.xpath('//*[@class="even"]'): item = TencentItem() name = each.xpath('./td[1]/a/text()').extract()[0] detailLink = each.xpath('./td[1]/a/@href').extract()[0] positionInfo = each.xpath('./td[2]/text()').extract()[0] peopleNumber = each.xpath('./td[3]/text()').extract()[0] workLocation = each.xpath('./td[4]/text()').extract()[0] publishTime = each.xpath('./td[5]/text()').extract()[0] #print name, detailLink, catalog, peopleNumber, workLocation,publishTime item['name'] = name.encode('utf-8') item['detailLink'] = detailLink.encode('utf-8') item['positionInfo'] = positionInfo.encode('utf-8') item['peopleNumber'] = peopleNumber.encode('utf-8') item['workLocation'] = workLocation.encode('utf-8') item['publishTime'] = publishTime.encode('utf-8') curpage = re.search('(\d+)',response.url).group(1) page = int(curpage) + 10 url = re.sub('\d+', str(page), response.url) # 发送新的url请求加入待爬队列,并调用回调函数 self.parse yield scrapy.Request(url, callback = self.parse) # 将获取的数据交给pipeline yield item
4.编写pipeline.py文件函数
import json class TencentJsonPipeline(object): def __init__(self): #self.file = open('teacher.json', 'wb') self.file = open('tencent.json', 'wb') def process_item(self, item, spider): content = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(content) return item def close_spider(self, spider): self.file.close()
5.在 setting.py 里设置ITEM_PIPELINES学习
ITEM_PIPELINES = { #'mySpider.pipelines.SomePipeline': 300, #"mySpider.pipelines.ItcastJsonPipeline":300 "mySpider.pipelines.TencentJsonPipeline":300 }
6.执行爬虫:scrapy crawl tencent
url