scrapy 知乎的模拟登录及抓取用户数据

时间 2019-11-19

原文原文链接

最近看了python的scrapy 框架并用其抓取了部分知乎用户数据，代码主要是集中在知乎登录和抓取时候的逻辑处理上。html

一、首先进入知乎登录页面zhihu.com/#sigin上，用xpath提取_xsrf参数，获取验证码的部分url，完整的url是由当前的时间戳和type参数构成，利用获得的url造成response，在函数handle_captcha对验证码提取并提示在终端输入验证码，最后再将登录的url、cookie、用户帐号、密码什么的from进去就能够登录成功了。下面是代码：node

# _*_coding:utf-8_*_

from scrapy.spider import CrawlSpider
from scrapy.http import Request, FormRequest
from scrapy.selector import Selector
from zhihu2 import config
from PIL import Image
import time
import json
import re
from zhihu2 import items


class ZhiHu_spider(CrawlSpider):
    name = 'zhihu2'
    allowed_domain = ['https://www.zhihu.com']

    def __init__(self, *args, **kwargs):
        super(ZhiHu_spider, self).__init__(*args, **kwargs)
        self.xsrf = ''
        self.headers = config.headers

    def start_requests(self):
        yield Request(
            'http://www.zhihu.com/#signin',
            meta={
                'cookiejar': 1
            },
            callback=self.post_login
        )

    def post_login(self, response):
        print 'parper login in '
        sel = Selector(response)
        self.xsrf = sel.xpath('//input[@name="_xsrf"]/@value').extract()[0]

        #验证码的获取 没有自动识别 识别率过低 因此手打
        str_time = str(time.time() * 1000)
        cap_url = 'https://www.zhihu.com/captcha.gif?r=' + str_time + '&type=login'
        print cap_url
        yield Request(
            cap_url,
            meta={'cookiejar': response.meta['cookiejar'],
                  '_xsrf': self.xsrf,
                  },

            headers=self.headers,
            callback=self.handle_captcha

        )

    def handle_captcha(self, response):
        with open('E:\\myscrapy\\captcha.gif', 'wb') as gif:
            gif.write(response.body)
        gif.close()
        Im = Image.open('E:\\myscrapy\\captcha.gif')
        Im.show()
        captcha = raw_input('enter your captcha:')

        yield FormRequest(
            'http://www.zhihu.com/login/phone_num', #s手机号登录， 对应的能够换成邮箱
            method='POST',
            meta={'cookiejar': response.meta['cookiejar']},
            headers=self.headers,
            formdata={
                '_xsrf': self.xsrf,
                'password': '密码',
                'remember_me': 'true',
                'phone_num': '帐号',
                'captcha': captcha
            },
            callback=self.after_login,

        )

二、下面是登录以后获取关注人的信息，因为知乎第一次只会显示20个关注人，剩下的要post数据到www.zhihu.com/node/ProfileFolloweesList2python

才能又获取20个，因此在这要获取每一个人的关注人数并与20作对比。web

 # 获取我的主页
    def after_login(self, response):
        print response.body
        print 'login success'
        yield Request(
            'https://www.zhihu.com/people/你的id须要填写, #本身主页的网址 由于我没获取id 因此要输入本身主页的网址
            meta={'cookiejar': response.meta['cookiejar']},
            headers=self.headers,
            callback=self.parse_people,
        ) 
  #获取关注人url
    def parse_people(self, response):
        # print 'ready'
        sel = Selector(response)
        follow_url = sel.xpath('//a[@class="item"]/@href').extract_first()
        if follow_url:
            compete_url = 'https://www.zhihu.com' + follow_url

            yield Request(
                compete_url,
                meta={
                    'cookiejar': response.meta['cookiejar'],
                      },
                headers=self.headers,
                callback=self.person_info,
        )

    #处理关注人的url 并获取信息
    def person_info(self, response):
        item = items.Zhihu2Item()
        count = 20
        sel = Selector(response)

        nikname = sel.xpath('//div[@class="title-section"]/a[@class="name"]/text()').extract_first()
        location = sel.xpath('//span[@class="location item"]/@title').extract_first()
        business = sel.xpath('//span[@class="business item"]/@title').extract_first()
        education = sel.xpath('//span[@class="education item"]/@title').extract_first()
        education_extra = sel.xpath('//span[@class="education-extra item"]/@title').extract_first()
        sex = sel.xpath('//span[@class="item gender"]/i/@class').extract_first().split('-')[-1]
        agree = sel.xpath('//span[@class="zm-profile-header-user-agree"]/strong/text()').extract_first()
        thanks = sel.xpath('//span[@class="zm-profile-header-user-thanks"]/strong/text()').extract_first()

        config.try_none(nikname)
        config.try_none(location)
        config.try_none(business)
        config.try_none(education)
        config.try_none(education_extra)
        config.try_none(sex)
        config.try_none(agree)
        config.try_none(thanks)

        peo_num = sel.xpath('/html/body/div[3]/div[2]/div[1]/a[1]/strong/text()').extract_first()
        item['nikname'] = nikname
        item['business'] = business
        item['education_extra'] = education_extra
        item['location'] = location
        item['education'] =education
        item['sex'] = sex
        item['agree'] = agree
        item['thanks'] = thanks

        if peo_num:

            people_urls = sel.xpath('//a[@class="zg-link author-link"]/@href').extract()
            for people_url in people_urls:
             yield Request(
                people_url,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,
                callback=self.person_info
            )

            peo_params = sel.xpath('//div[@class="zh-general-list clearfix"]/@data-init').extract_first()
            if peo_params:
                try:
                    values = json.loads(str(peo_params))
                except ValueError, e:
                    print e.message
                params = {}
                params['offset'] = 20
                params['order_by'] = 'created'
                params['hash_id'] = values['params']['hash_id']

                if count < peo_num:
                    params['offset'] = count
                    yield FormRequest(
                        'https://www.zhihu.com/node/ProfileFolloweesListV2',
                        method='POST',
                        meta={'cookiejar': response.meta['cookiejar']},
                        headers=self.headers,
                        formdata={
                            'method': 'next',
                            'params': json.dumps(params),
                            '_xsrf': self.xsrf,
                        },
                        callback=self.foolows_V2
                    )
                    count += 20
                else:
                    num = peo_num / 20
                    params['offset'] = num
                yield FormRequest(
                    'https://www.zhihu.com/node/ProfileFolloweesListV2',
                    method='POST',
                    meta={'cookiejar': response.meta['cookiejar']},
                    headers=self.headers,
                    formdata={
                        'method': 'next',
                        'params': json.dumps(params),
                        '_xsrf': self.xsrf,
                    },
                    callback=self.foolows_V2
                )

View Code

三、从上面url的response获取关注人的url,获得的url 交由parse_people函数处理，parse_people函数的response交由person_info函数处理，因此就造成了一个循环，不断的有url被提取，也不断的有数据被提取出来，下面是parse_people函数的代码：json

    def foolows_V2(self, response):
        p = re.compile(r'href="https://www\.zhihu\.com/people/(.*?)"')

        aa = json.loads(response.body)['msg']
        for item in aa:
            peo = p.search(item).group(1)
            followes_url = 'https://www.zhihu.com/people/' + str(peo)
            yield Request(
                followes_url,
                meta={'cookiejar': response.meta['cookiejar']},
                headers=self.headers,
                callback=self.parse_people
            )

View Code

下面是一些配置信息：cookie

cofig.pyapp

#_*_coding:utf-8_*_

from settings import USER_AGENT

headers = {

    'Host': 'www.zhihu.com',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Origin': 'https://www.zhihu.com',
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': USER_AGENT,
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Referer': 'https://www.zhihu.com/',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8',
}


def try_none(tag):
    try:
        tag
    except:
        tag = 'none'
    return tag

View Code

items.py:框架

from scrapy import Item, Field


class Zhihu2Item(Item):
    nikname = Field()
    location = Field()
    business = Field()
    education = Field()
    education_extra = Field()
    sex = Field()
    thanks = Field()
    agree = Field()

View Code

代码没有维护已爬取的url和带爬取的url的重复，可能会致使重复抓取，代码的优化也挺烂的。但愿大神们多给点意见，若是代码有错误，但愿提出，以避免给别人误导。dom