去哪儿

'''
爬取去哪儿网全部城市自由行数据
爬取速度过快时会引起服务器返回错误
反反爬策略:设置cookies池和ip代理池以及延长爬虫休眠时间
'''
import requests
import time
from urllib.parse import quote
from multiprocessing import Pool
import pymongo
import random

def begin():
    """
    获取去哪儿网出发地站点列表
    :return:
    """
    depurl='https://touch.dujia.qunar.com/depCities.qunar'
    response=requests.get(depurl)
    deps=response.json()
    for dep_item in deps['data']:
        for dep in deps['data'][dep_item]:
            yield dep#出发城市

def main(dep):
    """
    获取去哪儿网出发地可旅行的目的地列表
    :param dep: 出发地
    :return: 目的地列表
    """
    a = []
    #中文字符要进行编码
    desurl = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(
        quote(dep))
    time.sleep(random.randint(2,4))
    response = requests.get(desurl)
    des = response.json()
    for des_item in des['data']:
        for des_item_1 in des_item['subModules']:
            for query in des_item_1['items']:
                if query['query'] not in a:#去重,目的城市中有重复出现
                    a.append(query['query'])#目的城市列表
    get(a, dep)

def get(array,dep):
    """
    获得去哪儿网自由行数据搜索结果
    :param array: 目的城市列表
    :param dep: 出发城市
    :return:出发城市到目的城市的自由行结果
    """
    for item in array:
        # 头文件 防止反爬
        headers = {
            'cookie':'QN99=8770; QN1=eIQjmVtYQgbBDaEiPevvAg==; csrfToken=zKMVroGqYK6fdBphXg8rqQ3MpcaiZ7TZ; QN269=AA9586A58FEC11E88A24FA163E233FC1; QN601=3f55b4673bbd18ac3206bfea7c5996d3; QunarGlobal=10.86.213.148_6291bf49_164d0ba9dbf_-1a4d|1532510727219; _i=RBTKSaIAM3KBlurx6OwRjfuQ8pEx; QN300=auto_4e0d874a; QN163=0; QN6=auto_4e0d874a; QN48=tc_427b9f2555dccb4c_164d9787381_d960; _RSG=Ue4lzWGVuXAKnGpozKI.OB; _RDG=28c738c8ddc979203b2642a9f86b2ac273; _RGUID=a8787d08-3dbc-4a1e-b63e-494f72cd0c54; QN205=auto_4e0d874a; QN234=home_free_t; _vi=Xan8_FldA2NGBwqzRSKDNIYHisxd4ARxiomsg1mowQsC4OV3wCXnooJECkbZWsL9_3XGq9mmj5lTyMlGPRfgZD0jC_eS-Vas8fJyOdtOVO02USpBUqqwRZ1LfhiofVGvkPVi9NW0omogB1BkpWCaX2atkxba7uWItHjFuSd5R2NK; QN162=%E6%B7%B1%E5%9C%B3; QN233=FreetripTouchin; DJ12=eyJxIjoi5p2t5bee6Ieq55Sx6KGMIiwic3UiOiI4MDU5MjU4OTIiLCJkIjoi5rex5ZyzIiwiZSI6IkEiLCJsIjoiMCwyOCIsInRzIjoiZGQxNDZmZWYtMWY2NC00N2U5LWIyNjAtMTY0ODE2ZTlmYmQ0In0; _RF1=113.110.176.137; _pk_ref.1.8600=%5B%22%22%2C%22%22%2C1533395038%2C%22http%3A%2F%2Ftouch.qunar.com%2F%22%5D; _pk_ses.1.8600=*; _pk_id.1.8600=92302397325aca81.1533353790.5.1533395068.1533392908.; QN243=168',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            'Referer': 'https://touch.dujia.qunar.com/p/list?cfrom=zyx&dep={}&query={}&it=FreetripTouchin&et=home_free_t'.format(quote(dep),quote(item))
        }
        resulturl = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=FreetripTouchin&date=&configDepNew=&needNoResult=true&originalquery={}&limit=0,28&includeAD=true&qsact=search'.format(
            quote(dep), quote(item), quote(item))
        time.sleep(random.randint(2,4))
        response = requests.get(resulturl, headers=headers).json()

        #容错处理,防止json文件中有不存在的项引发报错
        try:
            routecount = int(response['data']['limit']['routeCount'])#获取
            for limit in range(0, routecount, 28):
                resulturl = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}' \
                            '&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&' \
                            'it=FreetripTouchin&date=&configDepNew=&needNoResult=true&originalquery={}&limit={},28&' \
                            'includeAD=true&qsact=search'.format(quote(dep), quote(item), quote(item), limit)
                time.sleep(random.randint(2,4))
                response = requests.get(resulturl, headers=headers)
                items=response.json()['data']['list']['results'][0]
                result = {
                    '时间': time.strftime('%Y-%m-%d', time.localtime(time.time())),
                    '出发地': dep,
                    '目的地': item,
                    '价格':items['price'],
                    '天数': items['accomInclude'],
                    '亮点': items['brightspots'],
                    '出行工具':items['backtraffic'],
                    '类别':items['ttsRouteType']
                }
                print(result)
                savetomongo(result)
                time.sleep(1)
        except:
            return

client=pymongo.MongoClient(host='127.0.0.1',port=27017)#链接mongodb
db=client['qunar']#mongodb数据库
collection='travel'#mongodb集合


def savetomongo(result):
    """
    保存到mongodb数据库
    :param result: 出发城市到目的城市自由行搜索结果
    :return:
    """
    db[collection].insert(result)#插入数据到mongodb


if __name__ == '__main__':
    deps = begin()
    #开启多线程
    pool=Pool()
    pool.map(main,[dep for dep in deps])
    client.close()