Python爬取去哪儿网旅游景点信息并入库保存到MongoDB数据库操做

1、网页分析
1.在浏览器访问:http://touch.qunar.com/
这里写图片描述
网页的排版会变,可是衣服变了身体仍是那个身体,F12分析一波。
这里写图片描述
点击主页的“”自由行”,鼠标点击搜索栏,按F12观察网页的标签,发如今JS中有咱们须要的网页请求,全部的旅游地区都保存的JSON格式的列表里。
2.查看Headers
这里写图片描述
咱们须要的HTTP请求为:https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep=%E4%B8%8A%E6%B5%B7&exclude=&extensionImg=255,175&callback=jsonp_1535868021986_30812;经过分析发现,“dep”为地点。接下来咱们点击网页上的“丽江”来继续分析网页,
这里写图片描述
此次咱们所要查看的是旅游城市所在地的全部旅游景点,因此经过查找发现所需内容在XHR中,而且咱们须要将网页HTTP请求复制下来进行处理
这里写图片描述
连接比较复杂,可是仔细查看就会发现,所要查找的内容依旧在“dep”关键字中,只不过被编码所替换,若是有须要能够下载转码工具对其转码。
2、实战
首先咱们写一个简单版,对此我就很少啰嗦,关键地方我会在代码中给予注释。html

# coding=utf-8
# au: Luo
# data:2018.08.30
import  urllib.request
import  requests # 爬虫requests库,很是强大好用
import  time
import pymongo # pymongo库,不懂能够查看上一篇博客
client = pymongo.MongoClient('localhost', 27017)# 链接数数据库
book_qunar = client['qunar']
sheet_qunar_zyx = book_qunar['aunar_zyx']# 建立MongoDB数据库的表
url = 'https://touch.dujia.qunar.com/depCities.qunar'
# 请求头,注意这里必须添加cookie,不然在后续爬取过程当中会出现非法请求的提示,在此本人也是花了好久尝试才得出结论
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                        'accept-encoding': 'gzip, deflate, sdch',
                        'accept-language': 'zh-CN,zh;q=0.8',
                        'cache-control': 'max-age=0',
                        'upgrade-insecure-requests': '1',
                        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                     'Chrome/49.0.2623.75 Safari/537.36',

                       'cookie': 'QN300=organic; QN1=eIQjm1uFGUGmVZroEV65Ag==; QN277=organic; '
                                 'csrfToken=jlP8SVo79YjStBWaCDZMTzfRdk8vhrer; QN269=D27F47D0AAA6'
                                 '11E8BB00FA163EF78B12; QN57=15354494109820.5188963907166866; '
                                 'i=VInJOQycvl1TIZs3ZNjHGo-NWE6q; _vi=wntLd-u3hhb623qZeAtQj5Re8O'
                                 'a8V_UFxv73OyS0PeOdzXMoC1PScrp4BJxGp_XlCJszxevRwtpqQ9XqrSDXGRtPO'
                                 '2F53lXYVNmkniXsuS4XTTttFgFbhwQ9vP-d0pQNBRMzPvI6WYZPvXzM_cpoVqVIm'
                                 'f_zF1VHFiXdrLBoyuEx; QN58=1535618769166%7C1535618769166%7C1; QN48'
                                 '=tc_363b9bcfab7eadcd_1658a8854b4_34a6; QN267=078324963ae656e5e; _'
                                 'RF1=116.228.53.168; _RSG=pf4OU7iqgs4zEmiQzYI7tA; _RDG=2822ea67142a'
                                 'a520563dbbaa984354cd9a; _RGUID=4daad2c1-de60-4a73-adc7-4a2f5ab303b'
                                 'd; PHPSESSID=sao7528ocg7qp4041ubv5f94a1; QN234=home_free_t; _pk_ref'
                                 '.1.8600=%5B%22%22%2C%22%22%2C1535700974%2C%22http%3A%2F%2Ftouch.quna'
                                 'r.com%2F%22%5D; _pk_id.1.8600=f10de5ca53398f77.1535627735.5.153570134'
                                 '6.1535699139.; _pk_ses.1.8600=*; QN243=137; QN205=organic; QN233=dujia_hy_destination'

                      }
strhtml = requests.get(url, headers=headers)
dep_dict = strhtml.json()# 将获取的网页内容转换为json格式.
# 获取旅游城市中的景点
for dep_item in dep_dict['data']:
    for dep in dep_dict['data'][dep_item]:
        print(dep)
        a = []

        url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175' \
            .format(urllib.request.quote(dep))
        time.sleep(1)
        strhtml_1 = requests.get(url)
        arrive_dict = strhtml_1.json()
        for arr_item in arrive_dict['data']:
            for arr_item_1 in arr_item['subModules']:
                for query in arr_item_1['items']:
                    if query['query'] not in a:
                        a.append(query['query'])
        print(a)
        for item in a:# 将获取内容保存在A集合中,防止出现重复

            print(item)
            url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}' \
                  '&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
                  '&cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&originalquery={}' \
                  '&limit=0,28&includeAD=true&qsact=search'.\
                format(urllib.request.quote(dep),
                        urllib.request.quote(query['query']),
                         urllib.request.quote(query['query']))
            time.sleep(1)
            strhtml_2 = requests.get(url, headers=headers)
            routeCount = int(strhtml_2.json()['data']['limit']['routeCount'])
            for limit in range(0, routeCount, 20):# 循环读取集合中的旅游景点信息
                url = "https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&" \
                  "dep={}&query={}&" \
                  "dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&" \
                  "cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&" \
                  "originalquery={}&limit={},28&includeAD=true&qsact=search". \
                    format(urllib.request.quote(dep),
                           urllib.request.quote(item),
                           urllib.request.quote(item), limit)
                time.sleep(1)
                strhtml = requests.get(url, headers=headers)
                # 保存旅游城市、景点等信息
                result = {
                    'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
                    'dep': dep,
                    'arrive': item,
                    'limit': limit,
                    'result': strhtml.json(),
                }
                sheet_qunar_zyx.insert_one(result)

代码运行结果:
这里写图片描述
数据库保存结果:
这里写图片描述
到此,爬虫工做就结束啦,我也在学习着将数据进行可视化,后续可能会跟新这个版本的升级版,一步步进阶。web

———————————————————————————————————————上传一个升级版:数据库

# coding=utf-8
# au: Luo
# data:2018.09.02
import  urllib.request
import  requests
import  time
import pymongo

client = pymongo.MongoClient('localhost', 27017)# 链接数数据库
book_qunar = client['qunar']
sheet_qunar_zyx = book_qunar['aunar_zyx']# 建立MongoDB数据库的表
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                        'accept-encoding': 'gzip, deflate, sdch',
                        'accept-language': 'zh-CN,zh;q=0.8',
                        'cache-control': 'max-age=0',
                        'upgrade-insecure-requests': '1',
                        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                     'Chrome/49.0.2623.75 Safari/537.36',

                       'cookie': 'QN300=organic; QN1=eIQjm1uFGUGmVZroEV65Ag==; QN277=organic; '
                                 'csrfToken=jlP8SVo79YjStBWaCDZMTzfRdk8vhrer; QN269=D27F47D0AAA6'
                                 '11E8BB00FA163EF78B12; QN57=15354494109820.5188963907166866; '
                                 'i=VInJOQycvl1TIZs3ZNjHGo-NWE6q; _vi=wntLd-u3hhb623qZeAtQj5Re8O'
                                 'a8V_UFxv73OyS0PeOdzXMoC1PScrp4BJxGp_XlCJszxevRwtpqQ9XqrSDXGRtPO'
                                 '2F53lXYVNmkniXsuS4XTTttFgFbhwQ9vP-d0pQNBRMzPvI6WYZPvXzM_cpoVqVIm'
                                 'f_zF1VHFiXdrLBoyuEx; QN58=1535618769166%7C1535618769166%7C1; QN48'
                                 '=tc_363b9bcfab7eadcd_1658a8854b4_34a6; QN267=078324963ae656e5e; _'
                                 'RF1=116.228.53.168; _RSG=pf4OU7iqgs4zEmiQzYI7tA; _RDG=2822ea67142a'
                                 'a520563dbbaa984354cd9a; _RGUID=4daad2c1-de60-4a73-adc7-4a2f5ab303b'
                                 'd; PHPSESSID=sao7528ocg7qp4041ubv5f94a1; QN234=home_free_t; _pk_ref'
                                 '.1.8600=%5B%22%22%2C%22%22%2C1535700974%2C%22http%3A%2F%2Ftouch.quna'
                                 'r.com%2F%22%5D; _pk_id.1.8600=f10de5ca53398f77.1535627735.5.153570134'
                                 '6.1535699139.; _pk_ses.1.8600=*; QN243=137; QN205=organic; QN233=dujia_hy_destination'

                      }
def get_list(dep, item):
    url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}' \
          '&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
          '&cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&originalquery={}' \
          '&limit=0,28&includeAD=true&qsact=search'. \
        format(urllib.request.quote(dep),
               urllib.request.quote(query['query']),
               urllib.request.quote(query['query']))
    time.sleep(1)
    # strhtml = requests.get(url, headers=headers)
    # routeCount = int(strhtml.json()['data']['limit']['routeCount'])
    routeCount = int(get_json(url)['data']['limit']['routeCount'])
    for limit in range(0, routeCount, 20):  # 循环读取集合中的旅游景点信息
        url = "https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&" \
              "dep={}&query={}&" \
              "dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&" \
              "cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&" \
              "originalquery={}&limit={},28&includeAD=true&qsact=search". \
            format(urllib.request.quote(dep),
                   urllib.request.quote(item),
                   urllib.request.quote(item), limit)
        time.sleep(1)
        # strhtml = requests.get(url, headers=headers)
        # 保存旅游城市、景点等信息
        result = {
            'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
            'dep': dep,
            'arrive': item,
            'limit': limit,
            'result': get_json(url),
        }
        sheet_qunar_zyx.insert_one(result)

def get_json(url):
    strhtml = requests.get(url, headers=headers)
    time.sleep(1)
    return strhtml.json()

if __name__ == "__main__":
    url = "https://touch.dujia.qunar.com/depCities.qunar"
    dep_dic = get_json(url)
    for dep_item in dep_dic['data']:
        for dep in dep_dic['data'][dep_item]:
            a= []
            url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175' \
            .format(urllib.request.quote(dep))
            arrive_dict = get_json(url)
            for arr_item in arrive_dict['data']:
                for arr_item_1 in arr_item['subModules']:
                    for query in arr_item_1['items']:
                        if query['query'] not in a:
                            a.append(query['query'])
                for item in a:
                    get_list(dep, item)