抗击新冠病毒(3)-探索在线数据资源

# default_exp digdata
# 上面一行用于nbdev中声明本模块的名称。必须是notebook的第一个Cell的第一行。

digdata

#hide
from nbdev.showdoc import *
#export 
from bs4 import BeautifulSoup
from parser import * #regex_parser
import re
import json
import time
import logging
import datetime
import requests
import pprint

获取网页数据

#export 
#url = "https://3g.dxy.cn/newh5/view/pneumonia"
url = "https://ncov.dxy.cn/ncovh5/view/pneumonia?from=singlemessage&isappinstalled=0"
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
#export 
session = requests.session()
session.headers.update(headers)
r = session.get(url)
#export 
#pprint.pprint(r.text)
#export 
#soup = BeautifulSoup(r.content, 'lxml')
#soup

提取特定的数据域

# export 
# 分为整体状况、分省状况、省内各市状况、新闻四大类。
overall_information = re.search(r'\{("id".*?)\}', str(soup.find('script', attrs={'id': 'getStatisticsService'})))
province_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService1'})))
area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'})))
news_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'})))

一、整体状况

#pprint.pprint(overall_information.string)
#overall_information.group(0)
#jsall = json.loads(overall_information.group(0))

def overall_parser(overall_information): overall_information = json.loads(overall_information.group(0)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information['countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace(' 死亡', ',死亡').replace(' ', '')python

#overall_information = json.loads(overall_information.group(0))

二、分省状况

#provinces = json.loads(province_information.group(0))
#provinces

def province_parser(province_information): provinces = json.loads(province_information.group(0)) crawl_timestamp = "" for province in provinces:git

province.pop('id')
    province['comment'] = province['comment'].replace(' ', '')
    province['crawlTime'] = crawl_timestamp
    #province['country'] = country_type.get(province['countryType'])
    province['tags'] = province['tags'].replace(' ', '')
    province = regex_parser(content=province, key='tags')
#for province in provinces:
#    print(province['id'],'\t',province['provinceShortName'],'\t',province['tags'])

三、省内各市县状况

#area_information.string
area = json.loads(area_information.group(0))
print("省份\t确诊\t疑似\t治愈\t死亡")
for a in area:
    print(a['provinceName'],'\t',a['confirmedCount'],'\t',a['suspectedCount'],'\t',a['curedCount'],'\t',a['deadCount'])

按省提取城市状况

cities = area[0]['cities']
#cities
print("城市\t确诊\t疑似\t治愈\t死亡")
for p in area:
    cities = p['cities']
    print("===================================")
    print(p['provinceName'],'\t',p['confirmedCount'],'\t',p['suspectedCount'],'\t',p['curedCount'],'\t',p['deadCount'])
    print("-----------------------------------")
    for c in cities:
        print(c['cityName'],'\t',c['confirmedCount'],'\t',c['suspectedCount'],'\t',c['curedCount'],'\t',c['deadCount'])

四、新闻列表

news = json.loads(news_information.group(0))
#news
for n in news:
    print(n['id'],'\t',n['infoSource'].strip(),'\t',n['title'].strip())#,n['summary'].strip())

nbdev 适用工具

# 将notebook转化为python的*.py代码,保存到项目名称的子目录中。

from nbdev.export import *
notebook2script()
Converted 00_digdata.ipynb.
Converted 01_getdata.ipynb.
Converted 10_charts.ipynb.
Converted 10_china.ipynb.
Converted index.ipynb.
help(notebook2script)
Help on function notebook2script in module nbdev.export:

notebook2script(fname=None, silent=False, to_dict=False)
    Convert notebooks matching `fname` to modules