机器学习参考篇: python+sklearn+kaggle机器学习
用python+sklearn(机器学习)实现天气预报数据 数据
用python+sklearn(机器学习)实现天气预报 准备
用python+sklearn(机器学习)实现天气预报 模型和使用html5
# 爬取数据连接 url = "http://www.meteomanz.com/sy2?l=1&cou=2250&ind=59287&d1=" + str(week_ago.day).zfill(2) + "&m1=" + str(week_ago.month).zfill(2) + "&y1=" + str(week_ago.month) + "&d2=" + str(week_pre.day - years[0]).zfill(2) + "&m2=" + str(week_pre.month).zfill(2) + "&y2=" + str(week_pre.year - years[1])
改为python
# 爬取数据连接 url = "http://www.meteomanz.com/sy2?l=1&cou=2250&ind=59287&d1=" + str(week_ago.day).zfill(2) + "&m1=" + str( week_ago.month).zfill(2) + "&y1=" + str(week_ago.year - years[0]) + "&d2=" + str(week_pre.day).zfill(2) + "&m2=" + str(week_pre.month).zfill(2) + "&y2=" + str(week_pre.year - years[1])
在上一篇教程里咱们已经知道了数据来源网页的规则,因此这一篇就讲数据如何用爬虫获取和机器学习的数据预处理阶段git
爬虫这方面能够参考我以前的一篇文章github
首先咱们主要要爬取去年今日的半个月前到去年今日,而根据上一篇咱们得出的网址规则,咱们能够获得(PS:真正的连接里是没有换行的)web
http://www.meteomanz.com/sy2?l=1&cou=2250&ind=59287 &d1=去年今日的半个月前的日 &m1=去年今日的半个月前的月份 &y1=去年年份 &d2=今天的日期的日 &m2=今天的日期的月份 &y2=今年年份
而为何是取去年和时间要半个月呢?由于去年的天气环境相比于前年或者更久以前是和咱们如今的天气条件更类似的,能够减小偏差,半个月而不是一个星期是由于使用多的数据量能够减小偏差,不是一个月而是由于网站的限制,并且在实验中也会增长少许的偏差。因此最终取用了去年和半个月的时间。数组
若是咱们是只测今天这一次上面的网址就能够人工填写,可是若是咱们要作不用人工填就要用datetime
这个python库
以下:bash
import datetime as DT # 取如今日期 today = DT.datetime.now() # 取b[0]天前日期 week_ago = (today - DT.timedelta(days=b[0])).date() # b[1]天后 week_pre = (today + DT.timedelta(days=b[1])).date()
咱们传入b = [-15 0]
,就能够获取上个半月的日期在week_ago
里,今天的日期在week_pre
里
因此,能够用这一行构建须要的网址app
# 爬取数据连接 url = "http://www.meteomanz.com/sy2?l=1&cou=2250&ind=59287&d1=" + str(week_ago.day).zfill(2) + "&m1=" + str( week_ago.month).zfill(2) + "&y1=" + str(week_ago.year - years[0]) + "&d2=" + str(week_pre.day).zfill(2) + "&m2=" + str(week_pre.month).zfill(2) + "&y2=" + str(week_pre.year - years[1])
其中.zfill(2)
是指填充2位,好比若是是1就返回01,若是是12就返回12
有了网址,接下来就是爬虫爬取网页而后分析网页元素取出里面的数据dom
首先先写爬虫部分,这部分很简单,写了个GetData
class
# -*- coding: utf-8 -*- # @Time: 2020/12/16 # @Author: Eritque arcus # @File: GetData.py # 功能: 爬取数据 import urllib3 class GetData: url = "" headers = "" def __init__(self, url, header=""): """ :param url: 获取的网址 :param header: 请求头,默认已内置 """ self.url = url if header == "": self.headers = { 'Connection': 'Keep-Alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,' '*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/87.0.4280.66 Mobile Safari/537.36 ', 'Host': 'www.meteomanz.com' } else: self.headers = header def Get(self): """ :return: 网址对应的网页内容 """ http = urllib3.PoolManager() return http.request('GET', self.url, headers=self.headers).data
本处用了urllib3库和GET方式,其中headers是申请头,这部分能够在按F12调出开发者工具,在Network那一栏,点击任意一个事件,往下滑就有了,能够用个人也能够。请求头主要是http协议里的东西,想要了解能够自行搜索。
本处使用了BeautifulSoup
库
g = GetData(url).Get() # beautifulsoup解析网页 soup = BeautifulSoup(g, "html5lib") # 取<tbody>内容 tb = soup.find(name="tbody") # 取tr内容 past_tr = tb.find_all(name="tr") for tr in past_tr: # 取tr内每一个td的内容 text = tr.find_all(name="td") flag = False for i in range(0, len(text)): if i == 0: text[i] = text[i].a.string # 网站bug,跨月请求的话会给每月第0天的数据,可是里面是全空的由于日期不存在,好比 00/11/2020(日/月/年),因此要手动drop掉这个数据 if "00/" in text[i]: flag = True elif i == 8: # 把/8去掉,网页显示的格式问题 text[i] = text[i].string.replace("/8", "") elif i == 5: # 去掉Hpa单位 text[i] = text[i].string.replace(" Hpa", "") elif i == 6: # 用正则去掉风力里括号内的内容 text[i] = re.sub(u"[º(.*?|N|W|E|S)]", "", text[i].string) else: # 取每一个元素的内容 text[i] = text[i].string # 丢失数据都取2(简陋作法) # 这么作 MAE=3.6021 text[i] = text[i].replace("-", "2") text[i] = text[i].replace("Tr", "2")
若是有什么不清楚的评论里答复。
import csv # 建立文件对象 f = open(c, 'w', encoding='utf-8', newline='') # 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 写入内容,text数组 csv_writer.writerow(text) # 关闭文件 f.close()
Write.py
# -*- coding: utf-8 -*- # @Time: 2020/12/16 # @Author: Eritque arcus # @File: Write.py import re from bs4 import BeautifulSoup from GetData import GetData import datetime as DT import csv def a(t): return t.replace(" - ", "0") # 功能: 写csv def write(years, b, c): """ :param years: [开始日期距离如今的年份, 结束日期距离如今的年份] :param b: [开始日期距离如今日期的天数, 结束日期距离如今日期的天数] :param c: csv文件名 :return: None """ # 1. 建立文件对象 f = open(c, 'w', encoding='utf-8', newline='') # 2. 基于文件对象构建 csv写入对象 csv_writer = csv.writer(f) # 3. 构建列表头 # , "negAve", "negMax", "negMin" csv_writer.writerow(["Time", "Ave_t", "Max_t", "Min_t", "Prec", "SLpress", "Winddir", "Windsp", "Cloud"]) # 取如今日期 today = DT.datetime.now() # 取20天前日期 week_ago = (today - DT.timedelta(days=b[0])).date() # 20天后 week_pre = (today + DT.timedelta(days=b[1])).date() # 城市id 广州59287 青岛 54857 id = "59287" # 爬取数据连接 url = "http://www.meteomanz.com/sy2?l=1&cou=2250&ind=" + id + "&d1=" + str(week_ago.day).zfill(2) + "&m1=" + str( week_ago.month).zfill(2) + "&y1=" + str(week_ago.year - years[0]) + "&d2=" + str(week_pre.day).zfill( 2) + "&m2=" + str(week_pre.month).zfill(2) + "&y2=" + str(week_pre.year - years[1]) # 显示获取数据集的网址 print(url) g = GetData(url).Get() # beautifulsoup解析网页 soup = BeautifulSoup(g, "html5lib") # 取<tbody>内容 tb = soup.find(name="tbody") # 取tr内容 past_tr = tb.find_all(name="tr") for tr in past_tr: # 取tr内每一个td的内容 text = tr.find_all(name="td") flag = False negA = negMax = negMin = False for i in range(0, len(text)): if i == 0: text[i] = text[i].a.string # 网站bug,会给每月第0天,好比 00/11/2020,因此要drop掉 if "00/" in text[i]: flag = True elif i == 8: # 把/8去掉,网页显示的格式 text[i] = text[i].string.replace("/8", "") elif i == 5: # 去掉单位 text[i] = text[i].string.replace(" Hpa", "") elif i == 6: # 去掉风力里括号内的内容 text[i] = re.sub(u"[º(.*?|N|W|E|S)]", "", text[i].string) else: # 取每一个元素的内容 text[i] = text[i].string # 丢失数据都取2(简陋作法) # 这么作 MAE=3.6021 text[i] = "2" if text[i] == "-" else text[i] text[i] = "2" if text[i] == "Tr" else text[i] text = text[0:9] # ext += [str(int(negA)), str(int(negMax)), str(int(negMin))] # 4. 写入csv文件内容 if not flag: csv_writer.writerow(text) # 5. 关闭文件 f.close()
GetData.py
# -*- coding: utf-8 -*- # @Time: 2020/12/16 # @Author: Eritque arcus # @File: GetData.py # 功能: 爬取数据 import urllib3 class GetData: url = "" headers = "" def __init__(self, url, header=""): """ :param url: 获取的网址 :param header: 请求头,默认已内置 """ self.url = url if header == "": self.headers = { 'Connection': 'Keep-Alive', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,' '*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/87.0.4280.66 Mobile Safari/537.36 ', 'Host': 'www.meteomanz.com' } else: self.headers = header def Get(self): """ :return: 网址对应的网页内容 """ http = urllib3.PoolManager() return http.request('GET', self.url, headers=self.headers).data
到时候就能够直接用一行命令取得天气数据了,以下面是取去年今日的20天到去年今日的天气数据
# 用近几年的数据作训练集 # 如 [1,1], [20, 0]就是用2019年的今天的20天前到2019年的今天数据作训练集 # 写入csv Write([1, 1], [20, 0], "weather_train_train.csv")
结果以下
weather_train_train.csv
Time,Ave_t,Max_t,Min_t,Prec,SLpress,Winddir,Windsp,Cloud 07/12/2019,14.8,20.8,8.8,0.0,1026.3,331,11,0 06/12/2019,15.2,19.8,10.7,0.0,1026.6,344,15,0 05/12/2019,14.5,20.4,8.6,2,1026.2,346,13,8 04/12/2019,13.8,20.4,7.1,0.0,1024.7,335,16,2 03/12/2019,13.0,18.9,7.1,0.0,1024.8,330,10,0 02/12/2019,18.2,24.9,11.5,0.0,1024.8,347,18,3 01/12/2019,18.1,24.9,11.4,0.0,1020.9,332,16,1 30/11/2019,17.5,23.6,11.4,0.0,1020.5,352,8,3 29/11/2019,15.8,20.1,11.5,0.0,1023.6,349,11,4 28/11/2019,20.4,27.1,13.8,0.0,1024.5,337,19,3 27/11/2019,21.9,27.1,16.6,0.0,1021.3,336,12,0 26/11/2019,22.2,28.4,16.1,0.0,1021.1,356,6,6 25/11/2019,22.2,29.3,15.2,0.0,1020.8,344,13,3 24/11/2019,21.4,29.3,13.6,0.0,1018.5,346,5,0 23/11/2019,20.7,28.4,13.0,0.0,1017.2,352,5,1 22/11/2019,19.6,27.6,11.6,0.0,1017.3,331,6,0 21/11/2019,18.4,25.1,11.6,0.0,1019.1,323,9,1 20/11/2019,18.3,24.2,12.4,0.0,1020.3,338,7,0 19/11/2019,19.1,25.4,12.8,0.0,1020.5,342,11,0 18/11/2019,22.2,28.8,15.7,0.0,1018.8,342,17,0 17/11/2019,22.2,28.8,15.7,0.0,1015.2,358,7,3
若是在把上面的数据做为数据集训练,咱们还须要作些数据的预处理,由于有些状况下咱们获得的数据会有残缺,这种状况咱们就要选择抛弃那一列或者用方差或其余什么的方法填充缺乏的数据。
由于在我已经决定把数据里丢失的项所有取2了,因此下面我会列出可能的解决方法而不使用。
新建个ProcessData.py
里创建ProcessData
方法以得到数据
# -*- coding: utf-8 -*- # @Time: 2020/12/16 # @Author: Eritque arcus # @File: ProcessData.py from Write import Write import pandas as pd from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer import seaborn as sns import matplotlib.pyplot as plt # 功能: 数据预处理 def ProcessData(): """ :return: [X_train X训练数据集, X_valid X训练数据集的验证集, y_train Y训练数据集, y_valid Y训练数据集的验证集, imputed_X_test 预测数据集] """ # 用近几年的数据作训练集 # 如 [1,1], [20, 0]就是用2019年的今天的20天前到2019年的今天数据作训练集 # 写入csv Write([1, 1], [20, 0], "weather_train_train.csv") Write([1, 1], [0, 20], "weather_train_valid.csv") Write([0, 0], [20, 0], "weather_test.csv") X_test = pd.read_csv("weather_test.csv", index_col="Time", parse_dates=True) # 读取测试集和验证集 X = pd.read_csv("weather_train_train.csv", index_col="Time", parse_dates=True) y = pd.read_csv("weather_train_valid.csv", index_col="Time", parse_dates=True) # 把所有丢失的数据都drop,MAE=3.7又高了,因此去掉了 # dxtcol = [col for col in X_test.columns # if X_test[col].isnull().all()] # dxcol = [col for col in X.columns # if X[col].isnull().all()] # dycol = [col for col in y.columns # if y[col].isnull().all()] # for a1 in [dxtcol, dxcol, dycol]: # for a2 in a1: # if a2 in X_test.columns: # X_test = X_test.drop(a2, axis=1) # if a2 in X.columns: # X = X.drop(a2, axis=1) # if a2 in y.columns: # y = y.drop(a2, axis=1) # 数据归一化和标准化,没法还原不用 # scaler = preprocessing.StandardScaler() # pars = [cols for cols in X.columns if cols != "Time"] # for data in [X, y, X_test]: # for par in pars: # data[par] = scaler.fit_transform(data[par].values.reshape(-1, 1)) # # temp = scaler.fit(data[par].values.reshape(-1, 1)) # # data[par] = scaler.fit_transform(data[par].values.reshape(-1, 1), temp) # 填充缺乏的数值用方差,不清楚效果如何 my_imputer = SimpleImputer() X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns imputed_y_train = pd.DataFrame(my_imputer.fit_transform(y_train)) imputed_y_valid = pd.DataFrame(my_imputer.transform(y_valid)) imputed_y_train.columns = y_train.columns imputed_y_valid.columns = y_valid.columns imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test)) # 画折线图 # sns.lineplot(data=X) # plt.show() # sns.lineplot(data=y) # plt.show() # sns.lineplot(data=X_test) # plt.show() # 返回分割后的数据集 return [imputed_X_train, imputed_X_valid, imputed_y_train, imputed_y_valid, imputed_X_test]