In [201]:
import pandas as pd import matplotlib.pyplot as plt import matplotlib.mlab as mlab import warnings import numpy as np import seaborn as sns from sklearn.model_selection import train_test_split from sklearn import datasets from pandas.plotting import parallel_coordinates plt.rcParams['font.sans-serif'] = ['SimHei'] # 绘图时可以显示中文 plt.rcParams['axes.unicode_minus']=False # 绘图时显示负号 warnings.filterwarnings("ignore") # 不要显示警告
In [202]:
sale_data = pd.read_excel('C:\\Users\\91333\\Documents\\semester6\\data science\\第2周数据集:1-sale.xls', index_col = u'日期')
1) 前后五行
In [203]:
sale_data.head(5)
Out[203]:
销量 | |
---|---|
日期 | |
2015-03-01 | 51.0 |
2015-02-28 | 2618.2 |
2015-02-27 | 2608.4 |
2015-02-26 | 2651.9 |
2015-02-25 | 3442.1 |
In [204]:
sale_data.tail(5)
Out[204]:
销量 | |
---|---|
日期 | |
2014-08-06 | 2915.8 |
2014-08-05 | 2618.1 |
2014-08-04 | 2993.0 |
2014-08-03 | 3436.4 |
2014-08-02 | 2261.7 |
2) index of columns
In [205]:
sale_data.columns
Out[205]:
Index(['销量'], dtype='object')
3) descriptive statistics
In [206]:
sale_data.describe().T
Out[206]:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
销量 | 200.0 | 2755.2147 | 751.029772 | 22.0 | 2451.975 | 2655.85 | 3026.125 | 9106.44 |
4) sequence diagram
In [207]:
plt.plot(sale_data, linewidth = 1)
Out[207]:
[<matplotlib.lines.Line2D at 0x299198d0780>]
5) others
In [208]:
sale_data.shape
Out[208]:
(201, 1)
In [209]:
sale_data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 201 entries, 2015-03-01 to 2014-08-02 Data columns (total 1 columns): 销量 200 non-null float64 dtypes: float64(1) memory usage: 3.1 KB
根据上面的描述性统计量,最大值远远大于均值,很有可能存在异常点,下面使用分位数的原理利用盒图识别和剔除异常点。
1) spot outliers and draw boxplot
In [210]:
plt.figure() plt.title('boxplot of sales', fontsize = 15) p = sale_data.boxplot(return_type = 'dict', showmeans = True, sym = '*') x = p['fliers'][0].get_xdata() y = p['fliers'][0].get_ydata() y.sort() for i in range(len(x)): if i > 0: plt.annotate(y[i], xy = (x[i], y[i]), xytext = (x[i] + 0.05 - 1.1 / (y[i] - y[i - 1]), y[i])) #在盒图中标注异常值的具体数值 else: plt.annotate(y[i], xy = (x[i], y[i]), xytext = (x[i] + 0.1, y[i]))
Outliers are marked with stars and specific values in the diagram above.
2) remove outliers
In [211]:
sale_data2 = sale_data for index,value in enumerate(sale_data.columns): fliers_value_list = p['fliers'][index].get_ydata() for flier in fliers_value_list: sale_data2 = sale_data2[sale_data.loc[:,value] != flier]
In [212]:
sale_data2.shape
Out[212]:
(193, 1)
There are still 193 remaining samples after remove.
1) detect missing value
In [213]:
sale_data2[sale_data2.isnull().values == True]
Out[213]:
销量 | |
---|---|
日期 | |
2015-02-14 | NaN |
2) handle missing value
A. deletion
In [214]:
sale_data3 = sale_data2 sale_data3 = sale_data3.dropna() sale_data3.isnull().sum()
Out[214]:
销量 0 dtype: int64
B. filling with mean value
In [215]:
sale_data4 = sale_data2 sale_data4 = sale_data4.fillna(sale_data4 .mean())
In [216]:
sale_data4[sale_data2.isnull().values == True]
Out[216]:
销量 | |
---|---|
日期 | |
2015-02-14 | 2740.654167 |
C. filling with mode
In [217]:
sale_data5 = sale_data2 sale_data5.mode()
Out[217]:
销量 | |
---|---|
0 | 2618.2 |
1 | 2620.2 |
存在两个众数,取平均。
In [218]:
sale_data5 = sale_data5.fillna(sale_data5 .mode().mean()) sale_data5[sale_data2.isnull().values == True]
Out[218]:
销量 | |
---|---|
日期 | |
2015-02-14 | 2619.2 |
D.filling with median
In [219]:
sale_data6 = sale_data2 sale_data6 = sale_data6.fillna(sale_data6.median()) sale_data6[sale_data2.isnull().values == True]
Out[219]:
销量 | |
---|---|
日期 | |
2015-02-14 | 2655.85 |