pandas数据处理进阶

时间 2019-11-10

标签 pandas 数据处理进阶栏目大数据繁體版

原文原文链接

　　1、pandas的统计分析数组

　　一、关于pandas 的数值统计(统计detail 中的单价的相关指标)app

　　import pandas as pdide

　　# 加载数据函数

　　detail = pd.read_excel("./meal_order_detail.xlsx")spa

　　print("detail :\n", detail)excel

　　print("detail 的列索引名称:\n", detail.columns)orm

　　print("detail 的形状:\n", detail.shape)对象

　　print("detail 数据类型:\n", detail.dtypes)排序

　　print("amounts 的最大值：\n",detail.loc[:,'amounts'].max())索引

　　print("amounts 的最小值：\n",detail.loc[:,'amounts'].min())

　　print("amounts 的均值：\n",detail.loc[:,'amounts'].mean())

　　print("amounts 的中位数：\n",detail.loc[:,'amounts'].median())

　　print("amounts 的方差：\n",detail.loc[:,'amounts'].var())

　　print("amounts 的describe：\n",detail.loc[:,'amounts'].describe())

　　# 对于两列的统计结果

　　print("amounts 的describe：\n",detail.loc[:,['counts','amounts']].describe())

　　print("amounts 的describe：\n",detail.loc[:,'amounts'].describe())

　　print("amounts 的describe：\n",detail.loc[:,'counts'].describe())

　　print("amounts 的极差：\n",detail.loc[:,'amounts'].ptp())

　　print("amounts 的标准差：\n",detail.loc[:,'amounts'].std())

　　print("amounts 的众数：\n",detail.loc[:,'amounts'].mode()) # 返回众数的数组

　　print("amounts 的众数：\n",detail.loc[:,'counts'].mode()) # 返回众数的数组

　　print("amounts 的非空值的数目：\n",detail.loc[:,'amounts'].count())

　　print("amounts 的最大值的位置：\n",detail.loc[:,'amounts'].idxmax()) # np.argmax()

　　print("amounts 的最小值的位置：\n",detail.loc[:,'amounts'].idxmin()) # np.argmin()

　　二、pandas对于非数值型数据的统计分析

　　(1)对于dataframe转化数据类型，其余类型转化为object类型

　　detail.loc[:,'amounts'] = detail.loc[:,'amounts'].astype('object')

　　(2)类别型数据

　　detail.loc[:,'amounts'] = detail.loc[:,'amounts'].astype('category')

　　print("统计类别型数据的describe指标:\n",detail.loc[:, 'amounts'].describe())

　　(3)统计实例

　　## 在detail中哪些菜品最火?菜品卖出了多少份?

　　# 若白饭算菜

　　detail.loc[:, 'dishes_name'] = detail.loc[:, 'dishes_name'].astype('category')

　　print("按照dishes_name统计描述信息：\n", detail.loc[:, 'dishes_name'].describe())

　　# 若白饭不算菜 ---把白饭删除，再统计

　　# drop labels ---行的名称， axis =0,inplace = True

　　# 行的名称??? 怎么获取----bool值

　　# 定位到白饭的行

　　bool_id = detail.loc[:, 'dishes_name'] == '白饭/大碗'

　　# 进行获取行名称

　　index = detail.loc[bool_id, :].index

　　# 进行删除

　　detail.drop(labels=index, axis=0, inplace=True)

　　# 在进行转化类型

　　detail.loc[:, 'dishes_name'] = detail.loc[:, 'dishes_name'].astype('category')

　　# 在进行统计描述信息

　　print("按照dishes_name统计描述信息：\n", detail.loc[:, 'dishes_name'].describe())

　　# 看在detail 中那个订单点的菜最多，点了多少份菜?

　　# 将 order_id 转化为类别型数据，再进行describe

　　detail.loc[:, 'order_id'] = detail.loc[:, 'order_id'].astype("category")

　　# 统计描述

　　print("按照order_id统计描述信息为:\n", detail.loc[:, 'order_id'].describe())

　　2、pandas时间数据

　　datetime64[ns] ---numpy 里面的时间点类

　　Timestamp ---pandas 默认的时间点类型----封装了datetime64[ns]

　　DatetimeIndex ---pandas 默认支持的时间序列结构

　　一、能够经过 pd.to_datetime 将时间点数据转化为pandas默认支持的时间点数据

　　res = pd.to_datetime("2016/01/01")

　　print("res:\n",res)

　　print("res 的类型：\n",type(res))

　　二、时间序列转化 --能够经过pd.to_datetime 或者pd.DatetimeIndex将时间序列转化为pandas默认支持的时间序列结构

　　res = pd.to_datetime(['2016-01-01', '2016-01-01', '2016-01-01', '2011-01-01'])

　　res1 = pd.DatetimeIndex(['2016-01-01', '2016-01-02', '2016-02-05', '2011-09-01'])

　　print("res:\n", res)

　　print("res 的类型：\n", type(res))

　　print("res1:\n", res1)

　　print("res1 的类型：\n", type(res1))

　　三、

　　import pandas as pd

　　# #加载数据

　　detail = pd.read_excel("./meal_order_detail.xlsx")

　　# print("detail :\n",detail)

　　print("detail 的列索引名称:\n", detail.columns)

　　print("detail 的形状:\n", detail.shape)

　　# print("detail 数据类型:\n",detail.dtypes)

　　print("*" * 80)

　　# 获取place_order_time列

　　print(detail.loc[:, 'place_order_time'])

　　# 转化为pandas默认支持的时间序列结构

　　detail.loc[:, 'place_order_time'] = pd.to_datetime(detail.loc[:, 'place_order_time'])

　　# print(detail.dtypes)

　　print("*" * 80)

　　# 获取该时间序列的属性---能够经过列表推导式来获取时间点的属性

　　year = [i.year for i in detail.loc[:, 'place_order_time']]

　　print("年：\n", year)

　　month = [i.month for i in detail.loc[:, 'place_order_time']]

　　print("月：\n", month)

　　day = [i.day for i in detail.loc[:, 'place_order_time']]

　　print("日：\n", day)

　　quarter = [i.quarter for i in detail.loc[:, 'place_order_time']]

　　print("季度：\n", quarter)

　　# 返回对象

　　weekday = [i.weekday for i in detail.loc[:, 'place_order_time']]

　　print("周几：\n", weekday)

　　weekday_name = [i.weekday_name for i in detail.loc[:, 'place_order_time']]

　　print("周几：\n", weekday_name)

　　is_leap_year = [i.is_leap_year for i in detail.loc[:, 'place_order_time']]

　　print("是否闰年：\n", is_leap_year)

　　四、时间加减

　　import pandas as pd

　　res = pd.to_datetime("2016-01-01")

　　print("res:\n", res)

　　print("res 的类型：\n", type(res))

　　print("时间推后一天：\n", res + pd.Timedelta(days=1))

　　print("时间推后一小时：\n", res + pd.Timedelta(hours=1))

　　detail.loc[:, 'place_over_time'] = detail.loc[:, 'place_order_time'] + pd.Timedelta(days=1)

　　print(detail)

　　## 时间差距计算

　　res = pd.to_datetime('2019-10-9') - pd.to_datetime('1996-11-07')

　　print(res)

　　五、获取本机可使用的最初时间和最后使用的时间节点

　　print(pd.Timestamp.min)

　　print(pd.Timestamp.max)

　　3、分组聚合

　　import pandas as pd

　　import numpy as np

　　# 加载数据

　　users = pd.read_excel("./users.xlsx")

　　print("users:\n", users)

　　print("users 的列索引：\n", users.columns)

　　print("users 的数据类型：\n", users.dtypes)

　　# 根据班级分组、统计学员的班级的平均年龄

　　# groupby 分组

　　# by ---指定分组的列，能够是单列也能够是多列

　　# res = users.groupby(by='ORGANIZE_NAME')['age'].mean()

　　# 按照单列进行分组，统计多个列的指标

　　# res = users.groupby(by='ORGANIZE_NAME')[['age','USER_ID']].mean()

　　res = users.groupby(by=['ORGANIZE_NAME', 'poo', 'sex'])['age'].mean()

　　print(res)

　　# 利用agg

　　# 进行同时对age 求平均值、对userid 求最大值

　　# 只须要指定 np.方法名

　　print(users.agg({'age': np.mean, 'USER_ID': np.max}))

　　# 对age 和 USER_ID 同时分别求和和均值

　　print(users[['age', 'USER_ID']].agg([np.sum, np.mean]))

　　# 对age USER_ID 求取不一样个数的统计指标

　　print(users.agg({'age': np.min, 'USER_ID': [np.mean, np.sum]}))

　　def hh(x):

　　return x + 1

　　# 自定义函数进行计算

　　# res = users['age'].apply(hh)

　　# res = users[['age','USER_ID']].apply(lambda x:x+1)

　　res = users['age'].transform(lambda x: x + 1)

　　# 不能进行跨列的运算

　　print(res)

　　4、透视表与交叉表

　　import pandas as pd

　　# 加载数据

　　detail = pd.read_excel("./meal_order_detail.xlsx")

　　print("detail :\n", detail)

　　print("detail 的列名：\n", detail.columns)

　　print("detail 的数据类型：\n", detail.dtypes)

　　# 获取时间点的日属性

　　# 必须pandas默认支持的时间序列类型

　　detail.loc[:, 'place_order_time'] = pd.to_datetime(detail.loc[:, 'place_order_time'])

　　# 以列表推导式来获取日属性

　　detail.loc[:, 'day'] = [i.day for i in detail.loc[:, 'place_order_time']]

　　# 透视表是一种plus 版的分组聚合

　　# 建立一个透视表

　　# data dataframe数据

　　# values 最终统计指标所针对对象，要关心的数据主体

　　# index --按照index 进行行分组

　　# columns ---按照columns进行列分组

　　# aggfunc ---对主体进行什么指标的统计

　　# res = pd.pivot_table(data=detail[['amounts','order_id','counts','dishes_name','day']],values='amounts',columns=['day','counts'],index=['order_id','dishes_name'],aggfunc='mean',margins=True)

　　# # print(res)

　　# res.to_excel("./hh.xlsx")

　　# 交叉表 mini版的透视表

　　# 若是只传index 与columns 统计这两列的相对个数

　　# res = pd.crosstab(index=detail['counts'],columns=detail['amounts'])

　　# values 必须和aggfunc同时存在

　　res = pd.crosstab(index=detail['order_id'],columns=detail['counts'],values=detail['amounts'],aggfunc='mean')

　　print(res)　无锡妇科检查医院 http://www.87554006.com/

　　5、案例

　　一、营业额案例

　　import pandas as pd

　　# detail 有时间数据

　　# 加载数据

　　detail = pd.read_excel("./meal_order_detail.xlsx")

　　print("detail :\n", detail)

　　print("detail 的列名：\n", detail.columns)

　　print("detail 的数据类型：\n", detail.dtypes)

　　# 计算每一个菜品的销售额，增长到detail

　　detail.loc[:, 'pay'] = detail.loc[:, 'counts'] * detail.loc[:, 'amounts']

　　# print(detail)

　　# 获取时间点的日属性

　　# 必须pandas默认支持的时间序列类型

　　detail.loc[:, 'place_order_time'] = pd.to_datetime(detail.loc[:, 'place_order_time'])

　　# 以列表推导式来获取日属性

　　detail.loc[:, 'day'] = [i.day for i in detail.loc[:, 'place_order_time']]

　　# print(detail)

　　# 以日为分组，统计pay的sum

　　res = detail.groupby(by='day')['pay'].sum()

　　print(res)

　　# print(type(res))

　　df = pd.DataFrame(res.values, columns=['monty'], index=res.index)

　　print(df)

　　print(type(df))

　　二、连锁超市案例

　　import pandas as pd

　　# 加载数据

　　order = pd.read_csv("./order.csv", encoding='ansi')

　　print("order:\n", order)

　　print("order 的列索引：\n", order.columns)

　　# 一、哪些类别的商品比较畅销?

　　# 剔除销量 < 0 的数据 (保留销量 >0 的数据)

　　# 保存

　　bool_id = order.loc[:, '销量'] > 0

　　data = order.loc[bool_id, :] # 剔除异常数据以后的正常数据

　　print(data.shape)

　　print("*" * 80)

　　# 删除异常

　　# bool_id = order.loc[:,'销量'] <= 0

　　# index = order.loc[bool_id,:].index

　　# data = order.drop(labels=index,axis=0,inplace=False)

　　# 按照类别进行分组，统计销量的和

　　# 进行dataframe或者series的值排序

　　# 若是series sort_values()直接按照seies的值进行排序

　　# 若是df 那么须要指定按照哪一列进行排序，by= 列名

　　# 默认是升序ascending=True

　　# ascending=False 降序

　　# res = data.groupby(by='类别ID')['销量'].sum().sort_values(ascending=False)

　　# print(res)

　　# 二、哪些商品比较畅销?

　　# 分组聚合实现

　　# res = data.groupby(by='商品ID')['销量'].sum().sort_values(ascending=False).head(10)

　　# print(res)

　　# 透视表实现

　　# res = pd.pivot_table(data=data.loc[:, ['商品ID', '销量']], index='商品ID', values='销量', aggfunc='sum').sort_values(by='销量',

　　# ascending=False).head(

　　# 10)

　　# print(res)

　　# 三、求不一样门店的销售额占比

　　# 提示：订单中没有销售额字段，全部须要新增一个销售额字段。增长字段后按照门店编号进行分组，而后计算占比。

　　# # 先计算销售额

　　# data.loc[:,'销售额'] = data.loc[:,'单价'] * data.loc[:,'销量']

　　# # 按照门店编号进行分组统计销售额的sum

　　# res = data.groupby(by='门店编号')['销售额'].sum()

　　# # print(res)

　　# # 计算全部的销售额总和

　　# all_ = res.sum()

　　# # print(all_)

　　# per_ = res / all_

　　# print("各个门店的销售额占比为：\n",per_.apply(lambda x:format(x,".2%")))

　　# a = 100.105

　　# print("%.2f"%a)

　　# print("{}%".format(2.0))

　　# 匿名函数

　　# print(lambda x:x+5) #

　　# def add(x):

　　# # return x+5

　　# 四、哪段时间段是超市的客流高峰期?

　　# 提示：须要知道每一个时间段对应的客流量，可是订单表中既有日期又有时间，咱们须要从中提出小时数，这里利用订单ID去重计数表明客流量。

　　# 先对订单去重

　　# subset 去重的那一列的列名，能够是多列，多列的时候传列表

　　data.drop_duplicates(subset='订单ID', inplace=True)

　　# print(data.shape)

　　# 按照小时分组对订单ID进行统计数量

　　# 将成交时间转化为 pandas默认支持的时间序列类型

　　data.loc[:, '成交时间'] = pd.to_datetime(data.loc[:, '成交时间'])

　　# 获取小时属性，增长到data 中

　　data.loc[:, 'hour'] = [i.hour for i in data.loc[:, '成交时间']]

　　# print(data)

　　# 按照hour 分组统计订单ID数量

　　res = data.groupby(by='hour')['订单ID'].count().sort_values(ascending=False)

　　print(res)