pandas进阶02--高级应用篇

Pandas进阶02–高级应用

用到的工具为:jupyter
开发工具版本:python3python

目录介绍:
一、pandas高级应用–数据合并
二、pandas高级应用–数据重塑和旋转
三、pandas高级应用–数据转化、清除重复数据
四、pandas高级应用–数据替换
五、pandas高级应用–数据拆分web

下一篇地址:https://blog.csdn.net/sinat_30353259/article/details/80804935数据结构

一、pandas高级应用–数据合并

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA


df1 = DataFrame({"key":["b","b","a","c","a","a","b"],
                "data1":range(7)})
df2 = DataFrame({"key":["a","b","d"],
                "data2":range(3)})

df1
打印结果:
  data1 key
0   0   b
1   1   b
2   2   a
3   3   c
4   4   a
5   5   a
6   6   b

df2
打印结果:
  data1 key data2
0   0   b   1
1   1   b   1
2   6   b   1
3   2   a   0
4   4   a   0
5   5   a   0

# 指定key这一列取交集
pd.merge(df1,df2,on='key')
打印结果:
  data1 key data2
0   0   b   1
1   1   b   1
2   6   b   1
3   2   a   0
4   4   a   0
5   5   a   0


#若是列名不相同,如何去交集
df3 = DataFrame({"lkey":["b","b","a","c","a","a","b"],
                "data1":range(7)})
df4 = DataFrame({"rkey":["a","b","d"],
                "data2":range(3)})

pd.merge(df3,df4,left_on='lkey',right_on='rkey')
打印结果:
 data1 lkey data2 rkey
0   0   b   1   b
1   1   b   1   b
2   6   b   1   b
3   2   a   0   a
4   4   a   0   a
5   5   a   0   a


#merge方法的各类链接方法
#左链接
#右链接
#内链接
#外链接

#外链接 --> 并集
pd.merge(df1,df2,on='key',how='outer') 
打印结果:
  data1 key data2
0   0.0 b   1.0
1   1.0 b   1.0
2   6.0 b   1.0
3   2.0 a   0.0
4   4.0 a   0.0
5   5.0 a   0.0
6   3.0 c   NaN
7   NaN d   2.0

# 左链接 --> 以merge连结的左边数据集为标准,右边只取和左边有关联的,没关联的NAN值填充
pd.merge(df1,df2,on='key',how='left')
打印结果:
  data1 key data2
0   0   b   1.0
1   1   b   1.0
2   2   a   0.0
3   3   c   NaN
4   4   a   0.0
5   5   a   0.0
6   6   b   1.0

#以merge连结的右边数据集为标准,左边只取和右边有关联的,没关联的NAN值填充
pd.merge(df1,df2,on='key',how='right')
打印结果:
  data1 key data2
0   0.0  b  1
1   1.0  b  1
2   6.0  b  1
3   2.0  a  0
4   4.0  a  0
5   5.0  a  0
6   NaN  d  2


# Series数据的链接
s1 = Series([0,1],index=["a","b"])
s2 = Series([2,3,4],index=["c","d","e"])
s3 = Series([5,6],index=["f","g"])
result = pd.concat([s1,s2,s3])
result
打印结果:
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64


#将多个Series拼接成一个DataFrame,即一个Series就是DataFrame的一列数据
df_concat = pd.concat([s1,s2,s3],axis=1)
df_concat
    0   1   2
a   0.0 NaN NaN
b   1.0 NaN NaN
c   NaN 2.0 NaN
d   NaN 3.0 NaN
e   NaN 4.0 NaN
f   NaN NaN 5.0
g   NaN NaN 6.0


s4 = pd.concat([s1*5,s3]) #乘法是直接将Series的值进行乘法操做
s4
打印结果:
a    0
b    5
f    5
g    6
dtype: int64

pd.concat([s1,s4],axis=1)
打印结果:
    0   1
a   0.0 0
b   1.0 5
f   NaN 5
g   NaN 6


# inner取交集
pd.concat([s1,s4],axis=1,join='inner')
打印结果:
    0   1
a   0   0
b   1   5


# 利用concat生成层次化索引数据结构
result = pd.concat([s1,s2,s3],keys=["one","two","three"])
result
打印结果:
one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64


#获取指定的数据
result["one"]['a']
打印结果:
0


#合并重叠数据
a = Series([NA,2.5,NA,3.5,4.5,NA],index=list("fedcba"))
b = Series(np.arange(len(a)),dtype=np.float64,index=list("fedcba"))
pd.concat([a,b])
打印结果:
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64


#用其中一个Series中的数据给另外一个Series中的数据做为补丁
resultB = b[:-2]
resultB 
打印结果:
f    0.0
e    1.0
d    2.0
c    3.0
dtype: float64


resultA = a[2:]
resultA
打印结果:
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64


resultB.combine_first(resultA)
打印结果:
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64


# DataFrame利用combine_first进行数据补丁操做
df1 = DataFrame({"a":[1,NA,5,NA],
                "b":[NA,2,NA,6],
                "c":range(2,18,4)})
df2 = DataFrame({"a":[5,4,NA,3,7],
                "b":[NA,3,4,6,8]})
df1
打印结果:
    a   b   c
0   1.0 NaN 2
1   NaN 2.0 6
2   5.0 NaN 10
3   NaN 6.0 14

df2
打印结果:
    a   b
0   5.0 NaN
1   4.0 3.0
2   NaN 4.0
3   3.0 6.0
4   7.0 8.0


# 用df2的数据为df1中的数据打补丁
df1.combine_first(df2)
打印结果:
    a   b   c
0   1.0 NaN 2.0
1   4.0 2.0 6.0
2   5.0 4.0 10.0
3   3.0 6.0 14.0
4   7.0 8.0 NaN

二、pandas高级应用–数据重塑和旋转

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA

#建立层次化索引
data = Series(np.random.randn(10),index= [list("aaabbbccdd"),[1,2,3,1,2,3,1,2,2,3]])
data
打印结果:
a  1    0.535624
   2   -0.886595
   3   -0.434961
b  1    0.709035
   2    0.837770
   3    0.065979
c  1   -0.542920
   2    1.250756
d  2    0.466432
   3   -1.113291
dtype: float64


# 将行索引(index)转换到列索引上(columns)
result = data.unstack()
result
打印结果:
        1           2           3
a   0.535624    -0.886595   -0.434961
b   0.709035    0.837770    0.065979
c   -0.542920   1.250756    NaN
d   NaN 0.466432    -1.113291


# 将列索引(columns)转换到行索引(index)
result.stack()
打印结果:
a  1    0.535624
   2   -0.886595
   3   -0.434961
b  1    0.709035
   2    0.837770
   3    0.065979
c  1   -0.542920
   2    1.250756
d  2    0.466432
   3   -1.113291
dtype: float64


# DataFrame 中的行索引和列索引的重塑和转换
data = DataFrame(np.arange(6).reshape(2,3),
                index=pd.Index(["上海","北京"],name="省份"),
                columns=pd.Index([2011,2012,2013],name="年份"))
data
打印结果:
年份  2011    2012    2013
省份          
上海  0   1   2
北京  3   4   5


#将DataFrame的列索引转化到行索引
result = data.stack()
result
打印结果:
省份  年份  
上海  2011    0
    2012    1
    2013    2
北京  2011    3
    2012    4
    2013    5
dtype: int32


#将DataFrame的行索引转化为列索引 
#unstack()默认转换的最内层的层次化索引
result.unstack()
打印结果:
年份  2011    2012    2013
省份          
上海  0   1   2
北京  3   4   5


#第一种方法,转换的时候,指定层次化索引的名称
result.unstack("省份")
打印结果:
省份  上海  北京
年份      
2011    0   3
2012    1   4
2013    2   5


#第二种方法,转换的时候,指定层次化的索引 0是result的第一列,1是后面一层
result.unstack(1)
打印结果:
年份  2011    2012    2013
省份          
上海  0   1   2
北京  3   4   5


#在对DataFrame进行unstack操做时,做为旋转轴的级别将会成为结果中的最低级别
data = DataFrame(np.arange(6).reshape(2,3),
                 index=pd.Index(["Ohio","Colorado"],name="state"),
                 columns=pd.Index(["one","two","three"],name="nu mbers"))
data
打印结果:
numbers one two three
state           
Ohio        0   1   2
Colorado    3   4   5

result = data.stack()
result
打印结果:
state     nu mbers
Ohio      one         0
          two         1
          three       2
Colorado  one         3
          two         4
          three       5
dtype: int32


df = DataFrame({"left":result,
               "right":result+5},
              columns=pd.Index(["left","right"],name="side"))
df
打印结果:
    side    left    right
state   nu mbers        
Ohio    one 0   5
two 1   6
three   2   7
Colorado    one 3   8
two 4   9
three   5   10


result = df.unstack("state")
result
打印结果:
side    left                right
state   Ohio    Colorado    Ohio    Colorado
nu mbers                
one     0           3       5           8
two     1           4       6           9
three   2           5       7           10


s1=Series([0,1,2,3],index=list("abcd"))
s2 = Series([4,5,6],index=list("cde"))
#将s1和s2拼接成一个具备层次化索引的Series
result = pd.concat([s1,s2],keys=["one","two"])
result
打印结果:
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64


#将结果中的行索引变成列索引
tempResult = result.unstack(1)
tempResult
打印结果:
    a   b   c   d   e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0


#所有还原,空值用NaN填充
tempResult.stack(dropna=False)
打印结果:
one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

三、pandas高级应用–数据转化、清除重复数据

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA


data = DataFrame({"k1":["one"]*3+["two"]*4,
                "k2":[1,1,2,3,3,4,4]})
data
打印结果:
    k1  k2
0   one 1
1   one 1
2   one 2
3   two 3
4   two 3
5   two 4
6   two 4   

#第一种方法,去重
#检测DataFrame中的每行数据是否为重复数据行
mask = data.duplicated()
mask
打印结果:
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

#经过花式索引去除重复的数据
data[~mask]
打印结果:
    k1  k2
0   one 1
2   one 2
3   two 3
5   two 4

#第二种方法:去重
#经过DataFrame内置的drop_duplicates()方法去除重复的数据行.
#去除
data.drop_duplicates()
打印结果:
    k1  k2
0   one 1
2   one 2
3   two 3
5   two 4


data["v1"] = range(7)
data
打印结果:
    k1  k2  v1
0   one 1   0
1   one 1   1
2   one 2   2
3   two 3   3
4   two 3   4
5   two 4   5
6   two 4   6


# 只以k1这一列为标准去重
data.drop_duplicates(["k1"])
打印结果:
    k1  k2  v1
0   one 1   0
3   two 3   3


#经过制定keep参数制定须要保留特定的重复数据
#keep="first" 保留重复数据第一次出现的行索引
#keep="last" 保留重复数据最后一次的行索引
#keep=False 只要有重复数据,就所有丢掉
data.drop_duplicates(["k1"],keep="last")
打印结果:
    k1  k2  v1
2   one 2   2
6   two 4   6


data=DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
                       'corned beef','Bacon','pastrami','honey ham','nova lox'],
               'ounces':[4,3,12,6,7.5,8,3,5,6]})
data
打印结果:
    food    ounces
0   bacon   4.0
1   pulled pork 3.0
2   bacon   12.0
3   Pastrami    6.0
4   corned beef 7.5
5   Bacon   8.0
6   pastrami    3.0
7   honey ham   5.0
8   nova lox    6.0


#定义一个字典,反映每一种食物所属的动物
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
meat_to_animal
打印结果:
{'bacon': 'pig',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon',
 'pastrami': 'cow',
 'pulled pork': 'pig'}


data["animal"]=data["food"].map(str.lower).map(meat_to_animal)
data
打印结果:
    food    ounces  animal
0   bacon   4.0 pig
1   pulled pork 3.0 pig
2   bacon   12.0    pig
3   Pastrami    6.0 cow
4   corned beef 7.5 cow
5   Bacon   8.0 pig
6   pastrami    3.0 cow
7   honey ham   5.0 pig
8   nova lox    6.0 salmon


#使用lambda匿名函数
data["animal"] = data['food'].map(lambda x: meat_to_animal[x.lower()])
data
打印结果:
food    ounces  animal
0   bacon   4.0 pig
1   pulled pork 3.0 pig
2   bacon   12.0    pig
3   Pastrami    6.0 cow
4   corned beef 7.5 cow
5   Bacon   8.0 pig
6   pastrami    3.0 cow
7   honey ham   5.0 pig
8   nova lox    6.0 salmon

四、pandas高级应用–数据替换

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA


series = Series([1,-999,2,-999,-1000,3])
series
打印结果:
0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64


#单个数据替换
series.replace(-999,NA)
打印结果:
0      1
1   -999
2      2
3   -999
4      0
5      3
dtype: int64


#多个数据替换
series.replace([-999,-1000],NA)
打印结果:
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64


#replace方法传入字典,针对不一样的值,进行不一样的替换
#第一种方法
series.replace({-999:NA,-1000:0})
打印结果:
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64


#第二种方法
series.replace([-999,-1000],[NA,0])
打印结果:
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

五、pandas高级应用–数据拆分

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA
from matplotlib import pyplot as plt


age = [20,22,25,27,21,23,37,31,61,45,41,32]
#将全部年龄进行分组
bins = [18,25,35,60,100]
#使用pandas中的cut对年龄数据进行分组
cats = pd.cut(age,bins)
cats
打印结果:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]


#调用pd.value_counts方法统计每一个区间段的人数
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64


#区间属于那一行索引
cats.codes
打印结果:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)


#为分类出每一组年龄加上标签
group_names = ["Youth","YouthAdult","MiddleAged","senior"]
#用group_name中的值,把区间替换
personType = pd.cut(age,bins,labels=group_names)
personType
打印结果:
[Youth, Youth, Youth, YouthAdult, Youth, ..., YouthAdult, senior, MiddleAged, MiddleAged, YouthAdult]
Length: 12
Categories (4, object): [Youth < YouthAdult < MiddleAged < senior]


# 用一个直方图进行简单展现
plt.hist(personType)
(array([3., 0., 0., 5., 0., 0., 3., 0., 0., 1.]),
 array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3. ]),
 <a list of 10 Patch objects>)

这里写图片描述