python科学计算-pandas

时间 2019-11-21
原文原文链接
Pandas是基于Numpy开发出的,专门用于数据分析的开源Python库。

import numpy as np
import pandas as pd

# 数据准备
l = list(range(5))
np01 = np.array(l)
np02 = np.arange(10)
d01 = {'Michael': 95, 'Bob': 75, 'Tracy': 85}
d02 = {
    'a': [1, 2, 3, 4],
    'b': [5, 6, 7, 8],
    'c': [9, 10, 11, 12],
    'd': [13, 14, 15, 16]
}

# 1.一维数据结构Series
## 1.1经过list建立
s_l = pd.Series(l)
s_l_index = pd.Series(l,index=['bj','sh','gz','sz','qd'])

## 1.2经过numpy的ndarray数据结构建立
s_np01 = pd.Series(np01)
s_np02 = pd.Series(np02)

## 1.3经过字典建立
s_d01 = pd.Series(d01)

# 2.二维数据结构DataFrame
## 2.1使用dict建立
df_d02 = pd.DataFrame(d02, index=['one','two','three','four'])

# 2.2从文件读取
df_csv = pd.read_csv('./price.csv') #假定目标文件夹下有price.csv文件

# 2.3使用二维numpy数据结构建立
dates = list('abcdef')  #或者 dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))

# 3. DataFrame的属性
shape = df.shape
ndim = df.ndim
index = df.index
columns = df.columns
values = df.values

# 4. DataFrame的索引
# 4.1直接选取 df[]
a1 = df[1:3] #左包右不包，下标从0开始，此例为选取第二、3行
a2 = df[:3]  #选取前3行

a3 = df['A'] #选取A列
a4 = df[['A','C']]   #选取A、C列

a5 = df[df['A']>0]   #选取A列大于0的行

# 4.2标签选取df.loc[]
a6 = df.loc['a', :]
a7 = df.loc['a','A']
a8 = df.loc['a':'d', :]
a9 = df.loc[['a','d'], ['B','C']]

# 4.3位置选取df.iloc[]
a10 = df.iloc[1:4,1:4]  #选择第一、二、3行，第一、二、3列
a11 = df.iloc[[1,4],1:3]    #选择第一、4行，第一、2列

# 5. Panel
#聚宽平台获取的panel数据，行标为时间，列标为各种价格，还有一个股票代码标
panel = get_price(['000001.XSHE','000002.XSHE'],start_date='2016-07-12', end_date='2016-07-15', frequency='daily', fields=['open','high','low','close'])