pandas模块（数据分析）------dataframe

时间 2019-12-02

原文原文链接

DataFramepython

DataFrame是一个表格型的数据结构，含有一组有序的列，是一个二维结构。正则表达式

DataFrame能够被看作是由Series组成的字典，而且共用一个索引。数据库

1、生成方式

import numpy as np
import pandas as pd

a=pd.DataFrame({'one':pd.Series([1,2,3],index=['a','b','c']), 'two':pd.Series([1,2,3,4],index=['b','a','c','d'])})
a

能够看出有one和two两个Series组成，而且共用一组索引a,b,c,djson

# 字典方式建立

b=pd.DataFrame({"today":[12,43,23,123],"tomory":[23,45,34,23]})
b

# 自定义索引

c=pd.DataFrame({"today":[12,43,23,123],"tomory":[23,45,34,23]},index=list("abcd"))
c

2、csv文件读取与写入

df = pd.read_csv("d:/601318.csv")
df

2470 rows × 8 columns数组

x=open("d:/601318.csv")
df=pd.read_csv(x)
df

2470 rows × 8 columns数据结构

 1 #  保存到文件
 2 df.to_csv("d:/new.csv")
 3 
 4 
 5 # index                 获取行索引
 6 df.index
 7 
 8 RangeIndex(start=0, stop=2470, step=1)
 9 
10 a.index
11 
12 Index(['a', 'b', 'c', 'd'], dtype='object')
13 
14 
15 # 返回列索引
16 df.columns
17 
18 Index(['id', 'date', 'open', 'close', 'high', 'low', 'volume', 'code'], dtype='object')
19 
20 
21 #  values  返回二维数组
22 df.values
23 
24 array([
25         [0, '2007/3/1', 22.074, ..., 20.22, 1977633.51, 601318],
26         [1, '2007/3/2', 20.75, ..., 20.256, 425048.32, 601318],
27         [2, '2007/3/5', 20.3, ..., 19.218, 419196.74, 601318],
28         ..., 
29         [2467, '2017/7/28', 52.2, ..., 51.8, 491294.0, 601318],
30         [2468, '2017/7/31', 51.88, ..., 51.41, 616005.0, 601318],
31         [2469, '2017/8/1', 52.2, ..., 52.2, 1147936.0, 601318]
32         ], 
33         dtype=object)
34 
35 
36 # 倒置  行和列交换
37 
38 a.T

#  describe 按列打印一些统计信息

df.describe()

#  df 的columns 和index都有name属性

# 上面的数据中的index的name尚未值，能够设置一个
df.index.name='indexname'
df

2470 rows × 8 columnsapp

#获取第一列的name
df.columns[0]
'id'


df.columns[1]
'date'


#  给列重命名，并无修改原数据，这是下面是返回的数据
df.rename(columns={"close":"newclose","low":"newlow"})

2470 rows × 8 columns函数

3、索引和切片

df[0]

    ---------------------------------------------------------------------------

    KeyError                                  Traceback (most recent call last)

    d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
       2441             try:
    -> 2442                 return self._engine.get_loc(key)
       2443             except KeyError:


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()


    KeyError: 0


    During handling of the above exception, another exception occurred:


    KeyError                                  Traceback (most recent call last)

    <ipython-input-18-9ae93f22b889> in <module>()
    ----> 1 df[0]


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
       1962             return self._getitem_multilevel(key)
       1963         else:
    -> 1964             return self._getitem_column(key)
       1965 
       1966     def _getitem_column(self, key):


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
       1969         # get column
       1970         if self.columns.is_unique:
    -> 1971             return self._get_item_cache(key)
       1972 
       1973         # duplicate columns & possible reduce dimensionality


    d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
       1643         res = cache.get(item)
       1644         if res is None:
    -> 1645             values = self._data.get(item)
       1646             res = self._box_item_values(item, values)
       1647             cache[item] = res


    d:\program files (x86)\python35\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
       3588 
       3589             if not isnull(item):
    -> 3590                 loc = self.items.get_loc(item)
       3591             else:
       3592                 indexer = np.arange(len(self.items))[isnull(self.items)]


    d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
       2442                 return self._engine.get_loc(key)
       2443             except KeyError:
    -> 2444                 return self._engine.get_loc(self._maybe_cast_indexer(key))
       2445 
       2446         indexer = self.get_indexer([key], method=method, tolerance=tolerance)


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()


    pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()


    pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()


    KeyError: 0

df["close"]

    indexname
    0       20.657
    1       20.489
    2       19.593
    3       19.977
    4       20.520
    5       20.273
    6       20.101
    7       19.739
    8       19.818
    9       19.841
    10      19.849
    11      19.960
    12      20.211
    13      19.911
    14      20.026
    15      19.938
    16      20.282
    17      20.269
    18      20.565
    19      20.927
    20      20.772
    21      21.364
    22      21.284
    23      21.099
    24      21.156
    25      21.196
    26      22.785
    27      23.319
    28      23.637
    29      23.593
             ...  
    2440    48.896
    2441    48.609
    2442    49.183
    2443    49.183
    2444    49.381
    2445    48.085
    2446    49.420
    2447    49.074
    2448    48.411
    2449    47.403
    2450    49.876
    2451    50.835
    2452    50.459
    2453    50.578
    2454    51.230
    2455    50.610
    2456    51.630
    2457    52.770
    2458    53.900
    2459    53.470
    2460    53.840
    2461    54.010
    2462    51.960
    2463    52.610
    2464    52.310
    2465    51.890
    2466    52.360
    2467    51.890
    2468    52.020
    2469    54.850
    Name: close, Length: 2470, dtype: float64

从上边能够看出，[]里边彷佛要用来选择列才能够（后面知道，切片也能够）spa

# 花式索引

df[["close","low"]]

2470 rows × 2 columnscode

df["close"][0]

20.656999999999996

df[“close”] 先获得一个Series，而后再用标签索引0去查找

df[["close","low"]][0]

 1     ---------------------------------------------------------------------------
 2 
 3     KeyError                                  Traceback (most recent call last)
 4 
 5     d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
 6        2441             try:
 7     -> 2442                 return self._engine.get_loc(key)
 8        2443             except KeyError:
 9 
10 
11     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()
12 
13 
14     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()
15 
16 
17     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()
18 
19 
20     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()
21 
22 
23     KeyError: 0
24 
25 
26     During handling of the above exception, another exception occurred:
27 
28 
29     KeyError                                  Traceback (most recent call last)
30 
31     <ipython-input-22-7ed9e36ec1ab> in <module>()
32     ----> 1 df[["close","low"]][0]
33 
34 
35     d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
36        1962             return self._getitem_multilevel(key)
37        1963         else:
38     -> 1964             return self._getitem_column(key)
39        1965 
40        1966     def _getitem_column(self, key):
41 
42 
43     d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
44        1969         # get column
45        1970         if self.columns.is_unique:
46     -> 1971             return self._get_item_cache(key)
47        1972 
48        1973         # duplicate columns & possible reduce dimensionality
49 
50 
51     d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
52        1643         res = cache.get(item)
53        1644         if res is None:
54     -> 1645             values = self._data.get(item)
55        1646             res = self._box_item_values(item, values)
56        1647             cache[item] = res
57 
58 
59     d:\program files (x86)\python35\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
60        3588 
61        3589             if not isnull(item):
62     -> 3590                 loc = self.items.get_loc(item)
63        3591             else:
64        3592                 indexer = np.arange(len(self.items))[isnull(self.items)]
65 
66 
67     d:\program files (x86)\python35\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
68        2442                 return self._engine.get_loc(key)
69        2443             except KeyError:
70     -> 2444                 return self._engine.get_loc(self._maybe_cast_indexer(key))
71        2445 
72        2446         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
73 
74 
75     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5280)()
76 
77 
78     pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5126)()
79 
80 
81     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20523)()
82 
83 
84     pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20477)()
85 
86 
87     KeyError: 0

之因此报错，是由于df[[“close”,“low”]]获得的是一个DataFrame类型，它再加[],[]里边只能是列

# 切片，这个时候解释的就是行

df[0:10]

推荐使用loc和iloc索引

# 在loc里边，逗号左边表示行，右边表示列

# 在这里的0:10被解释为标签（不是行的下标）
ddf=df.loc[3:10,["close","low"]]
ddf

#  那我如今想拿到ddf里，"low"列，第5行的数据

# ddf["low"]获得的是一个Series，其索引是整数的，因此必须使用iloc指明使用下标取值
ddf["low"].iloc[4]

19.646000000000001

布尔值索引

# 过滤某一列

df[df["close"]<20]

856 rows × 8 columns

# 过滤全部的位置

# dataframe会将全部位置上小于20的设置为nan（由于其不能肯定该怎么舍弃数据，不可能由于一行中一个nan就删除整个一行或者一列）

df[df<20]

2470 rows × 8 columns

#  将全部小于20的值改成0

# 请注意这里，会将为False的位置改成0，因此咱们要写大于20，这样的话小于20的才是False
df[df>20].fillna(0)

2470 rows × 8 columns

#  选择date 为2017/7/25 和2017/7/3 的值

# 这里的date是字符串类型，不是datetime类型

df[(df["date"]=="2017/7/25") | (df["date"]=="2017/7/3")]

#  这里还能够用isin方法去过滤一个范围

df[df["date"].isin(["2017/7/25","2017/7/3"])]

df[df["high"].isin([53.050,54.150])]

修改值的时候要注意类型的问题

# 好比要将全部小于20的位置变为0

# 作法一：
df[df>20].fillna(0)

# 作法二：等号赋值
df[df<20]=0

    ---------------------------------------------------------------------------

    TypeError                                 Traceback (most recent call last)

    <ipython-input-45-ea838d192259> in <module>()
          5 
          6 # 作大二：等号赋值
    ----> 7 df[df<20]=0


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
       2326             self._setitem_array(key, value)
       2327         elif isinstance(key, DataFrame):
    -> 2328             self._setitem_frame(key, value)
       2329         else:
       2330             # set column


    d:\program files (x86)\python35\lib\site-packages\pandas\core\frame.py in _setitem_frame(self, key, value)
       2362             raise TypeError('Must pass DataFrame with boolean values only')
       2363 
    -> 2364         self._check_inplace_setting(value)
       2365         self._check_setitem_copy()
       2366         self._where(-key, value, inplace=True)


    d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in _check_inplace_setting(self, value)
       3197                     pass
       3198 
    -> 3199                 raise TypeError('Cannot do inplace boolean setting on '
       3200                                 'mixed-types with a non np.nan value')
       3201 


    TypeError: Cannot do inplace boolean setting on mixed-types with a non np.nan value

报错的缘由是由于，date这列是字符串类型，设置为0，类型转换失败

# 如今经过切片，去掉date列，看可否转换成功

df2=df.loc[:10,"open":"code"]
df2

df2[df2<20]=0
df2

能够看出，若是列里边没有字符串类型，是能够转换成功的

4、数据对齐和数据缺失

df3=df + df2
df3

2470 rows × 8 columns

新的数据，列和行都要对齐，列date和id都是nan，是由于df2中没有这两列，这些其实跟Series的道理是同样的

处理缺失数据的相关方法：

dropna() 过滤掉值为NaN的行
fillna() 填充缺失数据
isnull() 返回布尔数组，缺失值对应为True
notnull() 返回布尔数组，缺失值对应为False

跟Series的方法是同样的

df3.dropna()

在这里，dropna默认的规则，只要行里有nan，就会清除掉整行，可是能够设置参数去改变

df3.dropna(how="any") ---->默认是any，只要有nan就删除；how='all'的话，就是行里全是nan才删除

那若是我想对列进行操做，就还须要另一个才作，要记住默认的规则是对行的

df3.dropna(how="any",axis=0)--->axis默认等于0，表示是对行进行规则，axis=1的话，就表示对列进行规则

df3.dropna(how="any",axis=0)--->清除掉行里含有nan的行
df3.dropna(how="all",axis=0)--->清除掉行里都是nana的行
df3.dropna(how="any",axis=1)--->清除掉列里含有nan的列
df3.dropna(how="all",axis=1)--->清除掉列里都是nana的列

# 将位置是nan的地方替换为0

df3.fillna(0)

2470 rows × 8 columns

5、经常使用函数

mean 得出每一个列的平均值

df2.mean()

    open          11.258000
    close          9.276364
    high          15.107000
    low            5.513000
    volume    388403.913636
    code      601318.000000
    dtype: float64

#  单列的平均值（Series）

df2["close"].mean()

9.2763636363636355

sum 求出每列的和
字符串的话，就是字符串的拼接

df.sum()

    id                                                  3049215
    date      2007/3/12007/3/22007/3/52007/3/62007/3/72007/3...
    open                                                63999.2
    close                                               64054.2
    high                                                65113.7
    low                                                 63035.4
    volume                                          1.18105e+09
    code                                             1485255460
    dtype: object

sort 排序
sort_index 按照索引排序（行索引和列索引）
ascending默认为True ，表示按照升序排序；False表示降序
axis为0 ，表明按行索引；1表明用列索引 - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0) - sort_index(ascending=False,axis=0)

# ascending默认为True ，表示按照升序排序；False表示降序

df.sort_index(ascending=False)

2470 rows × 8 columns

# ascending默认为True ，表示按照升序排序；False表示降序

df.sort_index(ascending=False)

2470 rows × 8 columns

sort_values 按照值排序

# 按照close列升序排序

df2.sort_values("close")

# 按照close列降序

df2.sort_values("close",ascending=False)

1 # 按照close列升序排序，若是有close值相同，再按照low列排序
2 
3 df2.sort_values(["close","low"])

# axis=1，按照行排序，在这里必定要注意，必须保证这一行的数据类型是一致的，好比df中有字符串类型，就会报错

# df2 行类的数据类型都是一致的是没有问题的，第一个参数是说按照行的索引号，df中，0和1的结果就不同
df2.sort_values(0,axis=1)

df2.sort_values(1,axis=1)

numpy的通用函数用眼适用于pandas

# 请主要类型

df.abs()

 ---------------------------------------------------------------------------

    TypeError                                 Traceback (most recent call last)

    <ipython-input-98-db394c0c0cf4> in <module>()
          1 # 请主要类型
          2 
    ----> 3 df.abs()


    d:\program files (x86)\python35\lib\site-packages\pandas\core\generic.py in abs(self)
       5661         abs: type of caller
       5662         """
    -> 5663         return np.abs(self)
       5664 
       5665     def describe(self, percentiles=None, include=None, exclude=None):


    TypeError: bad operand type for abs(): 'str'

df2.abs()

6、自定义函数

applymap(函数名)，做用域DataFrame上，这个的函数的应用是针对于df里的每一个位置去执行

apply（函数名），做用域DataFrame上，将操做应用于整列或者整行上（整行要修改axis=1）

map做用于Series上

import numpy as np

import  pandas as pd

df=pd.read_csv("d:/601318.csv")
df

2470 rows × 8 columns

df2=df.loc[:15,"close":"code"]
df2

#df2中每一个位置都是加10

df2.applymap(lambda x:x+10)

# map做用域Series

df4=df2["close"]
df4.map(lambda x:x+100)

    0     120.657
    1     120.489
    2     119.593
    3     119.977
    4     120.520
    5     120.273
    6     120.101
    7     119.739
    8     119.818
    9     119.841
    10    119.849
    11    119.960
    12    120.211
    13    119.911
    14    120.026
    15    119.938
    Name: close, dtype: float64

#apply 将操做应用到每一列上

df2.apply(lambda x:x.sum()+1)

    close         321.903
    high          328.752
    low           317.416
    volume    5166066.460
    code      9621089.000
    dtype: float64

#apply 将操做应用到每一行上

df2.apply(lambda x:x.sum()+1,axis=1)

pandas之dataframe（下）
自定义函数
applymap(函数名)，做用域DataFrame上，这个的函数的应用是针对于df里的每一个位置去执行

apply（函数名），做用域DataFrame上，将操做应用于整列或者整行上（整行要修改axis=1）

map做用于Series上

import numpy as np

import  pandas as pd
df=pd.read_csv("d:/601318.csv")
df
id    date    open    close    high    low    volume    code
0    0    2007/3/1    22.074    20.657    22.503    20.220    1977633.51    601318
1    1    2007/3/2    20.750    20.489    20.944    20.256    425048.32    601318
2    2    2007/3/5    20.300    19.593    20.384    19.218    419196.74    601318
3    3    2007/3/6    19.426    19.977    20.308    19.315    297727.88    601318
4    4    2007/3/7    19.995    20.520    20.706    19.827    287463.78    601318
5    5    2007/3/8    20.353    20.273    20.454    20.167    130983.83    601318
6    6    2007/3/9    20.264    20.101    20.353    19.735    160887.79    601318
7    7    2007/3/12    19.999    19.739    19.999    19.646    145353.06    601318
8    8    2007/3/13    19.783    19.818    19.982    19.699    102319.68    601318
9    9    2007/3/14    19.558    19.841    19.911    19.333    173306.56    601318
10    10    2007/3/15    20.097    19.849    20.525    19.779    152521.90    601318
11    11    2007/3/16    19.863    19.960    20.286    19.602    227547.24    601318
12    12    2007/3/20    20.662    20.211    20.715    20.088    222026.87    601318
13    13    2007/3/21    20.220    19.911    20.308    19.823    136728.32    601318
14    14    2007/3/22    20.066    20.026    20.273    19.969    167509.84    601318
15    15    2007/3/23    20.017    19.938    20.101    19.739    139810.14    601318
16    16    2007/3/26    19.955    20.282    20.397    19.946    223266.79    601318
17    17    2007/3/27    20.216    20.269    20.467    20.145    139338.19    601318
18    18    2007/3/28    20.264    20.565    20.706    20.123    258263.69    601318
19    19    2007/3/29    20.666    20.927    21.540    20.520    461986.18    601318
20    20    2007/3/30    20.732    20.772    21.134    20.626    144617.20    601318
21    21    2007/4/2    20.772    21.364    21.501    20.772    231445.03    601318
22    22    2007/4/3    21.377    21.284    21.527    21.147    132712.04    601318
23    23    2007/4/4    21.289    21.099    21.412    20.993    122454.69    601318
24    24    2007/4/5    21.103    21.156    21.191    20.838    122865.38    601318
25    25    2007/4/6    21.050    21.196    21.611    20.971    195208.52    601318
26    26    2007/4/9    21.231    22.785    22.909    21.059    462770.21    601318
27    27    2007/4/10    22.516    23.319    23.699    22.516    407823.90    601318
28    28    2007/4/11    23.346    23.637    24.361    23.222    243446.50    601318
29    29    2007/4/12    23.832    23.593    25.606    23.377    159270.43    601318
...    ...    ...    ...    ...    ...    ...    ...    ...
2440    2440    2017/6/21    47.778    48.896    49.025    47.046    849757.00    601318
2441    2441    2017/6/22    48.669    48.609    49.925    48.520    1146464.00    601318
2442    2442    2017/6/23    48.708    49.183    49.361    48.263    873719.00    601318
2443    2443    2017/6/26    49.450    49.183    50.222    48.817    953192.00    601318
2444    2444    2017/6/27    49.163    49.381    49.411    48.402    780835.00    601318
2445    2445    2017/6/28    49.163    48.085    49.203    48.026    691322.00    601318
2446    2446    2017/6/29    48.273    49.420    49.510    47.858    753228.00    601318
2447    2447    2017/6/30    49.262    49.074    49.658    48.748    598630.00    601318
2448    2448    2017/7/3    49.262    48.411    49.262    48.026    563199.00    601318
2449    2449    2017/7/4    48.273    47.403    48.313    47.393    683920.00    601318
2450    2450    2017/7/5    47.482    49.876    50.152    47.482    1272537.00    601318
2451    2451    2017/7/6    49.876    50.835    51.438    49.529    1137814.00    601318
2452    2452    2017/7/7    50.598    50.459    51.063    49.984    533925.00    601318
2453    2453    2017/7/10    50.469    50.578    51.399    50.143    570776.00    601318
2454    2454    2017/7/11    50.810    51.230    52.010    50.610    699539.00    601318
2455    2455    2017/7/12    51.360    50.610    52.500    50.420    870117.00    601318
2456    2456    2017/7/13    50.980    51.630    51.860    50.830    665342.00    601318
2457    2457    2017/7/14    51.690    52.770    52.790    51.300    707791.00    601318
2458    2458    2017/7/17    53.010    53.900    55.090    52.420    1408791.00    601318
2459    2459    2017/7/18    53.600    53.470    54.260    52.510    879029.00    601318
2460    2460    2017/7/19    53.680    53.840    54.480    53.110    771180.00    601318
2461    2461    2017/7/20    53.550    54.010    54.150    52.820    659198.00    601318
2462    2462    2017/7/21    53.200    51.960    53.280    51.900    1294791.00    601318
2463    2463    2017/7/24    52.080    52.610    53.100    51.680    904595.00    601318
2464    2464    2017/7/25    52.620    52.310    53.050    52.180    506834.00    601318
2465    2465    2017/7/26    52.100    51.890    52.500    51.280    657610.00    601318
2466    2466    2017/7/27    51.850    52.360    52.740    51.090    667132.00    601318
2467    2467    2017/7/28    52.200    51.890    52.460    51.800    491294.00    601318
2468    2468    2017/7/31    51.880    52.020    52.640    51.410    616005.00    601318
2469    2469    2017/8/1    52.200    54.850    54.900    52.200    1147936.00    601318
2470 rows × 8 columns

df2=df.loc[:15,"close":"code"]
df2
close    high    low    volume    code
0    20.657    22.503    20.220    1977633.51    601318
1    20.489    20.944    20.256    425048.32    601318
2    19.593    20.384    19.218    419196.74    601318
3    19.977    20.308    19.315    297727.88    601318
4    20.520    20.706    19.827    287463.78    601318
5    20.273    20.454    20.167    130983.83    601318
6    20.101    20.353    19.735    160887.79    601318
7    19.739    19.999    19.646    145353.06    601318
8    19.818    19.982    19.699    102319.68    601318
9    19.841    19.911    19.333    173306.56    601318
10    19.849    20.525    19.779    152521.90    601318
11    19.960    20.286    19.602    227547.24    601318
12    20.211    20.715    20.088    222026.87    601318
13    19.911    20.308    19.823    136728.32    601318
14    20.026    20.273    19.969    167509.84    601318
15    19.938    20.101    19.739    139810.14    601318
#df2中每一个位置都是加10

df2.applymap(lambda x:x+10)
close    high    low    volume    code
0    30.657    32.503    30.220    1977643.51    601328
1    30.489    30.944    30.256    425058.32    601328
2    29.593    30.384    29.218    419206.74    601328
3    29.977    30.308    29.315    297737.88    601328
4    30.520    30.706    29.827    287473.78    601328
5    30.273    30.454    30.167    130993.83    601328
6    30.101    30.353    29.735    160897.79    601328
7    29.739    29.999    29.646    145363.06    601328
8    29.818    29.982    29.699    102329.68    601328
9    29.841    29.911    29.333    173316.56    601328
10    29.849    30.525    29.779    152531.90    601328
11    29.960    30.286    29.602    227557.24    601328
12    30.211    30.715    30.088    222036.87    601328
13    29.911    30.308    29.823    136738.32    601328
14    30.026    30.273    29.969    167519.84    601328
15    29.938    30.101    29.739    139820.14    601328
# map做用域Series

df4=df2["close"]
df4.map(lambda x:x+100)

    0     120.657
    1     120.489
    2     119.593
    3     119.977
    4     120.520
    5     120.273
    6     120.101
    7     119.739
    8     119.818
    9     119.841
    10    119.849
    11    119.960
    12    120.211
    13    119.911
    14    120.026
    15    119.938
    Name: close, dtype: float64
#apply 将操做应用到每一列上

df2.apply(lambda x:x.sum()+1)

    close         321.903
    high          328.752
    low           317.416
    volume    5166066.460
    code      9621089.000
    dtype: float64
#apply 将操做应用到每一行上

df2.apply(lambda x:x.sum()+1,axis=1)

    0     2579015.890
    1     1026429.009
    2     1020574.935
    3      899106.480
    4      888843.833
    5      732363.724
    6      762266.979
    7      746731.444
    8      703698.179
    9      774684.645
    10     753901.053
    11     828926.088
    12     823406.884
    13     738107.362
    14     768889.108
    15     741188.918
    dtype: float64

# 层次索引内容更新中....

# 从文件读取 - read_csv：默认分隔符是逗号 - read_table：默认分隔符是/t（tab键）参数： - sep 执行分隔符 - header=None 指定文件无列名 - names 指定列名 - index_col 指定某列做为索引 - skiprows 指定跳过哪一行 - na_values 指定某些字符串缺失值 - parse_dates 指定某些列是否被拆解为日期，布尔值或列表 - nrows 指定读取几行文件 - chunksize 分块读取文件，指定快大小

# read_table 默认是以/t（tab）为分割

pd.read_table("d:/new.csv")

pd.read_table("d:/new.csv",sep=",")

sep 还能够是正则表达式，好比 sep="\s+",表示任意长度的空白字符

#  在读取数据的时候，会默认将第一列指定为列名，能够经过修改header=None，指定第一行不是列名

pd.read_table("d:/new.csv",sep=",",header=None)

当设置header=None时，会自动取一个列名0，1，2，3，4，5，6，7

# 若是想本身取一个列名，能够修改names

pd.read_table("d:/new.csv",sep=",",header=None,names=["id","date","open","close","high","low","volumw","code"])

#  还能够设置跳过哪些行

#完整的
pd.read_table("d:/new.csv",sep=",")

pd.read_table("d:/new.csv",sep=",",skiprows=[0])

从上边能够看出。它跳是从表格的第一行开始，索引为0（在这里第一行列名就是索引0的位置）

pd.read_table("d:/new.csv",sep=",",skiprows=[1])

#  在导入的时候，默认会生成行索引，若是咱们想使用某一列做为行索引，可使用index_col，可使用多列["id","close"]

df=pd.read_table("d:/new2.csv",sep=",",index_col=["id"])
df

df.loc[4:7,"close":"low"]

# 通常在实际场景中，咱们常常用用date做为行索引

df=pd.read_table("d:/new2.csv",sep=",",index_col="date")
df

type(df.index[0])

str

#  这里的date是一个字符串，咱们能够将这个date转化为一个时间类型：设置parse_dates

df=pd.read_table("d:/new2.csv",sep=",",index_col="date",parse_dates=["date"])
type(df.index[0])

pandas._libs.tslib.Timestamp

在文件里若是有nan这个字符（咱们以前讲的是内存里边nan），如何去识别？

# 设置na_values

# 凡是"nan","None","null","xxx"这样的字符串都解析为nan，不然整列都被解析为字符串（记住，是整列，由于一列的数据类型必须一致）
df=pd.read_table("d:/new3.csv",sep=",")
df

df["id"][0]
'None'


type(df["id"].iloc[1])
str



df=pd.read_table("d:/new3.csv",sep=",",na_values=["nan","None","null","xxx"])
df

type(df["id"].iloc[1])

numpy.float64

# 写入到文件 to_csv 主要参数： - sep 指定分隔符 - na_sep 指定缺失值转换的字符串，默认为空字符串 - header=False 不输出第一行的列名 - index=False 不输出行的索引一列 - columns 输出指定列

# 默认是行名和列名都输出，缺失值转换的字符串转换为空

df.to_csv("d:/ceshi.csv",header=False,index=False,na_rep="DD",columns=["close"])

还能够导出成其它的文件类型：json，xml，Html，数据库

# 时间序列

# to_datetime 能够将字符串转换为一种特定的时间类型

pd.to_datetime(df["date"])

    0    2007-03-01
    1    2007-03-02
    2    2007-03-05
    3    2007-03-06
    4    2007-03-07
    5    2007-03-08
    6    2007-03-12
    7    2007-03-13
    8    2007-03-14
    9    2007-03-15
    10   2007-03-16
    11   2007-03-20
    12   2007-03-21
    13   2007-03-22
    Name: date, dtype: datetime64[ns]

时间处理对象：date_range
参数： - start 开始时间 - end 结束时间 - periods 时间长度 - freq 时间频率，默认为"D",可选H(our)，W(wwk),B(usiness),M(onth),S(econd),A(year)，T

# date_range 产生一组时间

pd.date_range("2017-06-01","2017-07-01")

    DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04',
                   '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08',
                   '2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12',
                   '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
                   '2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20',
                   '2017-06-21', '2017-06-22', '2017-06-23', '2017-06-24',
                   '2017-06-25', '2017-06-26', '2017-06-27', '2017-06-28',
                   '2017-06-29', '2017-06-30', '2017-07-01'],
                  dtype='datetime64[ns]', freq='D')

#  假如要每一周出一天（默认是每一天出一个）

# 这里是星期日为标准
pd.date_range("2017-06-01","2017-08-01",freq="W")

``` DatetimeIndex(['2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25', '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23', '2017-07-30'], dtype='datetime64[ns]', freq='W-SUN')

```python
#  假如要只出工做日

pd.date_range("2017-06-01","2017-08-01",freq="B")

    DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-05', '2017-06-06',
                   '2017-06-07', '2017-06-08', '2017-06-09', '2017-06-12',
                   '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
                   '2017-06-19', '2017-06-20', '2017-06-21', '2017-06-22',
                   '2017-06-23', '2017-06-26', '2017-06-27', '2017-06-28',
                   '2017-06-29', '2017-06-30', '2017-07-03', '2017-07-04',
                   '2017-07-05', '2017-07-06', '2017-07-07', '2017-07-10',
                   '2017-07-11', '2017-07-12', '2017-07-13', '2017-07-14',
                   '2017-07-17', '2017-07-18', '2017-07-19', '2017-07-20',
                   '2017-07-21', '2017-07-24', '2017-07-25', '2017-07-26',
                   '2017-07-27', '2017-07-28', '2017-07-31', '2017-08-01'],
                  dtype='datetime64[ns]', freq='B')

#  半个月
pd.date_range("2017-06-01","2017-08-01",freq="SM")


DatetimeIndex(['2017-06-15', '2017-06-30', '2017-07-15', '2017-07-31'], dtype='datetime64[ns]', freq='SM-15')


#  一个月
pd.date_range("2017-06-01","2017-08-01",freq="M")

    DatetimeIndex(['2017-06-30', '2017-07-31'], dtype='datetime64[ns]', freq='M')


#  分钟
pd.date_range("2017-06-01","2017-08-01",freq="T")

    DatetimeIndex(['2017-06-01 00:00:00', '2017-06-01 00:01:00',
                   '2017-06-01 00:02:00', '2017-06-01 00:03:00',
                   '2017-06-01 00:04:00', '2017-06-01 00:05:00',
                   '2017-06-01 00:06:00', '2017-06-01 00:07:00',
                   '2017-06-01 00:08:00', '2017-06-01 00:09:00',
                   ...
                   '2017-07-31 23:51:00', '2017-07-31 23:52:00',
                   '2017-07-31 23:53:00', '2017-07-31 23:54:00',
                   '2017-07-31 23:55:00', '2017-07-31 23:56:00',
                   '2017-07-31 23:57:00', '2017-07-31 23:58:00',
                   '2017-07-31 23:59:00', '2017-08-01 00:00:00'],
                  dtype='datetime64[ns]', length=87841, freq='T')

#  年
pd.date_range("2017-06-01","2019-08-01",freq="A")

    DatetimeIndex(['2017-12-31', '2018-12-31'], dtype='datetime64[ns]', freq='A-DEC')


#  星期一
pd.date_range("2017-06-01","2017-08-01",freq="W-MON")


    DatetimeIndex(['2017-06-05', '2017-06-12', '2017-06-19', '2017-06-26',
                   '2017-07-03', '2017-07-10', '2017-07-17', '2017-07-24',
                   '2017-07-31'],
                  dtype='datetime64[ns]', freq='W-MON')

periods 指定时间长度

#  从2017-06-01开始，产生20天

pd.date_range("2017-06-01",periods=20)

    DatetimeIndex(['2017-06-01', '2017-06-02', '2017-06-03', '2017-06-04',
                   '2017-06-05', '2017-06-06', '2017-06-07', '2017-06-08',
                   '2017-06-09', '2017-06-10', '2017-06-11', '2017-06-12',
                   '2017-06-13', '2017-06-14', '2017-06-15', '2017-06-16',
                   '2017-06-17', '2017-06-18', '2017-06-19', '2017-06-20'],
                  dtype='datetime64[ns]', freq='D')

#  从2017-06-01开始，产生20个周

pd.date_range("2017-06-01",periods=20,freq="W")

    DatetimeIndex(['2017-06-04', '2017-06-11', '2017-06-18', '2017-06-25',
                   '2017-07-02', '2017-07-09', '2017-07-16', '2017-07-23',
                   '2017-07-30', '2017-08-06', '2017-08-13', '2017-08-20',
                   '2017-08-27', '2017-09-03', '2017-09-10', '2017-09-17',
                   '2017-09-24', '2017-10-01', '2017-10-08', '2017-10-15'],
                  dtype='datetime64[ns]', freq='W-SUN')

df=pd.read_csv("d:/601318.csv",index_col="date",parse_dates=["date"])
df

2470 rows × 7 columns

type(df.index)


pandas.core.indexes.datetimes.DatetimeIndex

能够看到df.index的类型就是pd.date_range以后的类型：DatetimeIndex DatetimeIndex这个类型能够在查找时很是方便

#  查找 2017年的数据

df["2017"]

141 rows × 7 columns

#  查找 2017年8月的数据

df["2017-8"]

#  查找 2017年6月到9月的数据

df["2017-06":"2017-09"]

这里是按照时间对象索引（相似于标签索引），顾前也顾尾

df[:10]

7、测验

求出股票行情的前5日和前10日的平均值（这里是close列的平均值）

import numpy as np
import pandas as pd

df=pd.read_csv("d:/ceshi.csv",index_col="date",parse_dates=["date"])
df

2470 rows × 7 columns

方案1：手动计算

# 思路：拿出每一行前5行的"close"列的数据，再mean()求出平均值，赋值给列"ma5"
df2=df[:10].copy()
df2.loc["2007-03-07","ma5"]=df2["close"][:6].mean()
df2.loc["2007-03"]

# 建立两列，并初始化为nan

df["ma5"]=np.nan
df["ma10"]=np.nan

df

2470 rows × 9 columns

#  使用for循环一个一个的去赋值

for i in range(4,len(df)):
    df.loc[df.index[i],"ma5"]=df["close"][i-4:i+1].mean()

for i in range(9,len(df)):
    df.loc[df.index[i],"ma10"]=df["close"][i-9:i+1].mean()

df

2470 rows × 9 columns