-
Notifications
You must be signed in to change notification settings - Fork 0
/
0426pandas多重索引.py
157 lines (140 loc) · 5.92 KB
/
0426pandas多重索引.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Creating a MultiIndex (hierarchical index) object
import pandas as pd
import numpy as np
# 对于多个索引,若来自数组,则使用MultiIndex.from_arrays
# 对于多个索引,若来自元组,则使用MultiIndex.from_tuples
# a crossed set of iterables (using MultiIndex.from_product)
index = pd.MultiIndex.from_product([[0,1,2],['2012-01-01','2012-01-02','2012-01-03']],names=['first', 'second'])
df = pd.DataFrame(np.random.randn(9, 4), index=index,columns=list('abcd'))
print(df)
df_1 = df.loc[(slice(None),'2012-01-01'),:] # panel数据切片
print(df_1)
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
# zip函数接受任意多个(包括0个和1个)序列作为参数,返回一个tuple列表。
# 在函数调用中使用*list/tuple的方式表示将list/tuple分开,作为位置参数传递给对应函数
tuples=list(zip(*arrays)) # 将数组转换成元组
index=pd.MultiIndex.from_tuples(tuples,names=['first','second'])
# 也可以使用
index=pd.MultiIndex.from_arrays(arrays,names=['first','second'])
s=pd.Series(np.random.randn(8),index=index)
# 还可以使用如下方法
iterables=[['bar','baz','foo','qux'],['one','two']]
# 相当于R语言中的expand.grid()
pd.MultiIndex.from_product(iterables,names=['first','second'])
# 最方便的是直接给定数组arrays
arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']),
np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])]
s=pd.Series(np.random.randn(8),index=arrays)
df=pd.DataFrame(np.random.randn(8,4),index=arrays)
df=pd.DataFrame(np.random.randn(3,8),index=['A','B','C'],columns=index)
pd.DataFrame(np.random.randn(6,6),index=index[:6],columns=index[:6])
pd.Series(np.random.randn(8), index=tuples) # 使用元组作为index
# 设置数据的显示方式
pd.set_option('display.multi_sparse',False) # 每一个数据索引都显示
pd.set_option('display.multi_sparse',True) # 只显示第一个索引
# Reconstructing the level labels
index.get_level_values(0) # 返回每一个index的索引名称
index.get_level_values('second')
index.get_level_values(1)
# Basic indexing on axis with MultiIndex
df['bar']
df['bar','one']
df['bar']['one']
df.columns
df[['foo','qux']].columns
df[['foo','qux']].columns.values # 返回真正的索引名称
df[['foo','qux']].columns.get_level_values(0)
pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values)
# Data alignment and using reindex
# 数据的运算
s+s[:-2] # 与nan运算,返回nan值
s+s[::2]
s.reindex(index[:3]) # 返回前3行数据
# 按照索引元组进行返回数据
s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')])
# Advanced indexing with hierarchical index
df.T # 进行转置
df.loc['bar']
df.loc['bar','two'] # 等同于 df['bar','one']
df.loc[('baz','two'):('qux','one')]
df.loc[('baz','two'):'foo']
df.ix[[('bar', 'two'), ('qux', 'one')]]
def mklbl(prefix,n):
return(['%s%s' % (prefix,i) for i in range(n)])
miindex=pd.MultiIndex.from_product([mklbl('A',4),mklbl('B',2),mklbl('C',4),mklbl('D',2)])
micolumns = pd.MultiIndex.from_tuples([('a','foo'),('a','bar'),('b','foo'),('b','bah')],
names=['lvl0', 'lvl1'])
dfmi = pd.DataFrame(np.arange(len(miindex)*len(micolumns)).reshape((len(miindex),len(micolumns))),index=miindex,
columns=micolumns).sort_index().sort_index(axis=1)
dfmi.loc[(slice('A1','A3'),slice(None),['C1','C3']),:]
idx=pd.IndexSlice
dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']]
mask=dfmi[('a','foo')]>200
dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']]
dfmi.loc(axis=0)[:,:,['C1','C3']]
df2.loc(axis=0)[:,:,['C1','C3']] = -10
df2.loc[idx[:,:,['C1','C3']],:]=df2*1000
# Cross-section
df.xs('one',level='second') # 选取second的one一列数据
df.xs('bar',level='first') # df.xs('bar')
df.loc[(slice(None),'one'),:]
df.loc[('bar','one'),:]
df.loc[(slice('bar'),'one'),:]
df=df.T
df.xs('one', level='second', axis=1)
df.loc[:,(slice(None),'one')]
df.xs(('one', 'bar'), level=('second', 'first'), axis=1)
df.loc[:,('bar','one')]
df.xs('one', level='second', axis=1, drop_level=False) # 保留索引的名称
df.xs('one',level='second',axis=1,drop_level=True) #删除索引的名称
# Advanced reindexing and alignment
midx=pd.MultiIndex(levels=[['zero','one'],['x','y']],
labels=[[1,1,0,0],[1,0,1,0]])
df=pd.DataFrame(np.random.randn(4,2),index=midx)
df2 = df.mean(level=0)
df2.reindex(df.index,level=0)
# 将df赋值给df_aligned 将df2赋值给df2_aligned
df_aligned, df2_aligned = df.align(df2, level=0)
# Swapping levels with swaplevel()
# 转换索引名称
df[:5].swaplevel(0,1,axis=0) # 索引名称互换
# Reordering levels with reorder_levels()
df[:5].reorder_levels([1,0], axis=0)
# Sorting a MultiIndex 对于索引进行排序
import random
random.shuffle(tuples)
s=pd.Series(np.random.randn(8),index=pd.MultiIndex.from_tuples(tuples))
s.sort_index() # 默认level=0
s.sort_index(level=1)
# 通过索引name进行排序
s.index.set_names(['L1', 'L2'], inplace=True)
s.sort_index(level='L1')
df.T.sort_index(level=1, axis=1)
dfm = pd.DataFrame({'jim': [0, 0, 1, 1],
'joe':['x','x','z','y'],
'jolie':np.random.rand(4)})
dfm=dfm.set_index(['jim','joe']) # 将jim 和joe两列设置成行索引名称
dfm.loc[(1,'z')]
dfm.index.is_lexsorted() # 判断是否排序
dfm.index.is_lexsort_depth # 返回排序的深度
dfm.loc[(0,'y'):(1, 'z')]
# Take Methods
index=pd.Index(np.random.randint(0,1000,10))
positions=[0,9,3]
index[positions]
index.take(positions)
ser=pd.Series(np.random.randn(10))
ser.iloc[positions]
ser.take(positions)
frm = pd.DataFrame(np.random.randn(5, 3))
frm.take([1, 4, 3])
frm.take([0,2],axis=1)
arr=np.random.randn(10)
arr.take([False,True,False,True])
# Index Types
df.dtypes # 查看数据类型
df.B.cat.categories
indexf=pd.Index([1.5,2,3,4.5,5])
sf=pd.Series(range(5),index=indexf)
sf.loc[3.0] # 浮点型数据作为索引