Pandas Multi-index notes
In [1]:
import pandas as pd
import numpy as np
In [2]:
n=200
df = pd.DataFrame({'a':np.random.choice([0,1,2,3,4], n),
'b':np.random.choice(['x','y'], n, p=[.3,.7]),
'c':np.random.choice(['m','n'], n),
'd':np.round(np.random.uniform(0,1, size=n),2)
})
d = pd.DataFrame({'a':np.random.choice([0,1,2,3,4], n),
'b':np.random.choice(['x','y'], n, p=[.3,.7])})
Index ( Series & Dataframe )¶
Index (Basic)¶
- Single Index
In [3]:
d.head()
Out[3]:
In [4]:
d.index
Out[4]:
In [5]:
# Simple assignment
d.index = d.a
# sort by the index
d.sort_index(inplace=True)
d.head()
Out[5]:
MultiIndex¶
- Multi-indexes are ordered from the outside-in. The outermost index is 'level 0'
In [6]:
myseries = df.groupby(['b','a']).c.value_counts()
myseries
Out[6]:
MultiIndex Obects¶
- Levels can be referred to by name ( listed in names ) or by index ( 0 is the outermost level )
In [7]:
myseries.index
Out[7]:
In [8]:
# The "level 0" index
print(myseries.index.levels[0])
print(myseries.index.labels[0])
print(myseries.index.names[0])
In [9]:
# Levels can be re-named
myseries.index.names = ["cat","hat","bat"]
print(myseries.index)
myseries.index.names = ['b','a','c']
Select a subset of the Series based on the outermost index¶
In [10]:
myseries.loc['x']
Out[10]:
Select an subset of the Series based on more than one index level¶
The outermost index goes first . You can keep drilling down
In [11]:
myseries.loc['x', 1]
Out[11]:
In [12]:
myseries.loc['x', 1,'m']
Out[12]:
Reshaping¶
unstack : Move an index level up into the columns ( Wide format )¶
In [13]:
myseries.unstack(level='b')
Out[13]:
stack : Move a level from the columns into the row index ( Long format )¶
In [14]:
myseries.unstack(level='a').stack(level='a')
Out[14]:
Reorder Levels¶
In [15]:
myseries.reorder_levels(['a','b','c'])
Out[15]:
Sort the index¶
In [16]:
myseries.reorder_levels(['a','b','c']).sort_index()
Out[16]:
Reset Index¶
Reset Index¶
- each level of the index becomes a column
- The value of the index just becomes an entry in the row
In [17]:
myseries.reset_index()
Out[17]:
Reset Index -- you can select a level by name or by number¶
In [18]:
myseries.reset_index(level ='b')
Out[18]:
Hierarchical columns¶
In [19]:
mycolumn = myseries.reorder_levels(['a','b','c']).unstack(level=['b', 'c'])
mycolumn
Out[19]:
Select from columns¶
In [20]:
mycolumn.x
Out[20]:
In [21]:
mycolumn.x.m
Out[21]:
In [22]:
mycolumn[('x','m')]
Out[22]:
Add a new column in the hierarchy¶
In [23]:
mycolumn[('x','p')] = mycolumn[('x','m')]*5
mycolumn
Out[23]:
Crosstab¶
In [24]:
#Basic
pd.crosstab(index = df.a,
columns = [df.b, df.c])
Out[24]:
In [47]:
# With a value
pd.crosstab(index = df.a,columns = [df.b, df.c],
values=df.d,
aggfunc=[np.mean, len] )
Out[47]:
In [48]:
pd.crosstab(index = df.c,columns = df.b,
values= df.d ,
aggfunc= [len, sum] )
Out[48]:
Pivot¶
( todo )
In [ ]: