In [1]:

import pandas as pd
import numpy as np

In [2]:

n=200
df = pd.DataFrame({'a':np.random.choice([0,1,2,3,4], n),
                   'b':np.random.choice(['x','y'], n, p=[.3,.7]),
                   'c':np.random.choice(['m','n'], n),
                   'd':np.round(np.random.uniform(0,1, size=n),2)
                  })

d = pd.DataFrame({'a':np.random.choice([0,1,2,3,4], n),
                 'b':np.random.choice(['x','y'], n, p=[.3,.7])})

Index ( Series & Dataframe )¶

Index (Basic)¶

Single Index

In [3]:

d.head()

Out[3]:

	a	b
0	1	y
1	4	y
2	0	y
3	4	y
4	0	y

In [4]:

d.index

Out[4]:

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            190, 191, 192, 193, 194, 195, 196, 197, 198, 199],
           dtype='int64', length=200)

In [5]:

# Simple assignment
d.index = d.a

# sort by the index
d.sort_index(inplace=True)
d.head()

Out[5]:

	a	b
a
0	0	x
0	0	y
0	0	y
0	0	y
0	0	y

MultiIndex¶

Multi-indexes are ordered from the outside-in. The outermost index is 'level 0'

In [6]:

myseries = df.groupby(['b','a']).c.value_counts()
myseries

Out[6]:

b  a  c
x  0  m     7
      n     3
   1  m    12
      n     8
   2  m     7
      n     6
   3  n     9
      m     7
   4  n     8
      m     6
y  0  n    15
      m     5
   1  m    13
      n    11
   2  m    12
      n    12
   3  n    14
      m     8
   4  m    21
      n    16
dtype: int64

MultiIndex Obects¶

Levels can be referred to by name ( listed in names ) or by index ( 0 is the outermost level )

In [7]:

myseries.index

Out[7]:

MultiIndex(levels=[['x', 'y'], [0, 1, 2, 3, 4], ['m', 'n']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1]],
           names=['b', 'a', 'c'])

In [8]:

# The "level 0" index
print(myseries.index.levels[0])
print(myseries.index.labels[0])
print(myseries.index.names[0])

Index(['x', 'y'], dtype='object', name='b')
FrozenNDArray([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='int8')
b

In [9]:

# Levels can be re-named
myseries.index.names = ["cat","hat","bat"]
print(myseries.index)
myseries.index.names = ['b','a','c']

MultiIndex(levels=[['x', 'y'], [0, 1, 2, 3, 4], ['m', 'n']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1]],
           names=['cat', 'hat', 'bat'])

Select a subset of the Series based on the outermost index¶

In [10]:

myseries.loc['x']

Out[10]:

a  c
0  m     7
   n     3
1  m    12
   n     8
2  m     7
   n     6
3  n     9
   m     7
4  n     8
   m     6
dtype: int64

Select an subset of the Series based on more than one index level¶

The outermost index goes first . You can keep drilling down

In [11]:

myseries.loc['x', 1]

Out[11]:

c
m    12
n     8
dtype: int64

In [12]:

myseries.loc['x', 1,'m']

Out[12]:

Reshaping¶

unstack : Move an index level up into the columns ( Wide format )¶

In [13]:

myseries.unstack(level='b')

Out[13]:

	b	x	y
a	c
0	m	7	5
0	n	3	15
1	m	12	13
1	n	8	11
2	m	7	12
2	n	6	12
3	m	7	8
3	n	9	14
4	m	6	21
4	n	8	16

stack : Move a level from the columns into the row index ( Long format )¶

In [14]:

myseries.unstack(level='a').stack(level='a')

Out[14]:

b  c  a
x  m  0     7
      1    12
      2     7
      3     7
      4     6
   n  0     3
      1     8
      2     6
      3     9
      4     8
y  m  0     5
      1    13
      2    12
      3     8
      4    21
   n  0    15
      1    11
      2    12
      3    14
      4    16
dtype: int64

Reorder Levels¶

In [15]:

myseries.reorder_levels(['a','b','c'])

Out[15]:

a  b  c
0  x  m     7
      n     3
1  x  m    12
      n     8
2  x  m     7
      n     6
3  x  n     9
      m     7
4  x  n     8
      m     6
0  y  n    15
      m     5
1  y  m    13
      n    11
2  y  m    12
      n    12
3  y  n    14
      m     8
4  y  m    21
      n    16
dtype: int64

Sort the index¶

In [16]:

myseries.reorder_levels(['a','b','c']).sort_index()

Out[16]:

a  b  c
0  x  m     7
      n     3
   y  m     5
      n    15
1  x  m    12
      n     8
   y  m    13
      n    11
2  x  m     7
      n     6
   y  m    12
      n    12
3  x  m     7
      n     9
   y  m     8
      n    14
4  x  m     6
      n     8
   y  m    21
      n    16
dtype: int64

Reset Index¶

each level of the index becomes a column
The value of the index just becomes an entry in the row

In [17]:

myseries.reset_index()

Out[17]:

	b	a	c	0
0	x	0	m	7
1	x	0	n	3
2	x	1	m	12
3	x	1	n	8
4	x	2	m	7
5	x	2	n	6
6	x	3	n	9
7	x	3	m	7
8	x	4	n	8
9	x	4	m	6
10	y	0	n	15
11	y	0	m	5
12	y	1	m	13
13	y	1	n	11
14	y	2	m	12
15	y	2	n	12
16	y	3	n	14
17	y	3	m	8
18	y	4	m	21
19	y	4	n	16

Reset Index -- you can select a level by name or by number¶

In [18]:

myseries.reset_index(level ='b')

Out[18]:

		b	0
a	c
0	m	x	7
0	n	x	3
1	m	x	12
1	n	x	8
2	m	x	7
2	n	x	6
3	n	x	9
3	m	x	7
4	n	x	8
4	m	x	6
0	n	y	15
0	m	y	5
1	m	y	13
1	n	y	11
2	m	y	12
2	n	y	12
3	n	y	14
3	m	y	8
4	m	y	21
4	n	y	16

Hierarchical columns¶

In [19]:

mycolumn = myseries.reorder_levels(['a','b','c']).unstack(level=['b', 'c'])
mycolumn

Out[19]:

b	x		y
c	m	n	n	m
a
0	7	3	15	5
1	12	8	11	13
2	7	6	12	12
3	7	9	14	8
4	6	8	16	21

Select from columns¶

In [20]:

mycolumn.x

Out[20]:

c	m	n
a
0	7	3
1	12	8
2	7	6
3	7	9
4	6	8

In [21]:

mycolumn.x.m

Out[21]:

a
0     7
1    12
2     7
3     7
4     6
Name: m, dtype: int64

In [22]:

mycolumn[('x','m')]

Out[22]:

a
0     7
1    12
2     7
3     7
4     6
Name: (x, m), dtype: int64

Add a new column in the hierarchy¶

In [23]:

mycolumn[('x','p')] = mycolumn[('x','m')]*5
mycolumn

Out[23]:

b	x		y		x
c	m	n	n	m	p
a
0	7	3	15	5	35
1	12	8	11	13	60
2	7	6	12	12	35
3	7	9	14	8	35
4	6	8	16	21	30

Crosstab¶

In [24]:

#Basic
pd.crosstab(index = df.a,
            columns = [df.b, df.c])

Out[24]:

b	x		y
c	m	n	m	n
a
0	7	3	5	15
1	12	8	13	11
2	7	6	12	12
3	7	9	8	14
4	6	8	21	16

In [47]:

# With a value
pd.crosstab(index = df.a,columns = [df.b, df.c], 
            values=df.d, 
            aggfunc=[np.mean, len] )

Out[47]:

	mean				len
b	x		y		x		y
c	m	n	m	n	m	n	m	n
a
0	0.562857	0.406667	0.382000	0.652000	7	3	5	15
1	0.486667	0.338750	0.496154	0.440000	12	8	13	11
2	0.467143	0.346667	0.589167	0.433333	7	6	12	12
3	0.418571	0.515556	0.432500	0.404286	7	9	8	14
4	0.648333	0.598750	0.382381	0.528125	6	8	21	16

In [48]:

pd.crosstab(index = df.c,columns = df.b, 
            values= df.d , 
            aggfunc= [len, sum] )

Out[48]:

	len		sum
b	x	y	x	y
c
m	39	59	19.87	26.92
n	34	68	15.44	33.93

Pivot¶

( todo )

In [ ]:

Pandas Multi-index notes

Index ( Series & Dataframe )¶

Index (Basic)¶

MultiIndex¶

MultiIndex Obects¶

Select a subset of the Series based on the outermost index¶

Select an subset of the Series based on more than one index level¶

Reshaping¶

unstack : Move an index level up into the columns ( Wide format )¶

stack : Move a level from the columns into the row index ( Long format )¶

Reorder Levels¶

Sort the index¶

Reset Index¶

Reset Index¶

Reset Index -- you can select a level by name or by number¶

Hierarchical columns¶

Select from columns¶

Add a new column in the hierarchy¶

Crosstab¶

Pivot¶

Written by sideprojects in Notes on Sat 07 May 2016. Tags: data analysis, pandas,

Links

Tags

Social

	b	a	c	0
0	x	0	m	7
1	x	0	n	3
2	x	1	m	12
3	x	1	n	8
4	x	2	m	7
5	x	2	n	6
6	x	3	n	9
7	x	3	m	7
8	x	4	n	8
9	x	4	m	6
10	y	0	n	15
11	y	0	m	5
12	y	1	m	13
13	y	1	n	11
14	y	2	m	12
15	y	2	n	12
16	y	3	n	14
17	y	3	m	8
18	y	4	m	21
19	y	4	n	16

		b	0
a	c
0	m	x	7
0	n	x	3
1	m	x	12
1	n	x	8
2	m	x	7
2	n	x	6
3	n	x	9
3	m	x	7
4	n	x	8
4	m	x	6
0	n	y	15
0	m	y	5
1	m	y	13
1	n	y	11
2	m	y	12
2	n	y	12
3	n	y	14
3	m	y	8
4	m	y	21
4	n	y	16

	b	a	c	0
0	x	0	m	7
1	x	0	n	3
2	x	1	m	12
3	x	1	n	8
4	x	2	m	7
5	x	2	n	6
6	x	3	n	9
7	x	3	m	7
8	x	4	n	8
9	x	4	m	6
10	y	0	n	15
11	y	0	m	5
12	y	1	m	13
13	y	1	n	11
14	y	2	m	12
15	y	2	n	12
16	y	3	n	14
17	y	3	m	8
18	y	4	m	21
19	y	4	n	16

		b	0
a	c
0	m	x	7
0	n	x	3
1	m	x	12
1	n	x	8
2	m	x	7
2	n	x	6
3	n	x	9
3	m	x	7
4	n	x	8
4	m	x	6
0	n	y	15
0	m	y	5
1	m	y	13
1	n	y	11
2	m	y	12
2	n	y	12
3	n	y	14
3	m	y	8
4	m	y	21
4	n	y	16

	b	a	c	0
0	x	0	m	7
1	x	0	n	3
2	x	1	m	12
3	x	1	n	8
4	x	2	m	7
5	x	2	n	6
6	x	3	n	9
7	x	3	m	7
8	x	4	n	8
9	x	4	m	6
10	y	0	n	15
11	y	0	m	5
12	y	1	m	13
13	y	1	n	11
14	y	2	m	12
15	y	2	n	12
16	y	3	n	14
17	y	3	m	8
18	y	4	m	21
19	y	4	n	16

		b	0
a	c
0	m	x	7
0	n	x	3
1	m	x	12
1	n	x	8
2	m	x	7
2	n	x	6
3	n	x	9
3	m	x	7
4	n	x	8
4	m	x	6
0	n	y	15
0	m	y	5
1	m	y	13
1	n	y	11
2	m	y	12
2	n	y	12
3	n	y	14
3	m	y	8
4	m	y	21
4	n	y	16