Skip to content

Pandas

Credit to https://github.com/hangsz/pandas-tutorial

import numpy as np
import pandas as pd

IO

# load
df = pd.read_csv(path, header=None) # no column names (first line is data)

# save
df.to_csv(path, header=False, index=False) # do not write row and column names (will be default int)

Data Structure

Series (Ordered, Indexed, Homogenous, One-Dimensional)
pd.Series(data=None, index=None, dtype=None, name=None, copy=False)
# index := range(0, data.shape[0])
# dtype := float64

s = pd.Series([1,2,3], ['a','b','c'])
s = pd.Series({'a':1, 'b':2, 'c':3}) # indexed data

## attributes
s.name
s.values
s.index
s.dtype
DataFrame (Indexed, Heterogenous, Two-Dimensional)

Series of Series.

pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
# index := range(0, data.shape[0])
# columns := range(0, data.shape[1])

data = [[1,2,3],
        [4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data, index = index, columns = columns)

## attributes
df.index
df.columns
df.values
df.dtypes # of every column
df.shape
df.size
df.head()
df.tail()
df.memory_usage(deep=False) # deep: whether count reference
df.descrive(include='all') # simple stats of columns

## create from file
df = pd.read_csv(filepath_or_buffer, sep=',', header='infer', index_col=None, names=None)
# If csv has no header: set header=None
# index_col=None: create a new range. 
# index_col=0: use the first col 
# index_col='name': use the 'name' col

df = pd.read_excel(io, sheetname=0, header=0)

Manipulate Data

Series
### Access
mask = [True,True,False]
## [], index
s[0]
s[0:2]
s[[0,1]]
s[mask]
## .loc[] == []
s.loc['a']
## .iloc[], position
s.iloc[0]
s.iloc[0:2]
s.iloc[[0,2]]
s.iloc[mask]

### Modify value
s[0] = 1
s.replace(to_replace, value, inplace=False) # can replace a list of values

### Modify index
s.index = new_index
s.rename(index=None, level=None, inplace=False)

### append
s.loc[100] = 100
# `s[100] = 100` lead to error, use [] only to access.
s.append(s2, ignore_index=False) # ignore_index: erase index of s2

### delete
del s[100]
s.drop([100, 'a'])

### unique
s.unique()
DataFrame
data = [[1,2,3],
        [4,5,6]]
index = ['a','b']
columns = ['A','B','C']
df = pd.DataFrame(data=data, index=index, columns = columns)

### access

## []
df['A'] # column, return Series
df[['A', 'B']] # multiple columns, return DF
# `df[0]` error, 0 is not a column name.
df[0:1] # row, return DF! use df.iloc[0] to get same data as Series
df[mask] # row, return DF

## .loc()
df.loc['b'] # == df.loc['b', :], return Series
# `df.loc['B']` error, 'B' not in index (rows)
df.loc[:, 'B'] # return Series
df.loc['b', 'B'] # return scalar
df.loc[['a', 'b'], 'B']
df.loc[mask, 'B']

## .iloc()
df.iloc[0,0]
df.iloc[0:2, 0]
df.iloc[[0,1], [0,2]] # return DF
df.iloc[mask1, mask2]

### modify value
df.loc['a', 'A'] = 0

### modify name
df.index = [...]
df.columns = [...]

### append rows
df.loc['c'] = [7,8,9]
pd.concat([df1, df2], axis=0)

### append cols
df['D'] = [10, 11]
pd.concat([df1, df2], axis=1)

### remove
df.drop(['a', 'b'], axis=0) # row
df.drop(['A', 'B'], axis=1) # col

### unique
df.drop_duplicates()


### List as column
## select non empty rows
df[df['list'].str.len() > 0]

Conditioned Query

df.loc[df.country == 'Italy']
df.loc[df.country.isin(['Italy', 'New Zealand'])]
df.loc[(df.points>80) & (df.country=='Italy')] # Never use `and`, `or`. Must use ()&(), ()|()
df.loc[df.price.notnull()] # or isnull()

Grouping and Sorting

df.groupby(by=None, axis=0, ...)
# combine rows, followed by a function.

>>> df
   Animal  Max Speed
0  Falcon      380.0
1  Falcon      370.0
2  Parrot       24.0
3  Parrot       26.0

>>> df.groupby(['Animal']).mean()
        Max Speed
Animal
Falcon      375.0
Parrot       25.0

>>> df.groupby(['Animal'])
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000002B607F06128>

Missing values

### find na rows

is_NaN = df.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = df[row_has_NaN]
print(rows_with_NaN)

### fillna
def encode_label(train, columns, test=None):
    from sklearn.preprocessing import LabelEncoder
    for column in columns:
        train[column] = train[column].fillna('na')
        enc = LabelEncoder()
        enc.fit(train[column])
        train[column] = enc.transform(train[column])
        if test is not None:
            test[column] = test[column].fillna('na')
            test[column] = enc.transform(test[column])
    return train if test is None else train, test

def impute(train, columns, test=None):
    from sklearn.impute import SimpleImputer
    for column in columns:
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp.fit(train[[column]])
        train[column] = imp.transform(train[[column]])
        if test is not None:
            test[column] = imp.transform(test[[column]])      
    return train if test is None else train, test

Melting

df.melt(id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None)
# id_vars[list]: cols to use as identifier variables. If None, use nothing.
# value_vars[list]: cols to unpivot. If None, use all cols not in id_vars.
# var_name[scalar]:  name for 'variable' col.
# value_name[scalar]: name for 'value' col.
>>> df
     Name    Course  Age
0    John   Masters   27
1     Bob  Graduate   23
2  Shiela  Graduate   21

>>> df.melt(id_vars=['Name'])
     Name variable     value
0    John   Course   Masters
1     Bob   Course  Graduate
2  Shiela   Course  Graduate
3    John      Age        27
4     Bob      Age        23
5  Shiela      Age        21

>>> df.melt(id_vars=['Name', 'Course'], value_vars=['Age'])
     Name    Course variable  value
0    John   Masters      Age     27
1     Bob  Graduate      Age     23
2  Shiela  Graduate      Age     21

>>> df.melt(value_vars=['Age'])
  variable  value
0      Age     27
1      Age     23
2      Age     21

Pivoting

df.pivot(index=None, columns=None, values=None)
# index[str/obj]: index for new df. If None, use current.
# columns[str/obj]: col for new df.
# values[str/obj/list]: If None, use all remained.

>>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                      'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                      'baz': [1, 2, 3, 4, 5, 6],
                      'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
>>> df
    foo   bar  baz  zoo
0   one   A    1    x
1   one   B    2    y
2   one   C    3    z
3   two   A    4    q
4   two   B    5    w
5   two   C    6    t

>>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
      baz       zoo
bar   A  B  C   A  B  C
foo
one   1  2  3   x  y  z
two   4  5  6   q  w  t

>>> df.pivot(index='foo', columns='bar', values='baz')
bar  A   B   C
foo
one  1   2   3
two  4   5   6

Merging

pd.merge(left, right, how='inner', on=None)
# how: inner, outer, left, right

Options

pd.get_option(key)
pd.set_option(key, value)
pd.reset_option(key)

"display.max_rows"
"display.max_cols"
"display.precision"
"display.max_colwidth"

Others

# arithm
df.add(other, axis='columns')
df1.dot(df2)
df.abs()
df.clip(mn, mx)
df.sum(axis='rows') 

# functional
df.apply(func, axis=0, raw=False, ...)
df.pipe(func, *args, **kwargs)