Python Pandas

Basic Pandas syntax

import pandas as pd
pd.set_option('precision',3)
pd.set_option('display.width',500)
inpMatPD.info() # gives the memory usage of a object
s = pd.Series([1,3,5,np.nan,6,8]) # Creating a Series
Series.tolist() #Converting Series to list

pandas.__version__ # this will print the version of pandas beign used

#Creating a DataFrame
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))

(df.head(), df.tail(3), df.index, df.columns, df.describe) #Viewing Data

df.columns = ['a', 'b'] # 2.    Assign column names

df.rename(columns={'$a': 'a', '$b': 'b'}, inplace=True) #Rename columns 




##Using DataFrame.drop
df.drop(df.columns[[1, 2]], axis=1, inplace=True)
# drop by Name
df1 = df1.drop(['B', 'C'], axis=1)
## Select the ones you want
df1 = df[['a','d']]


df = df.drop(['id','var'],axis=1) #axis 1 for columns

prt1Pivot = prt1.pivot('TAZ','LINENO','EST').reset_index() # Reset index to make index a column

dtPivot=dtPivot.fillna(0) # Fill all missing variable with some number

#Sorting by an axis 
df.sort_index(axis=1, ascending=False)
#Sorting by value
df.sort(columns='B')
df.sort(['A', 'B'], ascending=[1, 0])


#Prefixing certain column names using multiple list iterators
df.columns = df.columns.tolist()[:2]+[a+b for a in df.columns.tolist()[2:] for b in ['_sfx'] ]

#Pandas append - appends rows and returns a new object  
allRslt=allRslt.append(rslt1)

df.drop_duplicates(['HOUSEID']) # Drop duplicates
df[df.duplicated(['HOUSEID'])] # Identify duplicates


pd.crosstab(df['oDist'],df['oDist']) ) # crosstab
pd.pivot_table(df,index=['oDist'],columns=['oDist'],values='id',aggfunc=len,fill_value=0) # crosstab using pivot_table but it can do more
df.pivot('foo', 'bar', 'baz') # not sure what this does

pd.merge(df1,df2,how='outer',left_on=['id1','id2'],right_on=['v1','v2']) # For suffixes use ",suffixes=('_prt1', '_prt2')"

racData=racData.add_prefix('r_') # Add suffix to all column names

prt3Pivot._get_numeric_data().apply(sum,0) # select only numeric coluns in a dataframe

#Read_csv options - reading a data withcoded null values
persons  = pd.read_csv('data.csv',na_values=['NULL','N.A.'], keep_default_na=False) 

#pd.Categorical can be used for ordered variables - good for custom sorting. Similar to the R ordered levels, Fits nicely with pivot_table
tr['TRPER'] = pd.Categorical(tr['TRPER'],['EA','AM','MD','PM','EV'])

#Categorical variables can be compared to get boolean operators. Aggregating and then reseting index will destroy the categories.To convert categories into cat code use this
sub_tours_summ['TOPER'].cat.codes

#Drop all columns starting with 
dropCols = [val for is_good, val in zip([('rac_' in a or 'wac' in a) for a in df.columns.tolist()], df.columns.tolist()) if is_good]

ctppdata['hTrct']   = ctppdata.GEOID.apply(lambda x: np.int64(x[7:18])) # Pandas string manipulation

estData['totNMTrip']=np.minimum(estData['totNMTrip'],50) # Capping a dataframe column at a certain value

Filtering a DataFrame

print(df)
print(df.loc['a']>0)
print(df.loc[:,df.loc['a']>0])
print(df.loc[df['C']>1.25,:])
print(df.loc[df['textCol'].isin(['cat','mouse']),:])
print(df.loc[~(df['textCol'].isin(['cat','mouse'])),:])

          A         B         C textCol
a -1.085631  0.997345  0.282978     dog
b -1.506295 -0.578600  1.651437     cat
c -2.426679 -0.428913  1.265936   mouse
d -0.866740 -0.678886 -0.094709     cat
A          False
B           True
C           True
textCol     True
Name: a, dtype: bool
          B         C textCol
a  0.997345  0.282978     dog
b -0.578600  1.651437     cat
c -0.428913  1.265936   mouse
d -0.678886 -0.094709     cat
          A         B         C textCol
b -1.506295 -0.578600  1.651437     cat
c -2.426679 -0.428913  1.265936   mouse
          A         B         C textCol
b -1.506295 -0.578600  1.651437     cat
c -2.426679 -0.428913  1.265936   mouse
d -0.866740 -0.678886 -0.094709     cat
          A         B         C textCol
a -1.085631  0.997345  0.282978     dog

#   Create DataFrame from vectors of unequal length
d = dict( A = np.array([1,2]), B = np.array([1,2,3,4]) )
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.iteritems() ]))
print(d)
print(df)

{'A': array([1, 2]), 'B': array([1, 2, 3, 4])}
     A  B
0  1.0  1
1  2.0  2
2  NaN  3
3  NaN  4

s.sample(n=3) # sample 3 random elements from the Series:
df.sample(frac=0.1, replace=True) # random 10% of the DataFrame with replacement:

Shift function and rolling function for time series kind of data

index = pd.date_range('2000-1-1', periods=10, freq='D')
df = pd.DataFrame(data=[3,1,4,2,1,3,2,1,3,2], index=index, columns=['A'])
df['B'] = df['A'].rolling(window=2,center=False).mean()
df['C'] = df['A'].shift(1).rolling(window=2,center=False).mean()
print(df)

##             A    B    C
## 2000-01-01  3  NaN  NaN
## 2000-01-02  1  2.0  NaN
## 2000-01-03  4  2.5  2.0
## 2000-01-04  2  3.0  2.5
## 2000-01-05  1  1.5  3.0
## 2000-01-06  3  2.0  1.5
## 2000-01-07  2  2.5  2.0
## 2000-01-08  1  1.5  2.5
## 2000-01-09  3  2.0  1.5
## 2000-01-10  2  2.5  2.0

Python Pandas

Raghu Sidharthan