Basic Pandas syntax
import pandas as pd
pd.set_option('precision',3)
pd.set_option('display.width',500)
inpMatPD.info() # gives the memory usage of a object
s = pd.Series([1,3,5,np.nan,6,8]) # Creating a Series
Series.tolist() #Converting Series to list
pandas.__version__ # this will print the version of pandas beign used
#Creating a DataFrame
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
(df.head(), df.tail(3), df.index, df.columns, df.describe) #Viewing Data
df.columns = ['a', 'b'] # 2. Assign column names
df.rename(columns={'$a': 'a', '$b': 'b'}, inplace=True) #Rename columns
##Using DataFrame.drop
df.drop(df.columns[[1, 2]], axis=1, inplace=True)
# drop by Name
df1 = df1.drop(['B', 'C'], axis=1)
## Select the ones you want
df1 = df[['a','d']]
df = df.drop(['id','var'],axis=1) #axis 1 for columns
prt1Pivot = prt1.pivot('TAZ','LINENO','EST').reset_index() # Reset index to make index a column
dtPivot=dtPivot.fillna(0) # Fill all missing variable with some number
#Sorting by an axis
df.sort_index(axis=1, ascending=False)
#Sorting by value
df.sort(columns='B')
df.sort(['A', 'B'], ascending=[1, 0])
#Prefixing certain column names using multiple list iterators
df.columns = df.columns.tolist()[:2]+[a+b for a in df.columns.tolist()[2:] for b in ['_sfx'] ]
#Pandas append - appends rows and returns a new object
allRslt=allRslt.append(rslt1)
df.drop_duplicates(['HOUSEID']) # Drop duplicates
df[df.duplicated(['HOUSEID'])] # Identify duplicates
pd.crosstab(df['oDist'],df['oDist']) ) # crosstab
pd.pivot_table(df,index=['oDist'],columns=['oDist'],values='id',aggfunc=len,fill_value=0) # crosstab using pivot_table but it can do more
df.pivot('foo', 'bar', 'baz') # not sure what this does
pd.merge(df1,df2,how='outer',left_on=['id1','id2'],right_on=['v1','v2']) # For suffixes use ",suffixes=('_prt1', '_prt2')"
racData=racData.add_prefix('r_') # Add suffix to all column names
prt3Pivot._get_numeric_data().apply(sum,0) # select only numeric coluns in a dataframe
#Read_csv options - reading a data withcoded null values
persons = pd.read_csv('data.csv',na_values=['NULL','N.A.'], keep_default_na=False)
#pd.Categorical can be used for ordered variables - good for custom sorting. Similar to the R ordered levels, Fits nicely with pivot_table
tr['TRPER'] = pd.Categorical(tr['TRPER'],['EA','AM','MD','PM','EV'])
#Categorical variables can be compared to get boolean operators. Aggregating and then reseting index will destroy the categories.To convert categories into cat code use this
sub_tours_summ['TOPER'].cat.codes
#Drop all columns starting with
dropCols = [val for is_good, val in zip([('rac_' in a or 'wac' in a) for a in df.columns.tolist()], df.columns.tolist()) if is_good]
ctppdata['hTrct'] = ctppdata.GEOID.apply(lambda x: np.int64(x[7:18])) # Pandas string manipulation
estData['totNMTrip']=np.minimum(estData['totNMTrip'],50) # Capping a dataframe column at a certain value
Filtering a DataFrame
print(df)
print(df.loc['a']>0)
print(df.loc[:,df.loc['a']>0])
print(df.loc[df['C']>1.25,:])
print(df.loc[df['textCol'].isin(['cat','mouse']),:])
print(df.loc[~(df['textCol'].isin(['cat','mouse'])),:])
A B C textCol
a -1.085631 0.997345 0.282978 dog
b -1.506295 -0.578600 1.651437 cat
c -2.426679 -0.428913 1.265936 mouse
d -0.866740 -0.678886 -0.094709 cat
A False
B True
C True
textCol True
Name: a, dtype: bool
B C textCol
a 0.997345 0.282978 dog
b -0.578600 1.651437 cat
c -0.428913 1.265936 mouse
d -0.678886 -0.094709 cat
A B C textCol
b -1.506295 -0.578600 1.651437 cat
c -2.426679 -0.428913 1.265936 mouse
A B C textCol
b -1.506295 -0.578600 1.651437 cat
c -2.426679 -0.428913 1.265936 mouse
d -0.866740 -0.678886 -0.094709 cat
A B C textCol
a -1.085631 0.997345 0.282978 dog
# Create DataFrame from vectors of unequal length
d = dict( A = np.array([1,2]), B = np.array([1,2,3,4]) )
df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in d.iteritems() ]))
print(d)
print(df)
{'A': array([1, 2]), 'B': array([1, 2, 3, 4])}
A B
0 1.0 1
1 2.0 2
2 NaN 3
3 NaN 4
s.sample(n=3) # sample 3 random elements from the Series:
df.sample(frac=0.1, replace=True) # random 10% of the DataFrame with replacement:
Shift function and rolling function for time series kind of data
index = pd.date_range('2000-1-1', periods=10, freq='D')
df = pd.DataFrame(data=[3,1,4,2,1,3,2,1,3,2], index=index, columns=['A'])
df['B'] = df['A'].rolling(window=2,center=False).mean()
df['C'] = df['A'].shift(1).rolling(window=2,center=False).mean()
print(df)
## A B C
## 2000-01-01 3 NaN NaN
## 2000-01-02 1 2.0 NaN
## 2000-01-03 4 2.5 2.0
## 2000-01-04 2 3.0 2.5
## 2000-01-05 1 1.5 3.0
## 2000-01-06 3 2.0 1.5
## 2000-01-07 2 2.5 2.0
## 2000-01-08 1 1.5 2.5
## 2000-01-09 3 2.0 1.5
## 2000-01-10 2 2.5 2.0