R Basic 1

This is a compilation of basic R utilities.

RBasic1, RBasic2, RBasic3, data.table, dplyr, tidyr, RGIS, Leaflet

1. System functions

#RStudio Clear Screen = Ctrl+L
sessionInfo()           #gives info about versions of loaded packages
rm(list=ls(all=TRUE))   # Remove/delete
rm(list=ls(pattern = "acc"))
gc()                    # garbage collection
getwd()                 # Getting directory
basename(getwd())       # Get directory name
dirname(getwd())        # Getting directory of parent
#Setting directory
PrjDr <- "C:\\Temp"
curDir <- paste0(PrjDr,"\\Tasks\\First\\Second");setwd(curDir)
file.edit('~/.Rprofile')
options(rpubs.upload.method = "internal")
#Function example
Mysummary <- function(x,y,x){ return (x+y+z)
}
# objects in memory
sort( sapply(ls(),function(x){object.size(get(x))})) 
#Run time
system.time(Mysummary(2,5,3))
sapply(df,class) # get column type for each column
.libPaths('C:\\Users\\sidharthanr\\Documents\\R\\win-library\\3.2') # to add user libs
Sys.time() # to print current date/time
suppressMessages(library(bit64)) # to suppress messages while loading a library
#Use library() to see all packages installed, search() to see all packages loaded 
str(df) # Compactly Display the Structure
if(exists('inpShpFile')) # check whether an object named inpShpFile exists

2. Reading and Writing files

#Reading and writing a DBF file
library(foreign);ind <- read.dbf("Ind_CNS2011.dbf", as.is = FALSE)
write.dbf(ind,"ind_out.dbf")
#Reading and writing a SPSS file
library(foreign);test1 <- read.spss("spss.sav")
library(memisc);test2 <- as.data.set(spss.system.file("spss.sav"))

incsv <- read.csv("test.csv",header = TRUE)
incsv <- read.csv("test.csv",header = TRUE,colClasses=c(rep("numeric",1),rep("character",2)))
incsv <- read.table("test.csv",sep=',',header=TRUE)
write.csv(incsv, file = "out.csv",row.names=FALSE)
write.table(incsv,file="out.csv",append=F,quote=T,sep = '\t',na = "NA",row.names=F,col.names=T)

## returns a vector equal to the number of lines in the file
count.fields("out.csv", sep = "\t")
## returns the maximum to set colClasses
max(count.fields("out.csv", sep = "\t"))

3. Common operations

colMeans(aq,na.rm = T)  # removes na records by variable(not by row) - see also colSums
sapply(PSN,class)       # get column data type
selNum <- sapply(PERTYPE_march,is.numeric) # select numeric cols
mgd <- merge(ds1,  df2[,c("v1","v2")], by = c("v1","v2") ) # Merge
x %% y                  #modulus (x mod y) 5%%2 is 1
x %/% y                 #integer division 5%/%2 is 2
identical( AB1, AB2 )   #Check whether identical
aq <- aq[order(aq$v1, aq$v2),]  #Sort DF
df$sl <- formatC(df$sl,width=2,format="d",flag = "0") # 1.  Formatting and padding number with leading zeros
aq2 <- subset(aq, select = -c(Ozone,Solar.R) ) # Dropping specific columns 
aq2 <- aq[aq$Month==5,] # dropping observation
aq2 <- subset(aq,Month==5) # dropping observation
paste("str",num,sep="") # concat string and numeric - see also paste0 (default sep="")
paste("create table t",i,sep="")
paste0("no", "gap")â pastes with default sep=ââ

assign(paste0("ds",10),read.csv(paste("dset",10,".csv",sep=""))) #dynamic LHS dset name
get("sht57") # opposite of assign/returns the object with that name
strsplit(x, "e") # String split at delimiter- to get vector
aq2 <- aggregate(list(mnOzone=aq$Ozone),aq['Month'],mean,na.rm = T) # aggregate in data frame
temp <- avgDistTbl[avgDistTbl$trippurpD %in% c('WBO','OBO','HBU','HBSC'),] # Using the 'in' to check in a list
# "%in%" <- function(x, table) match(x, table, nomatch = 0) > 0
quantile(tstData,probs = (1:5)/5) # gives 5 quantiles
LTripStr$trippurp[is.na(LTripStr$nStrStps)]=2 #Check variables NA - missing value
df[duplicated(df), ] # select duplicated records
feature$ATTR <- factor(feature$ATTR). #R doesn't automatically drop unused factor levels, have to re-factor: 

# creating ordered factors - below within data.table
tripData[,tripCat:=factor(tripCat,levels=c('HBW','HBO','NHB'),ordered=TRUE)]


# create cross tabs
mytable <- xtabs(~PERTYPE+HHVEH+distBand, data=trpGPSNS3)
xtabs(EST~origCnty+destCnty,wrkFlow) - to sum up the EST variable, workflow dset name

#Random number related
set.seed(123) - sets seed
runif(nAlt*nvar*nObs, min=0, max=1) # random uniform distribution

# Creating formulae dynamically from passsed strings or using paste
f1 <- as.name(names(allTemps)[1])
allTempsSMRY <- dcast.data.table(allTempsMelt,eval(bquote(.(f1)~variable+scenName)),value.var = 'value',fill=-1)

File Access Functions

#setwd('C:\\Users\\sidharthanr\\Dropbox\\Resource\\Git\\Resource')
print(file.info('test.csv'))

##          size isdir mode               mtime               ctime
## test.csv  441 FALSE  666 2015-08-19 07:52:11 2016-09-30 19:14:15
##                        atime exe
## test.csv 2016-09-30 19:14:15  no

print(list.files())

##  [1] "docs"          "dplyr.Rmd"     "Leaflet.Rmd"   "Numpy.Rmd"    
##  [5] "out.csv"       "Pandas.Rmd"    "PyBasic1.html" "PyBasic1.Rmd" 
##  [9] "RBasic1.Rmd"   "RBasic2.Rmd"   "RBasic3.Rmd"   "RDTable.Rmd"  
## [13] "README.md"     "RGIS.Rmd"      "RMisc.Rmd"     "Starter.Rmd"  
## [17] "test.csv"      "tidyr.Rmd"

Parallel Threading

library(parallel)
cl <- makeCluster(detectCores() - 1)
clusterEvalQ(cl, library("rgdal")) 
clusterExport(cl, c("tranFileList2"))
system.time(inpData <- parLapply(cl,1:length(tranFileList2),function(ii) read.fwf(tranFileList2[ii],widths = c(20,5,5,7,7,5,7,7,2,2,7,7,7,7,7)) ) )
stopCluster(cl) # To stop cluster and free resources

Creating a PDF from a vector

plot(density(runif(50)))

R Basic 1

Raghu Sidharthan