This is a compilation of data.table behavior.
RBasic1, RBasic2, RBasic3, data.table, dplyr, tidyr, RGIS, Leaflet
1. Basic syntax
library(data.table)
DT = data.table(x=c("b","b","b","a","a"),v=rnorm(5)) # same as data.frame
tables() # prints all data tables
identical( DT[list("x","v"),],DT[.("x","v"),]) # aslias for list
str(dtnew) #to get the class type of each column and some samples
estsetM2.4 <- data.table(estsetM2.4,key="sampn,perno") #Initialization
#Setting the key of the data table
setkey(PSN_DT, SAMPN)
setkeyv(maz2010,c("TAZ","mgra"))
#Select coumns from the table based on column names
vlst1 <- paste("dur",1:253,sep="");
temp1 <- estsetM4.2.3a[,vlst1, with=FALSE]
temp1 <- DT[,region] to get the column region as a vector
temp1 <- DT[,.(region)] to get the column region as a data table
temp1 <- DT[,.(region,zone)] to get both the column as a vector a data.table
temp1 <- DT[,list(region,zone)] to get both the column as a vector a data.table
# Change column names
setnames(DT,"V1", "Y")
DT3 <- copy(DT) # rather than DT3 <- DT â to make a copy rather than reference
#Try .SD of data table. It stand for subset of Datatable.
DT = data.table(x=rep(c("a","b","c"),each=2), y=c(1,3), v=1:6)
setkey(DT, y)
DT[, .SD[,paste(x,v, sep="", collapse="_")], by=y]
DT[,print(.SD),by=y]
#Add aggregated variables to the raw data file
set.seed(1234)
smalldat <- data.table(group1 = rep(1:2, each = 5),
group2 = rep(c('a','b'), times = 5),
x = rnorm(10))
smalldat[, aggGroup1 := mean(x), by = group1]
smalldat[, aggGroup1.2 := mean(x), by = list(group1, group2)]
DT[2,] # select row 2
DT[x=="b",] # select rows where column x == "b"
cat(try(DT["b",],silent=TRUE)) # to show that it doesn't work unless keyed
setkey(DT,x)
(try(DT["b",],silent=TRUE)) # to show that it doesn't work unless keyed, also gets sorted
DT["b",mult="first"] # The mult argument allows first/last row of group
DT["b",mult="last"]
DT["b"] # comma is optional
2. Efficiency comparison
#demonstrate the difference between a vector scan and a binary search - from datatable-intro.pdf
grpsize = ceiling(1e7/26^2) # 10 million rows, 676 groups
tt=system.time( DF <- data.frame(
x=rep(LETTERS,each=26*grpsize),
y=rep(letters,each=grpsize),
v=runif(grpsize*26^2),
stringsAsFactors=FALSE)
)
DT = as.data.table(DF)
tt=system.time(ans1 <- DF[DF$x=="R" & DF$y=="h",]) # vector scan
system.time(setkey(DT,x,y)) # one-off cost, usually
ss=system.time(ans2 <- DT[list("R","h")]) # binary search
uu=system.time(ans3 <- DT[x=="R" & y=="h",]) # vector scan using DT
tt;ss;uu;
3. Fill NA in a data.table
#Method for replacing na with something â Matt Dowle
tripData <- fread('tripData.csv',nrows = 10,colClasses = c(rep("integer", 5), rep("NULL", 35)))
f_dowle3 = function(DT) {
# either of the following for loops
for (i in names(DT))
DT[is.na(get(i)),i:=0,with=FALSE]
# by name :
for (j in names(DT))
set(DT,which(is.na(DT[[j]])),j,0)
# or by number (slightly faster than by name) :
for (j in seq_len(ncol(DT)))
set(DT,which(is.na(DT[[j]])),j,0)
4. Misc
#Nice application
library(data.table)
X <- rep("a s", 1e6)
d <- data.frame(1:1e6, replicate(20, X, simplify=FALSE), stringsAsFactors=FALSE)
colnames(d) <- paste("X", seq_len(ncol(d)), sep="")
DT1 <- as.data.table(d)
DT2 <- copy(DT1)
DT3 <- copy(DT2)
system.time({
cnames <- colnames(DT1)[-1]
for(cname in cnames) {
DT1[ ,
cname := gsub(" ", "_", DT1[[cname]][[1]]),
with=FALSE]
}
})
#Apply same function to all the columns (except key)
dtb[, lapply(.SD, mean), by=condition]
Apply same function to multiple columns
dtb[, lapply(.SD, mean), by=condition, .SDcols=2:4]
Alternative below
ids <- paste0("V", 251:300) # get column ids
dt.out <- dt[, lapply(.SD, mean), by=grp, .SDcols = ids]
# .Internal(inspect(DT)) can be used to identify copy by reference and all
# Remove/drop a column
# Method 1 (and preferred as it takes 0.00s even on a 20GB data.table)
df3[,foo:=NULL]
# Method 2a -- A safe idiom for excluding columns matching a regular expression
df3 <- df3[, which(!grepl("^foo$", colnames(df3))), with=FALSE]
# Method 2b -- An alternative to 2a, also "safe" in the sense described below
df3 <- df3[, grep("^foo$", colnames(df3), invert=TRUE), with=FALSE]
data.table also supports the following syntax:
## Method 3 (could then assign to df3,
df3[, !"foo", with=FALSE]
# Chained calling
check <- cwalk_maz2taz[,lapply(.SD,is.na)][,lapply(.SD,sum)]
summSED <- summSED[, which(!grepl("INC_WRK", colnames(summSED))), with=FALSE]
Cast and Melt
dcast.data.table
melt
Use duplicaterd and sort to remove rows â duplicated retains the first occuracnnce
aa = data.table( a=c(2,4,1,1),b=c(5,1,5,3))
aa[!duplicated(a),]
Function to pass string to data table and use it as string and not as a scope variable
vnam = "HINCCAT1"
cpreVar <- function(vnam){
stt <- merge(hh2[,lapply(.SD,length),by=c(vnam),.SDcols="ps2"],
hh3[,lapply(.SD,length),by=c(vnam),.SDcols="ps3"],by=c(vnam))
stt[,ps2Fr:=ps2/sum(ps2)];stt[,ps3Fr:=ps3/sum(ps3)];stt
}
cpreVar("HINCCAT1")
Multiple assign in j
DT[,`:=`(new1=sum(colB), new2=sum(colC))] # multiple :=.
# A Fill NA function for easy use with DT
fillna = function(DT) {
DT_ret <- copy(DT)
for (j in seq_len(ncol(DT_ret)))
set(DT_ret,which(is.na(DT_ret[[j]])),j,0)
return(DT_ret)
}