#################################
#時間:2020-07-29
################################
options(warn = -1)
# 加載data.table包
library(data.table)
# 數據導入
mt <- fread("https://raw.githubusercontent.com/selva86/datasets/master/mtcars.csv")
head(mt)
class(mt)
# 數據讀取性能對比分析
# Create a large .csv file
set.seed(100)
m <- data.frame(matrix(runif(10000000), nrow=1000000))
write.csv(m, 'm2.csv', row.names = F)
# Time taken by read.csv to import
system.time({m_df <- read.csv('m2.csv')})
# Time taken by fread to import
system.time({m_dt <- fread('m2.csv')})
# data.frame轉換為data.table
data("mtcars")
head(mtcars)
mtcars$carname <- rownames(mtcars)
mtcars_dt <- as.data.table(mtcars)
class(mtcars_dt)
mtcars_copy <- copy(mtcars)
setDT(mtcars_copy)
class(mtcars_copy)
# data.frame轉換為data.table
setDF(mtcars_copy)
class(mtcars_copy)
# 數據選擇操作
# dataframe syntax
mtcars[mtcars$cyl == 6 & mtcars$gear == 4, ]
# datatable syntax
mtcars_dt[cyl==6 & gear==4, ]
mtcars[, 1]
mtcars_dt[, 1]
mtcars_dt[, 1, with=F]
mtcars_dt[, mpg]
myvar <- "mpg"
mtcars_dt[, myvar, with=F]
columns <- c('mpg', 'cyl', 'disp')
mtcars_dt[, columns]
mtcars_dt[, columns, with=FALSE]
# syntax 1:
mtcars_dt[1:4, list(mpg, cyl, gear)]
# syntax 2: most used
mtcars_dt[, .(mpg, cyl, gear)]
# 刪除列
drop_cols <- c("mpg", "cyl", "gear")
mtcars_dt[, !drop_cols, with=FALSE]
# 列名重新命名
setnames(mtcars_dt, 'vs', 'engine_type')
colnames(mtcars_dt)
DT <- data.table(A=1:5)
DT[ , X := shift(A, 1, type="lag")]
DT[ , Y := shift(A, 1, type="lead")]
# Solution 1
aq_dt <- data.table(airquality)
aq_dt[!is.na(Ozone), .(Solar.R, Wind, Temp)]
# Solution 2
setDT(airquality)
airquality[!is.na(Ozone), .(Solar.R, Wind, Temp)]
# 增加新的列
# data.frame syntax (works on data.table)
mtcars_dt$cyl_gear <- mtcars_dt$cyl + mtcars_dt$gear
# data.table syntax
mtcars_dt[, cyl_gear2 := cyl + gear]
mtcars_dt[, `:=`(cyl_gear3 = cyl * gear,
cyl_gear4 = cyl - gear)]
mtcars_dt
mtcars_dt[, .(cyl_gear3 = cyl * gear,
cyl_gear4 = cyl - gear)]
# 分組操作
mtcars_dt[, .(mean_mileage=mean(mpg)), by=.(cyl, gear)]
mtcars_dt[, .N, by=cyl]
dt1 <- mtcars_dt[, .(mean_mpg=mean(mpg),
mean_disp=mean(disp),
mean_wt=mean(wt),
mean_qsec=mean(qsec)), by=cyl]
output <- dt1[order(cyl), ]
output
output <- mtcars_dt[, .(mean_mpg=mean(mpg),
mean_disp=mean(disp),
mean_wt=mean(wt),
mean_qsec=mean(qsec)), by=cyl][order(cyl), ]
# data.table中使用函數
output <- mtcars_dt[, lapply(.SD, mean), by=cyl, .SDcols=c("mpg", "disp", "hp", "drat", "wt", "qsec")]
output
# 數據連接
dt1 <- mtcars_dt[5:25,.(carname, mpg, cyl)]
dt2 <- mtcars_dt[1:10, .(carname, gear)]
dt3 <- mtcars_dt[2:12, .(carname, disp)]
# Inner Join
merge(dt1, dt2, by='carname')
#> <returns 6 rows>
# Left Join
merge(dt1, dt2, by='carname', all.x = T)
#> <returns 21 rows>
# Outer Join
merge(dt1, dt2, by='carname', all = T)
#> <returns 25 rows>
# 參考資料
# https://www.machinelearningplus.com/data-manipulation/datatable-in-r-complete-guide/