library(dplyr)library(sqldf)library(data.table)library(ggplot2)library(compare)library(plotrix)#SQL:sqldf(" SELECT COUNT(sid) as 'Number of rows' FROM student_dframe; ") #SQL :sqldf("SELECT * FROM student_dframe LIMIT 3;") #R:head(student_dframe, 3)SQL2=sqldf("SELECT * FROM student_dframe WHERE sex ='男';")R2 = filter(student_dframe, sex=="男")identical(SQL2, R2)3.4,GROUP BY and ORDER BY 分組與排序SQL3 = sqldf("SELECT sex, COUNT(sid) as Total FROM student_dframe WHERE sex IN ('男','女')GROUP BY sex ORDER BY Total DESC ;")R3=student_dframe%>%filter(sex %in%c('F','M','NS','UNK'))%>%group_by(sex) %>%summarise(Total = n())%>%arrange(desc(Total))sid<-c(1,2,3,4,5,6) # 學號一樣 sname<-c('張三', '李四', '王二', '狗蛋', '毛線', '賈伯斯') student_name <- data.frame(sid, sname)SQL4=sqldf("SELECT sd.*, sn.sname as sname FROM student_dframe sdINNER JOIN student_name snON sd.sid=sn.sid ORDER BY sd.sid, sn.sname")R4 = merge(student_dframe, student_name,by=intersect(names(student_dframe), names(student_name)))compare(R4,SQL4,allowAll = TRUE)student_dframe1<-student_dframe[1:3,] #重複利用數據框 前3行R5 <- rbind(student_dframe, student_dframe1)SQL5 <- sqldf("SELECT * FROM student_dframe UNION ALL SELECT * FROM student_dframe1;")compare(R5,SQL5, allowAll = TRUE)student_dframe1<-student_dframe[1:3,]R6 <- semi_join(student_dframe, student_dframe1) SQL6 <- sqldf("SELECT * FROM student_dframe INTERSECT SELECT * FROM student_dframe1;")compare(R6,SQL6, allowAll = TRUE)R7 <- anti_join(student_dframe, student_dframe1) SQL7 <- sqldf("SELECT * FROM student_dframe EXCEPT SELECT * FROM student_dframe1;") compare(R7,SQL7, allowAll = TRUE)SQL對R數據框檢索,排序,篩選後的數據結果依然是一個數據框,這麼我們也可以直接作圖,讓數據可視化。SQL = sqldf("SELECT sex, COUNT(sid) as Total FROM student_dframe WHERE sex IN ('男','女') GROUP BY sex ORDER BY Total DESC ;")SQL$Total=as.numeric(SQL$Total)pie(SQL$Total, labels =SQL$sex,explode=0.1,col=rainbow(4), main="性別分布餅圖",cex.lab=0.5, cex.axis=0.5, cex.main=1,labelcex=1, family='SimSun')ggplot(sqldf('SELECT age, sex FROM student_dframe WHERE age between 0 AND 100 ;'),aes(x=age, fill = sex), family='SimSun')+ geom_density(alpha = 0.6)ggplot(sqldf('SELECT age, sex FROM student_dframe union all SELECT age, sex FROM student_dframe1 union all SELECT age, sex FROM student_dframe1union allSELECT age, sex FROM student_dframe1'), aes(x=age, fill = sex), family='SimSun')+geom_bar(alpha=0.6)
本文經授權轉載自 趣味數據周刊,作者herain
封面圖來源:Photo by Franki Chamaki on Unsplash