7月20日R語言筆記——dplyr五個基礎函數1.mutate(),新增列2.select(),按列篩選兩個實用技能1:管道操作 %>% (cmd/ctr + shift + M),傳遞作用2:count統計某列的unique值,管道操作CTRL + SHIFT + M3. merge()只可以取交集1.內連inner_join,取交集2.左連left_join,以在前面的數據集為準3.全連full_join4.半連接:返回能夠與y表匹配的x表所有記錄semi_join,返回所有與test2匹配的test15.反連接:返回無法與y表匹配的x表的所記錄anti_join6.數據的簡單合併練習6-11.將iris數據框的前4列gather,然後還原2.將第三列分成兩列(以小數點為分隔符)然後合併3.加載test1.Rdata,將deg數據框按照pvalue從小到大排序4. 將兩個數據框按照probe_id列連接在一起
rm(list = ls())
if(!require(dplyr))install.packages("dplyr")
library(dplyr)
test <- iris[c(1:2,51:52,101:102),]
rownames(test) =NULL五個基礎函數1.mutate(),新增列mutate(test, new = Sepal.Length * Sepal.Width)#此處不需要『』和$2.select(),按列篩選(1)按列號篩選
select(test,1)
select(test,c(1,5))
select(test,Sepal.Length)(2)按列名篩選
select(test, Petal.Length, Petal.Width)
vars <- c("Petal.Length", "Petal.Width")
select(test, one_of(vars))#將所有要提取列的列名設置為一個向量,但是要用one_of()一組來自tidyselect的有用函數
select(test, starts_with("Petal"))
select(test, ends_with("Width"))
select(test, contains("etal"))
select(test, matches(".t."))#.指任意字符
select(test, everything())
select(test, last_col())
select(test, last_col(offset = 2))#距離最後一列距離為2的列(4)利用everything(),列名可以重排序
test
select(test,Species,everything())#將調整為第一列3.filter()篩選行
filter(test, Species == "setosa")
filter(test, Species == "setosa"&Sepal.Length > 5 )
filter(test, Species %in% c("setosa","versicolor"))4.arrange(),按某一列對整個表格進行排序
arrange(test, Sepal.Length)#默認從小到大排序
arrange(test, desc(Sepal.Length))#用desc從大到小
arrange(test, Sepal.Length, desc(Sepal.Width))#根據兩列排序,先排第一列Sepal.Length,再按照降序排Sepal.Width
5.summarise():匯總對數據進行匯總操作,結合group_by使用實用性強
summarise(test, mean(Sepal.Length), sd(Sepal.Length))# 計算Sepal.Length的平均值和標準差:先按照Species分組,計算每組Sepal.Length的平均值和標準差
group_by(test, Species)
summarise(group_by(test, Species),mean(Sepal.Length), sd(Sepal.Length))兩個實用技能1:管道操作 %>% (cmd/ctr + shift + M),傳遞作用library(dplyr)
a=iris
x1 = filter(iris,Sepal.Width>3)
x2 = select(x1,c("Sepal.Length","Sepal.Width" ))
x3 = arrange(x2,Sepal.Length)#按照Sepal.Length列,默認從小到大排序
rm(list = ls())
colnames(iris)
iris %>%
filter(Sepal.Width>3) %>%
select(c("Sepal.Length","Sepal.Width" ))%>%
arrange(Sepal.Length)2:count統計某列的unique值,管道操作CTRL + SHIFT + Mcount(test,Species)
count(test,test$Sepal.Length)
處理關係數據:即將2個表進行連接,注意:不要引入factor
3. merge()只可以取交集rm(list = ls())
options(stringsAsFactors = F)
test1 <- data.frame(name = c('jimmy','nicker','doodle'),
blood_type = c("A","B","O"))
test1
test2 <- data.frame(name = c('doodle','jimmy','nicker','tony'),
group = c("group1","group1","group2","group2"),
vision = c(4.2,4.3,4.9,4.5))
test2
test3 <- data.frame(NAME = c('doodle','jimmy','lucy','nicker'),
weight = c(140,145,110,138))
merge(test1,test2,by="name")
merge(test1,test3,by.x = "name",by.y = "NAME")#列名不一樣,但是列的內容一樣
1.內連inner_join,取交集inner_join(test1, test2, by = "name")
inner_join(test1,test3,by = c("name"="NAME"))#列名不一樣,但是列的內容一樣2.左連left_join,以在前面的數據集為準test1
test2
left_join(test1, test2, by = 'name')
left_join(test2, test1, by = 'name')3.全連full_joinfull_join( test1, test2, by = 'name')4.半連接:返回能夠與y表匹配的x表所有記錄semi_join,返回所有與test2匹配的test1semi_join(x = test1, y = test2, by = 'name')
semi_join(x = test2, y = test1, by = 'name')
test1
test25.反連接:返回無法與y表匹配的x表的所記錄anti_joinanti_join(x = test2, y = test1, by = 'name')6.數據的簡單合併在相當於base包裡的cbind()函數和rbind()函數;注意,bind_rows()函數需要兩個表格列數相同,而bind_cols()函數則需要兩個數據框有相同的行數
test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
test1
test2 <- data.frame(x = c(5,6), y = c(50,60))
test2
test3 <- data.frame(z = c(100,200,300,400))
test3
bind_rows(test1, test2)
bind_cols(test1, test3)
練習6-11.將iris數據框的前4列gather,然後還原rm(list = ls())
options(stringsAsFactors = F)
library('tidyr')
head(iris)
data = iris[,1:4]
colnames(data)
gather_data =gather(data,key = var,value = exp)
data2 = spread(gather_data,key = var,value = exp)
data3=gather_data %>%
group_by(var) %>%
mutate(id=1:n()) %>%
spread(var,exp)2.將第三列分成兩列(以小數點為分隔符)然後合併x=separate(iris,Sepal.Width,into = c('a','b'),sep = "[.]")
x#因為數值型的數據,所以劃分之後會有0存在即NA,因此可以將NA替換
#x$b=replace_na(x$b,0);x
x_re=unite(x,"Sepal.Width",a,b,sep = ".")
x_re
str(x_re)
#看起來代碼沒什麼不妥,但是仔細一看會發現其實有個坑,原始數據的第二行第二列是3.0,分割後0變成了NA,合併回來也是不對的。
#這個需求其實並不常見,只是為了作為一個例子,但這裡還是說一下解決方案:
x=separate(iris,Sepal.Width,into = c('a','b'),sep = "[.]") %>%
tidyr::replace_na(list(b=0))
x
x_re=unite(x,"Sepal.Width",a,b,sep = ".")
x_re$Sepal.Width = as.numeric(x_re$Sepal.Width)#轉換為數值型,NA變為0
x_re
str(x_re)3.加載test1.Rdata,將deg數據框按照pvalue從小到大排序rm(list = ls())
options(stringsAsFactors = F)
test1= load('test1.Rdata')
str(test1)
head(deg)
head(ids)
str(deg)
deg = arrange(deg,P.Value)
deg = arrange(deg,desc(P.Value))4. 將兩個數據框按照probe_id列連接在一起p = inner_join(deg,ids,by="probe_id")