通过dplyr与R中的聚合计算中位数

旋转医生

您好:当我通过dplyr / tidyr与汇总生成数据集时,我得到的数据集的中位数略有不同。谁能解释其中的区别?谢谢!

#dataset
out2<-structure(list(d3 = structure(c(1L, 2L, NA, NA, 1L, 1L, NA,  
2L,NA,3L,1L, NA, NA, 1L, 3L, NA, 1L, 2L, 3L, 2L, 1L, 3L, 2L, 3L, 1L), .Label 
=     c("Professional journalist", "Elected politician", "Online blogger"), 
class = "factor"), Accessible = c(3, 5, 2,NA, 1, 2, NA, 3, NA, 4, 2, 5, NA, 
3, 4, NA, 2, NA, 3, 4, 4, 4,2, 2, 2), Information = c(1, 2, 1, NA, 4, 1, NA, 
2, NA, 2, 1, 1, NA, 4, 1, NA, 1, 1, 1, 3, 1, 3, 3, 4, 1), Responsive = c(5, 
4, 6, NA, 2, 3, NA, 1, NA, 5, 4, 4, NA, 6, 3, NA, 4, NA, 2, 2, 6, 2, 1, 1, 
3), Debate = c(6, 3, 4, NA, 3, 4, NA, 5, NA, 6, 5,6, NA, 1, 5, NA, 5, 2, NA,
1, 5, 6, 5, 5, 7), Officials = c(2,1, 5, NA, 5, 5, NA, 6, NA, 3, 6, 2, NA, 2,
2, NA, 6, 3, NA, 5,2, 5, 4, 6, 5), Social = c(7, 6, 7, NA, 7, 7, NA, 4, NA,
7, 7,                                                                                                                                                                                                                                   
7, NA, 7, 7, NA, 7, NA, NA, 7, 7, 1, 6, 7, 6), `Trade-Offs` = c(4, 
7, 3, NA, 6, 6, NA, 7, NA, 1, 3, 3, NA, 5, 6, NA, 3, NA, NA,
6, 3, 7, 7, 3, 4)), .Names = c("d3", "Accessible", "Information",    
"Responsive", "Debate", "Officials", "Social", "Trade-Offs"), row.names = 
c(171L, 126L, 742L, 379L, 635L, 3L, 303L, 419L, 324L, 97L, 758L, 136L, 
770L, 405L, 101L, 674L, 386L, 631L, 168L, 590L, 731L, 387L, 673L, 208L, 
728L), class = "data.frame")

#Find Medians via tidyR and dplyr
test<-out2 %>%
gather(variable, value, -1) %>%
filter(is.na(d3)==FALSE)%>%
group_by(d3, variable) %>%
summarise(value=median(value, na.rm=TRUE))

#dataframe
test<-data.frame(test)

#find Medians via aggregate
test2<-aggregate(.~d3, data=out2, FUN=median, na.rm=TRUE)

#Gather for plotting
test2<-test2 %>% 
gather(variable, value, -d3)

#Plot Medians via tidyr
ggplot(test, aes(x=d3, y=value,    
group=d3))+facet_wrap(~variable)+
geom_bar(stat='identity')+labs(title='Medians via TidyR')

#Plot Medians Via aggregate
ggplot(test2, aes(x=d3,  y=value,    
group=d3))+facet_wrap(~variable)+geom_bar(stat='identity')+
labs(title='Medians via Aggregate')

#Compare Debate, Information and Responsive
山姆·菲克(Sam Firke)

由产生的结果aggregate是不同的,因为aggregate会丢弃整行中任何值为的行NA,即使该行中的某些变量包含数据也是如此。

您可以通过为自na.action变量指定一个值来更正此错误,如此接受的答案所述在这里是:

test2<-aggregate(.~d3, data=out2, FUN=median, na.rm = TRUE, na.action=NULL)
test2<-test2 %>% 
  gather(variable, value, -d3)

确认结果相同:

identical(as.data.frame(test %>% arrange(d3, variable, value)),
          as.data.frame(test2 %>% arrange(d3, variable, value)))
[1] TRUE

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章