我正在尝试生成一些汇总统计信息,其中一些需要在每个组的子集上生成。data.table很大,有1000万行,但是by
没有列子集的使用速度很快(不到一秒钟)。仅添加一个需要在每个组的子集上计算的附加列,可使运行时间增加12倍。
这是一种更快的方法吗?下面是我的完整代码。
library(data.table)
library(microbenchmark)
N = 10^7
DT = data.table(id1 = sample(1:400, size = N, replace = TRUE),
id2 = sample(1:100, size = N, replace = TRUE),
id3 = sample(1:50, size = N, replace = TRUE),
filter_var = sample(1:10, size = N, replace = TRUE),
x1 = sample(1:1000, size = N, replace = TRUE),
x2 = sample(1:1000, size = N, replace = TRUE),
x3 = sample(1:1000, size = N, replace = TRUE),
x4 = sample(1:1000, size = N, replace = TRUE),
x5 = sample(1:1000, size = N, replace = TRUE) )
setkey(DT, id1,id2,id3)
microbenchmark(
DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5)
) , by = c('id1','id2','id3')] , unit = 's', times = 10L)
min lq mean median uq max neval
0.942013 0.9566891 1.004134 0.9884895 1.031334 1.165144 10
microbenchmark( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(x1[filter_var < 5]) #this line slows everything down
) , by = c('id1','id2','id3')] , unit = 's', times = 10L)
min lq mean median uq max neval
12.24046 12.4123 12.83447 12.72026 13.49059 13.61248 10
GForce使分组操作运行得更快,并且可以在诸如list(x = funx(X), y = funy(Y)), ...)
whereX
和Y
are列名funx
并且funy
属于优化函数集的表达式上工作。
?GForce
。DT[, expr, by=, verbose=TRUE]
。在OP的情况下,即使有,我们也sum_x1_F1 = sum(x1[filter_var < 5])
没有将其包含在GForcesum(v)
中。在这种特殊情况下,我们可以使var v = x1 *条件并求和:
DT[, v := x1*(filter_var < 5)]
system.time( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(v)
) , by = c('id1','id2','id3')])
# user system elapsed
# 0.63 0.19 0.81
为了进行比较,请在我的计算机上计时OP的代码:
system.time( DT[, .(
sum_x1 = sum(x1),
sum_x2 = sum(x2),
sum_x3 = sum(x3),
sum_x4 = sum(x4),
sum_x5 = sum(x5),
avg_x1 = mean(x1),
avg_x2 = mean(x2),
avg_x3 = mean(x3),
avg_x4 = mean(x4),
avg_x5 = mean(x5),
sum_x1_F1 = sum(x1[filter_var < 5]) #this line slows everything down
) , by = c('id1','id2','id3')])
# user system elapsed
# 9.00 0.02 9.06
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句