从R中的累积概率质量函数矩阵快速随机采样

克里斯·胡佛

我有一个矩阵(mat_cdf)表示某个特定日期某人在普查区中i迁移到普查区的累积概率j给定一个决定不“待在家里”的特工向量,我有一个功能(GetCTMove在下面的功能),从该矩阵中随机抽样以确定他们将花费时间的普查区。

# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts

# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))

# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)

  for(i in 1:cts){
  # Create cdf for row i
    mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
  }

GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
  mat_expand <- ct_mat_cdf[agent_cts,]
  
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column 
  s <- runif(length(agent_cts))
    
  fin_col <- max.col(s < mat_expand, "first")

  return(fin_col)
}

# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)

# Run function
system.time(GetCTMove(agents, mat_cdf))
 user  system elapsed 
   3.09    1.19    4.30 

与一百万个代理一起工作,每个样本大约需要10秒钟才能运行,乘以许多时间步长会导致每次模拟花费数小时,而此功能到目前为止是模型的速率限制因素。我想知道是否有人对更快实施这种随机抽样有意见。我使用了该dqrng包来加快随机数的生成,但是与矩阵扩展(mat_expand)和max.col运行时间最长的调用相比,它相对较小

ekoam

您可以优化的第一件事是以下代码:

max.col(s < mat_expand, "first")

由于s < mat_expand返回逻辑矩阵,因此应用该max.col功能与获取TRUE每一行的第一位相同在这种情况下,使用which将更加有效。同样,如下所示,您将所有CDF存储在一个矩阵中。

mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}

这种结构可能不是最佳的。list结构是用于将功能,如更好which由于您不必经过,因此运行速度也更快do.call(rbind, ...)

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))

以下是您的实现:

# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand, "first")
  return(fin_col)
}

在我的桌面上,大约需要运行2.68s。

> system.time(GetCTMove(agents, mat_cdf))
   user  system elapsed 
   2.25    0.41    2.68 

通过list结构和which功能,运行时间可以减少大约1s。

# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}

> system.time(GetCTMove2(agents, ls_cdf))
   user  system elapsed 
   1.59    0.02    1.64 

据我所知,仅使用R就没有其他方法可以进一步加快代码的速度。但是,您确实可以通过重写GetCTMoveC ++中的键函数提高性能使用该Rcpp软件包,您可以执行以下操作:

# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts, s, ls_cdf)
}

此实现快如闪电,应该可以满足您的所有需求。

> system.time(GetCTMove3(agents, ls_cdf))
   user  system elapsed 
   0.07    0.00    0.06 

完整脚本如下所示:

# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)

# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
  mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}

# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
  mat_expand <- ct_mat_cdf[agent_cts,]
  s <- runif(length(agent_cts))
  fin_col <- max.col(s < mat_expand, "first")
  return(fin_col)
}


# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
  n <- length(agent_cts)
  s <- runif(n)
  out <- integer(n)
  i <- 1L
  while (i <= n) {
    out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
    i <- i + 1L
  }
  out
}


# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
  int n = agents.size(); 
  NumericVector out(n); 
  for (int i = 0; i < n; ++i) {
    NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]); 
    int m = cdf.size(); 
    for (int j = 0; j < m; ++j) {
      if (s[i] < cdf[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
  s <- runif(length(agent_cts))
  fast_GetCTMove(agent_cts, s, ls_cdf)
}


system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章