是否有一个等效的（或更快的）版本numpy.bin计数在R中表示基于多个箱子的值求和？

# R First attempt, "straightforward" smplSize <- 1000000 binTypes <- 100 nIter <- 20 set.seed(1) bins <- matrix(floor(runif(smplSize * binTypes, min=0, max=5)), nrow = smplSize) wgts <- runif(smplSize) tic <- Sys.time() for (i in (1:nIter)) { res <- matrix(nrow=5, ncol=binTypes) for (j in 0:4) { res[j+1,] <- colSums(wgts * (bins == j)) } # Some process that modifies wgts based on res } toc <- Sys.time() toc - tic # 117 seconds

# Second attempt, storing category locations in separate mask matrices tic <- Sys.time() # Store 5 matrices identifying locations of the integers 0 - 4 binMask <- list() for (i in 0:4) { binMask[[i+1]] <- bins == i } for (i in (1:nIter)) { res <- matrix(nrow=5, ncol=binTypes) for (j in 0:4) { res[j+1,] <- colSums(wgts * binMask[[j + 1]]) } # Some process that modifies wgts based on res } toc <- Sys.time() toc - tic # 72 seconds print(object.size(binMask), units = "Gb") # 1.9 Gb

import numpy as np import timeit import sys smplSize = 1000000 nBins = 100 nIter = 20 wgts = np.random.random_sample(smplSize) bins = np.random.randint(0, 5, (smplSize, nBins)) tic=timeit.default_timer() res = np.bincount(bins, wgts) toc=timeit.default_timer() toc - tic tic=timeit.default_timer() for i in range(nIter): res = np.apply_along_axis(np.bincount, 0, bins, wgts) toc=timeit.default_timer() toc - tic # 39 seconds sys.getsizeof(bins)/(1024 ** 2) # 381 Mb

1条回答

网友

1楼 · 发布于 2024-05-02 20:39:09

也许和数据表，但我们需要先改变垃圾箱的结构：

require(data.table)
dt <- data.table(bins = as.integer(bins), # integer for reduced size
                    row = rep(1:nrow(bins), ncol(bins)),
                    col = rep(1:ncol(bins), each = nrow(bins)))

剩下的：

dt[, wg := wgts[row]] # add wgts for each corresponding row to data.table
rez <- dt[, .(wg_sum = sum(wg)), by = .(col, bins)] # sum by "cols" & bins
rez # your results, only in different structure
# (i would suggest keeping this, if possible)

# if needed can cast to similar structure as your original results:
rezt <- dcast(rez, bins ~ col, value.var = 'wg_sum')

但也许这不符合你的需要，因为你提到你也在做循环中的其他事情。。。你知道吗

仅计时20次的总和：

tic <- Sys.time()
for (i in (1:nIter)) {
  rez <- dt[, .(wg_sum = sum(wg)), by = .(col, bins)]
}
toc <- Sys.time()
toc - tic # 48.8 45.9 45.9 38.9

不像python那么快，但是我们用100x5组求100e6个元素的和是有意义的。你知道吗

# maybe if we split the huge dt before by bins in list:
dtl <- split(dt, by = 'bins')
tic <- Sys.time()
for (i in (1:nIter)) {
  r <- lapply(dtl, function(x) x[, sum(wg), col])
}
toc <- Sys.time()
toc - tic # 18.062

但在这种情况下，你需要在求和后对结果进行不同的处理。。。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章