can someone explain to me where does this difference in performance come from. From the bellow I understand that
I probably do not understand what both packages are doing under the hood.
I checked GForce was enabled in data.table
code:
suppressMessages(library(collapse))
suppressMessages(library(data.table))
options(datatable.print.class = TRUE)
options(datatable.print.topn = 3L)
# dataset
set.seed(1L)
n_rics <- 500L; ric <- sprintf("A-%s",1:n_rics)
n_dates <- 1000L; date <- as.Date("2015-01-01") + 1:n_dates
n_values <- 4L
DT <- CJ(ric, date, value = 1:n_values)[, value := runif(.N)][sample(1:.N)]
## indexing
DT1 <- copy(DT)
system.time({
setindex(DT1, date, ric)
})
# user system elapsed
# 7.367 0.033 0.750
DT2 <- copy(DT)
system.time({
DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)
})
# user system elapsed
# 0.165 0.004 0.169
## first by group
DT1 <- copy(DT)
setindex(DT1, date, ric)
system.time({
f1 <- DT1[, .SD[1L], .(date, ric)]
})
# user system elapsed
# 16.118 0.072 1.523
DT2 <- copy(DT)
DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)
system.time({
f2 <- ffirst(DT2)
})
# user system elapsed
# 0.017 0.000 0.017
identical(f1, f2)
# [1] TRUE
Using data.table_1.14.2, collapse_1.7.3
I have more or less the same time between data.table and collapse when data.table uses 4 threads. Your example below:
suppressMessages(library(collapse))
suppressMessages(library(data.table))
options(datatable.print.class = TRUE)
options(datatable.print.topn = 3L)
# what versions?
packageVersion("data.table")
#> [1] '1.14.5'
packageVersion("collapse")
#> [1] '1.8.9'
# how many threads?
getDTthreads()
#> [1] 4
# dataset
set.seed(1L)
n_rics <- 500L; ric <- sprintf("A-%s",1:n_rics)
n_dates <- 1000L; date <- as.Date("2015-01-01") + 1:n_dates
n_values <- 4L
DT <- CJ(ric, date, value = 1:n_values)[, value := runif(.N)][sample(1:.N)]
## indexing
DT1 <- copy(DT)
system.time({
setindex(DT1, date, ric)
})
#> user system elapsed
#> 0.18 0.00 0.05
DT2 <- copy(DT)
system.time({
DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)
})
#> user system elapsed
#> 0.05 0.03 0.07
## first by group
DT1 <- copy(DT)
setindex(DT1, date, ric)
system.time({
f1 <- DT1[, .SD[1L], .(date, ric)]
})
#> user system elapsed
#> 0.20 0.03 0.10
DT2 <- copy(DT)
DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)
system.time({
f2 <- ffirst(DT2)
})
#> user system elapsed
#> 0 0 0
identical(f1, f2)
#> [1] TRUE
Created on 2022-11-02 with reprex v2.0.2
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With