Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

What is causing this difference in performance in indexing between data.table and collapse

Tags:

r

data.table

can someone explain to me where does this difference in performance come from. From the bellow I understand that

  • indexing is significantly faster in collapse than data.table
  • taking first by index is also way faster

I probably do not understand what both packages are doing under the hood.
I checked GForce was enabled in data.table

code:

suppressMessages(library(collapse))                                        
suppressMessages(library(data.table))                                      
options(datatable.print.class = TRUE)                                      
options(datatable.print.topn = 3L)                                         
                                                                           
# dataset                                                                  
set.seed(1L)                                                               
n_rics <- 500L; ric <- sprintf("A-%s",1:n_rics)                            
n_dates <- 1000L; date <- as.Date("2015-01-01") + 1:n_dates                
n_values <- 4L                                                             
DT <- CJ(ric, date, value = 1:n_values)[, value := runif(.N)][sample(1:.N)]
                                                                           
## indexing                                                                
                                                                           
DT1 <- copy(DT)                                                            
system.time({                                                              
    setindex(DT1, date, ric)                                               
})                                                                         
#    user  system elapsed                                                  
#   7.367   0.033   0.750                                                  
                                                                           
DT2 <- copy(DT)                                                            
system.time({                                                              
    DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)                         
})                                                                         
#    user  system elapsed                                                  
#   0.165   0.004   0.169                                                  
                                                                           
## first by group                                                          
                                                                           
DT1 <- copy(DT)                                                            
setindex(DT1, date, ric)                                                   
system.time({                                                              
    f1 <- DT1[, .SD[1L], .(date, ric)]                                     
})                                                                         
#    user  system elapsed                                                  
#  16.118   0.072   1.523                                                  
                                                                           
DT2 <- copy(DT)                                                            
DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)                             
system.time({                                                              
    f2 <- ffirst(DT2)                                                      
})                                                                         
#   user  system elapsed                                                   
#  0.017   0.000   0.017                                                   
                                                                           
identical(f1, f2)                                                          
# [1] TRUE

                                                             

Using data.table_1.14.2, collapse_1.7.3

like image 599
statquant Avatar asked Oct 21 '25 01:10

statquant


1 Answers

I have more or less the same time between data.table and collapse when data.table uses 4 threads. Your example below:

suppressMessages(library(collapse))                                        
suppressMessages(library(data.table))                                      
options(datatable.print.class = TRUE)                                      
options(datatable.print.topn = 3L)   

# what versions?
packageVersion("data.table")
#> [1] '1.14.5'
packageVersion("collapse")
#> [1] '1.8.9'

# how many threads?
getDTthreads()
#> [1] 4


# dataset                                                                  
set.seed(1L)                                                               
n_rics <- 500L; ric <- sprintf("A-%s",1:n_rics)                            
n_dates <- 1000L; date <- as.Date("2015-01-01") + 1:n_dates                
n_values <- 4L                                                             
DT <- CJ(ric, date, value = 1:n_values)[, value := runif(.N)][sample(1:.N)]

## indexing                                                                

DT1 <- copy(DT)                                                            
system.time({                                                              
  setindex(DT1, date, ric)                                               
})                                                                         
#>    user  system elapsed 
#>    0.18    0.00    0.05

DT2 <- copy(DT)                                                            
system.time({                                                              
  DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)                         
})                                                                         
#>    user  system elapsed 
#>    0.05    0.03    0.07


## first by group                                                          

DT1 <- copy(DT)                                                            
setindex(DT1, date, ric)                                                   
system.time({                                                              
  f1 <- DT1[, .SD[1L], .(date, ric)]                                     
})                                                                         
#>    user  system elapsed 
#>    0.20    0.03    0.10

DT2 <- copy(DT)                                                            
DT2 <- fgroup_by(DT2, date, ric, sort = FALSE)                             
system.time({                                                              
  f2 <- ffirst(DT2)                                                      
})                                                                         
#>    user  system elapsed 
#>       0       0       0

identical(f1, f2)
#> [1] TRUE

Created on 2022-11-02 with reprex v2.0.2

like image 97
bretauv Avatar answered Oct 22 '25 14:10

bretauv