I have a toy example of a tibble. What is the most efficient way to sum two consecutive rows of y grouped by x
library(tibble)
l = list(x = c("a", "b", "a", "b", "a", "b"), y = c(1, 4, 3, 3, 7, 0))
df <- as_tibble(l)
df
#> # A tibble: 6 x 2
#>       x     y
#>   <chr> <dbl>
#> 1     a     1
#> 2     b     4
#> 3     a     3
#> 4     b     3
#> 5     a     7
#> 6     b     0
So the output would be something like this
   group   sum  seq
     a      4     1
     a     10     2
     b      7     1
     b      3     2
I'd like to use the tidyverse and possibly roll_sum() from the RcppRoll package and have the code so that a variable length of consecutive rows could be used for real world data in which there would be many groups
TIA
I notice you asked for the most efficient way-- if you are looking at scaling this up to a much larger set, I would strongly recommend data.table.
library(data.table)
library(RcppRoll)
l[, .(sum = RcppRoll::roll_sum(y, n = 2L, fill = NA, align = "left"),
      seq = seq_len(.N)),
  keyby = .(x)][!is.na(sum)]
A rough benchmark comparison of this vs an answer using the tidyverse packages with 100,000 rows and 10,000 groups illustrates the significant difference.
(I used Psidom's answer instead of jazzurro's since jazzuro's did not allow for an arbritary number of rows to be summed.)
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount   <- 100000
groupCount <- 10000
sumRows    <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
            y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>% 
    group_by(x) %>% 
    do(
        data.frame(
            sum = roll_sum(.$y, sumRows), 
            seq = seq_len(length(.$y) - sumRows + 1)
        )
    )
|========================================================0% ~0 s remaining     
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
                  seq = seq_len(.N)),
               keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm ## Stop the clock
Results:
> dplyr_time
  user  system elapsed 
  10.28    0.04   10.36 
> data.table_time
   user  system elapsed 
   0.35    0.02    0.36 
> all.equal(dplyr_result,as.tibble(dt_result))
[1] TRUE
One way to do this is use group_by %>% do where you can customize the returned data frame in do:
library(RcppRoll); library(tidyverse)
n = 2
df %>% 
    group_by(x) %>% 
    do(
        data.frame(
            sum = roll_sum(.$y, n), 
            seq = seq_len(length(.$y) - n + 1)
        )
    )
# A tibble: 4 x 3
# Groups:   x [2]
#      x   sum   seq
#  <chr> <dbl> <int>
#1     a     4     1
#2     a    10     2
#3     b     7     1
#4     b     3     2
Edit: Since this is not as efficient, probably due to the data frame construction header and binding data frames on the go, here is an improved version (still somewhat slower than data.table but not as much now):
df %>% 
    group_by(x) %>% 
    summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
    unnest()
Timing, use @Matt's data and setup:
library(tibble)
library(dplyr)
library(RcppRoll)
library(stringi) ## Only included for ability to generate random strings
## Generate data with arbitrary number of groups and rows --------------
rowCount   <- 100000
groupCount <- 10000
sumRows    <- 2L
set.seed(1)
l <- tibble(x = sample(stri_rand_strings(groupCount,3),rowCount,rep=TRUE),
            y = sample(0:10,rowCount,rep=TRUE))
## Using dplyr and tibble -----------------------------------------------
ptm <- proc.time() ## Start the clock
dplyr_result <- l %>% 
    group_by(x) %>% 
    summarise(sum = list(roll_sum(y, n)), seq = list(seq_len(n() -n + 1))) %>%
    unnest()
dplyr_time <- proc.time() - ptm ## Stop the clock
## Using data.table instead ----------------------------------------------
library(data.table)
ptm <- proc.time() ## Start the clock
setDT(l) ## Convert l to a data.table
dt_result <- l[,.(sum = RcppRoll::roll_sum(y, n = sumRows, fill = NA, align = "left"),
                  seq = seq_len(.N)),
               keyby = .(x)][!is.na(sum)]
data.table_time <- proc.time() - ptm
Result is:
dplyr_time
#   user  system elapsed 
#  0.688   0.003   0.689 
data.table_time
#   user  system elapsed 
#  0.422   0.009   0.430 
Here is one approach for you. Since you want to sum up two consecutive rows, you could use lead() and do the calculation for sum. For seq, I think you can simply take row numbers, seeing your expected outcome. Once you are done with these operations, you arrange your data by x (if necessary, x and seq). Finally, you drop rows with NAs. If necessary, you may want to drop y by writing select(-y) at the end of the code.
group_by(df, x) %>%
mutate(sum = y + lead(y),
       seq = row_number()) %>%
arrange(x) %>%
ungroup %>%
filter(complete.cases(.))
#      x     y   sum   seq
#  <chr> <dbl> <dbl> <int>
#1     a     1     4     1
#2     a     3    10     2
#3     b     4     7     1
#4     b     3     3     2
A solution using tidyverse and zoo. This is similar to Psidom's approach.
library(tidyverse)
library(zoo)
df2 <- df %>%
  group_by(x) %>%
  do(data_frame(x = unique(.$x), 
                sum = rollapplyr(.$y, width = 2, FUN = sum))) %>%
  mutate(seq = 1:n()) %>%
  ungroup()
df2
# A tibble: 4 x 3
      x   sum   seq
  <chr> <dbl> <int>
1     a     4     1
2     a    10     2
3     b     7     1
4     b     3     2
zoo + dplyr
library(zoo)
library(dplyr)
df %>% 
    group_by(x) %>% 
    mutate(sum = c(NA, rollapply(y, width = 2, sum)), 
           seq = row_number() - 1) %>% 
    drop_na()
# A tibble: 4 x 4
# Groups:   x [2]
      x     y   sum   seq
  <chr> <dbl> <dbl> <dbl>
1     a     3     4     1
2     b     3     7     1
3     a     7    10     2
4     b     0     3     2
If the moving  window only equal to 2 using lag
df %>% 
    group_by(x) %>% 
    mutate(sum = y + lag(y), 
    seq = row_number() - 1) %>% 
    drop_na()
# A tibble: 4 x 4
# Groups:   x [2]
      x     y   sum   seq
  <chr> <dbl> <dbl> <dbl>
1     a     3     4     1
2     b     3     7     1
3     a     7    10     2
4     b     0     3     2
EDIT :
n = 3    # your moving window 
df %>% 
    group_by(x) %>% 
    mutate(sum = c(rep(NA, n - 1), rollapply(y, width = n, sum)), 
           seq = row_number() - n + 1) %>% 
    drop_na()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With