I want to paste all rows into the same cell column by column
For example, I have a table like as follows:
library(tibble)
tibble::tribble(
~Col1, ~Col2, ~Col3,
"AA", "AA", "AB",
"AB", "AB", "BB",
"BC", "BB", "AA"
)
Col1 Col2 Col3
AA AA AB
AB AB BB
BC BB AA
The output I want is a 3X1 table as follows:
Col1 AAABBC
Col2 AAABBB
Col3 ABBBAA
However, the real case is more complicated because my original table has 600,000 rows and 2000 columns. I'm wondering what's the most rapid way of achieving this. I tried the loop but it took forever to finish pasting rows by columns.
Any help is appreciated, thanks!
If you have enough memory to store multiple instance of your data this approach using doParallel package could work. Here I am using tidyverse family.
library(tidyverse)
library(doParallel)
n <- 1000
# Generate a 1000 rows df with ~3000 columns
big_table <- do.call("rbind", replicate(n, data, simplify = FALSE))
lapply(1:10, function(x) {big_table <<- bind_cols(big_table, big_table); return(x)})
# Get the list of column names
col_list <- names(big_table)
# Define number of cores you want to process
number_of_parallel_cores <- 4
col_group <- split(col_list, sort(rep_len(1:number_of_parallel_cores, length(col_list))))
# Running the code with timer
system.time({
registerDoParallel(number_of_parallel_cores)
combine_data <- bind_rows(foreach(i_col_group = col_group) %dopar% {
big_table %>%
select(one_of(i_col_group)) %>%
summarize(across(.fns = paste, collapse = "")) %>%
pivot_longer(cols = everything(), names_to = "col_names", values_to = "values")
})
})
Timing
user system elapsed
1.291 0.291 0.898
Output
col_names values
<chr> <chr>
1 Col1...1 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
2 Col2...2 AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
3 Col3...3 ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
4 Col1...4 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
5 Col2...5 AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
6 Col3...6 ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
7 Col1...7 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
8 Col2...8 AAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAA…
9 Col3...9 ABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAABBBAAAB…
10 Col1...10 AAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAAABBCAA…
# … with 3,062 more rows
However I found that just simply bind all together is a lot faster comparing with parallel setup. Guess the overhead cost is not really feasible for this operations
system.time(
big_table %>%
select(one_of(col_list)) %>%
summarize(across(.fns = paste, collapse = "")) %>%
pivot_longer(cols = everything(), names_to = "col_names", values_to = "values")
)
user system elapsed
0.021 0.000 0.022
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With