I have a 67MM row data.table with people names and surname separated by spaces. I just need to create a new column for each word.
Here is an small subset of the data:
n <- structure(list(Subscription_Id = c("13.855.231.846.091.000",
"11.156.048.529.090.800", "24.940.584.090.830", "242.753.039.111.124",
"27.843.782.090.830", "13.773.513.145.090.800", "25.691.374.090.830",
"12.236.174.155.090.900", "252.027.904.121.210", "11.136.991.054.110.100"
), Account_Desc = c("AGUAYO CARLA", "LEIVA LILIANA", "FULLANA MARIA LAURA",
"PETREL SERGIO", "IPTICKET SRL", "LEDESMA ORLANDO", "CATTANEO LUIS RAUL",
"CABRAL CARMEN ESTELA", "ITURGOYEN HECTOR", "CASA CASILDO"),
V1 = c("AGUAYO", "LEIVA", "FULLANA", "PETREL", "IPTICKET",
"LEDESMA", "CATTANEO", "CABRAL", "ITURGOYEN", "CASA"), V2 = c("CARLA",
"LILIANA", "MARIA", "SERGIO", "SRL", "ORLANDO", "LUIS", "CARMEN",
"HECTOR", "CASILDO"), V3 = c(NA, NA, "LAURA", NA, NA, NA,
"RAUL", "ESTELA", NA, NA), `NA` = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), .Names = c("Subscription_Id", "Account_Desc", "V1", "V2",
"V3", NA), class = c("data.table", "data.frame"), row.names = c(NA,
-10L), .internal.selfref = <pointer: 0x0000000000200788>)
require("data.table")
n <- data.table(n)
Expected Output
# Subscription_Id Account_Desc V1 V2 V3 NA
# 1: 13.855.231.846.091.000 AGUAYO CARLA AGUAYO CARLA NA NA
# 2: 11.156.048.529.090.800 LEIVA LILIANA LEIVA LILIANA NA NA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA FULLANA MARIA LAURA NA
library(stringr)
# This separates the strings, but i loose the Subscription_Id variable.
n[, str_split_fixed(Account_Desc, "[ +]", 4)]
# This doesn't work.
n[, paste0("V",1:4) := str_split_fixed(Account_Desc, "[ +]", 4)]
This works, but i seem to be doing the calculation 3 times. Not sure if its the most effiecient way
cols = paste0("V",1:3)
for(j in 1:3){
set(n,i=NULL,j=cols[j],value = sapply(strsplit(as.character(n$Account_Desc),"[ +]"), "[", j))
}
Let's use big_n to benchmarck
big_n <- data.table(Subscription_Id = rep(n[,Subscription_Id],1e7),
Account_Desc = rep(n[,Account_Desc],1e7)
)
I don't work with datasets anywhere near this scale, so I have no idea if this is going to be of use or not. One thing that comes to mind is to use a matrix
and matrix indexing.
Since I'm impatient, I've only tried it on 1e5 rows on my slow system :-)
big_n <- data.table(Subscription_Id = rep(n[,Subscription_Id],1e5),
Account_Desc = rep(n[,Account_Desc],1e5))
StringMat <- function(input) {
Temp <- strsplit(input, " ", fixed = TRUE)
Lens <- vapply(Temp, length, 1L)
A <- unlist(Temp, use.names = FALSE)
Rows <- rep(sequence(length(Temp)), Lens)
Cols <- sequence(Lens)
m <- matrix(NA, nrow = length(Temp), ncol = max(Lens),
dimnames = list(NULL, paste0("V", sequence(max(Lens)))))
m[cbind(Rows, Cols)] <- A
m
}
system.time(outB1 <- cbind(big_n, StringMat(big_n$Account_Desc)))
# user system elapsed
# 4.524 0.000 4.533
outB1
# Subscription_Id Account_Desc V1 V2 V3
# 1: 13.855.231.846.091.000 AGUAYO CARLA AGUAYO CARLA NA
# 2: 11.156.048.529.090.800 LEIVA LILIANA LEIVA LILIANA NA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA FULLANA MARIA LAURA
# 4: 242.753.039.111.124 PETREL SERGIO PETREL SERGIO NA
# 5: 27.843.782.090.830 IPTICKET SRL IPTICKET SRL NA
# ---
# 999996: 13.773.513.145.090.800 LEDESMA ORLANDO LEDESMA ORLANDO NA
# 999997: 25.691.374.090.830 CATTANEO LUIS RAUL CATTANEO LUIS RAUL
# 999998: 12.236.174.155.090.900 CABRAL CARMEN ESTELA CABRAL CARMEN ESTELA
# 999999: 252.027.904.121.210 ITURGOYEN HECTOR ITURGOYEN HECTOR NA
# 1000000: 11.136.991.054.110.100 CASA CASILDO CASA CASILDO NA
set_method
function and compare timingsset_method <- function(DT){
cols = paste0("V",1:3)
for(j in 1:3){
set(DT,i=NULL,j=cols[j],
value = sapply(strsplit(as.character(DT[, Account_Desc, with = TRUE]),
"[ +]"), "[", j))
}
}
system.time(set_method(big_n))
# user system elapsed
# 25.319 0.022 25.586
str_split_fixed
(ouch!)big_n[, c("V1", "V2", "V3") := NULL]
library(stringr)
system.time(outBrodie <- cbind(big_n, as.data.table(str_split_fixed(
big_n$Account_Desc, "[ +]", 4))))
# user system elapsed
# 204.966 0.514 206.910
EDIT 3: Stealing Arun's blood and sweat:
cbind(n, as.data.table(str_split_fixed(n$Account_Desc, "[ +]", 4)))
This avoids the potentially costly by
and produces the same result (plus the original name column).
EDIT2: as per Arun's comment, maybe:
n.2[, c(paste0("V", 1:4)):=as.list(str_split_fixed(Account_Desc, "[ +]", 4)), by=Subscription_Id]
But you still have the by
. Old way:
n[, as.list(str_split_fixed(Account_Desc, "[ +]", 4)), by=Subscription_Id]
produces:
# Subscription_Id V1 V2 V3 V4
# 1: 13.855.231.846.091.000 AGUAYO CARLA
# 2: 11.156.048.529.090.800 LEIVA LILIANA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA
# 4: 242.753.039.111.124 PETREL SERGIO
# 5: 27.843.782.090.830 IPTICKET SRL
# 6: 13.773.513.145.090.800 LEDESMA ORLANDO
# 7: 25.691.374.090.830 CATTANEO LUIS RAUL
# 8: 12.236.174.155.090.900 CABRAL CARMEN ESTELA
# 9: 252.027.904.121.210 ITURGOYEN HECTOR
# 10: 11.136.991.054.110.100 CASA CASILDO
EDIT: word of warning, some stringr
functions can be slow (not sure if this one is). If this is still slow for your process, you may want to write your own function using strsplit
and something to pad it to the appropriate length.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With