Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

create a dummy variable from a messy data

Tags:

r

dplyr

I have a tibble like this -

# A tibble: 11 × 1
   messycol                   
   <chr>                      
 1 ""                         
 2 ""                         
 3 ""                         
 4 ""                         
 5 ""                         
 6 ""                         
 7 "|15|16|32|33|36|39|40|41|"
 8 "|15|16|32|39|"            
 9 "|10|29|32|38|39|40|"      
10 "|32|39|"                  
11 ""                         

The dput() output is here -

structure(list(messycol = structure(c("", "", "", "", "", "", 
"|15|16|32|33|36|39|40|41|", "|15|16|32|39|", "|10|29|32|38|39|40|", 
"|32|39|", ""), label = "Noteffectiveaccountingrule reason keys", format.sas = "$")), row.names = c(NA, 
-11L), class = c("tbl_df", "tbl", "data.frame"))

I want to create a dummy variable from the messycol like this - if messycol includes either 15 or 16, then 1 otherwise 0. Please note that the blank value in the messycol should be as it is - " "

So, my expected output is like this -

# A tibble: 11 × 2
   messycol                         DUMMY                 
   <chr>                      
 1 ""                               ""                      
 2 ""                               ""                          
 3 ""                               ""                           
 4 ""                               ""                          
 5 ""                               ""                         
 6 ""                               ""                         
 7 "|15|16|32|33|36|39|40|41|"       1
 8 "|15|16|32|39|"                   1     
 9 "|10|29|32|38|39|40|"             0    
10 "|32|39|"                         0              
11 ""                               ""  
like image 395
Sharif Avatar asked Mar 18 '26 09:03

Sharif


2 Answers

df |>
  mutate(dummy = case_when(
    messycol == "" ~ "",
    stringr::str_detect(messycol, "\\|15\\||\\|16\\|") ~ "1",
    TRUE ~ "0"
  ))

   messycol                    dummy
   <chr>                       <chr>
 1 ""                          ""   
 2 ""                          ""   
 3 ""                          ""   
 4 ""                          ""   
 5 ""                          ""   
 6 ""                          ""   
 7 "|15|16|32|33|36|39|40|41|" "1"  
 8 "|15|16|32|39|"             "1"  
 9 "|10|29|32|38|39|169|"      "0"  # edited input to show rejection 
10 "|115|39|"                  "0"  # edited input to show rejection
11 ""                          ""
like image 53
Jon Spring Avatar answered Mar 20 '26 19:03

Jon Spring


One option would be to use string::str_split and a member of the purrr::map family of functions:

library(tidyverse)

dat |>
  mutate(
    dummy = map_chr(
      str_split(messycol, pattern = "\\|"),
      ~ as.character(+any(.x %in% c(15, 16)))
    ),
    dummy = if_else(messycol == "", messycol, dummy)
  )
#> # A tibble: 11 × 2
#>    messycol                    dummy
#>    <chr>                       <chr>
#>  1 ""                          ""   
#>  2 ""                          ""   
#>  3 ""                          ""   
#>  4 ""                          ""   
#>  5 ""                          ""   
#>  6 ""                          ""   
#>  7 "|15|16|32|33|36|39|40|41|" "1"  
#>  8 "|15|16|32|39|"             "1"  
#>  9 "|10|29|32|38|39|40|"       "0"  
#> 10 "|32|39|"                   "0"  
#> 11 ""                          ""

Or using base R you can achieve the same result e.g. like so:

dat |>
  transform(dummy = vapply(
    strsplit(messycol, split = "\\|"),
    FUN = \(x) as.character(+any(x %in% c(15, 16))),
    FUN.VALUE = character(1)
  )) |>
  transform(dummy = ifelse(messycol == "", messycol, dummy))
#>                     messycol dummy
#> 1                                 
#> 2                                 
#> 3                                 
#> 4                                 
#> 5                                 
#> 6                                 
#> 7  |15|16|32|33|36|39|40|41|     1
#> 8              |15|16|32|39|     1
#> 9        |10|29|32|38|39|40|     0
#> 10                   |32|39|     0
#> 11

Finally, another more verbose option using the tidyverse would be to use separate_longer_delim :

dat |>
  mutate(row = row_number()) |>
  separate_longer_delim(messycol, delim = "|") |>
  mutate(dummy = as.character(+any(messycol %in% c(15, 16))), .by = row) |>
  summarise(messycol = paste0(messycol, collapse = "|"), .by = c(row, dummy)) |>
  mutate(dummy = if_else(messycol == "", messycol, dummy))
#> # A tibble: 11 × 3
#>      row dummy messycol                   
#>    <int> <chr> <chr>                      
#>  1     1 ""    ""                         
#>  2     2 ""    ""                         
#>  3     3 ""    ""                         
#>  4     4 ""    ""                         
#>  5     5 ""    ""                         
#>  6     6 ""    ""                         
#>  7     7 "1"   "|15|16|32|33|36|39|40|41|"
#>  8     8 "1"   "|15|16|32|39|"            
#>  9     9 "0"   "|10|29|32|38|39|40|"      
#> 10    10 "0"   "|32|39|"                  
#> 11    11 ""    ""
like image 24
stefan Avatar answered Mar 20 '26 20:03

stefan



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!