Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How can I use the tidyverse to insert values that fall between a specified range and don't already exist in the column of interest

I have a large tibble with grouped data. Each group includes 12 rows. The third column of the tibble has to have a values between 1 and 12. Most of the rows have all the rows in column 3 populated, but some do not. For those rows with missing data (NA) I like to use dplyr to insert the correct value. My challenge is that the values in column 3 are not necessarily in numeric order, so I have to find the values that do not already exist in the and then insert accordingly.

I prepared a reprex to demonstrate what I'm trying to describe. In my example group B is missing the last two values in column 3. The only values that don't already exist in column 3 are 4 and 7. Therefore I would like to find a way of determining what values can be inserted and then use mutate to update the NA value to the unused value. This is all part of my effort to meeting the data requirements of mlogit data frames.

library(tidyverse)

df_BEFORE <- tibble( key = c("A","A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","B","B"),y=c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2), z=c(1,2,3,4,5,6,7,8,9,10,11,12,2,11,3,6,12,8,9,10,1,5,NA,NA))

print(df_BEFORE, n=24)
#> # A tibble: 24 x 3
#>    key       y     z
#>    <chr> <dbl> <dbl>
#>  1 A         2     1
#>  2 A         2     2
#>  3 A         2     3
#>  4 A         2     4
#>  5 A         2     5
#>  6 A         2     6
#>  7 A         2     7
#>  8 A         2     8
#>  9 A         2     9
#> 10 A         2    10
#> 11 A         2    11
#> 12 A         2    12
#> 13 B         2     2
#> 14 B         2    11
#> 15 B         2     3
#> 16 B         2     6
#> 17 B         2    12
#> 18 B         2     8
#> 19 B         2     9
#> 20 B         2    10
#> 21 B         2     1
#> 22 B         2     5
#> 23 B         2    NA
#> 24 B         2    NA

df_AFTER <- df_BEFORE %>% 
  group_by(key) 


target_df  <- tibble( key = c("A","A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","B","B"),y=c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2), z=c(1,2,3,4,5,6,7,8,9,10,11,12,2,11,3,6,12,8,9,10,1,5,4,7))

print(target_df, n=24)
#> # A tibble: 24 x 3
#>    key       y     z
#>    <chr> <dbl> <dbl>
#>  1 A         2     1
#>  2 A         2     2
#>  3 A         2     3
#>  4 A         2     4
#>  5 A         2     5
#>  6 A         2     6
#>  7 A         2     7
#>  8 A         2     8
#>  9 A         2     9
#> 10 A         2    10
#> 11 A         2    11
#> 12 A         2    12
#> 13 B         2     2
#> 14 B         2    11
#> 15 B         2     3
#> 16 B         2     6
#> 17 B         2    12
#> 18 B         2     8
#> 19 B         2     9
#> 20 B         2    10
#> 21 B         2     1
#> 22 B         2     5
#> 23 B         2     4
#> 24 B         2     7

Update: I have a working solution but its ugly purrr might help


library(tidyverse)

df_BEFORE <- tibble( key = c("A","A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","B","B"),y=c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2), z=c(1,2,3,4,5,6,7,8,9,10,11,12,2,11,3,6,12,8,9,10,1,5,NA,NA))

print(df_BEFORE, n=24)
#> # A tibble: 24 x 3
#>    key       y     z
#>    <chr> <dbl> <dbl>
#>  1 A         2     1
#>  2 A         2     2
#>  3 A         2     3
#>  4 A         2     4
#>  5 A         2     5
#>  6 A         2     6
#>  7 A         2     7
#>  8 A         2     8
#>  9 A         2     9
#> 10 A         2    10
#> 11 A         2    11
#> 12 A         2    12
#> 13 B         2     2
#> 14 B         2    11
#> 15 B         2     3
#> 16 B         2     6
#> 17 B         2    12
#> 18 B         2     8
#> 19 B         2     9
#> 20 B         2    10
#> 21 B         2     1
#> 22 B         2     5
#> 23 B         2    NA
#> 24 B         2    NA

get_index <- function(c,j){
  case_when(
    is.na(j) ~ 0,
    j == 0 ~ 0,
    j ==1 ~ setdiff(1:12,c)[1],
    j ==2 ~ setdiff(1:12,c)[2],
    j ==3 ~ setdiff(1:12,c)[3],
    j ==4 ~ setdiff(1:12,c)[4],
    j ==5 ~ setdiff(1:12,c)[5],
    j ==6 ~ setdiff(1:12,c)[6],
    j ==7 ~ setdiff(1:12,c)[7],
    j ==8 ~ setdiff(1:12,c)[8],
    j ==9 ~ setdiff(1:12,c)[9],
    j ==10 ~ setdiff(1:12,c)[10],
    j ==11 ~ setdiff(1:12,c)[11]
  )
}


df_BEFORE %>% 
  group_by(key) %>% 
  mutate(seed = 1) %>% 
  mutate(flag = if_else(!is.na(z),0,seed)) %>% 
  mutate(j = flag + lag(flag)) %>%
  mutate(temp = z) %>%
  mutate(new_z = if_else(j==1,setdiff(1:12,temp)[1],
                 if_else(j==2,setdiff(1:12,temp)[2],
                 if_else(j==3,setdiff(1:12,temp)[3],
                 if_else(j==4,setdiff(1:12,temp)[4],
                 if_else(j==2,setdiff(1:12,temp)[5],
                 if_else(j==2,setdiff(1:12,temp)[6],
                 if_else(j==2,setdiff(1:12,temp)[7],
                 if_else(j==2,setdiff(1:12,temp)[8],
                 if_else(j==2,setdiff(1:12,temp)[9],
                 if_else(j==2,setdiff(1:12,temp)[2],0L
                                 ))))))))))) %>%
  mutate(z= if_else(is.na(z),as.numeric(new_z),z)) %>%
  select(-j, -new_z,-temp, -flag, -seed) %>% 
  print(n=24)
#> # A tibble: 24 x 3
#> # Groups:   key [2]
#>    key       y     z
#>    <chr> <dbl> <dbl>
#>  1 A         2     1
#>  2 A         2     2
#>  3 A         2     3
#>  4 A         2     4
#>  5 A         2     5
#>  6 A         2     6
#>  7 A         2     7
#>  8 A         2     8
#>  9 A         2     9
#> 10 A         2    10
#> 11 A         2    11
#> 12 A         2    12
#> 13 B         2     2
#> 14 B         2    11
#> 15 B         2     3
#> 16 B         2     6
#> 17 B         2    12
#> 18 B         2     8
#> 19 B         2     9
#> 20 B         2    10
#> 21 B         2     1
#> 22 B         2     5
#> 23 B         2     4
#> 24 B         2     7

Created on 2020-05-03 by the reprex package (v0.3.0)

like image 355
Mutuelinvestor Avatar asked Jan 18 '26 19:01

Mutuelinvestor


1 Answers

If the data (df0) is that well organised, we can simply do

df0 |>
  dplyr::mutate(z = replace(z, is.na(z), setdiff(seq(12L), z)), .by = key)

giving

# A tibble: 24 × 3
   key       y     z
   <chr> <dbl> <dbl>
 1 A         2     1
 2 A         2     2
 3 A         2     3
 4 A         2     4
 5 A         2     5
 6 A         2     6
 7 A         2     7
 8 A         2     8
 9 A         2     9
10 A         2    10
11 A         2    11
12 A         2    12
13 B         2     2
14 B         2    11
15 B         2     3
16 B         2     6
17 B         2    12
18 B         2     8
19 B         2     9
20 B         2    10
21 B         2     1
22 B         2     5
23 B         2     4
24 B         2     7

Note

df0 = tibble::tibble(key = c("A","A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","B","B"),y=c(2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2), z=c(1,2,3,4,5,6,7,8,9,10,11,12,2,11,3,6,12,8,9,10,1,5,NA,NA))
like image 70
Friede Avatar answered Jan 21 '26 09:01

Friede



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!