I'm trying to use the new separate_wider_regex() function to separate a string.
In the first example (moldavia_1), we have a pattern. So, it is simple to obtain all the columns:
moldavia_1 <-
tibble(adresa = c("1;MD-3101,Balti str-la Botu Pavel 3",
"3;MD-3102,Balti str-la Muresanu A. 11",
"17;MD-3102,Balti str-la Sorocii 46",
"398;MD-3111,Balti str-la Stefan cel Mare 20",
"1130;MD-3128,Balti str-la Lvovului 2",
"1252;MD-3128,Balti str-la Lvovului 1",
"2814;MD-3102,Balti str-la Cahulului 44"))
Using the separate_wider_regex():
moldavia_1 %>%
separate_wider_regex(cols = adresa,
patterns = c(ids = "^\\d+",
";",
cod_post = ".*",
",",
cod_4 = "\\w+",
"\\s",
str = "str-la",
den_str = "\\s[A-Z][a-z]+.*(?<=[a-z]|[A-Z][:punct:])\\s(?=[0-9])",
nr = "\\d+$"))
# A tibble: 7 × 6
ids cod_post cod_4 str den_str nr
<chr> <chr> <chr> <chr> <chr> <chr>
1 1 MD-3101 Balti str-la " Botu Pavel " 3
2 3 MD-3102 Balti str-la " Muresanu A. " 11
3 17 MD-3102 Balti str-la " Sorocii " 46
4 398 MD-3111 Balti str-la " Stefan cel Mare " 20
5 1130 MD-3128 Balti str-la " Lvovului " 2
6 1252 MD-3128 Balti str-la " Lvovului " 1
7 2814 MD-3102 Balti str-la " Cahulului " 44
In the second example (moldavia_2) if the pattern fails (in this case the column "str") the subsequents columns fail too.
moldavia_2 <-
tibble(adresa = c("1;MD-3101,Balti Botu Pavel 3",
"3;MD-3102,Balti str-la Muresanu A. 11",
"17;MD-3102,Balti Sorocii 46",
"398;MD-3111,Balti Stefan cel Mare 20",
"1130;MD-3128,Balti str-la Lvovului 2",
"1252;MD-3128,Balti str-la Lvovului 1",
"2814;MD-3102,Balti Cahulului 44"))
Using the separate_wider_regex():
moldavia_2 %>%
separate_wider_regex(cols = adresa,
patterns = c(ids = "^\\d+",
";",
cod_post = ".*",
",",
cod_4 = "\\w+",
"\\s",
str = "str-la",
den_str = "\\s[A-Z][a-z]+.*(?<=[a-z]|[A-Z][:punct:])\\s(?=[0-9])",
nr = "\\d+$"),
too_few = "align_start")
# A tibble: 7 × 6
ids cod_post cod_4 str den_str nr
<chr> <chr> <chr> <chr> <chr> <chr>
1 1 MD-3101 Balti NA NA NA
2 3 MD-3102 Balti str-la " Muresanu A. " 11
3 17 MD-3102 Balti NA NA NA
4 398 MD-3111 Balti NA NA NA
5 1130 MD-3128 Balti str-la " Lvovului " 2
6 1252 MD-3128 Balti str-la " Lvovului " 1
7 2814 MD-3102 Balti NA NA NA
I'm expecting:
# A tibble: 7 × 6
ids cod_post cod_4 tip_str den_str nr
<chr> <chr> <chr> <chr> <chr> <chr>
1 1 MD-3101 Balti NA Botu Pavel 3
2 3 MD-3102 Balti str-la Muresanu A. 11
3 17 MD-3102 Balti NA Sorocii 46
4 398 MD-3111 Balti NA Stefan cel Mare 20
5 1130 MD-3128 Balti str-la Lvovului 2
6 1252 MD-3128 Balti str-la Lvovului 1
7 2814 MD-3102 Balti NA Cahulului 44
Here is one way to do it: We use a non-capturing group (?:) and check if the string is either str-la followed by a white space or just a white space \\w.
library(dplyr)
library(tidyr)
moldavia_2 %>%
separate_wider_regex(cols = adresa,
patterns = c(ids = "\\d+;",
cod_post = ".*,",
cod_4 = "\\w+ ",
str = "(?:str-la\\s|\\s)?",
den_str = "[A-Z][a-z]+.*(?<=[a-z]|[A-Z][:punct:])\\s(?=[0-9])",
nr = "\\d+$"),
too_few = "align_start")
#> # A tibble: 7 × 6
#> ids cod_post cod_4 str den_str nr
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1; MD-3101, "Balti " "" "Botu Pavel " 3
#> 2 3; MD-3102, "Balti " "str-la " "Muresanu A. " 11
#> 3 17; MD-3102, "Balti " "" "Sorocii " 46
#> 4 398; MD-3111, "Balti " "" "Stefan cel Mare " 20
#> 5 1130; MD-3128, "Balti " "str-la " "Lvovului " 2
#> 6 1252; MD-3128, "Balti " "str-la " "Lvovului " 1
#> 7 2814; MD-3102, "Balti " "" "Cahulului " 44
Data from OP
moldavia_2 <-
tibble(adresa = c("1;MD-3101,Balti Botu Pavel 3",
"3;MD-3102,Balti str-la Muresanu A. 11",
"17;MD-3102,Balti Sorocii 46",
"398;MD-3111,Balti Stefan cel Mare 20",
"1130;MD-3128,Balti str-la Lvovului 2",
"1252;MD-3128,Balti str-la Lvovului 1",
"2814;MD-3102,Balti Cahulului 44")
)
Created on 2023-03-22 with reprex v2.0.2
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With