As explained here when the test condition in ifelse(test, yes, no) is NA, the evaluation is also NA. Hence the following returns...
df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
b = c(NA, NA, 1, 1, NA, NA),
c = c(rep(NA, 4), 1, 1))
ifelse(df$a==1, "a==1",
ifelse(df$b==1, "b==1",
ifelse(df$c==1, "c==1", NA)))
#[1] "a==1" "a==1" NA NA NA NA
... instead of the desired
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
As suggested by Cath, I can circumvent this problem by formally specifying that the test condition should not include NA:
ifelse(df$a==1 & !is.na(df$a), "a==1",
ifelse(df$b==1 & !is.na(df$b), "b==1",
ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
However, as akrun also noted, this solution becomes rather lengthy with increasing number of columns.
A workaround would be to first replace all NAs with a value not present in the data.frame (e.g, 2 in this case):
df_noNA <- data.frame(a = c(1, 1, 2, 2, 2 ,2),
b = c(2, 2, 1, 1, 2, 2),
c = c(rep(2, 4), 1, 1))
ifelse(df_noNA$a==1, "a==1",
ifelse(df_noNA$b==1, "b==1",
ifelse(df_noNA$c==1, "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
However, I was wondering if there was a more direct way to tell ifelse to ignore NAs? Or is writing a function for & !is.na the most direct way?
ignorena <- function(column) {
column ==1 & !is.na(column)
}
ifelse(ignorena(df$a), "a==1",
ifelse(ignorena(df$b), "b==1",
ifelse(ignorena(df$c), "c==1", NA)))
#[1] "a==1" "a==1" "b==1" "b==1" "c==1" "c==1"
You can use %in% instead of == to sort-of ignore NAs.
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
Unfortunately, this does not give any performance gain compared to the original while @arkun's solution is about 3 times faster.
solution_original <- function(){
ifelse(df$a==1 & !is.na(df$a), "a==1",
ifelse(df$b==1 & !is.na(df$b), "b==1",
ifelse(df$c==1 & !is.na(df$c), "c==1", NA)))
}
solution_akrun <- function(){
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
}
solution_mine <- function(x){
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
}
set.seed(1)
df <- data.frame(a = sample(c(1, rep(NA, 4)), 1e6, T),
b = sample(c(1, rep(NA, 4)), 1e6, T),
c = sample(c(1, rep(NA, 4)), 1e6, T))
microbenchmark::microbenchmark(
solution_original(),
solution_akrun(),
solution_mine()
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## solution_original() 701.9413 839.3715 845.0720 853.1960 875.6151 1051.6659 100
## solution_akrun() 217.4129 242.5113 293.2987 253.2144 387.1598 564.3981 100
## solution_mine() 698.7628 845.0822 848.6717 858.7892 877.9676 1006.2872 100
Was inspired by this: R: Dealing with TRUE, FALSE, NA and NaN
Edit
Following the comment by @arkun, I redid the benchmark and revised the statement.
dplyr::case_when is a convenient alternative to cascading ifelse calls:
library(dplyr)
df <- data.frame(a = c(1, 1, NA, NA, NA ,NA),
b = c(NA, NA, 1, 1, NA, NA),
c = c(rep(NA, 4), 1, 1))
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
#> a b c equals
#> 1 1 NA NA a==1
#> 2 1 NA NA a==1
#> 3 NA 1 NA b==1
#> 4 NA 1 NA b==1
#> 5 NA NA 1 c==1
#> 6 NA NA 1 c==1
It cascades like ifelse, so if the first condition is true, the first result is returned even if the second and third conditions are true too. If none are true, it returns NA:
set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 30, replace = TRUE), 10)), letters[1:3])
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
#> a b c equals
#> 1 NA 1 1 b==1
#> 2 1 NA NA a==1
#> 3 NA 1 NA b==1
#> 4 NA NA 1 c==1
#> 5 NA NA NA <NA>
#> 6 NA NA 1 c==1
#> 7 1 1 1 a==1
#> 8 1 1 1 a==1
#> 9 NA 1 NA b==1
#> 10 NA 1 NA b==1
Plus it's fast:
set.seed(47)
df <- setNames(as.data.frame(matrix(sample(c(1, NA), 3 * 1e5, replace = TRUE), ncol = 3)), letters[1:3])
microbenchmark::microbenchmark(
original = {
ifelse(df$a == 1 & !is.na(df$a), "a==1",
ifelse(df$b == 1 & !is.na(df$b), "b==1",
ifelse(df$c == 1 & !is.na(df$c), "c==1", NA)))},
akrun = {
v1 <- names(df)[max.col(!is.na(df)) * NA^!rowSums(!is.na(df))]
i1 <- !is.na(v1)
v1[i1] <- paste0(v1[i1], "==1")
},
amatsuo_net = {
ifelse(df$a %in% 1, "a==1",
ifelse(df$b %in% 1, "b==1",
ifelse(df$c %in% 1, "c==1", NA)))
},
alistaire = {
df %>% mutate(equals = case_when(a == 1 ~ 'a==1',
b == 1 ~ 'b==1',
c == 1 ~ 'c==1'))
}
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> original 81.19896 86.11843 110.93882 123.92463 128.58037 171.11026 100
#> akrun 27.50351 30.99127 38.98353 32.67991 34.64947 77.98958 100
#> amatsuo_net 83.75744 88.54095 109.22226 110.40066 129.02168 170.92911 100
#> alistaire 16.57426 18.91951 21.73293 19.29925 24.30350 33.83180 100
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With