How to re-group rows based on correlation values

Question

I have one dataframe called snps of genetic variants:

             ID Group
 1: 1:12345:A:G     1
 2: 1:12346:T:C     1
 3: 1:23457:A:G     1
 4:  3:1234:A:G     2
 5: 3:12345:A:G     2
 6: 1:99991:A:T     3
 7: 1:99992:C:T     3
 8: 1:99993:A:G     3
 9: 1:99994:T:C     3
10: 1:99995:A:G     3
11:   4:777:A:G     4 
12:  4:7778:T:C     4 
13:  4:7774:A:T     4 
14:  4:7771:C:G     4

I then have another datafame called ld that measures the correlation of some of these variants with each other:

           SNP    lead_snp  R2
1: 1:12346:T:C 1:12345:A:G 0.6
2: 1:23457:A:G 1:12346:T:C 0.1
3: 3:12345:A:G  3:1234:A:G 0.5
4:  3:1234:A:G 3:12345:A:G 0.5
5: 1:99991:A:T 1:99992:C:T 0.2
6: 1:99991:A:T 1:99993:A:G 0.7
7: 1:99994:T:C 1:99991:A:T 0.1
8: 1:99992:C:T 1:99994:T:C 0.6
 9:   4:777:A:G  4:7778:T:C 0.7
10:  4:7774:A:T  4:7771:C:G 0.8

I am looking to re-assign the already existing groups in snps$Group based on whether any snps/variants correlate <0.4.

Essentially I am looking to perform:

Per each already existing group, look for any snps that have only <0.4 correlation with any other snp in that group
Assign that snp into a new group number
If the rest of the snps either aren't present in the ld dataframe (they have no measures of R2) or have any >0.4 R2 with any of the other snps, leave them in that group.

Right now my code for this is:

reassign_groups <- function(df, ld, threshold = 0.4) {
  df <- df %>% arrange(Group)
  new_group_id <- max(df$Group, na.rm = TRUE) + 1
  low_r2_snps <- data.table()  # Data table to store SNPs with <0.4 R2
  
  for (group_id in unique(df$Group)) {
    snps_in_group <- df[df$Group == group_id, ]
    n <- nrow(snps_in_group)
    
    for (i in 1:n) {
      for (j in (i+1):n) {
        snp1 <- snps_in_group$ID[i]
        snp2 <- snps_in_group$ID[j]
        
        ld_check <- ld[(lead_snp == snp1 & SNP == snp2) | (lead_snp == snp2 & SNP == snp1)]
        if (nrow(ld_check) > 0 && any(ld_check$R2 < threshold)) {
          df$Group[df$ID == snp2] <- new_group_id
          
          # Store SNPs with <0.4 R2 in low_r2_snps
          low_r2_snps <- rbind(low_r2_snps, data.table(snp1 = snp1, snp2 = snp2, R2 = ld_check$R2))
          
          new_group_id <- new_group_id + 1
        }
      }
    }
  }
  return(list(updated_df = df, low_r2_snps = low_r2_snps))
}

# Reassign groups based on LD criteria and get SNPs with <0.4 R2
result <- reassign_groups(snps, ld)

new_groups <- result$updated_df
low_r2_snps <- result$low_r2_snps

However, this isn't quite right. For my example it outputs:

             ID Group
 1: 1:12345:A:G     1
 2: 1:12346:T:C     1
 3: 1:23457:A:G     4
 4:  3:1234:A:G     2
 5: 3:12345:A:G     2
 6: 1:99991:A:T     3
 7: 1:99992:C:T     5
 8: 1:99993:A:G     3
 9: 1:99994:T:C     6
10: 1:99995:A:G     3

The expected ordering of this data above overall (edit: including a new edge case for group 4) is:

            ID Old_group New_group
 1: 1:12345:A:G         1         1
 2: 1:12346:T:C         1         1
 3: 1:23457:A:G         1         5
 4:  3:1234:A:G         2         2
 5: 3:12345:A:G         2         2
 6: 1:99991:A:T         3         3
 7: 1:99992:C:T         3         3
 8: 1:99993:A:G         3         3
 9: 1:99994:T:C         3         3
10: 1:99995:A:G         3         3
11:   4:777:A:G         4         4
12:  4:7778:T:C         4         4
13:  4:7774:A:T         4         4
14:  4:7771:C:G         4         4

Only 1:23457:A:G has been assigned a new group number as it has <0.4 with every other snp in its old group. Some other snps have <0.4 for some snps but >0.4 for others in their group (so they remain in the group).

How can I fix my code to reach this?

Edit: updated example data:


ld <- structure(list(SNP = c("1:12346:T:C", "1:23457:A:G", "3:12345:A:G", 
                             "3:1234:A:G", "1:99991:A:T", "1:99991:A:T", "1:99994:T:C", "1:99992:C:T", 
                             "4:777:A:G", "4:7774:A:T"), lead_snp = c("1:12345:A:G", "1:12346:T:C", 
                                                                      "3:1234:A:G", "3:12345:A:G", "1:99992:C:T", "1:99993:A:G", "1:99991:A:T", 
                                                                      "1:99994:T:C", "4:7778:T:C", "4:7771:C:G"), R2 = c(0.6, 0.1, 
                                                                                                                         0.5, 0.5, 0.2, 0.7, 0.1, 0.6, 0.7, 0.8)), row.names = c(NA, -10L
                                                                                                                         ), class = c("data.table", "data.frame"))

snps <-structure(list(ID = c("1:12345:A:G", "1:12346:T:C", "1:23457:A:G", 
                             "3:1234:A:G", "3:12345:A:G", "1:99991:A:T", "1:99992:C:T", "1:99993:A:G", 
                             "1:99994:T:C", "1:99995:A:G", "4:777:A:G", "4:7778:T:C", "4:7774:A:T", 
                             "4:7771:C:G"), Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 
                                                      3L, 4L, 4L, 4L, 4L)), row.names = c(NA, -14L), class = c("data.table", 
                                                                                                               "data.frame"))

zx8754 · Accepted Answer

New answer after requirements change

In this case there is no need for igraph, we just need to get maximum R2 any SNP has within the group with any SNP, if less than 0.4, then set new group.

ldg <- merge(ld, snps[, .(SNP = ID, Group)], all.x = TRUE)
ldg <- unique(rbind(ldg[, .(ID = SNP, R2)],
                    ldg[, .(ID = lead_snp, R2)]))
res <- merge(snps, ldg, all.x = TRUE)
setorder(res, rn)
res <- res[, .(R2max = max(R2)), by = .(ID, Group, rn)]

#if R2 missing set to 1
res[ is.na(R2max), R2max := 1 ]
# if R2 < 4 set group to NA
res[ R2max < 0.4, Group := NA ]
# get max group ID
gmax <- max(res$Group, na.rm = TRUE)
# set new Group ID for NAs
res[ is.na(Group), Group := gmax + .I ]
res
#             ID Group rn R2max
# 1: 1:12345:A:G     1  1   0.6
# 2: 1:12346:T:C     1  2   0.6
# 3: 1:23457:A:G     5  3   0.1 ### <- new group assigned as 5
# 4:  3:1234:A:G     2  4   0.5
# 5: 3:12345:A:G     2  5   0.5
# 6: 1:99991:A:T     3  6   0.7
# 7: 1:99992:C:T     3  7   0.6
# 8: 1:99993:A:G     3  8   0.7
# 9: 1:99994:T:C     3  9   0.6
#10: 1:99995:A:G     3 10   1.0
#11:   4:777:A:G     4 11   0.7
#12:  4:7778:T:C     4 12   0.7
#13:  4:7774:A:T     4 13   0.8
#14:  4:7771:C:G     4 14   0.8

Old answer

Using igraph membership based on high ld:

#Example data:
ld <- structure(list(SNP = c("1:12346:T:C", "1:23457:A:G", "3:12345:A:G", 
                             "3:1234:A:G", "1:99991:A:T", "1:99991:A:T", "1:99994:T:C", "1:99992:C:T"
), lead_snp = c("1:12345:A:G", "1:12346:T:C", "3:1234:A:G", "3:12345:A:G", 
                "1:99992:C:T", "1:99993:A:G", "1:99991:A:T", "1:99994:T:C"), 
R2 = c(0.6, 0.1, 0.5, 0.5, 0.2, 0.7, 0.1, 0.6)), row.names = c(NA, 
                                                               -8L), class = c("data.table", "data.frame"))

snps <-structure(list(ID = c("1:12345:A:G", "1:12346:T:C", "1:23457:A:G", 
                             "3:1234:A:G", "3:12345:A:G", "1:99991:A:T", "1:99992:C:T", "1:99993:A:G", 
                             "1:99994:T:C", "1:99995:A:G"), Group = c(1L, 1L, 1L, 2L, 2L, 
                                                                      3L, 3L, 3L, 3L, 3L)), row.names = c(NA, -10L), class = c("data.table", 
                                                                                                                               "data.frame"))


library(igraph)
# get clusters where R2 is more than 0.4
g <- graph_from_data_frame(ld[ R2 > 0.4, ])
plot(g)

enter image description here

gGroup <- setNames(stack(components(g)$membership), c("membership", "ID"))

# add rownumber to reorder after merge
snps[, rn := .I ]
res <- merge(snps, gGroup, all.x = TRUE)
setorder(res, rn)

# if there is no ld, then keep same group
res[ !ID %in% unlist(ld[, 1:2 ]), newGroup := paste0("a_", Group) ]
# no membership means low ld, make new group
res[ is.na(newGroup) & is.na(membership), newGroup := paste0("b_", .I) ]
# update group based on igraph membership
res[ is.na(newGroup) & !is.na(membership),
     newGroup := ifelse(Group == membership, paste0("a_", membership),
                        paste0("c_", membership)) ]

#convert to integer groups
res[, newGroup := as.integer(as.factor(newGroup)) ]
res
#              ID Group rn membership newGroup
#  1: 1:12345:A:G     1  1          1        1
#  2: 1:12346:T:C     1  2          1        1
#  3: 1:23457:A:G     1  3         NA        4
#  4:  3:1234:A:G     2  4          2        2
#  5: 3:12345:A:G     2  5          2        2
#  6: 1:99991:A:T     3  6          3        3
#  7: 1:99992:C:T     3  7          4        5
#  8: 1:99993:A:G     3  8          3        3
#  9: 1:99994:T:C     3  9          4        5
# 10: 1:99995:A:G     3 10         NA        3

How to re-group rows based on correlation values

Tags:

r

data.table

tidyverse

DN1

1 Answers

New answer after requirements change

Old answer

zx8754

Recent Activity

Donate For Us

How to re-group rows based on correlation values

Tags:

r

data.table

tidyverse

DN1

1 Answers

New answer after requirements change

Old answer

zx8754

Related questions

Recent Activity

Donate For Us