I have one dataframe called snps of genetic variants:
ID Group
1: 1:12345:A:G 1
2: 1:12346:T:C 1
3: 1:23457:A:G 1
4: 3:1234:A:G 2
5: 3:12345:A:G 2
6: 1:99991:A:T 3
7: 1:99992:C:T 3
8: 1:99993:A:G 3
9: 1:99994:T:C 3
10: 1:99995:A:G 3
11: 4:777:A:G 4
12: 4:7778:T:C 4
13: 4:7774:A:T 4
14: 4:7771:C:G 4
I then have another datafame called ld that measures the correlation of some of these variants with each other:
SNP lead_snp R2
1: 1:12346:T:C 1:12345:A:G 0.6
2: 1:23457:A:G 1:12346:T:C 0.1
3: 3:12345:A:G 3:1234:A:G 0.5
4: 3:1234:A:G 3:12345:A:G 0.5
5: 1:99991:A:T 1:99992:C:T 0.2
6: 1:99991:A:T 1:99993:A:G 0.7
7: 1:99994:T:C 1:99991:A:T 0.1
8: 1:99992:C:T 1:99994:T:C 0.6
9: 4:777:A:G 4:7778:T:C 0.7
10: 4:7774:A:T 4:7771:C:G 0.8
I am looking to re-assign the already existing groups in snps$Group based on whether any snps/variants correlate <0.4.
Essentially I am looking to perform:
ld dataframe (they have no measures of R2) or have any >0.4 R2 with any of the other snps, leave them in that group.Right now my code for this is:
reassign_groups <- function(df, ld, threshold = 0.4) {
df <- df %>% arrange(Group)
new_group_id <- max(df$Group, na.rm = TRUE) + 1
low_r2_snps <- data.table() # Data table to store SNPs with <0.4 R2
for (group_id in unique(df$Group)) {
snps_in_group <- df[df$Group == group_id, ]
n <- nrow(snps_in_group)
for (i in 1:n) {
for (j in (i+1):n) {
snp1 <- snps_in_group$ID[i]
snp2 <- snps_in_group$ID[j]
ld_check <- ld[(lead_snp == snp1 & SNP == snp2) | (lead_snp == snp2 & SNP == snp1)]
if (nrow(ld_check) > 0 && any(ld_check$R2 < threshold)) {
df$Group[df$ID == snp2] <- new_group_id
# Store SNPs with <0.4 R2 in low_r2_snps
low_r2_snps <- rbind(low_r2_snps, data.table(snp1 = snp1, snp2 = snp2, R2 = ld_check$R2))
new_group_id <- new_group_id + 1
}
}
}
}
return(list(updated_df = df, low_r2_snps = low_r2_snps))
}
# Reassign groups based on LD criteria and get SNPs with <0.4 R2
result <- reassign_groups(snps, ld)
new_groups <- result$updated_df
low_r2_snps <- result$low_r2_snps
However, this isn't quite right. For my example it outputs:
ID Group
1: 1:12345:A:G 1
2: 1:12346:T:C 1
3: 1:23457:A:G 4
4: 3:1234:A:G 2
5: 3:12345:A:G 2
6: 1:99991:A:T 3
7: 1:99992:C:T 5
8: 1:99993:A:G 3
9: 1:99994:T:C 6
10: 1:99995:A:G 3
The expected ordering of this data above overall (edit: including a new edge case for group 4) is:
ID Old_group New_group
1: 1:12345:A:G 1 1
2: 1:12346:T:C 1 1
3: 1:23457:A:G 1 5
4: 3:1234:A:G 2 2
5: 3:12345:A:G 2 2
6: 1:99991:A:T 3 3
7: 1:99992:C:T 3 3
8: 1:99993:A:G 3 3
9: 1:99994:T:C 3 3
10: 1:99995:A:G 3 3
11: 4:777:A:G 4 4
12: 4:7778:T:C 4 4
13: 4:7774:A:T 4 4
14: 4:7771:C:G 4 4
Only 1:23457:A:G has been assigned a new group number as it has <0.4 with every other snp in its old group. Some other snps have <0.4 for some snps but >0.4 for others in their group (so they remain in the group).
How can I fix my code to reach this?
Edit: updated example data:
ld <- structure(list(SNP = c("1:12346:T:C", "1:23457:A:G", "3:12345:A:G",
"3:1234:A:G", "1:99991:A:T", "1:99991:A:T", "1:99994:T:C", "1:99992:C:T",
"4:777:A:G", "4:7774:A:T"), lead_snp = c("1:12345:A:G", "1:12346:T:C",
"3:1234:A:G", "3:12345:A:G", "1:99992:C:T", "1:99993:A:G", "1:99991:A:T",
"1:99994:T:C", "4:7778:T:C", "4:7771:C:G"), R2 = c(0.6, 0.1,
0.5, 0.5, 0.2, 0.7, 0.1, 0.6, 0.7, 0.8)), row.names = c(NA, -10L
), class = c("data.table", "data.frame"))
snps <-structure(list(ID = c("1:12345:A:G", "1:12346:T:C", "1:23457:A:G",
"3:1234:A:G", "3:12345:A:G", "1:99991:A:T", "1:99992:C:T", "1:99993:A:G",
"1:99994:T:C", "1:99995:A:G", "4:777:A:G", "4:7778:T:C", "4:7774:A:T",
"4:7771:C:G"), Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 4L, 4L, 4L, 4L)), row.names = c(NA, -14L), class = c("data.table",
"data.frame"))
In this case there is no need for igraph, we just need to get maximum R2 any SNP has within the group with any SNP, if less than 0.4, then set new group.
ldg <- merge(ld, snps[, .(SNP = ID, Group)], all.x = TRUE)
ldg <- unique(rbind(ldg[, .(ID = SNP, R2)],
ldg[, .(ID = lead_snp, R2)]))
res <- merge(snps, ldg, all.x = TRUE)
setorder(res, rn)
res <- res[, .(R2max = max(R2)), by = .(ID, Group, rn)]
#if R2 missing set to 1
res[ is.na(R2max), R2max := 1 ]
# if R2 < 4 set group to NA
res[ R2max < 0.4, Group := NA ]
# get max group ID
gmax <- max(res$Group, na.rm = TRUE)
# set new Group ID for NAs
res[ is.na(Group), Group := gmax + .I ]
res
# ID Group rn R2max
# 1: 1:12345:A:G 1 1 0.6
# 2: 1:12346:T:C 1 2 0.6
# 3: 1:23457:A:G 5 3 0.1 ### <- new group assigned as 5
# 4: 3:1234:A:G 2 4 0.5
# 5: 3:12345:A:G 2 5 0.5
# 6: 1:99991:A:T 3 6 0.7
# 7: 1:99992:C:T 3 7 0.6
# 8: 1:99993:A:G 3 8 0.7
# 9: 1:99994:T:C 3 9 0.6
#10: 1:99995:A:G 3 10 1.0
#11: 4:777:A:G 4 11 0.7
#12: 4:7778:T:C 4 12 0.7
#13: 4:7774:A:T 4 13 0.8
#14: 4:7771:C:G 4 14 0.8
Using igraph membership based on high ld:
#Example data:
ld <- structure(list(SNP = c("1:12346:T:C", "1:23457:A:G", "3:12345:A:G",
"3:1234:A:G", "1:99991:A:T", "1:99991:A:T", "1:99994:T:C", "1:99992:C:T"
), lead_snp = c("1:12345:A:G", "1:12346:T:C", "3:1234:A:G", "3:12345:A:G",
"1:99992:C:T", "1:99993:A:G", "1:99991:A:T", "1:99994:T:C"),
R2 = c(0.6, 0.1, 0.5, 0.5, 0.2, 0.7, 0.1, 0.6)), row.names = c(NA,
-8L), class = c("data.table", "data.frame"))
snps <-structure(list(ID = c("1:12345:A:G", "1:12346:T:C", "1:23457:A:G",
"3:1234:A:G", "3:12345:A:G", "1:99991:A:T", "1:99992:C:T", "1:99993:A:G",
"1:99994:T:C", "1:99995:A:G"), Group = c(1L, 1L, 1L, 2L, 2L,
3L, 3L, 3L, 3L, 3L)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
library(igraph)
# get clusters where R2 is more than 0.4
g <- graph_from_data_frame(ld[ R2 > 0.4, ])
plot(g)

gGroup <- setNames(stack(components(g)$membership), c("membership", "ID"))
# add rownumber to reorder after merge
snps[, rn := .I ]
res <- merge(snps, gGroup, all.x = TRUE)
setorder(res, rn)
# if there is no ld, then keep same group
res[ !ID %in% unlist(ld[, 1:2 ]), newGroup := paste0("a_", Group) ]
# no membership means low ld, make new group
res[ is.na(newGroup) & is.na(membership), newGroup := paste0("b_", .I) ]
# update group based on igraph membership
res[ is.na(newGroup) & !is.na(membership),
newGroup := ifelse(Group == membership, paste0("a_", membership),
paste0("c_", membership)) ]
#convert to integer groups
res[, newGroup := as.integer(as.factor(newGroup)) ]
res
# ID Group rn membership newGroup
# 1: 1:12345:A:G 1 1 1 1
# 2: 1:12346:T:C 1 2 1 1
# 3: 1:23457:A:G 1 3 NA 4
# 4: 3:1234:A:G 2 4 2 2
# 5: 3:12345:A:G 2 5 2 2
# 6: 1:99991:A:T 3 6 3 3
# 7: 1:99992:C:T 3 7 4 5
# 8: 1:99993:A:G 3 8 3 3
# 9: 1:99994:T:C 3 9 4 5
# 10: 1:99995:A:G 3 10 NA 3
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With