Assuming we have a set v of even cardinality, e.g., v <- 1:6, and a data.frame df consisting of entries of v, which is defined by an fixed occurrence of each element from v in each column, namely, k, for example
k <- 2
x <- rep(v, each = k)
df <- data.frame(A = x, B = c(tail(x, -(k + 1)), head(x, k + 1)))
shown as
> df
A B
1 1 2
2 1 3
3 2 3
4 2 4
5 3 4
6 3 5
7 4 5
8 4 6
9 5 6
10 5 1
11 6 1
12 6 2
where the occurrences of 1:6 on both columns are 2
> table(df$A)
1 2 3 4 5 6
2 2 2 2 2 2
> table(df$B)
1 2 3 4 5 6
2 2 2 2 2 2
In df, each row represents a "pair", and there are no duplicated "pairs". I am wondering, how to divide the pairs into clusters, such that each cluster is minimal and complete, i.e., each cluster contains all values from v, without any duplicated entries.
Since the cardinality of v is length(v), and the occurrence of each entry in df is actually 2*k, the number of clusters via a "ideal" split of df should be 2*k*length(v)/length(v) == 2*k. In other words, the number of clusters is defined by k only, say, 2*k.
For example, df can be divided into 4 clusters like below, where the "completeness" property can be achieved
[[1]]
A B
1 1 2
5 3 4
9 5 6
[[2]]
A B
2 1 3
7 4 5
12 6 2
[[3]]
A B
3 2 3
8 4 6
10 5 1
[[4]]
A B
4 2 4
6 3 5
11 6 1
Note that, output above is just one of the valid instances, and there should be other candidates for clustering.
One possible solution is using Monte Carlo simulation and keep the valid clustering outcomes, iteratively, if the randomized cluster satisfy all constraints. The code may look like below
out <- c()
repeat {
if (nrow(df) == 0) {
break
}
repeat {
k <- sample.int(nrow(df), length(v) / 2)
if (!length(setdiff(v, unlist(df[k, ])))) {
out <- c(out, list(df[k, ]))
df <- df[-k, ]
break
}
}
}
which sometimes can give the desired output like
> out
[[1]]
A B
6 3 5
11 6 1
4 2 4
[[2]]
A B
2 1 3
7 4 5
12 6 2
[[3]]
A B
8 4 6
3 2 3
10 5 1
[[4]]
A B
1 1 2
9 5 6
5 3 4
However, this approach has a major issue, say, Inefficiency: If the set v has a large cardinality, the space for Monte Carlo simulation exponentially grows, which dramatically slows down the procedure of finding a valid solution.
I am wondering, if there is a stable and more efficient to solve such kind of problem. I think backtracking should work, but I believe there must be other approaches that can make it in a more elegant manner.
Looking forward to diverse and interesting solutions. Appreciated in advance!
I'm not confident that I completely follow what the desired behavior is, so I recommend further testing of this solution. The idea is to:
df, and two vertices are connected if they have no elements in common.library(Rfast) # for `rowTabulate` and `rowMaxs`
library(adagio) # for `setcover`
library(igraph) # for `max_cliques`
f <- function(df) {
v <- unique(unlist(df))
pairs <- combn(nrow(df), 2)
n <- choose(nrow(df), 2)
y <- matrix(match(unlist(df[combn(nrow(df), 2),]), v), 2*n, 2, 1)
y <- rowTabulate(cbind(y[1:n,], y[(n + 1):(2*n),]), length(v))
mode(y) <- "numeric"
g <- graph_from_data_frame(as.data.frame(t(pairs[,rowMaxs(y, TRUE) == 1])),
FALSE)
cl <- lapply(max_cliques(g, length(v)/2), \(x) as.integer(names(x)))
m <- matrix(0L, length(cl), nrow(df))
m[cbind(rep(1:length(cl), each = length(v)/2), unlist(cl))] <- 1L
lapply(cl[setcover(m)$sets], \(x) df[x,])
}
Testing on df from the question:
f(df)
#> [[1]]
#> A B
#> 11 6 1
#> 6 3 5
#> 4 2 4
#>
#> [[2]]
#> A B
#> 2 1 3
#> 12 6 2
#> 7 4 5
#>
#> [[3]]
#> A B
#> 3 2 3
#> 8 4 6
#> 10 5 1
#>
#> [[4]]
#> A B
#> 5 3 4
#> 1 1 2
#> 9 5 6
Here is my data.table/igraph approach
updated answer the partitioning part is heavily inspired by this answer
v <- 1:6
k <- 2
x <- rep(v, each = k)
df <- data.frame(A = x, B = c(tail(x, -(k + 1)), head(x, k + 1)))
library(data.table)
library(igraph)
# set of tiles to work with
n = 3
setDT(df)
# set a rownumber
df[, from := .I]
# set rownumber as key
setkey(df, from)
# get all possible combinations with different A and B values
df[df, to := paste0(
df[!A %in% c(i.A, i.B) & !B %in% c(i.A, i.B), ]$from,
collapse = ";"), by = .EACHI]
# split to rows
nodes <- df[, .(to = unlist(tstrsplit(to, ";"))), by = .(from)]
# build graph
g <- graph_from_data_frame(d = nodes, directed = FALSE, vertices = df$from)
# get closed triangles (here n = 3) / subsets of n vertices, present in a matrix
allCliques <- cliques(g, min = n,max = n)
# create matrix of subgraphs
cliqueMatrix <- matrix(c(rep(paste0("cluster", seq_along(allCliques)),
sapply(allCliques, length)), names(unlist(allCliques))), ncol=2)
# make graph of subgraphs
g2 <- graph_from_edgelist(cliqueMatrix, directed = FALSE)
V(g2)$type <- grepl("cluster", V(g2)$name)
plot(g2)
# get icidence matrix to get what cluster contains which vertices
g2.ind <- t(as_incidence_matrix(g2))
# create an adjacency matrix by multiplcate the incidennce matrix by its transpose
g2.adj <- g2.ind %*% t(g2.ind)
# create a graph from the adjacency matrix
g2.adj.g <- graph_from_adjacency_matrix(g2.adj, mode = "undirected")
#plot(g2.adj.g)
# get the independant vertex sets
g2.adj.mis <- ivs(g2.adj.g)
sets <- lapply(g2.adj.mis, function(x) cliqueMatrix[cliqueMatrix[,1] %in% as_ids(x), 2])
# we want the sets with maximum sets size (12 in this case, so all tiles are used)
lapply(sets[which(sapply(sets, length) == max(sapply(sets, length)))],
matrix, nrow = n)
# [[1]]
# [,1] [,2] [,3] [,4]
# [1,] "5" "3" "2" "1"
# [2,] "10" "7" "4" "6"
# [3,] "12" "11" "9" "8"
#
# [[2]]
# [,1] [,2] [,3] [,4]
# [1,] "4" "3" "2" "1"
# [2,] "6" "8" "7" "5"
# [3,] "11" "10" "12" "9"
lapply(sets[which(sapply(sets, length) == max(sapply(sets, length)))],
function(m) {
lapply(as.data.table(matrix(m, nrow = n)), function(x) df[from %in% as.numeric(x), .(rn = from, A, B)])
})
# [[1]]
# [[1]]$V1
# rn A B
# 1: 5 3 4
# 2: 10 5 1
# 3: 12 6 2
#
# [[1]]$V2
# rn A B
# 1: 3 2 3
# 2: 7 4 5
# 3: 11 6 1
#
# [[1]]$V3
# rn A B
# 1: 2 1 3
# 2: 4 2 4
# 3: 9 5 6
#
# [[1]]$V4
# rn A B
# 1: 1 1 2
# 2: 6 3 5
# 3: 8 4 6
#
#
# [[2]]
# [[2]]$V1
# rn A B
# 1: 4 2 4
# 2: 6 3 5
# 3: 11 6 1
#
# [[2]]$V2
# rn A B
# 1: 3 2 3
# 2: 8 4 6
# 3: 10 5 1
#
# [[2]]$V3
# rn A B
# 1: 2 1 3
# 2: 7 4 5
# 3: 12 6 2
#
# [[2]]$V4
# rn A B
# 1: 1 1 2
# 2: 5 3 4
# 3: 9 5 6
old answer
note: I lack some mathematic skills, so it could be that the solution only works in this specific usecase. so I would appreciate of someone could take a look at that
I believe this is a way to select n rows (here 3) with all different values in A and B (so you end up with 6 diffetent values).
library(data.table)
library(igraph)
# set of tiles to work with
n = 3
setDT(df)
# set a rownumber
df[, from := .I]
# set rownumber as key
setkey(df, from)
# get all possible combinations with different A and B values
df[df, to := paste0(
df[!A %in% c(i.A, i.B) & !B %in% c(i.A, i.B), ]$from,
collapse = ";"), by = .EACHI]
# split to rows
nodes <- df[, .(to = unlist(tstrsplit(to, ";"))), by = .(from)]
# build graph
g <- graph_from_data_frame(d = nodes, directed = FALSE, vertices = df$from)
# get triangles/subsets of 3 vertices, present in a matrix
t(sapply(cliques(g, min = n,max = n), names))
# [,1] [,2] [,3]
# [1,] "5" "10" "12"
# [2,] "4" "6" "11"
# [3,] "3" "7" "11"
# [4,] "3" "8" "10"
# [5,] "2" "4" "9"
# [6,] "2" "7" "12"
# [7,] "1" "5" "9"
# [8,] "1" "6" "8"
or, as a last line
lapply(cliques(g, min = 3,max = 3), function(x) df[as.numeric(x), .(rn = from, A,B)])
which results in
# [[1]]
# rn A B
# 1: 5 3 4
# 2: 10 5 1
# 3: 12 6 2
#
# [[2]]
# rn A B
# 1: 4 2 4
# 2: 6 3 5
# 3: 11 6 1
#
# [[3]]
# rn A B
# 1: 3 2 3
# 2: 7 4 5
# 3: 11 6 1
#
# [[4]]
# rn A B
# 1: 3 2 3
# 2: 8 4 6
# 3: 10 5 1
#
# [[5]]
# rn A B
# 1: 2 1 3
# 2: 4 2 4
# 3: 9 5 6
#
# [[6]]
# rn A B
# 1: 2 1 3
# 2: 7 4 5
# 3: 12 6 2
#
# [[7]]
# rn A B
# 1: 1 1 2
# 2: 5 3 4
# 3: 9 5 6
#
# [[8]]
# rn A B
# 1: 1 1 2
# 2: 6 3 5
# 3: 8 4 6
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With