I am try to split Granges
to specific n
of bins, usually, GenomicRanges::tile
could work for this. However, my Granges
has some gaps, for example:
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("GenomicRanges")
# BiocManager::install("GenomicFeatures")
library(GenomicRanges)
library(GenomicFeatures)
gr <- GRanges(
seqnames = "chr1",
ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
strand = "+"
)
GRanges object with 3 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr1 100-148 +
[2] chr1 200-249 +
[3] chr1 300-349 +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
expected output:
seqnames ranges strand | bin
<Rle> <IRanges> <Rle> | <integer>
[1] chr1 100-109 + | 1
[2] chr1 110-119 + | 2
[3] chr1 120-129 + | 3
[4] chr1 130-139 + | 4
[5] chr1 140-148 + | 5 ## cross gap
[6] chr1 200-200 + | 5
[7] chr1 201-210 + | 6
...
A diagram for this is:
(The middle
l3
is l2
, Its a typo)
I have a customize function to do this, but It runs very slowly. I'm looking forward to seeing if there are any efficient methods.
tile_granges_with_intron <- function(exons, n) {
exon_lengths <- width(exons)
total_length <- sum(exon_lengths)
bin_size <- total_length / n
bin_starts <- floor(seq(1, total_length, by = bin_size))
bin_ends <- c(bin_starts[-1] - 1, total_length)
cum_lengths <- cumsum(exon_lengths)
transcript_starts <- c(1, cum_lengths[-length(cum_lengths)] + 1)
transcript_ends <- cum_lengths
bins <- GRangesList()
for (i in 1:n) {
bin_start <- bin_starts[i]
bin_end <- bin_ends[i]
overlaps <- which(
bin_start <= transcript_ends & bin_end >= transcript_starts
)
bin_ranges <- GRanges()
for (j in overlaps) {
overlap_start <- max(bin_start, transcript_starts[j])
overlap_end <- min(bin_end, transcript_ends[j])
genomic_start <- start(exons[j]) + (overlap_start - transcript_starts[j])
genomic_end <- start(exons[j]) + (overlap_end - transcript_starts[j])
bin_ranges <- c(bin_ranges, GRanges(
seqnames = seqnames(exons[j]),
ranges = IRanges(start = genomic_start, end = genomic_end),
strand = strand(exons[j])
))
}
mcols(bin_ranges)$bin <- i
bins[[i]] <- bin_ranges
}
names(bins) <- paste0("bin", 1:n)
bins <- unlist(bins, use.names = FALSE)
return(bins)
}
If this cannot be accomplished in Grangs
, can it be accomplished in data.table
?
The column strands
here can be ignored because they are always consistent
# we can convert it to data.table
df <- as.data.frame(gr) %>%
data.table::as.data.table()
df
seqnames start end width strand
<fctr> <int> <int> <int> <fctr>
1: chr1 100 148 50 +
2: chr1 200 249 50 +
3: chr1 300 349 50 +
library(GenomicRanges)
library(GenomicFeatures)
gr <- GRanges(
seqnames = "chr1",
ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
strand = "+"
)
tile_granges_vectorized <- function(gr, n_bins) {
starts <- start(gr)
widths <- width(gr)
total_length <- sum(widths)
bin_size <- total_length / n_bins
cum_ends <- cumsum(widths)
cum_starts <- c(1, cum_ends[-length(cum_ends)] + 1)
max_results <- n_bins * length(gr)
result_seqnames <- character(max_results)
result_starts <- integer(max_results)
result_ends <- integer(max_results)
result_strands <- character(max_results)
result_bins <- integer(max_results)
result_count <- 0
for(i in 1:n_bins) {
bin_start_cum <- (i - 1) * bin_size + 1
bin_end_cum <- min(i * bin_size, total_length)
overlaps <- which(bin_start_cum <= cum_ends & bin_end_cum >= cum_starts)
if(length(overlaps) > 0) {
overlap_starts_cum <- pmax(bin_start_cum, cum_starts[overlaps])
overlap_ends_cum <- pmin(bin_end_cum, cum_ends[overlaps])
genomic_starts <- starts[overlaps] + (overlap_starts_cum - cum_starts[overlaps])
genomic_ends <- starts[overlaps] + (overlap_ends_cum - cum_starts[overlaps])
n_overlaps <- length(overlaps)
idx_range <- (result_count + 1):(result_count + n_overlaps)
result_seqnames[idx_range] <- as.character(seqnames(gr)[overlaps])
result_starts[idx_range] <- genomic_starts
result_ends[idx_range] <- genomic_ends
result_strands[idx_range] <- as.character(strand(gr)[overlaps])
result_bins[idx_range] <- i
result_count <- result_count + n_overlaps
}
}
if(result_count > 0) {
idx <- 1:result_count
GRanges(
seqnames = result_seqnames[idx],
ranges = IRanges(start = result_starts[idx], end = result_ends[idx]),
strand = result_strands[idx],
bin = result_bins[idx]
)
} else {
GRanges()
}
}
microbenchmark::microbenchmark(
intron = tile_granges_with_intron(gr, 10),
vectorized = tile_granges_vectorized(gr, 10),
times = 10,
check='identical'
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> intron 322.7702 328.6691 341.78269 341.35505 345.6044 377.0622 10 a
#> vectorized 10.0457 10.4513 12.07787 10.97265 13.5882 18.1502 10 b
Created on 2025-06-30 with reprex v2.1.1
Although there is an accepted answer, I post the solution below since it uses a different logic, achieves comparable results in terms of speed, and is more stable (in terms of speed) when the number of bins grows.
library(GenomicRanges)
library(GenomicFeatures)
gr <- GRanges(
seqnames = "chr1",
ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
strand = "+"
)
tile_granges_rle <- function(exons, n) {
exon_lengths <- width(exons)
total_length <- sum(exon_lengths)
# get cutting points
bin_size <- total_length / n
bin_cuts <- seq(from = bin_size, to = total_length, by = bin_size)
# assign each gene to a bin
bin_idx <- findInterval(
x = 1:total_length,
vec = bin_cuts,
left.open = T,
checkSorted = F
) + 1
# identify new ranges & get summary using running length encoding
range_idx <- rep(start(exons), times = width(exons))
new_ranges <- paste(range_idx, bin_idx)
rle_ranges <- rle(new_ranges)
id_split <- strsplit(rle_ranges$values, split = " ")
rle_ranges$bins <- as.integer(sapply(id_split, "[", 2))
rle_ranges$values <- as.integer(sapply(id_split, "[", 1))
# find start and end points of each new range (+ strand & seqnames)
for (val in unique(rle_ranges$values)) {
idx_val <- which(rle_ranges$values == val)
range_width <- rle_ranges$lengths[idx_val]
rle_ranges$starts <- c(rle_ranges$starts, cumsum(range_width) - range_width + val)
rle_ranges$ends <- c(rle_ranges$ends, cumsum(range_width) - 1 + val)
strand_val <- as.character(strand(exons[start(exons) == val]))
rle_ranges$strands <- c(rle_ranges$strands,
rep(strand_val, times = length(idx_val)))
seqnames_val <- as.character(seqnames(exons[start(exons) == val]))
rle_ranges$seqnames <- c(rle_ranges$seqnames,
rep(seqnames_val, times = length(idx_val)))
}
res <- GRanges(
seqnames = rle_ranges$seqnames,
ranges = IRanges(start = rle_ranges$starts, end = rle_ranges$ends),
strand = rle_ranges$strands,
bin = rle_ranges$bins
)
return(res)
}
microbenchmark::microbenchmark(
intron = tile_granges_with_intron(gr, 10),
vectorized = tile_granges_vectorized(gr, 10),
rle = tile_granges_rle(gr, 10),
times = 10,
check='identical'
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# intron 337.5580 350.1868 416.33718 362.63930 382.9547 902.6799 10 a
# vectorized 10.2361 10.5405 11.08353 11.01365 11.7656 11.8167 10 b
# rle 8.9927 9.1254 10.29057 10.18255 10.8431 12.2104 10 b
microbenchmark::microbenchmark(
intron = tile_granges_with_intron(gr, 60),
vectorized = tile_granges_vectorized(gr, 60),
rle = tile_granges_rle(gr, 60),
times = 10,
check='identical'
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# intron 1946.3718 1974.2415 1995.67697 1981.59585 1999.4995 2091.2665 10 a
# vectorized 42.6344 43.0850 47.24929 44.35430 54.7038 56.2842 10 b
# rle 9.0754 9.3832 9.78063 9.52365 10.2477 11.2114 10 c
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With