Here, we visualize the inversions that were identified across all size, depth, and platform treatments for long reads.
library(ggplot2)
library(dplyr)
library(ggpubr)
library(ggpattern)
options(warn = -1, repr.plot.width = 19, repr.plot.height = 12)
found_inversions <- read.table("longread.inversions.summary", header = T)
nrow(found_inversions)
head(found_inversions)
There are approximate duplicates in here, so we need to filter out rows with breakpoints that are within a few bp of each other
Source
remove_nearby_duplicates <- function(df, threshold = 20) {
n <- nrow(df)
if (n <= 1) return(df)
# Sort by start position within each group for efficiency
df <- df[order(df$start), ]
keep <- rep(TRUE, n)
for (i in 1:(n-1)) {
if (!keep[i]) next
# Only check rows that could potentially be within threshold
remaining_idx <- (i+1):n
start_diffs <- abs(df$start[i] - df$start[remaining_idx])
# Early exit if start positions are already too far apart (since sorted)
if (min(start_diffs) > threshold) break
end_diffs <- abs(df$end[i] - df$end[remaining_idx])
# Mark rows that are too close
too_close <- which(start_diffs <= threshold & end_diffs <= threshold)
if (length(too_close) > 0) {
keep[remaining_idx[too_close]] <- FALSE
}
}
return(df[keep, ])
}
cleaned_inversions <- found_inversions %>%
group_by(sample, chrom, size, depth, platform) %>%
group_modify(~ remove_nearby_duplicates(.x, threshold = 20)) %>%
ungroup()
nrow(cleaned_inversions)
head(cleaned_inversions, 10)
Small sanity check to make sure the deduplication was accurate
filter(found_inversions, platform == "pacbio", chrom == "2L", size == "small", sample == "sample_01", depth == 5)
filter(cleaned_inversions, platform == "pacbio", chrom == "2L", size == "small", sample == "sample_01", depth == 5)
Simulated Inversions¶
Read in the groundtruth file. To avoid merging the breakpoints with the samples-inversions manifest each time, this process was done once with the code below and written to the file to be imported.
Source
if(!file.exists("manifest.breakpoints")){
# Get all the inversion breakpoints
groundtruth <- read.table("../inversion_simulations.inventory", header = T)
truthfile <- paste0("../simulated_variants/setup_inversions/small/inv.small.vcf")
inversion_breakpoints <- read.table(truthfile, header = F)[,c(1,2,8)]
inversion_breakpoints$V8 <- as.numeric(unlist(lapply(inversion_breakpoints$V8, function(X){gsub(".+END=", "", X)})))
names(inversion_breakpoints) <- c("contig", "position_start", "position_end")
inversion_breakpoints$size <- "small"
inversion_breakpoints <- group_by(inversion_breakpoints, contig) %>% mutate(inversion = 1:n())
for( i in c("medium", "large","xl") ){
truthfile <- paste0("../simulated_variants/setup_inversions/", i, "/inv.", i,".vcf")
.inversion_breakpoints <- read.table(truthfile, header = F)[,c(1,2,8)]
.inversion_breakpoints$V8 <- as.numeric(unlist(lapply(.inversion_breakpoints$V8, function(X){gsub(".+END=", "", X)})))
names(.inversion_breakpoints) <- c("contig", "position_start", "position_end")
.inversion_breakpoints$size <- i
.inversion_breakpoints <- group_by(.inversion_breakpoints, contig) %>% mutate(inversion = 1:n())
inversion_breakpoints <- rbind(inversion_breakpoints, .inversion_breakpoints)
}
# Use a join to add breakpoints to the groundtruth
master_manifest <- merge(groundtruth, inversion_breakpoints, by.x = c("size", "contig","inversion"), by.y = c("size", "contig","inversion"))
# write it to a table so we don't have to do this over and over
write.table(master_manifest[c(4,2,7,8,1,5,6)], file = "manifest.breakpoints", row.names = F, quote = F)
}
groundtruth <- read.table("manifest.breakpoints", header = T)
head(groundtruth)
Extend the rows to include depth, platform, and state, defaulting the state to false negatives and true negatives. This table will be filled up according to whether an inversion was found, and if found, if it’s one of the known inversions.
assessment_df <- groundtruth[0,]
assessment_df$assessment <- character()
assessment_df$depth <- double()
assessment_df$platform <- character()
assessment_df$state <- character()
j <- 0
for(i in 1:nrow(groundtruth)){
.row <- groundtruth[i,]
if(.row$present){
.row$assessment <- "false negative"
} else {
.row$assessment <- "true negative"
}
for(.depth in c(0.5, 5.0, 20.0)){
for(.platform in c("pacbio", "ontlong", "ontshort")){
j <- j + 1
.row$depth <- .depth
.row$platform <- .platform
assessment_df[j,] <- .row
}
}
}
head(assessment_df, 8)
Assess Identified Inversions¶
The function to do a fuzzy-match to establish if the identified inversions are true/false positives/negatives
cleaned_inversions$assessment <- "false positive"
false_positives <- cleaned_inversions[0,]
for(i in 1:nrow(cleaned_inversions)){
.row <- cleaned_inversions[i,]
query <- which(
assessment_df$sample == .row$sample &
assessment_df$platform == .row$platform &
assessment_df$depth == .row$depth &
assessment_df$contig == .row$chrom &
assessment_df$size == .row$size &
assessment_df$position_start - 500 <= .row$start &
assessment_df$position_end + 500 >= .row$end
)
if(length(query) > 0 && assessment_df$present[query]){
cleaned_inversions$assessment[i] <- "true positive"
assessment_df$assessment[query] <- "true positive"
} else {
print(paste("false positive", query))
assessment_df$assessment[query] <- "false positive"
false_positives <- rbind(false_positives, .row)
}
}
This will have marked the inversions in both cleaned_inversions
and assessment_df
as either true positive
or false positive
. Let’s get a simple overview of performance (and a bit of a sanity check because this logic, although clean, took hours to get right).
table(assessment_df$assessment)
false_positives
Like the linked-read dataset, these data have no false positives, which is great!
Plotting the Inversions¶
Source
plot_inversions <- function(data, size_treatment){
filter(data, size == size_treatment, assessment == "true positive") %>%
ggplot(
aes(
x = sample,
ymin = position_start,
ymax = position_end,
color = platform,
)
) +
scale_y_continuous(labels = scales::comma) +
coord_flip() +
labs(title = "Called Inversions Across the Genome", subtitle = paste0("Size: ", size_treatment), y = "genomic position (bp)") +
geom_linerange(position = position_dodge(.7), linewidth = 2) +
facet_grid(cols = vars(contig), rows = vars(depth)) +
theme_light() +
theme(panel.grid.major.y = element_blank(), legend.position = "top")
}
plot_samples_matrix <- function(data, size_treatment){
filter(data, size == size_treatment) %>%
group_by(size, contig, position_start) %>% mutate(id = cur_group_id()) %>% ungroup() %>%
ggplot(aes(y = sample, x = id, fill = assessment, pattern = state)) +
geom_tile_pattern(
pattern_color = NA,
color = "white",
pattern_fill = "black",
pattern_angle = 45,
pattern_density = 0.5,
pattern_spacing = 0.025,
pattern_key_scale_factor = 1
) +
theme_light() +
labs(title = "By-Sample Inversion Detection", subtitle = "Inversions detected in individual samples, as a function of zygotic state.", caption = paste0("Size: ", size_treatment)) +
scale_fill_manual(values = c("false negative" = "#eaf398ff", "true negative" = "grey70", "true positive" = "#90aed8")) +
scale_pattern_manual(values = c("het" = "circle", "hom" = "none")) +
scale_x_discrete(name = "Inversion") +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
facet_grid(rows = vars(depth), cols = vars(platform))
}
Small Inversions¶
plot_inversions(assessment_df, "small")
plot_samples_matrix(assessment_df, "small")


Medium Inversions¶
plot_inversions(assessment_df, "medium")
plot_samples_matrix(assessment_df, "medium")


Large Inversions¶
plot_inversions(assessment_df, "large")
plot_samples_matrix(assessment_df, "large")


XL Inversions¶
plot_inversions(assessment_df, "xl")
plot_samples_matrix(assessment_df, "xl")

