Skip to article frontmatterSkip to article content

Long Read Inversion Assessment

Here, we visualize the inversions that were identified across all size, depth, and platform treatments for long reads.

library(ggplot2)
library(dplyr)
library(ggpubr)
library(ggpattern)
options(warn = -1, repr.plot.width = 19, repr.plot.height = 12)

Import Data

Identified Inversions

Read in the aggregate inversion discovery manifest

found_inversions <- read.table("longread.inversions.summary", header = T)
nrow(found_inversions)
head(found_inversions)
Loading...

There are approximate duplicates in here, so we need to filter out rows with breakpoints that are within a few bp of each other

Source
remove_nearby_duplicates <- function(df, threshold = 20) {
  n <- nrow(df)
  if (n <= 1) return(df)
  # Sort by start position within each group for efficiency
  df <- df[order(df$start), ]
  keep <- rep(TRUE, n)
  for (i in 1:(n-1)) {
    if (!keep[i]) next
    # Only check rows that could potentially be within threshold
    remaining_idx <- (i+1):n
    start_diffs <- abs(df$start[i] - df$start[remaining_idx])

    # Early exit if start positions are already too far apart (since sorted)
    if (min(start_diffs) > threshold) break

    end_diffs <- abs(df$end[i] - df$end[remaining_idx])

    # Mark rows that are too close
    too_close <- which(start_diffs <= threshold & end_diffs <= threshold)
    if (length(too_close) > 0) {
      keep[remaining_idx[too_close]] <- FALSE
    }
  }

  return(df[keep, ])
}
cleaned_inversions <- found_inversions %>%
  group_by(sample, chrom, size, depth, platform) %>%
  group_modify(~ remove_nearby_duplicates(.x, threshold = 20)) %>%
  ungroup()
nrow(cleaned_inversions)
head(cleaned_inversions, 10)
Loading...

Small sanity check to make sure the deduplication was accurate

filter(found_inversions,   platform == "pacbio", chrom == "2L", size == "small", sample == "sample_01", depth == 5)
filter(cleaned_inversions, platform == "pacbio", chrom == "2L", size == "small",  sample == "sample_01", depth == 5)
Loading...

Simulated Inversions

Read in the groundtruth file. To avoid merging the breakpoints with the samples-inversions manifest each time, this process was done once with the code below and written to the file to be imported.

Source
if(!file.exists("manifest.breakpoints")){
    # Get all the inversion breakpoints
    groundtruth <- read.table("../inversion_simulations.inventory", header = T)
    truthfile <- paste0("../simulated_variants/setup_inversions/small/inv.small.vcf")
    inversion_breakpoints <- read.table(truthfile, header = F)[,c(1,2,8)]
    inversion_breakpoints$V8 <- as.numeric(unlist(lapply(inversion_breakpoints$V8, function(X){gsub(".+END=", "", X)})))
    names(inversion_breakpoints) <- c("contig", "position_start", "position_end")
    inversion_breakpoints$size <- "small"
    inversion_breakpoints <- group_by(inversion_breakpoints, contig) %>% mutate(inversion = 1:n())

    for( i in c("medium", "large","xl") ){
        truthfile <- paste0("../simulated_variants/setup_inversions/", i, "/inv.", i,".vcf")
    
        .inversion_breakpoints <- read.table(truthfile, header = F)[,c(1,2,8)]
        .inversion_breakpoints$V8 <- as.numeric(unlist(lapply(.inversion_breakpoints$V8, function(X){gsub(".+END=", "", X)})))
        names(.inversion_breakpoints) <- c("contig", "position_start", "position_end")
        .inversion_breakpoints$size <- i
        .inversion_breakpoints <- group_by(.inversion_breakpoints, contig) %>% mutate(inversion = 1:n())
        inversion_breakpoints <- rbind(inversion_breakpoints, .inversion_breakpoints)
    }
    # Use a join to add breakpoints to the groundtruth
    master_manifest <- merge(groundtruth, inversion_breakpoints, by.x = c("size", "contig","inversion"), by.y = c("size", "contig","inversion"))

    # write it to a table so we don't have to do this over and over
    write.table(master_manifest[c(4,2,7,8,1,5,6)], file = "manifest.breakpoints", row.names = F, quote = F)
}
groundtruth <- read.table("manifest.breakpoints", header = T)
head(groundtruth)
Loading...

Extend the rows to include depth, platform, and state, defaulting the state to false negatives and true negatives. This table will be filled up according to whether an inversion was found, and if found, if it’s one of the known inversions.

assessment_df <- groundtruth[0,]
assessment_df$assessment <- character()
assessment_df$depth <- double()
assessment_df$platform <- character()
assessment_df$state <- character()
j <- 0
for(i in 1:nrow(groundtruth)){
    .row <- groundtruth[i,]
    if(.row$present){
        .row$assessment <- "false negative"
    } else {
        .row$assessment <- "true negative"
    }
    for(.depth in c(0.5, 5.0, 20.0)){
        for(.platform in c("pacbio", "ontlong", "ontshort")){
            j <- j + 1
            .row$depth <- .depth
            .row$platform <- .platform
            assessment_df[j,] <- .row
        }
    }
}
head(assessment_df, 8)
Loading...

Assess Identified Inversions

The function to do a fuzzy-match to establish if the identified inversions are true/false positives/negatives

cleaned_inversions$assessment <- "false positive"
false_positives <- cleaned_inversions[0,]
for(i in 1:nrow(cleaned_inversions)){
    .row <- cleaned_inversions[i,]
    query <- which(
        assessment_df$sample == .row$sample &
        assessment_df$platform == .row$platform &
        assessment_df$depth == .row$depth &
        assessment_df$contig == .row$chrom &
        assessment_df$size == .row$size &
        assessment_df$position_start - 500 <= .row$start &
        assessment_df$position_end + 500 >= .row$end
    )
    if(length(query) > 0 && assessment_df$present[query]){
        cleaned_inversions$assessment[i] <- "true positive"
        assessment_df$assessment[query] <- "true positive"
    } else {
        print(paste("false positive", query))
        assessment_df$assessment[query] <- "false positive"
        false_positives <- rbind(false_positives, .row)
    }
}

This will have marked the inversions in both cleaned_inversions and assessment_df as either true positive or false positive. Let’s get a simple overview of performance (and a bit of a sanity check because this logic, although clean, took hours to get right).

table(assessment_df$assessment)
false_positives
Loading...

Like the linked-read dataset, these data have no false positives, which is great!

Plotting the Inversions

Source
plot_inversions <- function(data, size_treatment){
    filter(data, size == size_treatment, assessment == "true positive") %>%
    ggplot(
        aes(
            x = sample,
            ymin = position_start,
            ymax = position_end,
            color = platform,
        )
    ) +
        scale_y_continuous(labels = scales::comma) +
        coord_flip() +
        labs(title = "Called Inversions Across the Genome", subtitle = paste0("Size: ", size_treatment), y = "genomic position (bp)") + 
        geom_linerange(position = position_dodge(.7), linewidth = 2) +
        facet_grid(cols = vars(contig), rows = vars(depth)) +
        theme_light() +
        theme(panel.grid.major.y = element_blank(), legend.position = "top")
}

plot_samples_matrix <- function(data, size_treatment){
filter(data, size == size_treatment) %>%
    group_by(size, contig, position_start) %>% mutate(id = cur_group_id()) %>% ungroup() %>%
    ggplot(aes(y = sample, x = id, fill = assessment, pattern = state)) +
        geom_tile_pattern(
            pattern_color = NA,
            color = "white",
            pattern_fill = "black",
            pattern_angle = 45,
            pattern_density = 0.5,
            pattern_spacing = 0.025,
            pattern_key_scale_factor = 1
        ) +
        theme_light() +
        labs(title = "By-Sample Inversion Detection", subtitle = "Inversions detected in individual samples, as a function of zygotic state.", caption = paste0("Size: ", size_treatment)) +
        scale_fill_manual(values = c("false negative" = "#eaf398ff", "true negative" = "grey70", "true positive" = "#90aed8")) +
        scale_pattern_manual(values = c("het" = "circle", "hom" = "none")) +
        scale_x_discrete(name = "Inversion") +
        theme(
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank()
        ) +
        facet_grid(rows = vars(depth), cols = vars(platform))
}

Small Inversions

plot_inversions(assessment_df, "small")
plot_samples_matrix(assessment_df, "small")
plot without titleplot without title

Medium Inversions

plot_inversions(assessment_df, "medium")
plot_samples_matrix(assessment_df, "medium")
plot without titleplot without title

Large Inversions

plot_inversions(assessment_df, "large")
plot_samples_matrix(assessment_df, "large")
plot without titleplot without title

XL Inversions

plot_inversions(assessment_df, "xl")
plot_samples_matrix(assessment_df, "xl")
plot without titleplot without title