Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Long-Read (Sniffles)

Cornell University

Here, we visualize the inversions that were identified across all size, depth, and platform treatments for long reads.

library(ggplot2)
library(dplyr)
library(tidyr)
library(ggpubr)
options(warn = -1, repr.plot.width = 19, repr.plot.height = 12)

Import Data

Identified Inversions

Read in the aggregate inversion discovery manifest

found_inversions <- read.table("longread.inversions.summary", header = T)
nrow(found_inversions)
head(found_inversions)
Loading...
Loading...

Simulated Inversions

Read in the groundtruth file. To avoid merging the breakpoints with the samples-inversions manifest each time, this process was done once with the code below and written to the file to be imported.

Source
if(!file.exists("manifest.breakpoints")){
    # Get all the inversion breakpoints
    groundtruth <- read.table("../inversion_simulations.inventory", header = T)
    truthfile <- paste0("../simulated_variants/setup_inversions/small/inv.small.vcf")
    inversion_breakpoints <- read.table(truthfile, header = F)[,c(1,2,8)]
    inversion_breakpoints$V8 <- as.numeric(unlist(lapply(inversion_breakpoints$V8, function(X){gsub(".+END=", "", X)})))
    names(inversion_breakpoints) <- c("contig", "position_start", "position_end")
    inversion_breakpoints$size <- "small"
    inversion_breakpoints <- group_by(inversion_breakpoints, contig) %>% mutate(inversion = 1:n())

    for( i in c("medium", "large","xl") ){
        truthfile <- paste0("../simulated_variants/setup_inversions/", i, "/inv.", i,".vcf")
    
        .inversion_breakpoints <- read.table(truthfile, header = F)[,c(1,2,8)]
        .inversion_breakpoints$V8 <- as.numeric(unlist(lapply(.inversion_breakpoints$V8, function(X){gsub(".+END=", "", X)})))
        names(.inversion_breakpoints) <- c("contig", "position_start", "position_end")
        .inversion_breakpoints$size <- i
        .inversion_breakpoints <- group_by(.inversion_breakpoints, contig) %>% mutate(inversion = 1:n())
        inversion_breakpoints <- rbind(inversion_breakpoints, .inversion_breakpoints)
    }
    # Use a join to add breakpoints to the groundtruth
    master_manifest <- merge(groundtruth, inversion_breakpoints, by.x = c("size", "contig","inversion"), by.y = c("size", "contig","inversion"))
    master_manifest$state <- ifelse(master_manifest$state == "het", "heterozygous", "homozygous")
    # write it to a table so we don't have to do this over and over
    write.table(master_manifest[c(4,2,7,8,1,5,6)], file = "manifest.breakpoints", row.names = F, quote = F)
}

groundtruth <- read.table("manifest.breakpoints", header = T)
head(groundtruth)
Loading...

Extend the rows to include depth, platform, and state, defaulting the state to false negatives and true negatives. This table will be filled up according to whether an inversion was found, and if found, if it’s one of the known inversions.

assessment_df <- groundtruth[0,]
assessment_df$assessment <- character()
assessment_df$depth <- double()
assessment_df$platform <- character()
assessment_df$state <- character()
j <- 0
for(i in 1:nrow(groundtruth)){
    .row <- groundtruth[i,]
    if(.row$present){
        .row$assessment <- "false negative"
    } else {
        .row$assessment <- "true negative"
    }
    for(.depth in c(0.5, 2.0, 5.0, 10.0, 20.0)){
        for(.platform in c("pacbio", "ontlong", "ontshort")){
            j <- j + 1
            .row$depth <- .depth
            .row$platform <- .platform
            assessment_df[j,] <- .row
        }
    }
}
assessment_df <- rename(assessment_df, zygosity = state)
head(assessment_df, 8)
Loading...

Assess Identified Inversions

The function to do a fuzzy-match to establish if the identified inversions are true/false positives/negatives

near_breakpoint <- function(x, y, tolerance = 75){
    # x is the simulated/known breakpoints
    # y is the breakpoint we want to compare
    # check if y is within tolerance of x
    return(
        (y >= x - tolerance) & (y <= x + tolerance)
    )
}
found_inversions$assessment <- "false positive"
false_positives <- found_inversions[0,]
for(i in 1:nrow(found_inversions)){
    .row <- found_inversions[i,]
    query <- which(
        assessment_df$sample == .row$sample &
        assessment_df$platform == .row$platform &
        assessment_df$depth == .row$depth &
        assessment_df$contig == .row$chrom &
        assessment_df$size == .row$size &
        near_breakpoint(assessment_df$position_start, .row$start, 100) &
        near_breakpoint(assessment_df$position_end, .row$end, 100)
    )
    if(length(query) > 0 && assessment_df$present[query]){
        found_inversions$assessment[i] <- "true positive"
        assessment_df$assessment[query] <- "true positive"
    } else {
        assessment_df$assessment[query] <- "false positive"
        false_positives <- rbind(false_positives, .row)
    }
}
nrow(false_positives)
if (nrow(false_positives) > 0) {
    false_positives$sample <- as.integer(gsub("sample_", "", false_positives$sample))
    false_positives <- select(false_positives, -quality, -assessment) %>% rename("contig" = chrom, "position_start" = start, "position_end" = end, "method" = platform) %>%
    select("contig", "position_start", "position_end", "sample", "depth", "size", "method")
    write.table(false_positives, file = "false_positives.longreads", row.names = F, quote = F)
    head(false_positives)
}
Loading...
Loading...

Let’s write this massive table to a file.

write.csv(assessment_df, file = "longread.sv.assessment", row.names = F, quote = F)
table(assessment_df$assessment)
false negative true negative true positive 1194 1950 4056

Well, it’s now missing false positives b/c those were put into a separate table. Those need to be combined now.

fp <- false_positives
fp$zygosity <- "homozygous" #actual value doesnt matter
fp$assessment <- "false positive"
fp$present <- FALSE
fp <- rename(fp, "platform" = method)
assessment_df <- rbind(assessment_df, fp)
head(assessment_df)
Loading...

Plotting the Inversions

We are plotting the inversions across the genome as a visual assessment, along with a presence/absence matrix across platforms and zygotic state. Small inversions were made bigger by 100kb to see them easier on the plot. All other inversions had their lengths increased by 50kb for the same reason.

Source
plot_inversions <- function(data, size_treatment){
    if(size_treatment == "small"){
        scale_factor = 100000
    } else {
        scale_factor = 50000
    }

    filter(data, size == size_treatment, assessment == "true positive") %>%
    ggplot(
        aes(
            x = sample,
            ymin = position_start,
            ymax = position_end + scale_factor,
            color = platform,
        )
    ) +
        scale_y_continuous(labels = scales::comma) +
        coord_flip() +
        geom_linerange(position = position_dodge(.7), linewidth = 2) +
        labs(title = "Called Inversions Across the Genome", subtitle = paste0("Size: ", size_treatment), y = "genomic position (bp)") + 
        facet_grid(cols = vars(contig), rows = vars(depth)) +
        theme_light() +
        theme(panel.grid.major.y = element_blank(), legend.position = "top")
}

plot_samples_matrix <- function(data, size_treatment){
filter(data, size == size_treatment) %>%
    group_by(size, contig, position_start) %>% mutate(id = cur_group_id()) %>% ungroup() %>%
    ggplot(aes(y = sample, x = id, fill = assessment, color = assessment, shape = zygosity)) +
        geom_point(size=6) +
        theme_light() +
        labs(title = "By-Sample Inversion Detection", subtitle = "Inversions detected in individual samples, as a function of zygotic state.", caption = paste0("Size: ", size_treatment)) +
        scale_color_manual(name = "Assessment", values = c("false negative" = "grey75", "true negative" = "white", "true positive" = "#90aed8")) +
        scale_fill_manual(name = "Assessment", values = c("false negative" = "white", "true negative" = "white", "true positive" = "#90aed8")) +
        scale_shape_manual(name = "Zygosity", values = c("homozygous" = 21, "heterozygous" = 22)) +
        scale_x_discrete(name = "Inversion") +
        theme(
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank()
        ) +
        facet_grid(rows = vars(depth), cols = vars(platform))
}

Small Inversions

plot_inversions(assessment_df, "small")
plot_samples_matrix(assessment_df, "small")
plot without title
plot without title

Medium Inversions

plot_inversions(assessment_df, "medium")
plot_samples_matrix(assessment_df, "medium")
plot without title
plot without title

Large Inversions

plot_inversions(assessment_df, "large")
plot_samples_matrix(assessment_df, "large")
plot without title
plot without title

XL Inversions

plot_inversions(assessment_df, "xl")
plot_samples_matrix(assessment_df, "xl")
plot without title
plot without title

Precision vs Recall

metrics <- group_by(assessment_df, size, depth, platform) %>%
    summarize(
        TP = sum(assessment == "true positive"),
        FP = sum(assessment == "false positive"),
        TN = sum(assessment == "true negative"),
        FN = sum(assessment == "false negative")
    ) %>% ungroup()
metrics$precision <- metrics$TP / (metrics$TP + metrics$FP)
metrics$recall <- metrics$TP / (metrics$TP + metrics$FN)
metrics$F1 <- 2 * ((metrics$precision * metrics$recall) / (metrics$precision + metrics$recall))
metrics$depth <- metrics$depth
metrics$size <- factor(metrics$size, ordered = T, levels = c("small", "medium", "large", "xl"))
head(metrics)
`summarise()` has grouped output by 'size', 'depth'. You can override using the
`.groups` argument.
Loading...
greys <- c(
    "#c0c6cf",
    "#a2aab5",
    "#6e7684",
    "#54546c",
    "black"
)

colorful <- c(
    "#9b6981",
    "#682c37",
    "#f6955e" ,
    "#a8cdec"
)
options(warn = -1, repr.plot.width = 10, repr.plot.height = 10)
ggplot(metrics, aes(x = recall, y = precision, color = as.factor(depth), shape = size)) +
    geom_abline(intercept = 0, slope = 1, color = "#ab99b1ff") +
    geom_point(size = 6) +
    theme_light() +
    scale_color_manual(name = "Sample Depth", values = greys) +
    scale_shape_manual(name = "Inversion Size", values = c(19,15,18,17)) +
    coord_cartesian(xlim = c(0,1), ylim = c(0,1)) +
    labs(title = "Precision-Recall Curve (long reads)") +
    facet_grid(cols = vars(platform), rows = vars(size))
plot without title
options(warn = -1, repr.plot.width = 15, repr.plot.height = 5)
ggplot(metrics, aes(x = depth, y = F1, color = platform, shape = platform, group = paste(size, platform))) +
    geom_line() +
    geom_point(size = 6) +
    theme_light() +
    scale_shape_manual(name = "Platform", values = c(19, 15, 17)) +
    scale_color_manual(name = "Platform", values = colorful) +
    labs(title = "F-scores Across Inversion Sizes (long reads)") +
    facet_grid(cols = vars(size))
plot without title