Skip to article frontmatterSkip to article content

XL Inversions

Here, we visualize the inversions that were identified for the xl treatment across the range of depths. The LEVIATHAN small, medium, and large call rates are configured to -s 95 -m 95 -l 90.

library(ggplot2)
library(dplyr)
library(ggpubr)
options(warn = -1, repr.plot.width = 19, repr.plot.height = 12)
.size <- "xl"
Output

Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


These are the setup functions that process that data. Note this fuzzy-match tolerates +- 1kb instead of the 500bp of small and medium.

Source
read_data <- function(size_treatment, depth){
    .depth <- paste("depth",depth, sep = "_")
    truthfile <- paste0("simulated_variants/setup_inversions/", size_treatment, "/inv.", size_treatment,".vcf")
    samplesfile <- paste("simulated_data/called_sv/leviathan", size_treatment, .depth, "by_sample90/inversions.bedpe", sep = "/")
    poolfile <- paste("simulated_data/called_sv/leviathan", size_treatment, .depth, "by_pop90/inversions.bedpe", sep = "/")

    true_inversions <- read.table(truthfile, header = F)[,c(1,2,8)]
    true_inversions$V8 <- as.numeric(unlist(lapply(true_inversions$V8, function(X){gsub(".+END=", "", X)})))
    names(true_inversions) <- c("contig", "position_start", "position_end")
    true_inversions$sample <- "simulated_inversions"

    sample_inversions <- read.table(samplesfile, header = T)[,1:4]

    pooled_inversions <- read.table(poolfile, header = T)
    pooled_inversions <- pooled_inversions[,c("population","contig", "position_start", "position_end")]
    names(pooled_inversions)[1] <- "sample"
    if( nrow(pooled_inversions) > 0){
        pooled_inversions$sample <- "pooled" #paste("pooled", depth, size_treatment, sep = "_")
    }
    return(rbind(true_inversions, sample_inversions, pooled_inversions))
}

assess_performace <- function(data, out_df){
    truth <- filter(data, sample == "simulated_inversions")
    truth <- group_by(truth, contig) %>% mutate(id = 1:n())
    #truth <- group_by(truth, het) %>% mutate(id = 1:n())
    
    samples <- filter(data, sample != "simulated_inversions")
    for(i in 1:nrow(samples)){
        .row <- samples[i,]
        contig <- .row$contig
        .sample <- .row$sample
        startpos <- .row$position_start
        endpos <- .row$position_end
        tb <- filter(truth, contig == contig, (position_start - 500 <= startpos) & (position_end + 500 >= endpos))
        if(nrow(tb) > 0){
            out_df[out_df$sample == .sample & out_df$contig == contig & out_df$inversion == tb$id, "state"] <- "true positive"
        }
    }
    out_df$contig <- gsub("2L", "2L (all homozygous)", out_df$contig)
    out_df$contig <- gsub("2R", "2R (all heterozygous)", out_df$contig)
    out_df$contig <- gsub("3L", "3L (inversions common)", out_df$contig)
    out_df$contig <- gsub("3R", "3R (inversions rare)", out_df$contig)
    return(out_df)
}
Source
plot_inversions <- function(data, size_treatment, depth){
    axis_ticks <- factor(c("pooled", paste0("sample_", sprintf("%02d", 1:10)), "simulated_inversions"))
    ggplot(
        data,
        aes(
            x = position_start,
            xend = position_end,
            y = sample,
            yend = sample,
            color = sample
        )
    ) +
        scale_x_continuous(labels = scales::comma) +
        scale_y_discrete(limits = axis_ticks) +
        labs(title = "Simulated and Called Inversions Across the Genome", subtitle = paste0("Size: ", size_treatment, " | Depth: ", depth,"X"), x = "genomic position (bp)", caption = "simulated_inversions are groundtruth") + 
        geom_segment(linewidth = 2) +
        facet_wrap(~contig, ncol = 1, scales = "free_x") +
        theme_light() +
        theme(panel.grid.major.y = element_blank(), legend.position = "None")
}

plot_samples_matrix <- function(data, size_treatment, depth){
    .data <- data[data$sample != "pooled" & !is.na(data$zygosity),]
    axis_ticks <- factor(paste0("sample_", sprintf("%02d", 1:10)))
    ggplot(.data, aes(y = sample, x = factor(id, levels = unique(.data$id), ordered = T), fill = state)) +
        geom_tile(color = "white") +
        theme_light() +
        labs(title = "By-Sample Inversion Detection", subtitle = "Inversions detected in individual samples, as a function of zygotic state.", caption = paste0("Size: ", size_treatment, " | Depth: ", depth,"X")) +
        scale_fill_manual(values = c("false negative" = "grey70", "true positive" = "#90aed8")) +
        scale_x_discrete(name = "Inversion") +
        scale_y_discrete(limits = axis_ticks, breaks = axis_ticks) +
        theme(
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank()
        ) +
        facet_grid(cols = vars(zygosity), scales = "free_x")
}

plot_pools_matrix <- function(data, size_treatment, depth){
    .data <- data[data$sample == "pooled",]
    ggplot(.data, aes(y = 1, x = as.character(id), fill = state)) +
        geom_tile(color = "white") +
        theme_light() +
        labs(title = "Pooled-Sample Detection", subtitle = "Inversions detected in sample-pooled data, as a function of inversion frequency in the population.", caption = paste0("Size: ", size_treatment, " | Depth: ", depth,"X")) +
        scale_fill_manual(values = c("false negative" = "grey70", "true positive" = "#90aed8")) +
        scale_y_continuous(breaks = 1, name = "State") +
        scale_x_discrete(name = "Inversion") +
        theme(
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks.y = element_blank()
        ) +
        facet_grid(cols = vars(contig), scales = "free_x")
}

A function to wrap all these other functions for simplicity.

Source
summarize_performance <- function(depth, inventory, .size){
    .data <- read_data(.size, depth)
    .performance <- assess_performace(.data, inventory)
    .sampleplot <- plot_samples_matrix(.performance, .size, depth)
    .poolplot <- plot_pools_matrix(.performance, .size, depth)
    .inversionplot <- plot_inversions(.data, .size, depth)
    show(
        ggarrange(
            ggarrange(.sampleplot, .poolplot, ncol = 2, labels = c("A", "B"), common.legend = T),
            .inversionplot, nrow = 2, labels = c("","C"), heights = c(1.5,4)
        )
    )
    return(.performance)
}

Read in the sample and pool simulation inventory

Source
inventory <- rbind(
    filter(read.table("inversion_simulations.inventory", header = T), size == .size),
    filter(read.table("inversion_simulations.pool.inventory", header = T), size == .size)
)
performance <- rename(inventory, "simulated" = "present", "zygosity" = "state")
# set a default to false negatives, to be updated to true positives when conditions are met
performance$state <- "false negative"
# there are no false positives, so we can default inversions that weren't simulated to true negatives
performance[!performance$simulated, 7] <- "true negative"
# fix the name of pool to pooled
performance$sample <- gsub("pool", "pooled", performance$sample)
# read in truth data to transfer inversion breakpoints
x <- read_data(.size, 5)
truth <- group_by(x[x$sample == "simulated_inversions",], contig) %>% mutate(inversion = 1:n())
truth <- truth[, c(1,5,2,3)]
performance <- merge(performance,truth, by = c("contig", "inversion"))
performance$id <- as.integer(as.factor(performance$position_start))
performance <- performance[, c(1,2,10,8,9,3:7)]
tail(performance)
Loading...

0.5X depth

assessment_05X <- summarize_performance(0.5, performance, .size)
plot without title

2X Depth

assessment_2X <- summarize_performance(2, performance, .size)
plot without title

5X Depth

assessment_5X <- summarize_performance(5, performance, .size)
plot without title

10X Depth

assessment_10X <- summarize_performance(10, performance, .size)
plot without title

20X Depth

assessment_20X <- summarize_performance(20, performance, .size)
plot without title

Write the results to a file

.x <- list(assessment_05X, assessment_2X, assessment_5X, assessment_10X, assessment_20X)
depths  <- c(0.5, 2, 5, 10, 20)
aggregate_df <- assessment_05X[0,]
aggregate_df$depth <- character()
for(i in 1:5){
    .tbl <- .x[i][[1]]
    .tbl$depth <- depths[i]
    aggregate_df <- rbind(aggregate_df, .tbl)
}
# sort it to make it visually sensible
aggregate_df <- arrange(aggregate_df, contig,inversion,depth, sample)
# restore contig names
aggregate_df$contig <- gsub("\\s*\\([^\\)]+\\)", "", aggregate_df$contig)
head(aggregate_df)
ifelse(!dir.exists("assess_called_sv"), dir.create("assess_called_sv"), TRUE)
write.csv(aggregate_df, file = paste0("assess_called_sv/",.size, ".sv.assessment"), quote = F, row.names = F)
Loading...