Skip to article frontmatterSkip to article content

XL-sized Inversions

Here, we visualize the inversions that were identified for the xl treatment across the range of depths for the 90% threshold called variants. An extra consideration here is that with the deconvolution-during-concatenation step, Harpy was unable to call variants on the 10X and 20X data because there were too many molecules (more than the 96^4 possible with haplotagging).

library(ggplot2)
options(warn = -1)

read_data <- function(size_treatment, depth){
    truthfile <- paste0("simulated_data/inversions/", size_treatment, "/inv.", size_treatment,".vcf")
    samplesfile <- paste("simulated_data/called_sv/leviathan90", size_treatment, depth , "by_sample/inversions.bedpe", sep = "/")
    poolfile <- paste("simulated_data/called_sv/leviathan90", size_treatment, depth, "by_pop/inversions.bedpe", sep = "/")

    true_inversions <- read.table(truthfile, header = F)[,c(1,2,8)]
    true_inversions$V8 <- as.numeric(unlist(lapply(true_inversions$V8, function(X){gsub(".+END=", "", X)})))
    names(true_inversions) <- c("contig", "position_start", "position_end")
    true_inversions$sample <- factor(c("simulated_inversions"))

    sample_inversions <- read.table(
        paste("simulated_data/called_sv/leviathan90", size_treatment, depth, "by_sample/inversions.bedpe", sep = "/"),
        header = T
    )[,1:4]
    sample_inversions$sample <- as.factor(sample_inversions$sample)
    if(file.exists(poolfile)){
        pooled_inversions <- read.table(poolfile, header = T)
        pooled_inversions <- pooled_inversions[,c("population","contig", "position_start", "position_end")]
        names(pooled_inversions)[1] <- "sample"
        if(nrow(pooled_inversions > 0)){
            pooled_inversions$sample <- factor(paste("pooled", depth, size_treatment, sep = "_"))
        }
    } else {
        # get an empty copy
        pooled_inversions <- sample_inversions[0, ]
    }
    return(
        rbind(true_inversions, sample_inversions, pooled_inversions)
    )
}
plot_data <- function(data, size_treatment, depth){
    axis_ticks <- c(paste0("sample_", sprintf("%02d", 1:10), ".", depth), "simulated_inversions", paste("pooled", depth, size_treatment, sep = "_"))
    ggplot(
        data,
        aes(
            x = position_start,
            xend = position_end,
            y = sample,
            yend = sample,
            color = sample
        )
    ) +
        scale_x_continuous(labels = scales::comma) +
        scale_y_discrete(limits = axis_ticks) +
        labs(title = "Called Inversions", subtitle = paste(size_treatment, depth), x = "genomic position (bp)", caption = "Simulated inversions (groundtruth) shown in top row") + 
        geom_segment(linewidth = 2) +
        facet_wrap(~contig, ncol = 1, scales = "free_x") +
        theme_light() +
        theme(panel.grid.major.y = element_blank(), legend.position = "None")
}

Set a global variable .size to "xl" to avoid writing it each time

.size <- "xl"

0.5X depth

depth_05X <- read_data(.size, "05X")
tail(depth_05X)
Loading...
plot_data(depth_05X, .size, "05X")
plot without title

2X Depth

depth_2X <- read_data(.size, "2X")
tail(depth_2X)
Loading...
plot_data(depth_2X, .size, "2X")
plot without title

5X Depth

depth_5X <- read_data(.size, "5X")
tail(depth_5X)
Loading...
plot_data(depth_5X, .size, "5X")
plot without title

10X Depth

depth_10X <- read_data(.size, "10X")
tail(depth_10X)
Loading...
plot_data(depth_10X, .size, "10X")
plot without title

20X Depth

depth_20X <- read_data(.size, "20X")
tail(depth_20X)
Loading...
plot_data(depth_20X, .size, "20X")
plot without title