Compare Inversion Calls Across Treatments

What would be interesting to see is the inversion-calling performance in linked reads across inversion sizes and depths. Since we already created the assessment tables from the individual size-class data exploration, we can read those in without having to reprocess those data.

list.files("assess_called_sv")

assess <- Reduce(rbind, Map(function(x) {read.csv(x, header = T)}, list.files("assess_called_sv", full.names = T)))

# remove unnecessary columns
assess <- assess[, c(1, 6:11)]
assess$depth <- as.factor(assess$depth)
assess$size <- factor(assess$size, ordered = T, levels = c("small","medium","large","xl"))

head(assess, 10)

Sample-level Calling¶

Prepare the Data¶

It would probably make sense to collapse these data somewhat to show a representation of true positives vs false negatives (as a ratio) across depths (x axis) and faceted for hom/het in the non-pooled data. We will also remove inversions that weren’t simulated for a sample b/c we aren’t interested in true negatives (since there were no false positives).

samples <- assess[assess$sample != "pooled" & assess$simulated,]

samples <- group_by(samples, size, depth, zygosity, state) %>%
    summarize(count = n())

head(samples)

Some of the treatments don’t have true positives or false negatives, so we’ll need to add in rows with count 0 for those treatments.

# add missing rows with value 0 in the event there were no true positives or false negatives for a treatment
# instantiate empty def
missings <- samples[0,]

for(size in c("small","medium","large","xl")){
    for(depth in levels(samples$depth)){
        for(zyg in c("hom","het")){
            .df <- samples[samples$size == size & samples$depth == depth & samples$zygosity == zyg,]
            if(!("false negative" %in% .df$state)){
                missings[nrow(missings)+1,] <- list(size, depth, zyg,"false negative", 0)
            }
            if(!("true positive" %in% .df$state)){
                missings[nrow(missings)+1,] <- list(size, depth, zyg,"true positive", 0)
            }
        }
    }
}

samples <- rbind(samples,missings) %>% arrange(size, depth, zygosity)
head(samples)

One last thing we’ll need to do is convert counts to proportions

samples <- group_by(samples, size, depth, zygosity) %>% mutate(proportion = round(count/sum(count),2))
head(samples)

Plotting¶

True Positives vs False Negatives¶

Now, let’s finally plot these data:

X-axis: depth
Y-axis: proportion
facets
- columns: zygosity
- rows: size

ggplot(samples, aes(x = depth, y = proportion, color = state, group = state)) +
    geom_line() +
    geom_point(size = 8) +
    theme_light() +
    labs(title = "Inversion Detection by Zygotic State and Variant Size", color = "Detection") +
    ylab("Proportion") +
    xlab("Sequencing Depth") +
    theme(panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank()) +
    facet_grid(cols = vars(zygosity), rows = vars(size))

True Positives Across Depths¶

The proportions will always sum to 1, so let’s try a consolidated view where it’s just the true positives across size treatments.

ggplot(samples[samples$state == "true positive",], aes(x = depth, y = proportion, color = size, group = size)) +
    geom_line() +
    geom_point(size = 8) +
    theme_light() +
    labs(title = "Inversion Detection by Zygotic State and Variant Size", color = "Inversion Size") +
    ylab("Proportion of Inversions Detected") +
    xlab("Sequencing Depth") +
    theme(panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank()) +
    facet_grid(cols = vars(zygosity))

True Positives heterozygotic vs homozygotic¶

What about the change in performance between homozygotic and heterozygotic states?

ggplot(samples[samples$state == "true positive",], aes(x = depth, y = proportion, color = zygosity, group = zygosity)) +
    geom_line() +
    geom_point(aes(shape = zygosity), size = 7) +
    theme_light() +
    scale_shape_manual(values = c("hom"= 19,"het" = 18)) +
    labs(title = "Inversion Detection by Zygotic State and Variant Size", color = "Zygotic State", shape = "Zygotic State") +
    ylab("Proportion of Inversions Detected") +
    xlab("Sequencing Depth") +
    theme(panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank()) +
    facet_grid(cols = vars(size))

Pooled Data¶

Now let’s look at the analogous characteristics of the pooled data.

Data Preparation¶

We need to filter for pooled samples only. The contig names don’t matter so much here as much as “common” or “rare” does, so we need to rename the contigs and sum their values. For plotting purporses, we’ll also want to get a count of the number of inversions for each category so there’s an indication of sample size for each treatment.

pooled <- assess[assess$sample == "pooled" & assess$simulated,]
pooled$treatment <- case_when(
    pooled$contig == "2L" ~ "Common",
    pooled$contig == "2R" ~ "Common",
    pooled$contig == "3L" ~ "Common",
    pooled$contig == "3R" ~ "Rare"
)

pooled <- group_by(pooled, treatment, size, depth, state) %>%
    summarize(count = n()) %>%
    group_by(treatment, size, depth) %>%
    mutate(total = sum(count))

head(pooled)

Again, some of the treatments are missing false negative or true positive rows, so we need to add them with a count of 0.

# add missing rows with value 0 in the event there were no true positives or false negatives for a treatment
# instantiate empty def
missings <- pooled[0,]

for(size in c("small","medium","large","xl")){
    for(depth in levels(pooled$depth)){
        for(.treat in unique(pooled$treatment)){
            .df <- pooled[pooled$size == size & pooled$depth == depth & pooled$treatment == .treat,]
            .tot <- sum(.df$total)
            if(!("false negative" %in% .df$state)){
                missings[nrow(missings)+1,] <- list(.treat, size, depth, "false negative", 0, .tot)
            }
            if(!("true positive" %in% .df$state)){
                missings[nrow(missings)+1,] <- list(.treat, size, depth, "true positive", 0, .tot)
            }
        }
    }
}

pooled <- rbind(pooled,missings) %>% arrange(treatment, size, depth)
head(pooled)

Lastly, convert the counts to proportions. Let’s also rename the depths to properly reflect that with aggregates, the depth is actually 10X higher. Seems silly to have to do as.numeric(as.character()), but if you don’t convert it to a Character first, 0.5 gets misinterpereted as 1.

pooled <- group_by(pooled, treatment, size, depth) %>% mutate(proportion = round(count/sum(count),2))
agg_depths <- paste0(as.numeric(as.character(pooled$depth)) * 10, " (", pooled$depth, ")")
pooled$agg_depth <- factor(agg_depths, levels = unique(agg_depths))
head(pooled)

Plotting¶

True Positives vs False Negatives¶

ggplot(pooled, aes(x = agg_depth, y = proportion, color = state, group = state)) +
    geom_line() +
    geom_point(aes(size = count)) +
    theme_light() +
    labs(title = "Inversion Detection (Pooled Data) by Variant Size", color = "Detection") +
    ylab("Proportion") +
    xlab("Effective Sequencing Depth (Per-Sample Depth)") +
    theme(panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank()) +
    facet_grid(cols = vars(treatment), rows = vars(size))

True Positives Across Inversion Size Classes¶

Consolidate views of just true positives across treatments

ggplot(pooled[pooled$state == "true positive",], aes(x = agg_depth, y = proportion, color = size, group = size)) +
    geom_line() +
    geom_point(aes(size = total)) +
    theme_light() +
    labs(title = "Inversion Detection (Pooled Data) by Variant Size", color = "Inversion Size", size = "Total Inversions") +
    ylab("Proportion of Inversions Detected") +
    xlab("Effective Sequencing Depth (Per-Sample Depth)") +
    theme(panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank()) +
    facet_grid(cols = vars(treatment))

True Positives Across Inversion Frequency in a Population¶

What about the change in performance between contigs?

ggplot(pooled[pooled$state == "true positive",], aes(x = agg_depth, y = proportion, color = treatment, group = treatment)) +
    geom_line() +
    geom_point(aes(shape = treatment, size = total)) +
    theme_light() +
    scale_color_manual(values = c("Common"= "#aa7099", "Rare" = "#e0aa30")) +
    labs(title = "Inversion Detection (Pooled Data) by Variant Size", color = "Inversion Frequency", shape = "Inversion Frequency", size = "Total Inversions") +
    ylab("Proportion of Inversions Detected") +
    xlab("Effective Sequencing Depth (Per-Sample Depth)") +
    theme(panel.grid.minor.y = element_blank(), panel.grid.major.x = element_blank()) +
    facet_grid(cols = vars(size))

Inversion Simulations

03 Call Variants

04 Assess Linked-Read Performance

Small Inversions