Here, we visualize the inversions that were identified for the xl
treatment across the range of depths. The LEVIATHAN
small, medium, and large call rates are configured to -s 95 -m 95 -l 90
.
library(ggplot2)
library(dplyr)
library(ggpubr)
options(warn = -1, repr.plot.width = 19, repr.plot.height = 12)
.size <- "xl"
Output
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
These are the setup functions that process that data. Note this fuzzy-match tolerates +- 1kb instead of the 500bp of small
and medium
.
Source
read_data <- function(size_treatment, depth){
.depth <- paste("depth",depth, sep = "_")
truthfile <- paste0("simulated_variants/setup_inversions/", size_treatment, "/inv.", size_treatment,".vcf")
samplesfile <- paste("simulated_data/called_sv/leviathan", size_treatment, .depth, "by_sample90/inversions.bedpe", sep = "/")
poolfile <- paste("simulated_data/called_sv/leviathan", size_treatment, .depth, "by_pop90/inversions.bedpe", sep = "/")
true_inversions <- read.table(truthfile, header = F)[,c(1,2,8)]
true_inversions$V8 <- as.numeric(unlist(lapply(true_inversions$V8, function(X){gsub(".+END=", "", X)})))
names(true_inversions) <- c("contig", "position_start", "position_end")
true_inversions$sample <- "simulated_inversions"
sample_inversions <- read.table(samplesfile, header = T)[,1:4]
pooled_inversions <- read.table(poolfile, header = T)
pooled_inversions <- pooled_inversions[,c("population","contig", "position_start", "position_end")]
names(pooled_inversions)[1] <- "sample"
if( nrow(pooled_inversions) > 0){
pooled_inversions$sample <- "pooled" #paste("pooled", depth, size_treatment, sep = "_")
}
return(rbind(true_inversions, sample_inversions, pooled_inversions))
}
assess_performace <- function(data, out_df){
truth <- filter(data, sample == "simulated_inversions")
truth <- group_by(truth, contig) %>% mutate(id = 1:n())
#truth <- group_by(truth, het) %>% mutate(id = 1:n())
samples <- filter(data, sample != "simulated_inversions")
for(i in 1:nrow(samples)){
.row <- samples[i,]
contig <- .row$contig
.sample <- .row$sample
startpos <- .row$position_start
endpos <- .row$position_end
tb <- filter(truth, contig == contig, (position_start - 500 <= startpos) & (position_end + 500 >= endpos))
if(nrow(tb) > 0){
out_df[out_df$sample == .sample & out_df$contig == contig & out_df$inversion == tb$id, "state"] <- "true positive"
}
}
out_df$contig <- gsub("2L", "2L (all homozygous)", out_df$contig)
out_df$contig <- gsub("2R", "2R (all heterozygous)", out_df$contig)
out_df$contig <- gsub("3L", "3L (inversions common)", out_df$contig)
out_df$contig <- gsub("3R", "3R (inversions rare)", out_df$contig)
return(out_df)
}
Source
plot_inversions <- function(data, size_treatment, depth){
axis_ticks <- factor(c("pooled", paste0("sample_", sprintf("%02d", 1:10)), "simulated_inversions"))
ggplot(
data,
aes(
x = position_start,
xend = position_end,
y = sample,
yend = sample,
color = sample
)
) +
scale_x_continuous(labels = scales::comma) +
scale_y_discrete(limits = axis_ticks) +
labs(title = "Simulated and Called Inversions Across the Genome", subtitle = paste0("Size: ", size_treatment, " | Depth: ", depth,"X"), x = "genomic position (bp)", caption = "simulated_inversions are groundtruth") +
geom_segment(linewidth = 2) +
facet_wrap(~contig, ncol = 1, scales = "free_x") +
theme_light() +
theme(panel.grid.major.y = element_blank(), legend.position = "None")
}
plot_samples_matrix <- function(data, size_treatment, depth){
.data <- data[data$sample != "pooled" & !is.na(data$zygosity),]
axis_ticks <- factor(paste0("sample_", sprintf("%02d", 1:10)))
ggplot(.data, aes(y = sample, x = factor(id, levels = unique(.data$id), ordered = T), fill = state)) +
geom_tile(color = "white") +
theme_light() +
labs(title = "By-Sample Inversion Detection", subtitle = "Inversions detected in individual samples, as a function of zygotic state.", caption = paste0("Size: ", size_treatment, " | Depth: ", depth,"X")) +
scale_fill_manual(values = c("false negative" = "grey70", "true positive" = "#90aed8")) +
scale_x_discrete(name = "Inversion") +
scale_y_discrete(limits = axis_ticks, breaks = axis_ticks) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
) +
facet_grid(cols = vars(zygosity), scales = "free_x")
}
plot_pools_matrix <- function(data, size_treatment, depth){
.data <- data[data$sample == "pooled",]
ggplot(.data, aes(y = 1, x = as.character(id), fill = state)) +
geom_tile(color = "white") +
theme_light() +
labs(title = "Pooled-Sample Detection", subtitle = "Inversions detected in sample-pooled data, as a function of inversion frequency in the population.", caption = paste0("Size: ", size_treatment, " | Depth: ", depth,"X")) +
scale_fill_manual(values = c("false negative" = "grey70", "true positive" = "#90aed8")) +
scale_y_continuous(breaks = 1, name = "State") +
scale_x_discrete(name = "Inversion") +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
) +
facet_grid(cols = vars(contig), scales = "free_x")
}
A function to wrap all these other functions for simplicity.
Source
summarize_performance <- function(depth, inventory, .size){
.data <- read_data(.size, depth)
.performance <- assess_performace(.data, inventory)
.sampleplot <- plot_samples_matrix(.performance, .size, depth)
.poolplot <- plot_pools_matrix(.performance, .size, depth)
.inversionplot <- plot_inversions(.data, .size, depth)
show(
ggarrange(
ggarrange(.sampleplot, .poolplot, ncol = 2, labels = c("A", "B"), common.legend = T),
.inversionplot, nrow = 2, labels = c("","C"), heights = c(1.5,4)
)
)
return(.performance)
}
Read in the sample and pool simulation inventory
Source
inventory <- rbind(
filter(read.table("inversion_simulations.inventory", header = T), size == .size),
filter(read.table("inversion_simulations.pool.inventory", header = T), size == .size)
)
performance <- rename(inventory, "simulated" = "present", "zygosity" = "state")
# set a default to false negatives, to be updated to true positives when conditions are met
performance$state <- "false negative"
# there are no false positives, so we can default inversions that weren't simulated to true negatives
performance[!performance$simulated, 7] <- "true negative"
# fix the name of pool to pooled
performance$sample <- gsub("pool", "pooled", performance$sample)
# read in truth data to transfer inversion breakpoints
x <- read_data(.size, 5)
truth <- group_by(x[x$sample == "simulated_inversions",], contig) %>% mutate(inversion = 1:n())
truth <- truth[, c(1,5,2,3)]
performance <- merge(performance,truth, by = c("contig", "inversion"))
performance$id <- as.integer(as.factor(performance$position_start))
performance <- performance[, c(1,2,10,8,9,3:7)]
tail(performance)
Loading...
0.5X depth¶
assessment_05X <- summarize_performance(0.5, performance, .size)

2X Depth¶
assessment_2X <- summarize_performance(2, performance, .size)

5X Depth¶
assessment_5X <- summarize_performance(5, performance, .size)

10X Depth¶
assessment_10X <- summarize_performance(10, performance, .size)

20X Depth¶
assessment_20X <- summarize_performance(20, performance, .size)

Write the results to a file
.x <- list(assessment_05X, assessment_2X, assessment_5X, assessment_10X, assessment_20X)
depths <- c(0.5, 2, 5, 10, 20)
aggregate_df <- assessment_05X[0,]
aggregate_df$depth <- character()
for(i in 1:5){
.tbl <- .x[i][[1]]
.tbl$depth <- depths[i]
aggregate_df <- rbind(aggregate_df, .tbl)
}
# sort it to make it visually sensible
aggregate_df <- arrange(aggregate_df, contig,inversion,depth, sample)
# restore contig names
aggregate_df$contig <- gsub("\\s*\\([^\\)]+\\)", "", aggregate_df$contig)
head(aggregate_df)
ifelse(!dir.exists("assess_called_sv"), dir.create("assess_called_sv"), TRUE)
write.csv(aggregate_df, file = paste0("assess_called_sv/",.size, ".sv.assessment"), quote = F, row.names = F)
Loading...