We are interested in comparing linked-read inversion-calling performance to long-read inversion calling performance. Since the price of long reads varies between vendors and the price will change dramatically over the course of this project and the time thereafter, we will restrict the long-read simulations to representatives of:
all samples
all inversion size classes
depths of
0.5
,2
,5
,10
,20
The Workflow¶
During the pilot runs of this project, we were able to make a Snakemake workflow of this process, which was modified as the needs of this work became clearer. As such, the Snakefile is provided below in its entirety. However, most of the code is workflow-specific nuance and the most relevant parts of the workflow are annotated below.
The workflow was invoked using:
snakemake -s longread_workflow/workflow/Snakefile --cores 12 --directory longread_workflow --sdm conda
Source
reference_genome = "../dmel_genome/dmel_2_3.fa.gz"
samples = ["sample_%02d" % i for i in range(1,11)]
sizes = ["small", "medium", "large", "xl"]
depths = ["05", "2", "5", "10", "20"]
platforms = ["pacbio", "ontshort", "ontlong"]
modes = {
"pacbio" : "--difference-ratio 22:45:33 --accuracy-mean 0.99 --length-mean 8000 --length-sd 1000 --length-min 2000 --length-max 20000",
"ontshort" : "--difference-ratio 39:24:36 --accuracy-mean 0.99 --length-mean 3000 --length-sd 1000",
"ontlong" : "--difference-ratio 39:24:36 --accuracy-mean 0.99 --length-mean 50000 --length-sd 20000"
}
wildcard_constraints:
runtype = "[a-zA-Z0-9]+",
size = "[a-zA-Z0-9]+",
sample_id = "[0-9]+",
depth = "[0-9]+"
rule all:
default_target: True
input:
expand("sv/{sample}.{runtype}.{size}.{depth}.bcf", sample = samples, runtype =platforms, size = sizes, depth = depths),
expand("logs/{sample}.{runtype}.{size}.{depth}.lengthdist", sample = samples, runtype =platforms, size = sizes, depth = depths)
message:
"Complete!"
rule decompress_fasta:
input: "../simulated_variants/inversions_snps_genomes/{size}/sample_{sample_id}.snp_inv.hap{hap}.fasta.gz"
output: temp("fasta/{size}.sample_{sample_id}.snp_inv.hap{hap}.fa")
message: "Temporarily decompressing {input}"
shell: "gzip -dc {input} > {output}"
rule simluate_longreads:
input:
geno = "fasta/{size}.sample_{sample_id}.snp_inv.hap{hap}.fa",
model = "models/QSHMM-ONT-HQ.model"
output:
temp(expand("reads/sample_{{sample_id}}.{{runtype}}.{{size}}.{{depth}}.{{hap}}_{chunk}.fq.gz", chunk = ["%04d" % i for i in range(1,5)]))
log:
"logs/sample_{sample_id}.{runtype}.{size}.{depth}.{hap}.log"
params:
config = lambda wc: modes[wc.get("runtype")],
depth = lambda wc: "0.5" if wc.get("depth") == "05" else wc.get("depth"),
prefix = lambda wc: "reads/sample_" + ".".join([wc.get(i) for i in ["sample_id", "runtype", "size", "depth", "hap"]])
message:
"Simulating reads: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth}.haplotype_{wildcards.hap}"
conda:
"envs/longreads.yaml"
shell:
"""
pbsim --strategy wgs --method qshmm --seed 6969 --qshmm {input.model} \\
--genome {input.geno} --depth {params.depth} {params.config} \\
--prefix {params.prefix} --id-prefix {params.prefix} 2> {log} &&
rm {params.prefix}*.maf.gz {params.prefix}*.ref
"""
rule merge_fastq:
input:
expand("reads/sample_{{sample_id}}.{{runtype}}.{{size}}.{{depth}}.{hap}_{chunk}.fq.gz", hap = [1,2], chunk = ["%04d" % i for i in range(1,5)])
output:
"reads/sample_{sample_id}.{runtype}.{size}.{depth}.fq.gz"
params:
lambda wc: modes[wc.get("runtype")]
message:
"Concatenating fastq: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth}"
shell:
"cat {input} > {output}"
rule length_distribution:
input:
"reads/sample_{sample_id}.{runtype}.{size}.{depth}.fq.gz"
output:
"logs/sample_{sample_id}.{runtype}.{size}.{depth}.lengthdist"
message:
"Creating read length histogram: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth}"
shell:
"""
awk 'NR%4 == 2 {{lengths[length($0)]++}} END {{for (l in lengths) {{print l, lengths[l]}}}}' <(zcat {input}) > {output}
"""
rule align:
input:
geno = reference_genome,
reads = "reads/sample_{sample_id}.{runtype}.{size}.{depth}.fq.gz"
output:
temp("align/sample_{sample_id}.{runtype}.{size}.{depth}.sam")
log:
"logs/sample_{sample_id}.{runtype}.{size}.{depth}.align.log"
threads:
6
params:
lambda wc: "map-ont" if "ont" in wc.get("runtype") else "map-pb"
message:
"Aligning: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth} in {params} mode"
conda:
"envs/longreads.yaml"
shell:
"minimap2 -t {threads} -ax {params} {input} > {output} 2> {log}"
rule filter:
input:
"align/sample_{sample_id}.{runtype}.{size}.{depth}.sam"
output:
bam = temp("align/sample_{sample_id}.{runtype}.{size}.{depth}.filt.bam"),
bai = temp("align/sample_{sample_id}.{runtype}.{size}.{depth}.filt.bam.bai")
log:
"logs/sample_{sample_id}.{runtype}.{size}.{depth}.filter.log"
threads:
2
message:
"Filtering and sorting: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth}"
conda:
"envs/longreads.yaml"
shell:
"""
samtools view -h -F 4 -q 20 {input} |
samtools sort -O bam -l 0 -m 4G --write-index -o {output.bam}##idx##{output.bai} 2> {log}
"""
rule mark_duplicates:
input:
bam = "align/sample_{sample_id}.{runtype}.{size}.{depth}.filt.bam",
bai = "align/sample_{sample_id}.{runtype}.{size}.{depth}.filt.bam.bai"
output:
bam = "align/sample_{sample_id}.{runtype}.{size}.{depth}.bam",
bai = "align/sample_{sample_id}.{runtype}.{size}.{depth}.bam.bai"
log:
"logs/sample_{sample_id}.{runtype}.{size}.{depth}.markduplicates.log"
threads:
4
message:
"Marking duplicates: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth}"
conda:
"envs/longreads.yaml"
shell:
"sambamba markdup -t {threads} -l 0 {input.bam} {output.bam} 2> {log}"
rule call_variants:
input:
gen = reference_genome,
reads = "align/sample_{sample_id}.{runtype}.{size}.{depth}.bam"
output:
"sv/sample_{sample_id}.{runtype}.{size}.{depth}.bcf"
params:
lambda wc: "ont" if "ont" in wc.get("runtype") else "pb"
message:
"Calling variants: sample_{wildcards.sample_id} {wildcards.runtype}.{wildcards.size}.{wildcards.depth} with mode {params}"
conda:
"envs/longreads.yaml"
shell:
"delly lr -y {params} -o {output} -g {input}"
Housekeeping¶
The modes
¶
There is a dict at the top that sets configurations for PacBio and Oxford Nanopore simulations with pbsim3
. It specifies that we want the PacBio data to have an average read length of 8kpb (sd = 1kbp) with min/max of 2kbp/20kbp. There are likewise two nanopore datesets (ontshort
and ontlong
), where the short has a mean length of 3kbp (sd = 1kbp) and the long has a mean of 50kbp (sd = 20kbp). There’s also an accuracy mean of 0.99
, which is greater than the default 0.85
to be generous.
modes = {
"pacbio" : "--difference-ratio 22:45:33 --accuracy-mean 0.99 --length-mean 8000 --length-sd 1000 --length-min 2000 --length-max 20000",
"ontshort" : "--difference-ratio 39:24:36 --accuracy-mean 0.99 --length-mean 3000 --length-sd 1000",
"ontlong" : "--difference-ratio 39:24:36 --accuracy-mean 0.99 --length-mean 50000 --length-sd 20000"
}
Decompression¶
pbsim3
doesn’t play nice with gzipped fasta files, so they need to be decompressed:
gzip -dc {input} > {output}
Simulate Long Reads¶
Using pbsim3
, we are simulating long reads from the variant-modified genomes for all samples. This rule iterates over samples, inversion size classes, depths, and the pacbio
, ontshort
, and ontlong
platforms.
pbsim --strategy wgs --method qshmm --seed 6969 --qshmm {model} \
--genome {reference_genome} {depth} {mode} \
--prefix {prefix} --id-prefix {params.prefix} 2> {log} &&
rm {params.prefix}*.maf.gz {params.prefix}*.ref
Length Distribution¶
This is something of a sanity check, but here we are getting the distribution of read lengths.
awk 'NR%4 == 2 {lengths[length($0)]++} END {for (l in lengths) {print l, lengths[l]}}' <(zcat {input}) > {output}
Aligning Reads¶
Like most variant-calling approaches, we first need to align the reads to the reference genome. We use the long-read mapper minimap2
for that and do some basic alignment filtering.
minimap2 -t {threads} -ax {params} {input} > {output} 2> {log}
samtools view -h -F 4 -q 20 {input} |
samtools sort -O bam -l 0 -m 4G --write-index -o {output.bam}##idx##{output.bai} 2> {log}
Marking Duplicates¶
As a precaution, we need to mark duplicates in the reads, if there are any.
sambamba markdup -t {threads} -l 0 {input.bam} {output.bam} 2> {log}
Call Structural Variants¶
Finally, we call variants using delly
.
delly lr -y {params} -o {output} -g {input}
Aggregate Results¶
There’s a lot of data now and we need to aggregate it into something we can explore. The simplest way to do that would be to use bcftools query
and append the information to one big summary file. We are interested in these columns specifically:
sample name
inversion size
depth treatment
platform name
start/end breakpoints
inversion quality
The “proper” approach using subprocess
was taking too long to figure out with all of the quotation marks in bcftools query
, so we’re going to cheat
with os.system()
and using a BASH appending redirect (>>
) to write everything to longread_workflow/longread.inversions.summary
.
import os
# basic bash command
# bcftools query -i 'INFO/SVTYPE = "INV"' -f "%CHROM\t%POS\t%INFO/END\t[%FT]\n" FILE.bcf
samples = ["sample_%02d" % i for i in range(1,11)]
sizes = ["small", "medium", "large", "xl"]
depths = ["05", "2", "5", "10", "20"]
platforms = ["pacbio", "ontshort", "ontlong"]
with open("longread.inversions.summary", "w") as summary:
summary.write("\t".join(["sample","size", "depth","platform","chrom","start","end","quality"]) + "\n")
for _sample in samples:
for _size in sizes:
for _depth in depths:
for _platform in platforms:
bcf = f"sv/{_sample}.{_platform}.{_size}.{_depth}.bcf"
__depth = "0.5" if _depth == "05" else _depth
query = f'/programs/bcftools-1.20/bin/bcftools query -i \'INFO/SVTYPE="INV"\' -f "{_sample}\\t{_size}\\t{__depth}\\t{_platform}\\t%CHROM\\t%POS\\t%INFO/END\\t[%FT]\\n" {bcf}'
os.system(query + " >> longread.inversions.summary")
# sanity check
with open("longread.inversions.summary", "r") as summary:
i = 0
for line in summary:
i += 1
print(line.strip())
if i == 10:
break
sample size depth platform chrom start end quality
sample_01 small 0.5 pacbio 2L 3193195 3214220 LowQual
sample_01 small 0.5 ontshort 2L 20107900 20113694 LowQual
sample_01 small 2 pacbio 2L 3193195 3214220 LowQual
sample_01 small 2 pacbio 2L 3940210 3944862 PASS
sample_01 small 2 pacbio 2L 3940211 3944863 PASS
sample_01 small 2 pacbio 2L 13024406 13026575 PASS
sample_01 small 2 pacbio 2L 14845710 14861199 LowQual
sample_01 small 2 pacbio 2L 20107900 20113694 PASS
sample_01 small 2 pacbio 2L 20107901 20113695 PASS