import os
Simulating the inversions¶
Prior to simulating the samples, we need to create the inversions themselves, which will be manually modified to meet the treatment goals listed below. Since each chromosome will be a technical replicate of different “conditions”, we need to simulate different inversions on each one, which is more tedious but ultimately worth it.
%%bash
CONTIGS=("2L" "2R" "3R" "3L")
SIZES=("small" "medium" "large" "xl")
for size in "${SIZES[@]}"; do
for contig in "${CONTIGS[@]}"; do
harpy simulate inversion -n 5 --max-size 25000 --min-size 1000 -o simulated_data/inversions/${size}/${contig} ${contig}.fasta
done
done
Creating inversion treatments for individuals¶
We need to create the inversion treatments, which is the way the inversions are sized and distributed among the contigs. The design is such that:
- 2R: all heterozygote
- 2L: all homozygote
- 3R: inversions are common
- 3L: inversions are rare The dictionaries below govern which variants go into which haplotype for each sample, following the format
sample_id : [hap1, hap2]
Small inversions¶
SMALL = {
"2R" : {
"sample_01" : [[1,3,5], [2,4]],
"sample_02" : [[1,2], [3,4,5]],
"sample_03" : [[1,5], [2,3,4]],
"sample_04" : [[1,4,5], [2,3]],
"sample_05" : [[1,2,3,4,5], []],
"sample_06" : [[1,2,3,4], [5]],
"sample_07" : [[1,2,5], [3,4]],
"sample_08" : [[1,2,4,5], [3]],
"sample_09" : [[1,3,4,5], [2]],
"sample_10" : [[1,2,3,5], [4]]
},
"2L" : {
"sample_01" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_02" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_03" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_04" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_05" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_06" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_07" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_08" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_09" : [[1,2,3,4,5], [1,2,3,4,5]],
"sample_10" : [[1,2,3,4,5], [1,2,3,4,5]]
},
"3R" : {
"sample_01" : [[1,3,4,5], [1,3,5]],
"sample_02" : [[1,3,4,5], [1,3]],
"sample_03" : [[1,3,4], [1,4]],
"sample_04" : [[1,2,3], [3]],
"sample_05" : [[1,2,3], [2,5]],
"sample_06" : [[1,2,3], [2,3]],
"sample_07" : [[1,2,3], [2]],
"sample_08" : [[2,3,5], [2]],
"sample_09" : [[2,3,4], []],
"sample_10" : [[2,4,5], [4,5]]
},
"3L" : {
"sample_01" : [[1], []],
"sample_02" : [[2], [2]],
"sample_03" : [[], [3]],
"sample_04" : [[4], [4]],
"sample_05" : [[4], []],
"sample_06" : [[], [1]],
"sample_07" : [[4], []],
"sample_08" : [[5], [5]],
"sample_09" : [[5], [5]],
"sample_10" : [[5], []]
}
}
Medium Inversions¶
Deep into the project (final data analysis) we discovered that sample_06
had a typo of 44
for the second genotype on 2L
, which meant it was in the heterozygous state for 4
for the analyses.
MEDIUM = {
"2R" : {
"sample_01" : [[1,3], [2,4]],
"sample_02" : [[1,2], [3,4]],
"sample_03" : [[1], [2,3,4]],
"sample_04" : [[1,4], [2,3]],
"sample_05" : [[1,2,3,4], []],
"sample_06" : [[1,2,3,4], []],
"sample_07" : [[1,2], [3,4]],
"sample_08" : [[1,2,4], [3]],
"sample_09" : [[1,3,4], [2]],
"sample_10" : [[1,2,3], [4]]
},
"2L" : {
"sample_01" : [[1,2,3,4], [1,2,3,4]],
"sample_02" : [[1,2,3,4], [1,2,3,4]],
"sample_03" : [[1,2,3,4], [1,2,3,4]],
"sample_04" : [[1,2,3,4], [1,2,3,4]],
"sample_05" : [[1,2,3,4], [1,2,3,4]],
"sample_06" : [[1,2,3,4], [1,2,3,4]], # there was originally a typo here and inversion 4 was accidentally put in the heterozygous state
"sample_09" : [[1,2,3,4], [1,2,3,4]],
"sample_10" : [[1,2,3,4], [1,2,3,4]]
},
"3R" : {
"sample_01" : [[1,3,4], [1,3]],
"sample_02" : [[1,3,4], [1,3]],
"sample_03" : [[1,3,4], [1,4]],
"sample_04" : [[1,2,3], [3]],
"sample_05" : [[1,2,3], [2]],
"sample_06" : [[1,2,3], [2,3]],
"sample_07" : [[1,2,3], [2]],
"sample_08" : [[2,3], [2]],
"sample_09" : [[2,3,4], []],
"sample_10" : [[2,4], [4]]
},
"3L" : {
"sample_01" : [[1], []],
"sample_02" : [[2], [2]],
"sample_03" : [[], [3]],
"sample_04" : [[4], [4]],
"sample_05" : [[4], []],
"sample_06" : [[], [1]],
"sample_07" : [[4], []],
"sample_08" : [[], []],
"sample_09" : [[1], [1]],
"sample_10" : [[], [3]]
}
}
Large Inversions¶
LARGE = {
"2R" : {
"sample_01" : [[1], [2]],
"sample_02" : [[], [1,2]],
"sample_03" : [[1], [2]],
"sample_04" : [[1], [2]],
"sample_05" : [[1,2], []],
"sample_06" : [[], [1,2]],
"sample_07" : [[2], [1]],
"sample_08" : [[1,2], []],
"sample_09" : [[1], [2]],
"sample_10" : [[1,2], []]
},
"2L" : {
"sample_01" : [[1,2], [1,2]],
"sample_02" : [[1,2], [1,2]],
"sample_03" : [[1,2], [1,2]],
"sample_04" : [[1,2], [1,2]],
"sample_05" : [[1,2], [1,2]],
"sample_06" : [[1,2], [1,2]],
"sample_07" : [[1,2], [1,2]],
"sample_08" : [[1,2], [1,2]],
"sample_09" : [[1,2], [1,2]],
"sample_10" : [[1,2], [1,2]]
},
"3R" : {
"sample_01" : [[1], [1]],
"sample_02" : [[1], [1]],
"sample_03" : [[1], [2]],
"sample_04" : [[1,2], []],
"sample_05" : [[1,2], [1,2]],
"sample_06" : [[1,2], [2]],
"sample_07" : [[1], [1,2]],
"sample_08" : [[2], [2]],
"sample_09" : [[2], []],
"sample_10" : [[2], []]
},
"3L" : {
"sample_01" : [[1], []],
"sample_02" : [[2], [2]],
"sample_03" : [[], []],
"sample_04" : [[], []],
"sample_05" : [[], []],
"sample_06" : [[], [1]],
"sample_07" : [[], []],
"sample_08" : [[], []],
"sample_09" : [[1], [1]],
"sample_10" : [[], []]
}
}
XL Inversions¶
XL = {
"2R" : {
"sample_01" : [[1], []],
"sample_02" : [[], [1]],
"sample_03" : [[1], []],
"sample_04" : [[1], []],
"sample_05" : [[1], []],
"sample_06" : [[], [1]],
"sample_07" : [[], [1]],
"sample_08" : [[1], []],
"sample_09" : [[1], []],
"sample_10" : [[1], []]
},
"2L" : {
"sample_01" : [[1], [1]],
"sample_02" : [[1], [1]],
"sample_03" : [[1], [1]],
"sample_04" : [[1], [1]],
"sample_05" : [[1], [1]],
"sample_06" : [[1], [1]],
"sample_07" : [[1], [1]],
"sample_08" : [[1], [1]],
"sample_09" : [[1], [1]],
"sample_10" : [[1], [1]]
},
"3R" : {
"sample_01" : [[1], [1]],
"sample_02" : [[1], [1]],
"sample_03" : [[1], []],
"sample_04" : [[], []],
"sample_05" : [[1], [1]],
"sample_06" : [[1], []],
"sample_07" : [[1], [1]],
"sample_08" : [[], []],
"sample_09" : [[], [1]],
"sample_10" : [[], [1]]
},
"3L" : {
"sample_01" : [[1], []],
"sample_02" : [[], []],
"sample_03" : [[], []],
"sample_04" : [[], []],
"sample_05" : [[], []],
"sample_06" : [[], [1]],
"sample_07" : [[], []],
"sample_08" : [[], []],
"sample_09" : [[1], [1]],
"sample_10" : [[], []]
}
}
Write the inversions¶
Using the schema from the dictionaries above, the inversions need to be written to VCF files so that we can simulate them with LRSIM (via harpy)
inv_inventory = {
"small" : SMALL,
"medium": MEDIUM,
"large" : LARGE,
"xl": XL
}
for inv,invdict in inv_inventory.items():
with open(f"simulated_data/inverions/{inv}/inv.{inv}.vcf", "r") as vcf:
inversions = {}
header = ""
for line in vcf:
if line.startswith("#"):
header += line
else:
inv_split = line.split()
contig = inv_split[0]
if contig not in inversions:
inversions[contig] = [line]
else:
inversions[contig].append(line)
os.makedirs(f"simulated_data/inversions/{inv}/samples", exist_ok = True)
samples = {}
for contig in invdict:
for sample,haps in invdict[contig].items():
hap1,hap2 = haps
inv_hap1 = [inversions[contig][i-1] for i in hap1]
inv_hap2 = [inversions[contig][i-1] for i in hap2]
if sample not in samples:
samples[sample] = {"hap1" : inv_hap1, "hap2" : inv_hap2}
else:
samples[sample]["hap1"] += inv_hap1
samples[sample]["hap2"] += inv_hap2
for sample in samples:
with (
open(f"simulated_data/inversions/{inv}/samples/{sample}.{inv}.hap1.vcf", "w") as hap1_vcf,
open(f"simulated_data/inversions/{inv}/samples/{sample}.{inv}.hap2.vcf", "w") as hap2_vcf
):
_ = hap1_vcf.write(header)
_ = hap1_vcf.write("".join(samples[sample]["hap1"]))
_ = hap2_vcf.write(header)
_ = hap2_vcf.write("".join(samples[sample]["hap2"]))
We’re going to need an inventory of the inversions later for assessing the called inversions so let’s generate the file now. It might make sense to print it in long-format, so let’s do that. It should look like:
sample | size | contig | inversion | present | state |
---|---|---|---|---|---|
sample_01 | small | 2L | 1 | TRUE | hom |
sample_01 | small | 2L | 2 | FALSE | het |
First, make a function that will assess the presence/absence of an inversion in a sample and whether it’s in a homozygous or heterozygous state
def zygosity(invs, genos):
geno1, geno2 = genos
geno1_present = invs in geno1
geno2_present = invs in geno2
TF = geno1_present + geno2_present
present = "FALSE" if TF == 0 else "TRUE"
state = "NA" if TF == 0 else "het"
if TF == 2:
state = "hom"
return present, state
The next part is going deep into iteration and using zygosity()
to return the presence and state of an inversion at a given contig for a given sample of a given size treatment. It’s kind of a lot, but the code comments help keep track of what’s getting iterated when. The itertools.chain
function helps collapse the nested lists of the genotypes.
import itertools
with open("inversion_simulations.inventory", "w") as inventory:
inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
for size_treatment, contig in inv_inventory.items():
# size_treatment = "small", "medium", etc.
# contig = dict{contig : {sample: [[hap1], [hap2]]}}
for cntg,smpl in contig.items():
# cntg = "2R", "3L" etc
# smpl = {sample: [[hap1], [hap2]]}
# collapse the nested lists and get the max number of inversions
max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
inversions = range(1, max_inversions + 1)
for samplename,genotype in smpl.items():
for i in inversions:
present, state = zygosity(i, genotype)
printline = f"{samplename}\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
inventory.write(printline)
The output send up looking like this (truncated for sanity)
sample size contig inversion present state
sample_01 small 2R 1 TRUE het
sample_01 small 2R 2 TRUE het
sample_01 small 2R 3 TRUE het
sample_01 small 2R 4 TRUE het
sample_01 small 2R 5 TRUE het
...
sample_06 xl 3L 1 TRUE het
sample_07 xl 3L 1 FALSE NA
sample_08 xl 3L 1 FALSE NA
sample_09 xl 3L 1 TRUE hom
sample_10 xl 3L 1 FALSE NA
We also need to output the sample “pools” for when we attempt pooled-sample calling.
with open("inversion_simulations.pool.inventory", "w") as inventory:
inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
for size_treatment, contig in inv_inventory.items():
# size_treatment = "small", "medium", etc.
# contig = dict{contig : {sample: [[hap1], [hap2]]}}
for cntg,smpl in contig.items():
# cntg = "2R", "3L" etc
# smpl = {sample: [[hap1], [hap2]]}
# collapse the nested lists and get the max number of inversions
max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
inversions = range(1, max_inversions + 1)
# collapse the haplotypes across samples into a set for each
haplo1 = set(itertools.chain.from_iterable([i[0] for i in smpl.values()]))
haplo2 = set(itertools.chain.from_iterable([i[1] for i in smpl.values()]))
for i in inversions:
present, state = zygosity(i, [haplo1, haplo2])
printline = f"pool\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
inventory.write(printline)
Simulating individual genomes with known inversions¶
Now that the inversions have been partitioned for each individual for each size treatment class, they can be simulated for every individual. Since the inversions across chromosomes have been combined into a single VCF file, we no longer need to do this per chromosome (thankfully).
%%bash
sizes=("small" "medium" "large" "xl")
for i in "${sizes[@]}"; do
for j in $(seq -w 1 1 10); do
for hap in $(seq 1 2); do
harpy simulate inversion --quiet \
-o simulated_data/inversions/${i}/samples/sample_${j}/hap${hap} \
-v simulated_data/inversions/${i}/samples/sample_${j}.${i}.hap${hap}.vcf dmel_genome/dmel_2_3.fa
mv simulated_data/inversions/${i}/samples/sample_${j}/hap${hap}/sim.fasta simulated_data/${i}/samples/sample_${j}/hap${hap}/sample_${j}.hap${hap}.fasta
rm -r simulated_data/inversions/${i}/samples/sample_${j}/hap${hap}/logs
done
done
done
Simulating SNPs¶
So there is variation beyond just inversions, we need to simulate SNPs as well. First, we’ll simulate 120,000 random SNPs, then using some basic Python wizardry, randomly move those SNPs into haplotype 1 or haplotype 2 for each individual. There will be a 70% chance that a SNP will appear in either haplotype, meaning some might be homozygote, heterozygote, or missing entirely (which is fine). The initial simulation can be done for the entire assembly at once, rather than per-chromosome and/or per size-treatment.
%%bash
harpy simulate snpindel --prefix all --snp-count 120000 -o simulated_data/snps dmel_genome/dmel_2_3.fa
Now that we have a general set of SNPs, we can randomly shuffle them into haplotypes for each sample.
import random
import os
samples = [f"sample_{i:02}" for i in range(1,11)]
os.makedirs("simulated_data/snps/per_sample", exist_ok = True)
rng = random.Random(69)
for sample in samples:
with(
open("simulated_data/snps/all.snp.vcf", "r") as vcf,
open(f"simulated_data/snps/per_sample/{sample}.hap1.snp.vcf", "w") as hap1,
open(f"simulated_data/snps/per_sample/{sample}.hap2.snp.vcf", "w") as hap2
):
for line in vcf:
if line.startswith("#"):
hap1.write(line)
hap2.write(line)
continue
for hap in [hap1,hap2]:
if rng.random() < 0.7:
hap.write(line)
Now, using the same approach as with the inversions, we can simulate the known SNPs onto the sample genomes. simuG takes a while here, so the commands are echo
d to a text file that will be consumed by parallel
for concurrency.
%%bash
sizes=("small" "medium" "large" "xl")
mkdir -p simulated_data/inversions_snps
for size in "${sizes[@]}"; do
for samp in $(seq -w 1 1 10); do
SAMPLE="sample_${samp}"
for hap in $(seq 1 2); do
FASTA="simulated_data/inversions/${size}/samples/${SAMPLE}/${SAMPLE}.hap${hap}.fasta"
SNPS="simulated_data/snps/per_sample/${SAMPLE}.hap${hap}.snp.vcf"
echo "harpy simulate snpindel --quiet --prefix ${SAMPLE}_snpinv.hap${hap} --snp-vcf $SNPS -o simulated_data/inversions_snps/${size}/${SAMPLE}/hap${hap} $FASTA" >> simulated_data/inversions_snps/sim_snps.sh
done
done
done
#parallel :::: sim.inversions/inversions_snps/sim_snps.sh
For some reason, simuG is acting up for manual [known] SNP simulation, so using the VCFs of known SNPs that were created, let’s just manually change the bases. Below is a simple simulate_snps()
function to accomplish this.
import os
for size in ["small", "medium", "large", "xl"]:
os.makedirs(f"simulated_data/inversions_snps2/{size}", exist_ok = True)
for sample in [f"sample_{i:02}" for i in range(1,11)]:
for hap in [1,2]:
SNPS=f"simulated_data/snps/per_sample/{sample}.hap{hap}.snp.vcf"
FASTA=f"simulated_data/inversions/{size}/samples/{sample}/{sample}.hap{hap}.fasta"
FASTA_OUT=f"simulated_data/inversions_snps2/{size}/{sample}.snp_inv.hap{hap}.fasta.gz"
simulate_snps(SNPS, FASTA, FASTA_OUT)
As a basic housekeeping thing, let’s gzip compress the fasta files and delete the logs
and workflow
folders to reduce disk footprint.
%%bash
sizes=("small" "medium" "large" "xl")
for size in "${sizes[@]}"; do
DIR="simulated_data/inversions_snps/${size}"
for samp in $(seq -w 1 1 10); do
SAMPLE="sample_${samp}"
for hap in $(seq 1 2); do
bgzip -c ${DIR}/${SAMPLE}/hap${hap}/${SAMPLE}_snpinv.hap${hap}.fasta > ${DIR}/${SAMPLE}.snp_inv.hap${hap}.fasta.gz
done
rm -r simulated_data/inversions_snps/${size}/${SAMPLE}/
done
done