01 Simulate Variants - Inversion Simulations

import os

Simulating the inversions¶

Prior to simulating the samples, we need to create the inversions themselves, which will be manually modified to meet the treatment goals listed below. Since each chromosome will be a technical replicate of different “conditions”, we need to simulate different inversions on each one, which is more tedious but ultimately worth it. The code below creates 5 random inversions, but this is really just to get the correct VCF format from the software-- the inversions will be manually modified.

%%bash
CONTIGS=("2L" "2R" "3R" "3L")
SIZES=("small" "medium" "large" "xl")

for size in "${SIZES[@]}"; do
    for contig in "${CONTIGS[@]}"; do
        harpy simulate inversion -n 5 --max-size 25000 --min-size 1000 -o simulated_variants/setup_inversions/${size}/${contig} ${contig}.fasta
    done
done

Creating inversion treatments for individuals¶

We need to create the inversion treatments, which is the way the inversions are sized and distributed among the contigs. The design is such that:

2R: all heterozygote
2L: all homozygote
3R: inversions are common
3L: inversions are rare The dictionaries below govern which variants go into which haplotype for each sample, following the format

sample_id : [hap1, hap2]

Small inversions¶

SMALL = {
    "2R" : {
        "sample_01" : [[1,3,5], [2,4]],
        "sample_02" : [[1,2], [3,4,5]],
        "sample_03" : [[1,5], [2,3,4]],
        "sample_04" : [[1,4,5], [2,3]],
        "sample_05" : [[1,2,3,4,5], []],
        "sample_06" : [[1,2,3,4], [5]],
        "sample_07" : [[1,2,5], [3,4]],
        "sample_08" : [[1,2,4,5], [3]],
        "sample_09" : [[1,3,4,5], [2]],
        "sample_10" : [[1,2,3,5], [4]]
    },
    "2L" : {
        "sample_01" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_02" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_03" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_04" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_05" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_06" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_07" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_08" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_09" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_10" : [[1,2,3,4,5], [1,2,3,4,5]]
    },
    "3R" : {
        "sample_01" : [[1,3,4,5], [1,3,5]],
        "sample_02" : [[1,3,4,5], [1,3]],
        "sample_03" : [[1,3,4], [1,4]],
        "sample_04" : [[1,2,3], [3]],
        "sample_05" : [[1,2,3], [2,5]],
        "sample_06" : [[1,2,3], [2,3]],
        "sample_07" : [[1,2,3], [2]],
        "sample_08" : [[2,3,5], [2]],
        "sample_09" : [[2,3,4], []],
        "sample_10" : [[2,4,5], [4,5]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], [3]],
        "sample_04" : [[4], [4]],
        "sample_05" : [[4], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[4], []],
        "sample_08" : [[5], [5]],
        "sample_09" : [[5], [5]],
        "sample_10" : [[5], []]
    }
}

Medium Inversions¶

MEDIUM = {
    "2R" : {
        "sample_01" : [[1,3], [2,4]],
        "sample_02" : [[1,2], [3,4]],
        "sample_03" : [[1], [2,3,4]],
        "sample_04" : [[1,4], [2,3]],
        "sample_05" : [[1,2,3,4], []],
        "sample_06" : [[1,2,3,4], []],
        "sample_07" : [[1,2], [3,4]],
        "sample_08" : [[1,2,4], [3]],
        "sample_09" : [[1,3,4], [2]],
        "sample_10" : [[1,2,3], [4]]
    },
    "2L" : {
        "sample_01" : [[1,2,3,4], [1,2,3,4]],
        "sample_02" : [[1,2,3,4], [1,2,3,4]],
        "sample_03" : [[1,2,3,4], [1,2,3,4]],
        "sample_04" : [[1,2,3,4], [1,2,3,4]],
        "sample_05" : [[1,2,3,4], [1,2,3,4]],
        "sample_06" : [[1,2,3,4], [1,2,3,4]],
        "sample_07" : [[1,2,3,4], [1,2,3,4]],
        "sample_08" : [[1,2,3,4], [1,2,3,4]],
        "sample_09" : [[1,2,3,4], [1,2,3,4]],
        "sample_10" : [[1,2,3,4], [1,2,3,4]]
    },
    "3R" : {
        "sample_01" : [[1,3,4], [1,3]],
        "sample_02" : [[1,3,4], [1,3]],
        "sample_03" : [[1,3,4], [1,4]],
        "sample_04" : [[1,2,3], [3]],
        "sample_05" : [[1,2,3], [2]],
        "sample_06" : [[1,2,3], [2,3]],
        "sample_07" : [[1,2,3], [2]],
        "sample_08" : [[2,3], [2]],
        "sample_09" : [[2,3,4], []],
        "sample_10" : [[2,4], [4]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], [3]],
        "sample_04" : [[4], [4]],
        "sample_05" : [[4], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[4], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], [3]]
    }
}

Large Inversions¶

LARGE = {
    "2R" : {
        "sample_01" : [[1], [2]],
        "sample_02" : [[], [1,2]],
        "sample_03" : [[1], [2]],
        "sample_04" : [[1], [2]],
        "sample_05" : [[1,2], []],
        "sample_06" : [[], [1,2]],
        "sample_07" : [[2], [1]],
        "sample_08" : [[1,2], []],
        "sample_09" : [[1], [2]],
        "sample_10" : [[1,2], []]
    },
    "2L" : {
        "sample_01" : [[1,2], [1,2]],
        "sample_02" : [[1,2], [1,2]],
        "sample_03" : [[1,2], [1,2]],
        "sample_04" : [[1,2], [1,2]],
        "sample_05" : [[1,2], [1,2]],
        "sample_06" : [[1,2], [1,2]],
        "sample_07" : [[1,2], [1,2]],
        "sample_08" : [[1,2], [1,2]],
        "sample_09" : [[1,2], [1,2]],
        "sample_10" : [[1,2], [1,2]]
    },
    "3R" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], [2]],
        "sample_04" : [[1,2], []],
        "sample_05" : [[1,2], [1,2]],
        "sample_06" : [[1,2], [2]],
        "sample_07" : [[1], [1,2]],
        "sample_08" : [[2], [2]],
        "sample_09" : [[2], []],
        "sample_10" : [[2], []]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], []],
        "sample_04" : [[], []],
        "sample_05" : [[], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], []]
    }
}

XL Inversions¶

XL = {
    "2R" : {
        "sample_01" : [[1], []],
        "sample_02" : [[], [1]],
        "sample_03" : [[1], []],
        "sample_04" : [[1], []],
        "sample_05" : [[1], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], [1]],
        "sample_08" : [[1], []],
        "sample_09" : [[1], []],
        "sample_10" : [[1], []]
    },
    "2L" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], [1]],
        "sample_04" : [[1], [1]],
        "sample_05" : [[1], [1]],
        "sample_06" : [[1], [1]],
        "sample_07" : [[1], [1]],
        "sample_08" : [[1], [1]],
        "sample_09" : [[1], [1]],
        "sample_10" : [[1], [1]]
    },
    "3R" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], []],
        "sample_04" : [[], []],
        "sample_05" : [[1], [1]],
        "sample_06" : [[1], []],
        "sample_07" : [[1], [1]],
        "sample_08" : [[], []],
        "sample_09" : [[], [1]],
        "sample_10" : [[], [1]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[], []],
        "sample_03" : [[], []],
        "sample_04" : [[], []],
        "sample_05" : [[], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], []]
    }
}

Write the inversions¶

Using the schema from the dictionaries above, the inversions need to be written to VCF files so that we can simulate them with LRSIM (via harpy)

inv_inventory = {
    "small" : SMALL,
    "medium": MEDIUM,
    "large" : LARGE,
    "xl": XL
}

for inv,invdict in inv_inventory.items():
    with open(f"simulated_variants/inversions_per_sample/inv.{inv}.vcf", "r") as vcf:
        inversions = {}
        header = ""
        for line in vcf:
            if line.startswith("#"):
                header += line
            else:
                inv_split = line.split()
                contig = inv_split[0]
                if contig not in inversions:
                    inversions[contig] = [line]
                else:
                    inversions[contig].append(line)

    os.makedirs(f"simulated_variants/inversions_per_sample/{inv}/", exist_ok = True)
    samples = {}
    for contig in invdict:
        for sample,haps in invdict[contig].items():
            hap1,hap2 = haps
            inv_hap1 = [inversions[contig][i-1] for i in hap1]
            inv_hap2 = [inversions[contig][i-1] for i in hap2]

            if sample not in samples:
                samples[sample] = {"hap1" : inv_hap1, "hap2" : inv_hap2}
            else:
                samples[sample]["hap1"] += inv_hap1
                samples[sample]["hap2"] += inv_hap2

    for sample in samples:
        with (
            open(f"simulated_variants/inversions_per_sample/{inv}/{sample}.{inv}.hap1.vcf", "w") as hap1_vcf,
            open(f"simulated_variants/inversions_per_sample/{inv}/{sample}.{inv}.hap2.vcf", "w") as hap2_vcf
        ):
            _ = hap1_vcf.write(header)
            _ = hap1_vcf.write("".join(samples[sample]["hap1"]))
            _ = hap2_vcf.write(header)
            _ = hap2_vcf.write("".join(samples[sample]["hap2"]))

Creating an inventory¶

We’re going to need an inventory of the inversions later for assessing the called inversions so let’s generate the file now. It might make sense to print it in long-format, so let’s do that. It should look like:

sample	size	contig	inversion	present	state
sample_01	small	2L	1	TRUE	hom
sample_01	small	2L	2	FALSE	het

First, make a function that will assess the presence/absence of an inversion in a sample and whether it’s in a homozygous or heterozygous state

def zygosity(invs, genos):
    geno1, geno2 = genos
    geno1_present = invs in geno1
    geno2_present = invs in geno2
    TF = geno1_present + geno2_present
    present = "FALSE" if TF == 0 else "TRUE"
    state = "NA" if TF == 0 else "het"
    if TF == 2:
        state = "hom"
    return present, state

The next part is going deep into iteration and using zygosity() to return the presence and state of an inversion at a given contig for a given sample of a given size treatment. It’s kind of a lot, but the code comments help keep track of what’s getting iterated when. The itertools.chain function helps collapse the nested lists of the genotypes.

import itertools

with open("simulated_variants/inversions_per_sample/inversion_simulations.inventory", "w") as inventory:
    inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
    for size_treatment, contig in inv_inventory.items():
        # size_treatment = "small", "medium", etc.
        # contig = dict{contig : {sample: [[hap1], [hap2]]}}
        for cntg,smpl in contig.items():
            # cntg = "2R", "3L" etc
            # smpl = {sample: [[hap1], [hap2]]}
            # collapse the nested lists and get the max number of inversions
            max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
            inversions = range(1, max_inversions + 1)
            for samplename,genotype in smpl.items():
                for i in inversions:
                    present, state = zygosity(i, genotype)
                    printline = f"{samplename}\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
                    inventory.write(printline)

The output send up looking like this (truncated for sanity)

sample	size	contig	inversion	present	state
sample_01	small	2R	1	TRUE	het
sample_01	small	2R	2	TRUE	het
sample_01	small	2R	3	TRUE	het
sample_01	small	2R	4	TRUE	het
sample_01	small	2R	5	TRUE	het
...
sample_06	xl	3L	1	TRUE	het
sample_07	xl	3L	1	FALSE	NA
sample_08	xl	3L	1	FALSE	NA
sample_09	xl	3L	1	TRUE	hom
sample_10	xl	3L	1	FALSE	NA

We also need to output the sample “pools” for when we attempt pooled-sample calling.

with open("simulated_variants/inversions_per_sample/inversion_simulations.pool.inventory", "w") as inventory:
    inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
    for size_treatment, contig in inv_inventory.items():
        # size_treatment = "small", "medium", etc.
        # contig = dict{contig : {sample: [[hap1], [hap2]]}}
        for cntg,smpl in contig.items():
            # cntg = "2R", "3L" etc
            # smpl = {sample: [[hap1], [hap2]]}
            # collapse the nested lists and get the max number of inversions
            max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
            inversions = range(1, max_inversions + 1)
            # collapse the haplotypes across samples into a set for each
            haplo1 = set(itertools.chain.from_iterable([i[0] for i in smpl.values()]))
            haplo2 = set(itertools.chain.from_iterable([i[1] for i in smpl.values()]))
            for i in inversions:
                present, state = zygosity(i, [haplo1, haplo2])
                printline = f"pool\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
                inventory.write(printline)

Simulating individual genomes with known inversions¶

Now that the inversions have been partitioned for each individual for each size treatment class, they can be simulated for every individual. Since the inversions across chromosomes have been combined into a single VCF file, we no longer need to do this per chromosome (thankfully).

%%bash
sizes=("small" "medium" "large" "xl")


for i in "${sizes[@]}"; do
    mkdir -p simulated_variants/inversions_per_sample/${i}/samples
    for j in $(seq -w 1 1 10); do
        for hap in $(seq 1 2); do
            OUTDIR="simulated_variants/inversions_per_sample/${i}/samples/sample_${j}/hap${hap}"
            harpy simulate inversion --quiet \
                -o ${OUTDIR} \
                -v simulated_variants/inversions_per_sample/${i}/sample_${j}.${i}.hap${hap}.vcf dmel_genome/dmel_2_3.fa
            mv ${OUTDIR}/sim.fasta simulated_variants/inversions_per_sample/${i}/sample_${j}.hap${hap}.fasta
            rm -r ${OUTDIR}
        done
    done
done

Simulating SNPs¶

So there is variation beyond just inversions, we need to simulate SNPs as well. First, we’ll simulate 120,000 random SNPs, then using some basic Python wizardry, randomly move those SNPs into haplotype 1 or haplotype 2 for each individual. There will be a 70% chance that a SNP will appear in either haplotype, meaning some might be homozygote, heterozygote, or missing entirely (which is fine). The initial simulation can be done for the entire assembly at once, rather than per-chromosome and/or per size-treatment.

%%bash
harpy simulate snpindel --prefix all --snp-count 120000 -o simulated_variants/setup_snps dmel_genome/dmel_2_3.fa

Now that we have a general set of SNPs, we can randomly shuffle them into haplotypes for each sample.

import random
import os

samples = [f"sample_{i:02}" for i in range(1,11)]
os.makedirs("simulated_variants/setup_snps/per_sample", exist_ok = True)
rng  = random.Random(69)
for sample in samples:
    with(
        open("simulated_variants/setup_snps/all.snp.vcf", "r") as vcf,
        open(f"simulated_variants/setup_snps/per_sample/{sample}.hap1.snp.vcf", "w") as hap1,
        open(f"simulated_variants/setup_snps/per_sample/{sample}.hap2.snp.vcf", "w") as hap2
    ):
        for line in vcf:
            if line.startswith("#"):
                hap1.write(line)
                hap2.write(line)
                continue
            for hap in [hap1,hap2]:
                if rng.random() < 0.7:
                    hap.write(line)

Now, using the same approach as with the inversions, we can simulate the known SNPs onto the sample genomes. simuG takes a while here, so the commands are echod to a text file that will be consumed by parallel for concurrency.

%%bash
sizes=("small" "medium" "large" "xl")
mkdir -p simulated_variants/inversions_snps
for size in "${sizes[@]}"; do
    for samp in $(seq -w 1 1 10); do
        SAMPLE="sample_${samp}"
        for hap in $(seq 1 2); do
            FASTA="simulated_variants/inversions_per_sample/${size}/${SAMPLE}.hap${hap}.fasta"
            SNPS="simulated_variants/setup_snps/per_sample/${SAMPLE}.hap${hap}.snp.vcf"
            echo "harpy simulate snpindel --quiet 2 --prefix ${SAMPLE}_snpinv.hap${hap} --snp-vcf $SNPS -o simulated_variants/inversions_snps_genomes/${size}/${SAMPLE}/hap${hap} $FASTA" >> simulated_variants/inversions_snps_genomes/sim_snps.sh
        done
    done
done
#parallel :::: sim.inversions/inversions_snps/sim_snps.sh

BGzip compression¶

As a basic housekeeping thing, let’s gzip compress the fasta files and delete the logs and workflow folders to reduce disk footprint.

%%bash
sizes=("small" "medium" "large" "xl")
for size in "${sizes[@]}"; do
    DIR="simulated_variants/inversions_snps_genomes/${size}"
    for samp in $(seq -w 1 1 10); do
        SAMPLE="sample_${samp}"
        for hap in $(seq 1 2); do
            bgzip -c ${DIR}/${SAMPLE}/hap${hap}/${SAMPLE}_snpinv.hap${hap}.fasta > ${DIR}/${SAMPLE}.snp_inv.hap${hap}.fasta.gz
        done
        rm -r simulated_variants/inversions_snps/${size}/${SAMPLE}/
    done
done

Inversion Simulations

02 Simulate Reads