Skip to article frontmatterSkip to article content

01 Simulate Variants

import os

Simulating the inversions

Prior to simulating the samples, we need to create the inversions themselves, which will be manually modified to meet the treatment goals listed below. Since each chromosome will be a technical replicate of different “conditions”, we need to simulate different inversions on each one, which is more tedious but ultimately worth it.

%%bash
CONTIGS=("2L" "2R" "3R" "3L")
SIZES=("small" "medium" "large" "xl")

for size in "${SIZES[@]}"; do
    for contig in "${CONTIGS[@]}"; do
        harpy simulate inversion -n 5 --max-size 25000 --min-size 1000 -o simulated_data/inversions/${size}/${contig} ${contig}.fasta
    done
done

Creating inversion treatments for individuals

We need to create the inversion treatments, which is the way the inversions are sized and distributed among the contigs. The design is such that:

  • 2R: all heterozygote
  • 2L: all homozygote
  • 3R: inversions are common
  • 3L: inversions are rare The dictionaries below govern which variants go into which haplotype for each sample, following the format
sample_id : [hap1, hap2]

Small inversions

SMALL = {
    "2R" : {
        "sample_01" : [[1,3,5], [2,4]],
        "sample_02" : [[1,2], [3,4,5]],
        "sample_03" : [[1,5], [2,3,4]],
        "sample_04" : [[1,4,5], [2,3]],
        "sample_05" : [[1,2,3,4,5], []],
        "sample_06" : [[1,2,3,4], [5]],
        "sample_07" : [[1,2,5], [3,4]],
        "sample_08" : [[1,2,4,5], [3]],
        "sample_09" : [[1,3,4,5], [2]],
        "sample_10" : [[1,2,3,5], [4]]
    },
    "2L" : {
        "sample_01" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_02" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_03" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_04" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_05" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_06" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_07" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_08" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_09" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_10" : [[1,2,3,4,5], [1,2,3,4,5]]
    },
    "3R" : {
        "sample_01" : [[1,3,4,5], [1,3,5]],
        "sample_02" : [[1,3,4,5], [1,3]],
        "sample_03" : [[1,3,4], [1,4]],
        "sample_04" : [[1,2,3], [3]],
        "sample_05" : [[1,2,3], [2,5]],
        "sample_06" : [[1,2,3], [2,3]],
        "sample_07" : [[1,2,3], [2]],
        "sample_08" : [[2,3,5], [2]],
        "sample_09" : [[2,3,4], []],
        "sample_10" : [[2,4,5], [4,5]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], [3]],
        "sample_04" : [[4], [4]],
        "sample_05" : [[4], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[4], []],
        "sample_08" : [[5], [5]],
        "sample_09" : [[5], [5]],
        "sample_10" : [[5], []]
    }
}

Medium Inversions

Deep into the project (final data analysis) we discovered that sample_06 had a typo of 44 for the second genotype on 2L, which meant it was in the heterozygous state for 4 for the analyses.

MEDIUM = {
    "2R" : {
        "sample_01" : [[1,3], [2,4]],
        "sample_02" : [[1,2], [3,4]],
        "sample_03" : [[1], [2,3,4]],
        "sample_04" : [[1,4], [2,3]],
        "sample_05" : [[1,2,3,4], []],
        "sample_06" : [[1,2,3,4], []],
        "sample_07" : [[1,2], [3,4]],
        "sample_08" : [[1,2,4], [3]],
        "sample_09" : [[1,3,4], [2]],
        "sample_10" : [[1,2,3], [4]]
    },
    "2L" : {
        "sample_01" : [[1,2,3,4], [1,2,3,4]],
        "sample_02" : [[1,2,3,4], [1,2,3,4]],
        "sample_03" : [[1,2,3,4], [1,2,3,4]],
        "sample_04" : [[1,2,3,4], [1,2,3,4]],
        "sample_05" : [[1,2,3,4], [1,2,3,4]],
        "sample_06" : [[1,2,3,4], [1,2,3,4]], # there was originally a typo here and inversion 4 was accidentally put in the heterozygous state
        "sample_09" : [[1,2,3,4], [1,2,3,4]],
        "sample_10" : [[1,2,3,4], [1,2,3,4]]
    },
    "3R" : {
        "sample_01" : [[1,3,4], [1,3]],
        "sample_02" : [[1,3,4], [1,3]],
        "sample_03" : [[1,3,4], [1,4]],
        "sample_04" : [[1,2,3], [3]],
        "sample_05" : [[1,2,3], [2]],
        "sample_06" : [[1,2,3], [2,3]],
        "sample_07" : [[1,2,3], [2]],
        "sample_08" : [[2,3], [2]],
        "sample_09" : [[2,3,4], []],
        "sample_10" : [[2,4], [4]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], [3]],
        "sample_04" : [[4], [4]],
        "sample_05" : [[4], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[4], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], [3]]
    }
}

Large Inversions

LARGE = {
    "2R" : {
        "sample_01" : [[1], [2]],
        "sample_02" : [[], [1,2]],
        "sample_03" : [[1], [2]],
        "sample_04" : [[1], [2]],
        "sample_05" : [[1,2], []],
        "sample_06" : [[], [1,2]],
        "sample_07" : [[2], [1]],
        "sample_08" : [[1,2], []],
        "sample_09" : [[1], [2]],
        "sample_10" : [[1,2], []]
    },
    "2L" : {
        "sample_01" : [[1,2], [1,2]],
        "sample_02" : [[1,2], [1,2]],
        "sample_03" : [[1,2], [1,2]],
        "sample_04" : [[1,2], [1,2]],
        "sample_05" : [[1,2], [1,2]],
        "sample_06" : [[1,2], [1,2]],
        "sample_07" : [[1,2], [1,2]],
        "sample_08" : [[1,2], [1,2]],
        "sample_09" : [[1,2], [1,2]],
        "sample_10" : [[1,2], [1,2]]
    },
    "3R" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], [2]],
        "sample_04" : [[1,2], []],
        "sample_05" : [[1,2], [1,2]],
        "sample_06" : [[1,2], [2]],
        "sample_07" : [[1], [1,2]],
        "sample_08" : [[2], [2]],
        "sample_09" : [[2], []],
        "sample_10" : [[2], []]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], []],
        "sample_04" : [[], []],
        "sample_05" : [[], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], []]
    }
}       

XL Inversions

XL = {
    "2R" : {
        "sample_01" : [[1], []],
        "sample_02" : [[], [1]],
        "sample_03" : [[1], []],
        "sample_04" : [[1], []],
        "sample_05" : [[1], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], [1]],
        "sample_08" : [[1], []],
        "sample_09" : [[1], []],
        "sample_10" : [[1], []]
    },
    "2L" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], [1]],
        "sample_04" : [[1], [1]],
        "sample_05" : [[1], [1]],
        "sample_06" : [[1], [1]],
        "sample_07" : [[1], [1]],
        "sample_08" : [[1], [1]],
        "sample_09" : [[1], [1]],
        "sample_10" : [[1], [1]]
    },
    "3R" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], []],
        "sample_04" : [[], []],
        "sample_05" : [[1], [1]],
        "sample_06" : [[1], []],
        "sample_07" : [[1], [1]],
        "sample_08" : [[], []],
        "sample_09" : [[], [1]],
        "sample_10" : [[], [1]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[], []],
        "sample_03" : [[], []],
        "sample_04" : [[], []],
        "sample_05" : [[], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], []]
    }
}

Write the inversions

Using the schema from the dictionaries above, the inversions need to be written to VCF files so that we can simulate them with LRSIM (via harpy)

inv_inventory = {
    "small" : SMALL,
    "medium": MEDIUM,
    "large" : LARGE,
    "xl": XL
}
for inv,invdict in inv_inventory.items():
    with open(f"simulated_data/inverions/{inv}/inv.{inv}.vcf", "r") as vcf:
        inversions = {}
        header = ""
        for line in vcf:
            if line.startswith("#"):
                header += line
            else:
                inv_split = line.split()
                contig = inv_split[0]
                if contig not in inversions:
                    inversions[contig] = [line]
                else:
                    inversions[contig].append(line)

    os.makedirs(f"simulated_data/inversions/{inv}/samples", exist_ok = True)
    samples = {}
    for contig in invdict:
        for sample,haps in invdict[contig].items():
            hap1,hap2 = haps
            inv_hap1 = [inversions[contig][i-1] for i in hap1]
            inv_hap2 = [inversions[contig][i-1] for i in hap2]

            if sample not in samples:
                samples[sample] = {"hap1" : inv_hap1, "hap2" : inv_hap2}
            else:
                samples[sample]["hap1"] += inv_hap1
                samples[sample]["hap2"] += inv_hap2

    for sample in samples:
        with (
            open(f"simulated_data/inversions/{inv}/samples/{sample}.{inv}.hap1.vcf", "w") as hap1_vcf,
            open(f"simulated_data/inversions/{inv}/samples/{sample}.{inv}.hap2.vcf", "w") as hap2_vcf
        ):
            _ = hap1_vcf.write(header)
            _ = hap1_vcf.write("".join(samples[sample]["hap1"]))
            _ = hap2_vcf.write(header)
            _ = hap2_vcf.write("".join(samples[sample]["hap2"]))

We’re going to need an inventory of the inversions later for assessing the called inversions so let’s generate the file now. It might make sense to print it in long-format, so let’s do that. It should look like:

samplesizecontiginversionpresentstate
sample_01small2L1TRUEhom
sample_01small2L2FALSEhet

First, make a function that will assess the presence/absence of an inversion in a sample and whether it’s in a homozygous or heterozygous state

def zygosity(invs, genos):
    geno1, geno2 = genos
    geno1_present = invs in geno1
    geno2_present = invs in geno2
    TF = geno1_present + geno2_present
    present = "FALSE" if TF == 0 else "TRUE"
    state = "NA" if TF == 0 else "het"
    if TF == 2:
        state = "hom"
    return present, state

The next part is going deep into iteration and using zygosity() to return the presence and state of an inversion at a given contig for a given sample of a given size treatment. It’s kind of a lot, but the code comments help keep track of what’s getting iterated when. The itertools.chain function helps collapse the nested lists of the genotypes.

import itertools

with open("inversion_simulations.inventory", "w") as inventory:
    inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
    for size_treatment, contig in inv_inventory.items():
        # size_treatment = "small", "medium", etc.
        # contig = dict{contig : {sample: [[hap1], [hap2]]}}
        for cntg,smpl in contig.items():
            # cntg = "2R", "3L" etc
            # smpl = {sample: [[hap1], [hap2]]}
            # collapse the nested lists and get the max number of inversions
            max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
            inversions = range(1, max_inversions + 1)
            for samplename,genotype in smpl.items():
                for i in inversions:
                    present, state = zygosity(i, genotype)
                    printline = f"{samplename}\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
                    inventory.write(printline)

The output send up looking like this (truncated for sanity)

sample	size	contig	inversion	present	state
sample_01	small	2R	1	TRUE	het
sample_01	small	2R	2	TRUE	het
sample_01	small	2R	3	TRUE	het
sample_01	small	2R	4	TRUE	het
sample_01	small	2R	5	TRUE	het
...
sample_06	xl	3L	1	TRUE	het
sample_07	xl	3L	1	FALSE	NA
sample_08	xl	3L	1	FALSE	NA
sample_09	xl	3L	1	TRUE	hom
sample_10	xl	3L	1	FALSE	NA

We also need to output the sample “pools” for when we attempt pooled-sample calling.

with open("inversion_simulations.pool.inventory", "w") as inventory:
    inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
    for size_treatment, contig in inv_inventory.items():
        # size_treatment = "small", "medium", etc.
        # contig = dict{contig : {sample: [[hap1], [hap2]]}}
        for cntg,smpl in contig.items():
            # cntg = "2R", "3L" etc
            # smpl = {sample: [[hap1], [hap2]]}
            # collapse the nested lists and get the max number of inversions
            max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
            inversions = range(1, max_inversions + 1)
            # collapse the haplotypes across samples into a set for each
            haplo1 = set(itertools.chain.from_iterable([i[0] for i in smpl.values()]))
            haplo2 = set(itertools.chain.from_iterable([i[1] for i in smpl.values()]))
            for i in inversions:
                present, state = zygosity(i, [haplo1, haplo2])
                printline = f"pool\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
                inventory.write(printline)

Simulating individual genomes with known inversions

Now that the inversions have been partitioned for each individual for each size treatment class, they can be simulated for every individual. Since the inversions across chromosomes have been combined into a single VCF file, we no longer need to do this per chromosome (thankfully).

%%bash
sizes=("small" "medium" "large" "xl")

for i in "${sizes[@]}"; do
    for j in $(seq -w 1 1 10); do
        for hap in $(seq 1 2); do
            harpy simulate inversion --quiet \
                -o simulated_data/inversions/${i}/samples/sample_${j}/hap${hap} \
                -v simulated_data/inversions/${i}/samples/sample_${j}.${i}.hap${hap}.vcf dmel_genome/dmel_2_3.fa
            mv simulated_data/inversions/${i}/samples/sample_${j}/hap${hap}/sim.fasta simulated_data/${i}/samples/sample_${j}/hap${hap}/sample_${j}.hap${hap}.fasta
            rm -r simulated_data/inversions/${i}/samples/sample_${j}/hap${hap}/logs
        done
    done
done

Simulating SNPs

So there is variation beyond just inversions, we need to simulate SNPs as well. First, we’ll simulate 120,000 random SNPs, then using some basic Python wizardry, randomly move those SNPs into haplotype 1 or haplotype 2 for each individual. There will be a 70% chance that a SNP will appear in either haplotype, meaning some might be homozygote, heterozygote, or missing entirely (which is fine). The initial simulation can be done for the entire assembly at once, rather than per-chromosome and/or per size-treatment.

%%bash
harpy simulate snpindel --prefix all --snp-count 120000 -o simulated_data/snps dmel_genome/dmel_2_3.fa

Now that we have a general set of SNPs, we can randomly shuffle them into haplotypes for each sample.

import random
import os

samples = [f"sample_{i:02}" for i in range(1,11)]
os.makedirs("simulated_data/snps/per_sample", exist_ok = True)
rng  = random.Random(69)
for sample in samples:
    with(
        open("simulated_data/snps/all.snp.vcf", "r") as vcf,
        open(f"simulated_data/snps/per_sample/{sample}.hap1.snp.vcf", "w") as hap1,
        open(f"simulated_data/snps/per_sample/{sample}.hap2.snp.vcf", "w") as hap2
    ):
        for line in vcf:
            if line.startswith("#"):
                hap1.write(line)
                hap2.write(line)
                continue
            for hap in [hap1,hap2]:
                if rng.random() < 0.7:
                    hap.write(line)

Now, using the same approach as with the inversions, we can simulate the known SNPs onto the sample genomes. simuG takes a while here, so the commands are echod to a text file that will be consumed by parallel for concurrency.

%%bash
sizes=("small" "medium" "large" "xl")
mkdir -p simulated_data/inversions_snps
for size in "${sizes[@]}"; do
    for samp in $(seq -w 1 1 10); do
        SAMPLE="sample_${samp}"
        for hap in $(seq 1 2); do
            FASTA="simulated_data/inversions/${size}/samples/${SAMPLE}/${SAMPLE}.hap${hap}.fasta"
            SNPS="simulated_data/snps/per_sample/${SAMPLE}.hap${hap}.snp.vcf"
            echo "harpy simulate snpindel --quiet --prefix ${SAMPLE}_snpinv.hap${hap} --snp-vcf $SNPS -o simulated_data/inversions_snps/${size}/${SAMPLE}/hap${hap} $FASTA" >> simulated_data/inversions_snps/sim_snps.sh
        done
    done
done
#parallel :::: sim.inversions/inversions_snps/sim_snps.sh

For some reason, simuG is acting up for manual [known] SNP simulation, so using the VCFs of known SNPs that were created, let’s just manually change the bases. Below is a simple simulate_snps() function to accomplish this.

import os
for size in ["small", "medium", "large", "xl"]:
    os.makedirs(f"simulated_data/inversions_snps2/{size}", exist_ok = True)
    for sample in [f"sample_{i:02}" for i in range(1,11)]:
        for hap in [1,2]:
            SNPS=f"simulated_data/snps/per_sample/{sample}.hap{hap}.snp.vcf"
            FASTA=f"simulated_data/inversions/{size}/samples/{sample}/{sample}.hap{hap}.fasta"
            FASTA_OUT=f"simulated_data/inversions_snps2/{size}/{sample}.snp_inv.hap{hap}.fasta.gz"
            simulate_snps(SNPS, FASTA, FASTA_OUT)

As a basic housekeeping thing, let’s gzip compress the fasta files and delete the logs and workflow folders to reduce disk footprint.

%%bash
sizes=("small" "medium" "large" "xl")
for size in "${sizes[@]}"; do
    DIR="simulated_data/inversions_snps/${size}"
    for samp in $(seq -w 1 1 10); do
        SAMPLE="sample_${samp}"
        for hap in $(seq 1 2); do
            bgzip -c ${DIR}/${SAMPLE}/hap${hap}/${SAMPLE}_snpinv.hap${hap}.fasta > ${DIR}/${SAMPLE}.snp_inv.hap${hap}.fasta.gz
        done
        rm -r simulated_data/inversions_snps/${size}/${SAMPLE}/
    done
done