Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

01 Simulate Variants

Cornell University
import os

Simulating the inversions

Prior to simulating the samples, we need to create the inversions themselves, which will be manually modified to meet the treatment goals listed below. Since each chromosome will be a technical replicate of different “conditions”, we need to simulate different inversions on each one, which is more tedious but ultimately worth it. The code below creates 5 random inversions, but this is really just to get the correct VCF format from the software-- the inversions will be manually modified.

%%bash
CONTIGS=("2L" "2R" "3R" "3L")
SIZES=("small" "medium" "large" "xl")

for size in "${SIZES[@]}"; do
    for contig in "${CONTIGS[@]}"; do
        harpy simulate inversion -n 5 --max-size 25000 --min-size 1000 -o simulated_variants/setup_inversions/${size}/${contig} ${contig}.fasta
    done
done

Creating inversion treatments for individuals

We need to create the inversion treatments, which is the way the inversions are sized and distributed among the contigs. The design is such that:

  • 2R: all heterozygote

  • 2L: all homozygote

  • 3R: inversions are common

  • 3L: inversions are rare

The dictionaries below govern which variants go into which haplotype for each sample, following the format

sample_id : [hap1, hap2]

Small inversions

SMALL = {
    "2R" : {
        "sample_01" : [[1,3,5], [2,4]],
        "sample_02" : [[1,2], [3,4,5]],
        "sample_03" : [[1,5], [2,3,4]],
        "sample_04" : [[1,4,5], [2,3]],
        "sample_05" : [[1,2,3,4,5], []],
        "sample_06" : [[1,2,3,4], [5]],
        "sample_07" : [[1,2,5], [3,4]],
        "sample_08" : [[1,2,4,5], [3]],
        "sample_09" : [[1,3,4,5], [2]],
        "sample_10" : [[1,2,3,5], [4]]
    },
    "2L" : {
        "sample_01" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_02" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_03" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_04" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_05" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_06" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_07" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_08" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_09" : [[1,2,3,4,5], [1,2,3,4,5]],
        "sample_10" : [[1,2,3,4,5], [1,2,3,4,5]]
    },
    "3R" : {
        "sample_01" : [[1,3,4,5], [1,3,5]],
        "sample_02" : [[1,3,4,5], [1,3]],
        "sample_03" : [[1,3,4], [1,4]],
        "sample_04" : [[1,2,3], [3]],
        "sample_05" : [[1,2,3], [2,5]],
        "sample_06" : [[1,2,3], [2,3]],
        "sample_07" : [[1,2,3], [2]],
        "sample_08" : [[2,3,5], [2]],
        "sample_09" : [[2,3,4], []],
        "sample_10" : [[2,4,5], [4,5]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], [3]],
        "sample_04" : [[4], [4]],
        "sample_05" : [[4], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[4], []],
        "sample_08" : [[5], [5]],
        "sample_09" : [[5], [5]],
        "sample_10" : [[5], []]
    }
}

Medium Inversions

MEDIUM = {
    "2R" : {
        "sample_01" : [[1,3], [2,4]],
        "sample_02" : [[1,2], [3,4]],
        "sample_03" : [[1], [2,3,4]],
        "sample_04" : [[1,4], [2,3]],
        "sample_05" : [[1,2,3,4], []],
        "sample_06" : [[1,2,3,4], []],
        "sample_07" : [[1,2], [3,4]],
        "sample_08" : [[1,2,4], [3]],
        "sample_09" : [[1,3,4], [2]],
        "sample_10" : [[1,2,3], [4]]
    },
    "2L" : {
        "sample_01" : [[1,2,3,4], [1,2,3,4]],
        "sample_02" : [[1,2,3,4], [1,2,3,4]],
        "sample_03" : [[1,2,3,4], [1,2,3,4]],
        "sample_04" : [[1,2,3,4], [1,2,3,4]],
        "sample_05" : [[1,2,3,4], [1,2,3,4]],
        "sample_06" : [[1,2,3,4], [1,2,3,4]],
        "sample_07" : [[1,2,3,4], [1,2,3,4]],
        "sample_08" : [[1,2,3,4], [1,2,3,4]],
        "sample_09" : [[1,2,3,4], [1,2,3,4]],
        "sample_10" : [[1,2,3,4], [1,2,3,4]]
    },
    "3R" : {
        "sample_01" : [[1,3,4], [1,3]],
        "sample_02" : [[1,3,4], [1,3]],
        "sample_03" : [[1,3,4], [1,4]],
        "sample_04" : [[1,2,3], [3]],
        "sample_05" : [[1,2,3], [2]],
        "sample_06" : [[1,2,3], [2,3]],
        "sample_07" : [[1,2,3], [2]],
        "sample_08" : [[2,3], [2]],
        "sample_09" : [[2,3,4], []],
        "sample_10" : [[2,4], [4]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], [3]],
        "sample_04" : [[4], [4]],
        "sample_05" : [[4], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[4], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], [3]]
    }
}

Large Inversions

LARGE = {
    "2R" : {
        "sample_01" : [[1], [2]],
        "sample_02" : [[], [1,2]],
        "sample_03" : [[1], [2]],
        "sample_04" : [[1], [2]],
        "sample_05" : [[1,2], []],
        "sample_06" : [[], [1,2]],
        "sample_07" : [[2], [1]],
        "sample_08" : [[1,2], []],
        "sample_09" : [[1], [2]],
        "sample_10" : [[1,2], []]
    },
    "2L" : {
        "sample_01" : [[1,2], [1,2]],
        "sample_02" : [[1,2], [1,2]],
        "sample_03" : [[1,2], [1,2]],
        "sample_04" : [[1,2], [1,2]],
        "sample_05" : [[1,2], [1,2]],
        "sample_06" : [[1,2], [1,2]],
        "sample_07" : [[1,2], [1,2]],
        "sample_08" : [[1,2], [1,2]],
        "sample_09" : [[1,2], [1,2]],
        "sample_10" : [[1,2], [1,2]]
    },
    "3R" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], [2]],
        "sample_04" : [[1,2], []],
        "sample_05" : [[1,2], [1,2]],
        "sample_06" : [[1,2], [2]],
        "sample_07" : [[1], [1,2]],
        "sample_08" : [[2], [2]],
        "sample_09" : [[2], []],
        "sample_10" : [[2], []]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[2], [2]],
        "sample_03" : [[], []],
        "sample_04" : [[], []],
        "sample_05" : [[], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], []]
    }
}       

XL Inversions

XL = {
    "2R" : {
        "sample_01" : [[1], []],
        "sample_02" : [[], [1]],
        "sample_03" : [[1], []],
        "sample_04" : [[1], []],
        "sample_05" : [[1], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], [1]],
        "sample_08" : [[1], []],
        "sample_09" : [[1], []],
        "sample_10" : [[1], []]
    },
    "2L" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], [1]],
        "sample_04" : [[1], [1]],
        "sample_05" : [[1], [1]],
        "sample_06" : [[1], [1]],
        "sample_07" : [[1], [1]],
        "sample_08" : [[1], [1]],
        "sample_09" : [[1], [1]],
        "sample_10" : [[1], [1]]
    },
    "3R" : {
        "sample_01" : [[1], [1]],
        "sample_02" : [[1], [1]],
        "sample_03" : [[1], []],
        "sample_04" : [[], []],
        "sample_05" : [[1], [1]],
        "sample_06" : [[1], []],
        "sample_07" : [[1], [1]],
        "sample_08" : [[], []],
        "sample_09" : [[], [1]],
        "sample_10" : [[], [1]]
    },
    "3L" : {
        "sample_01" : [[1], []],
        "sample_02" : [[], []],
        "sample_03" : [[], []],
        "sample_04" : [[], []],
        "sample_05" : [[], []],
        "sample_06" : [[], [1]],
        "sample_07" : [[], []],
        "sample_08" : [[], []],
        "sample_09" : [[1], [1]],
        "sample_10" : [[], []]
    }
}

Write the inversions

Using the schema from the dictionaries above, the inversions need to be written to VCF files so that we can simulate them with LRSIM (via harpy)

inv_inventory = {
    "small" : SMALL,
    "medium": MEDIUM,
    "large" : LARGE,
    "xl": XL
}
for inv,invdict in inv_inventory.items():
    with open(f"simulated_variants/setup_inversions/{inv}/inv.{inv}.vcf", "r") as vcf:
        inversions = {}
        for line in vcf:
            if line.startswith("#"):
                continue
            else:
                inv_split = line.split()
                contig = inv_split[0]
                pos_start = inv_split[1]
                pos_end = inv_split[-1].strip().split(";")[-1].removeprefix("END=")
                if contig not in inversions:
                    inversions[contig] = []                 
                inversions[contig].append("\t".join([contig, pos_start, pos_end, "inversion", "None", "0"]))

    os.makedirs(f"simulated_variants/bedfiles/inversions/{inv}/", exist_ok = True)
    samples = {}
    for contig in invdict:
        for sample,haps in invdict[contig].items():
            hap1,hap2 = haps
            inv_hap1 = [inversions[contig][i-1] for i in hap1]
            inv_hap2 = [inversions[contig][i-1] for i in hap2]

            if sample not in samples:
                samples[sample] = {"hap1" : inv_hap1, "hap2" : inv_hap2}
            else:
                samples[sample]["hap1"] += inv_hap1
                samples[sample]["hap2"] += inv_hap2

    for sample in samples:
        with (
            open(f"simulated_variants/bedfiles/inversions/{inv}/{sample}.inversions.hap1.bed", "w") as _hap1,
            open(f"simulated_variants/bedfiles/inversions/{inv}/{sample}.inversions.hap2.bed", "w") as _hap2
        ):
            _ = _hap1.write("\n".join(samples[sample]["hap1"]) + "\n")
            _ = _hap2.write("\n".join(samples[sample]["hap2"]) + "\n")

Creating an inventory

We’re going to need an inventory of the inversions later for assessing the called inversions so let’s generate the file now. It might make sense to print it in long-format, so let’s do that. It should look like:

samplesizecontiginversionpresentstate
sample_01small2L1TRUEhom
sample_01small2L2FALSEhet

First, make a function that will assess the presence/absence of an inversion in a sample and whether it’s in a homozygous or heterozygous state

def zygosity(invs, genos):
    geno1, geno2 = genos
    geno1_present = invs in geno1
    geno2_present = invs in geno2
    TF = geno1_present + geno2_present
    present = "FALSE" if TF == 0 else "TRUE"
    state = "NA" if TF == 0 else "het"
    if TF == 2:
        state = "hom"
    return present, state

The next part is going deep into iteration and using zygosity() to return the presence and state of an inversion at a given contig for a given sample of a given size treatment. It’s kind of a lot, but the code comments help keep track of what’s getting iterated when. The itertools.chain function helps collapse the nested lists of the genotypes.

import itertools

with open("simulated_variants/inversions_per_sample/inversion_simulations.inventory", "w") as inventory:
    inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
    for size_treatment, contig in inv_inventory.items():
        # size_treatment = "small", "medium", etc.
        # contig = dict{contig : {sample: [[hap1], [hap2]]}}
        for cntg,smpl in contig.items():
            # cntg = "2R", "3L" etc
            # smpl = {sample: [[hap1], [hap2]]}
            # collapse the nested lists and get the max number of inversions
            max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
            inversions = range(1, max_inversions + 1)
            for samplename,genotype in smpl.items():
                for i in inversions:
                    present, state = zygosity(i, genotype)
                    printline = f"{samplename}\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
                    inventory.write(printline)

The output send up looking like this (truncated for sanity)

sample	size	contig	inversion	present	state
sample_01	small	2R	1	TRUE	het
sample_01	small	2R	2	TRUE	het
sample_01	small	2R	3	TRUE	het
sample_01	small	2R	4	TRUE	het
sample_01	small	2R	5	TRUE	het
...
sample_06	xl	3L	1	TRUE	het
sample_07	xl	3L	1	FALSE	NA
sample_08	xl	3L	1	FALSE	NA
sample_09	xl	3L	1	TRUE	hom
sample_10	xl	3L	1	FALSE	NA

We also need to output the sample “pools” for when we attempt pooled-sample calling.

with open("simulated_variants/inversions_per_sample/inversion_simulations.pool.inventory", "w") as inventory:
    inventory.write("\t".join(["sample", "size","contig","inversion","present","state"]) + "\n")
    for size_treatment, contig in inv_inventory.items():
        # size_treatment = "small", "medium", etc.
        # contig = dict{contig : {sample: [[hap1], [hap2]]}}
        for cntg,smpl in contig.items():
            # cntg = "2R", "3L" etc
            # smpl = {sample: [[hap1], [hap2]]}
            # collapse the nested lists and get the max number of inversions
            max_inversions = max(list(itertools.chain.from_iterable(itertools.chain.from_iterable(smpl.values()))))
            inversions = range(1, max_inversions + 1)
            # collapse the haplotypes across samples into a set for each
            haplo1 = set(itertools.chain.from_iterable([i[0] for i in smpl.values()]))
            haplo2 = set(itertools.chain.from_iterable([i[1] for i in smpl.values()]))
            for i in inversions:
                present, state = zygosity(i, [haplo1, haplo2])
                printline = f"pool\t{size_treatment}\t{cntg}\t{i}\t{present}\t{state}\n"
                inventory.write(printline)

Simulating SNPs

So there is variation beyond just inversions, we need to simulate SNPs as well. First, we’ll simulate 120,000 random SNPs, then using some basic Python wizardry, randomly move those SNPs into haplotype 1 or haplotype 2 for each individual. There will be a 70% chance that a SNP will appear in either haplotype, meaning some might be homozygote, heterozygote, or missing entirely (which is fine). The initial simulation can be done for the entire assembly at once, rather than per-chromosome and/or per size-treatment.

%%bash
harpy simulate snpindel --prefix all --snp-count 120000 -o simulated_variants/setup_snps dmel_genome/dmel_2_3.fa
import random
import os

samples = [f"sample_{i:02}" for i in range(1,11)]
for i in inv:
    os.makedirs(f"simulated_variants/bedfiles/snp/", exist_ok = True)
rng  = random.Random(69)
record_cols = [0,1,4]
for sample in samples:
    with(
        open("simulated_variants/setup_snps/all.snp.vcf", "r") as vcf,
        open(f"simulated_variants/bedfiles/snp/{sample}.snps.hap1.bed", "w") as hap1,
        open(f"simulated_variants/bedfiles/snp/{sample}.snps.hap2.bed", "w") as hap2
    ):
        for line in vcf:
            if line.startswith("#"):
                continue
            _line = line.strip().split()
            # subtract 1 from the start position
            pos_start = f"{int(_line[1]) - 1}"
            snp = "\t".join([_line[0], pos_start, _line[1], "SNP", _line[4], "0"]) + "\n"
            if line.startswith("#"):
                continue
            for hap in [hap1,hap2]:
                if rng.random() < 0.7:
                    hap.write(snp)

Simulating individual genomes with known variants

Now we can use VISOR to first create genomes with simulated SNPs, then a second pass to add inversions. Since only 1 BED file was given per inversion haplotype, the resulting file will be named h1.fa regardless of whether it’s actually h1 or h2, which is why the mv command tries to move h1.fa twice.

%%bash
sizes=("small" "medium" "large" "xl")
GENO="/workdir/pd348/haplotagging_simulations/dmel_genome/dmel_2_3.fa"
OUTDIR="simulated_genomes/"

mkdir -p simulated_variants/visor_logs
mkdir -p ${OUTDIR}/snp
mkdir -p ${OUTDIR}/${sizes[@]}

for samp in $(seq -w 1 1 10); do
    SAMPLE="sample_${samp}"
    SNPDIR="${OUTDIR}/snp/${SAMPLE}"
    echo $SAMPLE
    VISOR HACk -g ${GENO} -o ${SNPDIR} -b simulated_variants/bedfiles/snp/${SAMPLE}.snps.hap{1..2}.bed &> visor_logs/${SAMPLE}.snp.log
    for size in "${sizes[@]}"; do
        echo $size
        {
            VISOR HACk -g ${SNPDIR}/h1.fa -o ${OUTDIR}/inv_snp1 -b simulated_variants/bedfiles/inversions/${size}/${SAMPLE}.inversions.hap1.bed
            VISOR HACk -g ${SNPDIR}/h2.fa -o ${OUTDIR}/inv_snp2 -b simulated_variants/bedfiles/inversions/${size}/${SAMPLE}.inversions.hap2.bed

            mv ${OUTDIR}/inv_snp1/h1.fa ${OUTDIR}/${size}/${SAMPLE}.${size}.hap1.fasta
            mv ${OUTDIR}/inv_snp2/h1.fa ${OUTDIR}/${size}/${SAMPLE}.${size}.hap2.fasta

            bgzip ${OUTDIR}/${size}/${SAMPLE}.${size}.hap1.fasta
            bgzip ${OUTDIR}/${size}/${SAMPLE}.${size}.hap2.fasta

            rm -r ${OUTDIR}/inv_snp{1..2}/
            
        } &> visor_logs/${SAMPLE}.${size}.log
    done
done