https://github.com/tskit-dev/msprime
Raw File
Tip revision: a3da1b02836537cf35564c914938c75d8ed1acec authored by Jerome Kelleher on 20 July 2016, 15:58:14 UTC
Changelog for 0.3.2.
Tip revision: a3da1b0
dev.py
"""
Simple client code for development purposes.
"""

from __future__ import print_function
from __future__ import division

import time
import math
import glob
import subprocess
import sys
import itertools

import numpy as np
import numpy.ma as ma
import matplotlib
import scipy.stats
import pandas as pd
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as pyplot

import msprime

def mutations():
    n = 10
    # num_reps = 1000
    num_reps = 1
    num_loci = 10001
    # recomb_rates = [(1000, 0.005), (2000, 0.01), (3000, 0), (10001, 0.05)]
    recomb_rates = [(10001, 0.05)]
    last_pos = 0
    mean_rate = 0
    for pos, rate in recomb_rates:
        d = (pos - last_pos - 1) / (num_loci - 1)
        mean_rate += d * rate
        # print("mean_rate + ", d, rate)
        # print("rate = ", rate, rate / (4 * 10**4))
        last_pos = pos
    assert last_pos == num_loci
    print("mean_rate = ", mean_rate)
    num_trees = 0
    for j in range(num_reps):
        simulator = msprime.TreeSimulator(n)
        simulator.set_num_loci(num_loci)
        simulator.set_scaled_recombination_rate(mean_rate)
        # simulator.set_random_seed(j)
        simulator.run()
        num_trees += simulator.get_num_breakpoints()
        ts = simulator.get_tree_sequence()
        for t in ts.trees():
            print(t.get_interval()[0])

    # Construct the scrm command line. Use the first value as the background
    # rate
    simulator.set_scaled_recombination_rate(recomb_rates[0][-1])

    cmd = simulator.get_ms_command_line(
        "/home/jk/work/wt/papers/msprime/simulators/scrm",
        num_replicates=num_reps)
    for j in range(len(recomb_rates) - 1):
        pos = recomb_rates[j][0]
        # We still scale the recombination rate by the full locus length,
        # not the subset that we are working over.
        length = num_loci - 1
        rate = recomb_rates[j + 1][1]
        cmd += ["-sr", str(pos), str(rate * length)]
    # print(cmd)
    print(" ".join(cmd))
    result = subprocess.check_output(cmd)
    scrm_num_trees = 0
    for line in result.splitlines():
        # print(line)
        if line.startswith(b"["):
            scrm_num_trees += 1
    print(num_trees / num_reps, scrm_num_trees / num_reps)

    # tree_sequence = msprime.simulate(10, 100, mean_rate, random_seed=1)

    # for record in tree_sequence.records():
    #     print(record)
    # for tree in tree_sequence.trees():
    #     print(tree.get_interval())

def plot_distance_maps(recomb_rates):
    # Plot the piecewise map of physical distance to recombination rate
    x = np.zeros(2 * len(recomb_rates))
    y = np.copy(x)
    last_phys_x = 0
    j = 0
    for phys_x, recomb_rate in recomb_rates:
        x[j] = last_phys_x
        y[j] = recomb_rate
        j += 1
        x[j] = phys_x
        y[j] = recomb_rate
        last_phys_x = phys_x
        j += 1
    pyplot.plot(x, y)
    pyplot.ylim(-0.01, 1.01)
    pyplot.savefig("phys_recomb_rate.png")

    pyplot.clf()

    x = np.zeros(1 + len(recomb_rates))
    y = np.copy(x)
    j = 1
    s = 0
    last_phys_x = 0
    for phys_x, recomb_rate in recomb_rates:
        s += (phys_x - last_phys_x) * recomb_rate
        y[j] = s
        x[j] = phys_x
        j += 1
        last_phys_x = phys_x
    pyplot.plot(x, y)
    # physical_dist = 21.6
    # genetic_dist = physical_to_genetic(physical_dist, recomb_rates)
    genetic_dist = 4
    physical_dist = genetic_to_physical(genetic_dist, recomb_rates)
    pyplot.axvline(x=physical_dist, color="green")
    pyplot.axhline(y=genetic_dist, color="green")
    pyplot.savefig("phys_genetic_distance.png")


def plot_1kg_map():
    infile = "tmp__NOBACKUP__/genetic_map_b36/genetic_map_chr1_b36.txt.gz"

    import pandas as pd
    df = pd.read_csv(infile, delim_whitespace=True, compression="gzip",
            names=["pos", "rate", "distance"], header=0)
    # print(df.pos)
    physical_length = df.pos.iloc[-1]
    num_crossovers = df.distance.iloc[-1] / 100
    Ne = 10**4
    rate = 4 * Ne * num_crossovers / physical_length
    print("Overall rate = {:.2E}".format(rate))

    scaled_rate = np.array(4 * Ne * (df.rate / 100) / 10**6)[:-1]
    print(scaled_rate)

    lengths = np.diff(df.pos)
    print(lengths)

    print(lengths * scaled_rate)


    # print("overall rate = ",
    # print(df["pos"])
    pyplot.plot(df.pos, df.rate)
    pyplot.savefig("1kg.png")


def simulations():
    n = 10
    m = 1000
    recomb_map = msprime.RecombinationMap(
        m, [0, 0.5, 0.6, 0.7, 1], [0.1, 10, 0, 0.1, 0])
    sim = msprime.TreeSimulator(n)
    sim.set_random_seed(1)
    sim.set_num_loci(m)
    sim.set_recombination_map(recomb_map)
    # sim.set_scaled_recombination_rate(
    #     recomb_map.get_total_recombination_rate())
    sim.run()
    ts = sim.get_tree_sequence()
    size = 0
    for l, records_in, records_out in ts.diffs():
        # print(l, records_in, records_out)
        size += l
    print("size", size, ts.get_sequence_length())
    for t in ts.trees():
        l, r = t.get_interval()
        # print(l, r)
    for l, ns in ts.newick_trees():
        print(l, ns)
    # ts.generate_mutations(2, 1)
    # for t in ts.trees():
    #     l, r = t.get_interval()
    #     print("tree:", recomb_map.genetic_to_physical(l / m),
    #             recomb_map.genetic_to_physical(l / m))
    #     for pos, node in t.mutations():
    #         print("\t", node, pos, recomb_map.genetic_to_physical(pos / m),
    #                 sep="\t")

def convert_hdf5():
    in_filename = "tmp__NOBACKUP__/mutations.hdf5"
    out_filename = "tmp__NOBACKUP__/mutations_double_coords.hdf5"
    import h5py
    infile = h5py.File(in_filename, "r")
    outfile = h5py.File(out_filename, "w")
    # print(root)
    # g = root["trees"]
    # fields = [
    #     ("left", uint32, 1), ("right", uint32, 1),
    #     ("node", uint32, 1), ("children", uint32, 2),
    #     ("time", float64, 1)]
    #         self.assertEqual(g[name].shape[0], ts.get_num_records())

def read_1kg_map():
    infile = "tmp__NOBACKUP__/genetic_map_b36/genetic_map_chr1_b36.txt.gz"
    # infile = "genetic_map_chr22_b36.txt"
    infile = "tmp__NOBACKUP__/genetic_map_GRCh37_chr2.txt"
    pattern = "tmp__NOBACKUP__/genetic_map_GRCh37_chr*.txt"
    # pattern = "tmp__NOBACKUP__/genetic_map_GRCh37_chrX_par1.txt"
    for infile in glob.glob(pattern):
        name = infile.split("_")[-1].split(".")[0]
        print(infile, name)
        recomb_map = msprime.RecombinationMap.read_hapmap(infile)
        positions = np.array(recomb_map.get_positions())
        rates = np.array(recomb_map.get_rates())

        # tree_seq = msprime.simulate(10, recombination_map=recomb_map)
        n = 10
        before = time.clock()
        ts = msprime.simulate(
            n, Ne=10**4, recombination_map=recomb_map)
        print("Simulation ran in ", time.clock() - before)
        # for t in ts.trees():
        #     breakpoints.append(t.get_interval()[0])
        # b = np.array(breakpoints)

        # N = 500
        # fig, ax1 = pyplot.subplots(figsize=(16, 6))
        # v, bin_edges, bin_number = scipy.stats.binned_statistic(
        #     positions, rates, bins=N)
        # x = bin_edges[:-1][np.logical_not(np.isnan(v))]
        # y = v[np.logical_not(np.isnan(v))]
        # ax1.plot(x, y, "-")

        # ax2 = ax1.twinx()
        # v, bin_edges = np.histogram(b, N)
        # ax2.plot(bin_edges[:-1], v, color="green")

        # fig.savefig("tmp__NOBACKUP__/hapmap_{}.png".format(name))


    #     print(t.get_interval())
    # print(ts.get_num_records())


def genetic_to_phys(genetic_x, num_loci, positions, rates):
    total_recomb_rate = 0
    size = len(positions)
    for j in range(1, size):
        phys_length = positions[j] - positions[j - 1]
        total_recomb_rate += phys_length * rates[j - 1]
    if total_recomb_rate == 0:
        ret = (genetic_x / num_loci) * phys_length
    else:
        x = (genetic_x / num_loci) * total_recomb_rate
        ret = 0
        if x > 0:
            s = 0
            k = 0
            while s < x:
                s += (positions[k + 1] - positions[k]) * rates[k]
                k += 1
            excess = (s - x) / rates[k - 1]
            ret = positions[k] - excess
    return ret

def genetic_to_phys_bulk(values, num_loci, positions, rates):
    total_recomb_rate = 0
    size = len(positions)
    n = len(values)
    for j in range(1, size):
        phys_length = positions[j] - positions[j - 1]
        total_recomb_rate += phys_length * rates[j - 1]
    ret = list(values)
    if total_recomb_rate == 0:
        for j in range(n):
            ret[j] = genetic_to_phys(
                values[j], num_loci, positions, rates)
    else:
        # Get rid of zero values
        j = 0
        while values[j] == 0:
            j += 1
        s = 0
        k = 0
        while j < n:
            if j > 0 and values[j - 1] > values[j]:
                raise Exception("Input list not sorted")
            x = (values[j] / num_loci) * total_recomb_rate
            while s < x:
                s += (positions[k + 1] - positions[k]) * rates[k]
                k += 1
            excess = (s - x) / rates[k - 1]
            ret[j] = positions[k] - excess
            j += 1
    return ret


def map_stuff():
    num_loci = 1000
    positions = [0, 50, 80, 100]
    rates =     [0.2, 0.1, 0.0, 0]

    values = [0, 10, 50, 100, 900, 1000]
    bulk = genetic_to_phys_bulk(values, num_loci, positions, rates)

    for x, y in zip(values, bulk):
        phys = genetic_to_phys(x, num_loci, positions, rates)
        print(x, "\t", phys, "\t", y)


def new_api():
    # ts = msprime.simulate(10)

    infile = "hapmap/genetic_map_GRCh37_chr22.txt"
    recomb_map = msprime.RecombinationMap.read_hapmap(infile)
    ts = msprime.simulate(
        100, Ne=10**4,
        recombination_map=recomb_map,
        mutation_rate=1e-8)
    ts.dump("tmp__NOBACKUP__/chr22.hdf5")


def replicate_example():
    theta = 5
    R = 1000
    replicates = msprime.simulate(
        sample_size=100, recombination_rate=2, mutation_rate=theta/4,
        num_replicates=R, random_seed=None)
    S = np.zeros(R)
    T = np.zeros(R)
    for j, tree_sequence in enumerate(replicates):
        S[j] = tree_sequence.get_num_mutations()
        T[j] = tree_sequence.get_num_trees()
    print("theta =", theta, "mean(S) = ", np.mean(S))
    print(np.mean(T))

def migration_example():
    # M is the overall symmetric migration rate, and d is the number
    # of demes.
    M = 0.2
    d = 3
    # We rescale m into per-generation values for msprime.
    m = M / (4 * (d - 1))
    # Allocate the initial sample. Because we are interested in the
    # between deme coalescence times, we choose one sample each
    # from the first two demes.
    population_configurations = [
        msprime.PopulationConfiguration(sample_size=1),
        msprime.PopulationConfiguration(sample_size=1),
        msprime.PopulationConfiguration(sample_size=0)]
    # Now we set up the migration matrix. Since this is a symmetric
    # island model, we have the same rate of migration between all
    # pairs of demes. Diagonal elements must be zero.
    migration_matrix = [
        [0, m, m],
        [m, 0, m],
        [m, m, 0]]
    # We pass these values to the simulate function, and ask it
    # to run the required number of replicates.
    num_replicates = 10000
    replicates = msprime.simulate(
        population_configurations=population_configurations,
        migration_matrix=migration_matrix,
        num_replicates=num_replicates)
    # And then iterate over these replicates
    T = np.zeros(num_replicates)
    for i, tree_sequence in enumerate(replicates):
        tree = next(tree_sequence.trees())
        T[i] = tree.get_time(tree.get_root())
    # Finally, calculate the analytical expectation and print
    # out the results
    analytical = d / 2 + (d - 1) / (2 * M)
    print("Observed  =", np.mean(T))
    print("Predicted =", analytical)


def segregating_sites_example(n, theta, num_replicates):
    S = np.zeros(num_replicates)
    replicates = msprime.simulate(
        sample_size=n,
        mutation_rate=theta / 4,
        num_replicates=num_replicates)
    for j, tree_sequence in enumerate(replicates):
        S[j] = tree_sequence.get_num_mutations()
    # Now, calculate the analytical predictions
    S_mean_a = np.sum(1 / np.arange(1, n)) * theta
    S_var_a = (
        theta * np.sum(1 / np.arange(1, n)) +
        theta**2 * np.sum(1 / np.arange(1, n)**2))
    print("              mean              variance")
    print("Observed      {}\t\t{}".format(np.mean(S), np.var(S)))
    print("Analytical    {:.5f}\t\t:.5f}".format(S_mean_a, S_var_a))
        # columns=["left", "right", "node", "children", "time", "population"])


def variable_recomb_example():
    infile = "hapmap/genetic_map_GRCh37_chr22.txt"
    # Read in the recombination map using the read_hapmap method,
    recomb_map = msprime.RecombinationMap.read_hapmap(infile)

    # Now we get the positions and rates from the recombination
    # map and plot these using 500 bins.
    positions = np.array(recomb_map.get_positions()[1:])
    rates = np.array(recomb_map.get_rates()[1:])
    num_bins = 500
    v, bin_edges, _ = scipy.stats.binned_statistic(
        positions, rates, bins=num_bins)
    x = bin_edges[:-1][np.logical_not(np.isnan(v))]
    y = v[np.logical_not(np.isnan(v))]
    fig, ax1 = pyplot.subplots(figsize=(16, 6))
    ax1.plot(x, y, color="blue")
    ax1.set_ylabel("Recombination rate")
    ax1.set_xlabel("Chromosome position")

    # Now we run the simulation for this map. We assume Ne=10^4
    # and have a sample of 100 individuals
    tree_sequence = msprime.simulate(
        sample_size=100,
        Ne=10**4,
        recombination_map=recomb_map)
    # Now plot the density of breakpoints along the chromosome
    breakpoints = np.array(list(tree_sequence.breakpoints()))
    ax2 = ax1.twinx()
    v, bin_edges = np.histogram(breakpoints, num_bins, density=True)
    ax2.plot(bin_edges[:-1], v, color="green")
    ax2.set_ylabel("Breakpoint density")
    ax2.set_xlim(1.5e7, 5.3e7)
    fig.savefig("hapmap_chr22.svg")


def pop_example():
    if False:
        t = 100
        ts = msprime.simulate(
            Ne=10**4,
            population_configurations=[
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000)],
            demographic_events=[
                msprime.MassMigration(time=t, source=1, destination=0),
                msprime.MassMigration(time=t, source=2, destination=0),
                msprime.MassMigration(time=t, source=3, destination=0),
                msprime.MassMigration(time=t, source=4, destination=0)],
            length=100 * 1e6,
            recombination_rate=2e-8,
            mutation_rate=2e-8,
            random_seed=1)
        ts.dump("populations.hdf5")
        print(
            ts.get_sample_size(), ts.get_num_trees(),
            ts.get_num_mutations())
    else:
        ts = msprime.load("populations.hdf5")
        before = time.clock()
        R = 1
        for i in range(R):
            for j in range(5):
                samples = ts.get_samples(population_id=j)
                pi = ts.get_pairwise_diversity(samples)
                # pi2 = ts.get_pairwise_diversity2(samples)
                # print(j, pi, pi2, pi == pi2)
                # print(j, pi2)
        duration = time.clock() - before
        print("duration = ", duration, " per call = ", duration / (5 * R))


def vcf_example():

    # n = 6 # 3 diploid samples from each pop
    # t = 100
    # ts = msprime.simulate(
    #     Ne=10**4,
    #     population_configurations=[
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n)],
    #     demographic_events=[
    #         msprime.MassMigration(time=t, source=1, destination=0),
    #         msprime.MassMigration(time=t, source=2, destination=0),
    #         msprime.MassMigration(time=t, source=3, destination=0),
    #         msprime.MassMigration(time=t, source=4, destination=0)],
    #     length=1 * 1e6,
    #     recombination_rate=2e-8,
    #     mutation_rate=2e-8,
    #     random_seed=1)
    # with open("test.vcf", "w") as f:

    #     ts.write_vcf(f, ploidy=2)

    ts = msprime.load("tmp__NOBACKUP__/populations.hdf5")
    before = time.clock()
    num_genotypes = 0
    for variant in ts.variants():
        num_genotypes += len(variant.genotypes)
    print(num_genotypes, ts.get_sample_size() * ts.get_num_mutations())
    duration = time.clock() - before
    print("Done in ", duration, " gives ",
            num_genotypes * 1e-6 / duration, " MGenotypes decoded per second")
    print(num_genotypes)


    before = time.clock()
    with open("tmp__NOBACKUP__/tmp_1.vcf", "w") as f:
        ts.write_vcf(f, ploidy=1)
        size = f.tell()
    duration = time.clock() - before
    print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration, "MB/s")
    before = time.clock()
    with open("tmp__NOBACKUP__/tmp_2.vcf", "w") as f:
        ts.write_vcf(f, ploidy=2)
    duration = time.clock() - before
    print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration, "MB/s")

def records_example():
    # filename = "records.txt"

    # ts = msprime.load("out.hdf5")
    # with open(filename, "w") as f:
    #     ts.write_records(f)

    # with open(filename, "r") as f:
    #     ts2 = msprime.TreeSequence.load_records(f)
    # for r1, r2 in zip(ts.records(), ts2.records()):
    #     print(r1.left, r2.left)

    ts = msprime.load_txt("example.txt")
    for t in ts.trees():
        print(t)


if __name__ == "__main__":
    # mutations()

    # plot_distance_maps(
    #     [(10, 0.1), (11, 1), (20, 0.1), (21, 1), (30, 0.1)]
    # )
    # plot_1kg_map()

    # read_1kg_map()

    # simulations()
    # convert_hdf5()
    # map_stuff()
    # new_api()
    # replicate_example()
    # migration_example()
    # segregating_sites_example(2, 5, 10000)
    # variable_recomb_example()
    # pop_example()
    # vcf_example()
    records_example()
back to top