Simulating data with msprime¶
The Python scripts for simulating input data for training can be found in the run_sims.py
and models.py
files. Below is the code for each model specified using msprime.
No hybridization¶
import msprime as msp
import numpy as np
def no_hybridization(coal_units=1.0):
"""
Out P3 P2 P1
| | | |
| | | |
| | |-----|
| | |
| | |
| |-----------|
| |
| |
|------------|
|
"""
# Coalescent units are the number of 2N generations
time_units = 2.0 * 1000.0 * coal_units
# Draw divergence times
tau1 = np.random.gamma(10.0, 1.0 / 10.0)
tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2
# Draw mutation and recombination rates
mu = np.random.uniform(1e-8, 1e-9)
r = np.random.uniform(1e-8, 1e-9)
# Set up population configurations
population_configurations = [
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
)
]
# Set up demographic events
demographic_events = [
msp.MassMigration(
time = tau1 * time_units,
source = 0, destination=1,
proportion=1.0
),
msp.MassMigration(
time = tau2 * time_units,
source = 1, destination=2,
proportion=1.0
),
msp.MassMigration(
time = tau3 * time_units,
source = 2, destination=3,
proportion=1.0
)
]
return ([time_units, tau1, tau2, tau3, mu, r],
msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
population_configurations=population_configurations,
demographic_events=demographic_events))
Hybrid speciation¶
import msprime as msp
import numpy as np
def hybrid_speciation(coal_units=1.0):
"""
Out P3 P2 P1
| | | |
| | | |
| | | |
| |----/ \----|
| | |
| |-----------|
| |
| |
|------------|
|
"""
# Coalescent units are the number of 2N generations
time_units = 2.0 * 1000.0 * coal_units
# Draw divergence times
tau1 = np.random.gamma(10.0, 1.0 / 10.0)
tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2
# Draw mutation and recombination rates
mu = np.random.uniform(1e-8, 1e-9)
r = np.random.uniform(1e-8, 1e-9)
# Draw admixture proportion
gamma = np.random.uniform(0.25, 0.75)
# Set up population configurations
population_configurations = [
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
)
]
# Set up demographic events
demographic_events = [
msp.MassMigration(
time = tau1 * time_units,
source = 1, destination = 2,
proportion = gamma
),
msp.MassMigration(
time = tau1 * time_units + 1e-6,
source = 1, destination = 0,
proportion = 1.0
),
msp.MassMigration(
time = tau2 * time_units,
source = 0, destination = 2,
proportion = 1.0
),
msp.MassMigration(
time = tau3 * time_units,
source = 2, destination = 3,
proportion = 1.0
)
]
return ([time_units, tau1, tau2, tau3, mu, r, gamma],
msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
population_configurations=population_configurations,
demographic_events=demographic_events))
Admixture¶
import msprime as msp
import numpy as np
def admixture(coal_units=1.0):
"""
Out P3 P2 P1
| | | |
| |---->| |
| | | |
| | \----|
| | |
| |-----------|
| |
| |
|------------|
|
"""
# Coalescent units are the number of 2N generations
time_units = 2.0 * 1000.0 * coal_units
# Draw divergence times
tau1 = np.random.gamma(10.0, 1.0 / 10.0)
tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2
# Draw mutation and recombination rates
mu = np.random.uniform(1e-8, 1e-9)
r = np.random.uniform(1e-8, 1e-9)
# Draw admixture proportion
gamma = np.random.uniform(0.01, 0.25)
# Draw admixture time, restrict to occurring between
# and 10% and 90% of tau1
admix_time = np.random.uniform(0.1 * tau1, 0.9 * tau1)
# Set up population configurations
population_configurations = [
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
)
]
# Set up demographic events
demographic_events = [
msp.MassMigration(
time = admix_time * time_units,
source = 1, destination = 2,
proportion = gamma
),
msp.MassMigration(
time = tau1 * time_units,
source = 1, destination = 0,
proportion = 1.0
),
msp.MassMigration(
time = tau2 * time_units,
source = 0, destination = 2,
proportion = 1.0
),
msp.MassMigration(
time = tau3 * time_units,
source = 2, destination = 3,
proportion = 1.0
)
]
return ([time_units, tau1, tau2, tau3, mu, r, gamma, admix_time],
msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
population_configurations=population_configurations,
demographic_events=demographic_events))
Admixture with gene flow¶
import msprime as msp
import numpy as np
def admixture_w_gflow(coal_units=1.0):
"""
Out P3 P2 P1
| | |<--->|
| |---->|<--->|
| | |<--->|
| | \----|
| | |
| |-----------|
| |
| |
|------------|
|
"""
# Coalescent units are the number of 2N generations
time_units = 2.0 * 1000.0 * coal_units
# Draw divergence times
tau1 = np.random.gamma(10.0, 1.0 / 10.0)
tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2
# Draw mutation and recombination rates
mu = np.random.uniform(1e-8, 1e-9)
r = np.random.uniform(1e-8, 1e-9)
# Draw admixture proportion
gamma = np.random.uniform(0.01, 0.25)
# Draw admixture time, restrict to occurring between
# and 10% and 90% of tau1
admix_time = np.random.uniform(0.1 * tau1, 0.9 * tau1)
m = np.random.uniform(5.0e-4, 1.0e-3)
migration_matrix = [
[0, m, 0, 0],
[m, 0, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 0]
]
# Set up population configurations
population_configurations = [
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
),
msp.PopulationConfiguration(
sample_size=5,
initial_size=1000
)
]
# Set up demographic events
demographic_events = [
msp.MassMigration(
time = admix_time * time_units,
source = 1, destination = 2,
proportion = gamma
),
msp.MassMigration(
time = tau1 * time_units,
source = 1, destination = 0,
proportion = 1.0
),
msp.MigrationRateChange(
time=tau1 * time_units,
rate=0
),
msp.MassMigration(
time = tau2 * time_units,
source = 0, destination = 2,
proportion = 1.0
),
msp.MassMigration(
time = tau3 * time_units,
source = 2, destination = 3,
proportion = 1.0
)
]
return ([time_units, tau1, tau2, tau3, mu, r, gamma, admix_time, m],
msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
population_configurations=population_configurations,
migration_matrix=migration_matrix,
demographic_events=demographic_events))
#!/usr/bin/env python3
"""
<< run_sims.py >>
Simulate data sets from admixture/hybridization models.
Arguments
---------
For more specific details, type: run_sims.py -h
- model <string> : name of model to run ['no_hybridization','hybrid_speciation','admixture']
- cu <float> : branch scaling in coalescent units
Output
------
Writes a compressed numpy array with simulated values of nucleotide diversity (pi)
in 1000 windows across all pairwise comparisons of taxa and individuals.
"""
from models import (
no_hybridization,
hybrid_speciation,
admixture,
admixture_w_gflow
)
import numpy as np
import tskit as tsk
from random import shuffle
from sys import argv, exit
import argparse
def open_file(filename, mode):
try:
handle = open(filename, mode)
except:
print("Error: Could not open {}...".format(filename))
exit(-1)
return handle
def print_params(params, f_out):
for i in range(len(params)-1):
print("{},".format(params[i]), end='', file=f_out)
print(params[-1], file=f_out)
if (sim+1) % 50 == 0: f_out.flush()
if __name__ == "__main__":
"""
Run the script from the command line.
"""
# Print docstring if only the name of the script is given
if len(argv) < 2:
print(__doc__)
exit(0)
# Parse command line arguments
parser = argparse.ArgumentParser(description="Options for run_sims.py", add_help=True)
required = parser.add_argument_group("required arguments")
required.add_argument('-m', '--model', action="store", type=str, required=True,
metavar='\b',
help="name of model to simulate ['no_hybridization','hybrid_speciation','admixture','admixture_w_gflow']")
additional = parser.add_argument_group("additional arguments")
additional.add_argument('-cu','--coal_units', action="store", type=float, default=1.0,
metavar='\b', help="branch scaling in coalescent units")
args = parser.parse_args()
model_name = args.model
cu = args.coal_units
if model_name == "no_hybridization":
print("Running simulations for no_hybridization model...", flush=True)
model = no_hybridization
elif model_name == "hybrid_speciation":
print("Running simulations for hybrid_speciation model...", flush=True)
model = hybrid_speciation
elif model_name == "admixture":
print("Running simulations for admixture model...", flush=True)
model = admixture
elif model_name == "admixture_w_gflow":
print("Running simulations for admixture_w_gflow model...", flush=True)
model = admixture_w_gflow
else:
print("Invalid model name supplied: {}".format(model_name), flush=True)
print("The following models are currently implemented:", flush=True)
print(" no_hybridization", flush=True)
print(" hybrid_speciation", flush=True)
print(" admixture", flush=True)
print(" admixture_w_gflow", flush=True)
exit(-1)
# Specify the number of simulations
nsims = 20000
# Define sample sets
samples_P1_P2 = [[i, j+5] for i in range(5) for j in range(5)]
samples_P1_P3 = [[i, j+10] for i in range(5) for j in range(5)]
samples_P2_P3 = [[i+5, j+10] for i in range(5) for j in range(5)]
samples_P1_Out = [[i, j+15] for i in range(5) for j in range(5)]
samples_P2_Out = [[i+5, j+15] for i in range(5) for j in range(5)]
samples_P3_Out = [[i+10, j+15] for i in range(5) for j in range(5)]
# Specify the number of windows to calculate pi
nwindows = 1000
windows = [i for i in range(0,int(1e7)+1,10000)]
# Specify the print frequency for tracking completed simulations
# Set to None if you don't want to print
print_freq = 1000
# Open CSV files for recording parameters used to simulate data
model_out = open_file("{}_{}.csv".format(model_name, cu), 'w')
# Define the numpy array that will hold the simulated
# windows of pi. The size of the last dimensions (6) comes
# from the fact that we are working with 4 species 2 at a time:
# 4 choose 2 = 6.
model_mean = np.zeros((nsims, nwindows, 6))
model_min = np.zeros((nsims, nwindows, 6))
model_std = np.zeros((nsims, nwindows, 6))
for sim in range(nsims):
"""
This is the main loop for doing the simulaitons.
"""
model_params, model_ts = model(cu)
print_params(model_params, model_out)
model_mean[sim,:,0] = np.mean(model_ts.diversity(samples_P1_P2, windows=windows).T, axis=0)
model_min[sim,:,0] = np.min(model_ts.diversity(samples_P1_P2, windows=windows).T, axis=0)
model_std[sim,:,0] = np.std(model_ts.diversity(samples_P1_P2, windows=windows).T, axis=0)
model_mean[sim,:,1] = np.mean(model_ts.diversity(samples_P1_P3, windows=windows).T, axis=0)
model_min[sim,:,1] = np.min(model_ts.diversity(samples_P1_P3, windows=windows).T, axis=0)
model_std[sim,:,1] = np.std(model_ts.diversity(samples_P1_P3, windows=windows).T, axis=0)
model_mean[sim,:,2] = np.mean(model_ts.diversity(samples_P2_P3, windows=windows).T, axis=0)
model_min[sim,:,2] = np.min(model_ts.diversity(samples_P2_P3, windows=windows).T, axis=0)
model_std[sim,:,2] = np.std(model_ts.diversity(samples_P2_P3, windows=windows).T, axis=0)
model_mean[sim,:,3] = np.mean(model_ts.diversity(samples_P1_Out, windows=windows).T, axis=0)
model_min[sim,:,3] = np.min(model_ts.diversity(samples_P1_Out, windows=windows).T, axis=0)
model_std[sim,:,3] = np.std(model_ts.diversity(samples_P1_Out, windows=windows).T, axis=0)
model_mean[sim,:,4] = np.mean(model_ts.diversity(samples_P2_Out, windows=windows).T, axis=0)
model_min[sim,:,4] = np.min(model_ts.diversity(samples_P2_Out, windows=windows).T, axis=0)
model_std[sim,:,4] = np.std(model_ts.diversity(samples_P2_Out, windows=windows).T, axis=0)
model_mean[sim,:,5] = np.mean(model_ts.diversity(samples_P3_Out, windows=windows).T, axis=0)
model_min[sim,:,5] = np.min(model_ts.diversity(samples_P3_Out, windows=windows).T, axis=0)
model_std[sim,:,5] = np.std(model_ts.diversity(samples_P3_Out, windows=windows).T, axis=0)
if print_freq is not None and (sim+1) % print_freq == 0:
print("Completed {} simulation iterations...".format(sim+1), flush=True)
np.savez_compressed('{}_{}.npz'.format(model_name, cu), mean=model_mean, min=model_min, std=model_std)
print("Done.", flush=True)
References
J Kelleher, AM Etheridge, and G McVean. 2016. Efficient Coalescent Simulation and Genealogical Analysis for Large Sample Sizes. PLoS Computational Biology 12:e1004842.