.. _Simulation:

Simulating data with *msprime*
==============================

The Python scripts for simulating input data for training can be found in the ``run_sims.py``
and ``models.py`` files. Below is the code for each model specified using *msprime*.

No hybridization
----------------

.. code-block:: python

  import msprime as msp
  import numpy as np

  def no_hybridization(coal_units=1.0):
      """
      Out    P3    P2    P1
      |      |     |     |
      |      |     |     |
      |      |     |-----|
      |      |           |
      |      |           |
      |      |-----------|
      |            |
      |            |
      |------------|
            |


      """
      # Coalescent units are the number of 2N generations
      time_units = 2.0 * 1000.0 * coal_units

      # Draw divergence times
      tau1 = np.random.gamma(10.0, 1.0 / 10.0)
      tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
      tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2

      # Draw mutation and recombination rates
      mu = np.random.uniform(1e-8, 1e-9)
      r  = np.random.uniform(1e-8, 1e-9)

      # Set up population configurations
      population_configurations = [
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          )
      ]

      # Set up demographic events
      demographic_events = [
          msp.MassMigration(
              time = tau1 * time_units,
              source = 0, destination=1,
              proportion=1.0
          ),
          msp.MassMigration(
              time = tau2 * time_units,
              source = 1, destination=2,
              proportion=1.0
          ),
          msp.MassMigration(
              time = tau3 * time_units,
              source = 2, destination=3,
              proportion=1.0
          )
      ]

      return ([time_units, tau1, tau2, tau3, mu, r],
              msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
                           population_configurations=population_configurations,
                           demographic_events=demographic_events))

Hybrid speciation
-----------------

.. code-block:: python

  import msprime as msp
  import numpy as np

  def hybrid_speciation(coal_units=1.0):
      """
      Out    P3    P2    P1
       |      |     |     |
       |      |     |     |
       |      |     |     |
       |      |----/ \----|
       |      |           |
       |      |-----------|
       |            |
       |            |
       |------------|
              |


      """
      # Coalescent units are the number of 2N generations
      time_units = 2.0 * 1000.0 * coal_units

      # Draw divergence times
      tau1 = np.random.gamma(10.0, 1.0 / 10.0)
      tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
      tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2

      # Draw mutation and recombination rates
      mu = np.random.uniform(1e-8, 1e-9)
      r  = np.random.uniform(1e-8, 1e-9)

      # Draw admixture proportion
      gamma = np.random.uniform(0.25, 0.75)

      # Set up population configurations
      population_configurations = [
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          )
      ]

      # Set up demographic events
      demographic_events = [
          msp.MassMigration(
              time = tau1 * time_units,
              source = 1, destination = 2,
              proportion = gamma
          ),
          msp.MassMigration(
              time = tau1 * time_units + 1e-6,
              source = 1, destination = 0,
              proportion = 1.0
          ),
          msp.MassMigration(
              time = tau2 * time_units,
              source = 0, destination = 2,
              proportion = 1.0
          ),
          msp.MassMigration(
              time = tau3 * time_units,
              source = 2, destination = 3,
              proportion = 1.0
          )
      ]

      return ([time_units, tau1, tau2, tau3, mu, r, gamma],
              msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
                           population_configurations=population_configurations,
                           demographic_events=demographic_events))

Admixture
---------

.. code-block:: python

  import msprime as msp
  import numpy as np
  def admixture(coal_units=1.0):
      """
      Out    P3    P2    P1
       |      |     |     |
       |      |---->|     |
       |      |     |     |
       |      |      \----|
       |      |           |
       |      |-----------|
       |            |
       |            |
       |------------|
              |


      """
      # Coalescent units are the number of 2N generations
      time_units = 2.0 * 1000.0 * coal_units

      # Draw divergence times
      tau1 = np.random.gamma(10.0, 1.0 / 10.0)
      tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
      tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2

      # Draw mutation and recombination rates
      mu = np.random.uniform(1e-8, 1e-9)
      r  = np.random.uniform(1e-8, 1e-9)

      # Draw admixture proportion
      gamma = np.random.uniform(0.01, 0.25)

      # Draw admixture time, restrict to occurring between
      # and 10% and 90% of tau1
      admix_time = np.random.uniform(0.1 * tau1, 0.9 * tau1)

      # Set up population configurations
      population_configurations = [
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          )
      ]

      # Set up demographic events
      demographic_events = [
          msp.MassMigration(
              time = admix_time * time_units,
              source = 1, destination = 2,
              proportion = gamma
          ),
          msp.MassMigration(
              time = tau1 * time_units,
              source = 1, destination = 0,
              proportion = 1.0
          ),
          msp.MassMigration(
              time = tau2 * time_units,
              source = 0, destination = 2,
              proportion = 1.0
          ),
          msp.MassMigration(
              time = tau3 * time_units,
              source = 2, destination = 3,
              proportion = 1.0
          )
      ]

      return ([time_units, tau1, tau2, tau3, mu, r, gamma, admix_time],
              msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
                           population_configurations=population_configurations,
                           demographic_events=demographic_events))

Admixture with gene flow
------------------------

.. code-block:: python

  import msprime as msp
  import numpy as np

  def admixture_w_gflow(coal_units=1.0):
      """
      Out    P3    P2    P1
       |      |     |<--->|
       |      |---->|<--->|
       |      |     |<--->|
       |      |      \----|
       |      |           |
       |      |-----------|
       |            |
       |            |
       |------------|
              |

      """
      # Coalescent units are the number of 2N generations
      time_units = 2.0 * 1000.0 * coal_units

      # Draw divergence times
      tau1 = np.random.gamma(10.0, 1.0 / 10.0)
      tau2 = np.random.gamma(10.0, 1.0 / 10.0) + tau1
      tau3 = np.random.gamma(20.0, 1.0 / 10.0) + tau2

      # Draw mutation and recombination rates
      mu = np.random.uniform(1e-8, 1e-9)
      r  = np.random.uniform(1e-8, 1e-9)

      # Draw admixture proportion
      gamma = np.random.uniform(0.01, 0.25)

      # Draw admixture time, restrict to occurring between
      # and 10% and 90% of tau1
      admix_time = np.random.uniform(0.1 * tau1, 0.9 * tau1)

      m = np.random.uniform(5.0e-4, 1.0e-3)

      migration_matrix = [
          [0, m, 0, 0],
          [m, 0, 0, 0],
          [0, 0, 0, 0],
          [0, 0, 0, 0]
      ]

      # Set up population configurations
      population_configurations = [
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          ),
          msp.PopulationConfiguration(
              sample_size=5,
              initial_size=1000
          )
      ]

      # Set up demographic events
      demographic_events = [
          msp.MassMigration(
              time = admix_time * time_units,
              source = 1, destination = 2,
              proportion = gamma
          ),
          msp.MassMigration(
              time = tau1 * time_units,
              source = 1, destination = 0,
              proportion = 1.0
          ),
          msp.MigrationRateChange(
              time=tau1 * time_units,
              rate=0
          ),
          msp.MassMigration(
              time = tau2 * time_units,
              source = 0, destination = 2,
              proportion = 1.0
          ),
          msp.MassMigration(
              time = tau3 * time_units,
              source = 2, destination = 3,
              proportion = 1.0
          )
      ]

      return ([time_units, tau1, tau2, tau3, mu, r, gamma, admix_time, m],
              msp.simulate(length=1e7, mutation_rate=mu, recombination_rate=r,
                           population_configurations=population_configurations,
                           migration_matrix=migration_matrix,
                           demographic_events=demographic_events))

.. code-block:: python

  #!/usr/bin/env python3

  """
  << run_sims.py >>

  Simulate data sets from admixture/hybridization models.

  Arguments
  ---------

  For more specific details, type: run_sims.py -h

      - model <string> : name of model to run ['no_hybridization','hybrid_speciation','admixture']
      - cu    <float>  : branch scaling in coalescent units

  Output
  ------

  Writes a compressed numpy array with simulated values of nucleotide diversity (pi)
  in 1000 windows across all pairwise comparisons of taxa and individuals.
  """

  from models import (
      no_hybridization,
      hybrid_speciation,
      admixture,
      admixture_w_gflow
  )
  import numpy as np
  import tskit as tsk
  from random import shuffle
  from sys import argv, exit
  import argparse

  def open_file(filename, mode):
      try:
          handle = open(filename, mode)
      except:
          print("Error: Could not open {}...".format(filename))
          exit(-1)
      return handle

  def print_params(params, f_out):
      for i in range(len(params)-1):
          print("{},".format(params[i]), end='', file=f_out)
      print(params[-1], file=f_out)
      if (sim+1) % 50 == 0: f_out.flush()

  if __name__ == "__main__":
      """
      Run the script from the command line.
      """
      # Print docstring if only the name of the script is given
      if len(argv) < 2:
          print(__doc__)
          exit(0)

      # Parse command line arguments
      parser = argparse.ArgumentParser(description="Options for run_sims.py", add_help=True)

      required = parser.add_argument_group("required arguments")
      required.add_argument('-m', '--model', action="store", type=str, required=True,
                            metavar='\b',
                            help="name of model to simulate ['no_hybridization','hybrid_speciation','admixture','admixture_w_gflow']")
      additional = parser.add_argument_group("additional arguments")
      additional.add_argument('-cu','--coal_units', action="store", type=float, default=1.0,
                              metavar='\b', help="branch scaling in coalescent units")

      args       = parser.parse_args()
      model_name = args.model
      cu         = args.coal_units

      if model_name == "no_hybridization":
          print("Running simulations for no_hybridization model...", flush=True)
          model = no_hybridization
      elif model_name == "hybrid_speciation":
          print("Running simulations for hybrid_speciation model...", flush=True)
          model = hybrid_speciation
      elif model_name == "admixture":
          print("Running simulations for admixture model...", flush=True)
          model = admixture
      elif model_name == "admixture_w_gflow":
          print("Running simulations for admixture_w_gflow model...", flush=True)
          model = admixture_w_gflow
      else:
          print("Invalid model name supplied: {}".format(model_name), flush=True)
          print("The following models are currently implemented:", flush=True)
          print("  no_hybridization", flush=True)
          print("  hybrid_speciation", flush=True)
          print("  admixture", flush=True)
          print("  admixture_w_gflow", flush=True)
          exit(-1)

      # Specify the number of simulations
      nsims = 20000

      # Define sample sets
      samples_P1_P2  = [[i, j+5] for i in range(5) for j in range(5)]
      samples_P1_P3  = [[i, j+10] for i in range(5) for j in range(5)]
      samples_P2_P3  = [[i+5, j+10] for i in range(5) for j in range(5)]
      samples_P1_Out = [[i, j+15] for i in range(5) for j in range(5)]
      samples_P2_Out = [[i+5, j+15] for i in range(5) for j in range(5)]
      samples_P3_Out = [[i+10, j+15] for i in range(5) for j in range(5)]

      # Specify the number of windows to calculate pi
      nwindows = 1000
      windows = [i for i in range(0,int(1e7)+1,10000)]

      # Specify the print frequency for tracking completed simulations
      # Set to None if you don't want to print
      print_freq = 1000

      # Open CSV files for recording parameters used to simulate data
      model_out  = open_file("{}_{}.csv".format(model_name, cu), 'w')

      # Define the numpy array that will hold the simulated
      # windows of pi. The size of the last dimensions (6) comes
      # from the fact that we are working with 4 species 2 at a time:
      # 4 choose 2 = 6.
      model_mean = np.zeros((nsims, nwindows, 6))
      model_min  = np.zeros((nsims, nwindows, 6))
      model_std  = np.zeros((nsims, nwindows, 6))

      for sim in range(nsims):
          """
          This is the main loop for doing the simulaitons.
          """
          model_params, model_ts = model(cu)
          print_params(model_params, model_out)

          model_mean[sim,:,0] = np.mean(model_ts.diversity(samples_P1_P2, windows=windows).T, axis=0)
          model_min[sim,:,0]  = np.min(model_ts.diversity(samples_P1_P2, windows=windows).T, axis=0)
          model_std[sim,:,0]  = np.std(model_ts.diversity(samples_P1_P2, windows=windows).T, axis=0)

          model_mean[sim,:,1] = np.mean(model_ts.diversity(samples_P1_P3, windows=windows).T, axis=0)
          model_min[sim,:,1]  = np.min(model_ts.diversity(samples_P1_P3, windows=windows).T, axis=0)
          model_std[sim,:,1]  = np.std(model_ts.diversity(samples_P1_P3, windows=windows).T, axis=0)

          model_mean[sim,:,2] = np.mean(model_ts.diversity(samples_P2_P3, windows=windows).T, axis=0)
          model_min[sim,:,2]  = np.min(model_ts.diversity(samples_P2_P3, windows=windows).T, axis=0)
          model_std[sim,:,2]  = np.std(model_ts.diversity(samples_P2_P3, windows=windows).T, axis=0)

          model_mean[sim,:,3] = np.mean(model_ts.diversity(samples_P1_Out, windows=windows).T, axis=0)
          model_min[sim,:,3]  = np.min(model_ts.diversity(samples_P1_Out, windows=windows).T, axis=0)
          model_std[sim,:,3]  = np.std(model_ts.diversity(samples_P1_Out, windows=windows).T, axis=0)

          model_mean[sim,:,4] = np.mean(model_ts.diversity(samples_P2_Out, windows=windows).T, axis=0)
          model_min[sim,:,4]  = np.min(model_ts.diversity(samples_P2_Out, windows=windows).T, axis=0)
          model_std[sim,:,4]  = np.std(model_ts.diversity(samples_P2_Out, windows=windows).T, axis=0)

          model_mean[sim,:,5] = np.mean(model_ts.diversity(samples_P3_Out, windows=windows).T, axis=0)
          model_min[sim,:,5]  = np.min(model_ts.diversity(samples_P3_Out, windows=windows).T, axis=0)
          model_std[sim,:,5]  = np.std(model_ts.diversity(samples_P3_Out, windows=windows).T, axis=0)

          if print_freq is not None and (sim+1) % print_freq == 0:
              print("Completed {} simulation iterations...".format(sim+1), flush=True)

      np.savez_compressed('{}_{}.npz'.format(model_name, cu), mean=model_mean, min=model_min, std=model_std)
      print("Done.", flush=True)

----

**References**

  - J Kelleher, AM Etheridge, and G McVean. 2016. Efficient Coalescent Simulation
    and Genealogical Analysis for Large Sample Sizes. *PLoS Computational Biology*
    12:e1004842.