.. _DataPrep:

Preparing input data
====================

The Python scripts ``prepare_*_cnn.py`` and ``prepare_*_flagel.py`` contain the code needed
to process the data simulated by *msprime* and convert it into compressed and labeled numpy
arrays. Below is an example for minimum :math:`d_{XY}`.

.. code-block:: python

  import numpy as np
  from random import shuffle

  if __name__ == '__main__':
      print("Preparing data for mean pairwise coalescent times:")
      for cu in ['0.5', '1.0', '2.0']:
          print("  Coalescent Units {}".format(cu))
          no_hybridization       = np.load("../raw_data/no_hybridization_{}.npz".format(cu))['min']
          no_hybridization_norm  = np.zeros((no_hybridization.shape[0],no_hybridization.shape[1],no_hybridization.shape[2],1))
          hybrid_speciation      = np.load("../raw_data/hybrid_speciation_{}.npz".format(cu))['min']
          hybrid_speciation_norm = np.zeros((hybrid_speciation.shape[0],hybrid_speciation.shape[1],hybrid_speciation.shape[2],1))
          admixture              = np.load("../raw_data/admixture_{}.npz".format(cu))['min']
          admixture_norm         = np.zeros((admixture.shape[0],admixture.shape[1],admixture.shape[2],1))
          admixture_w_gflow      = np.load("../raw_data/admixture_w_gflow_{}.npz".format(cu))['min']
          admixture_w_gflow_norm = np.zeros((admixture_w_gflow.shape[0],admixture_w_gflow.shape[1],admixture_w_gflow.shape[2],1))

          for i in range(no_hybridization.shape[0]):
              no_hybridization_norm[i,:,:,0] = no_hybridization[i,:,:] / np.max(no_hybridization[i,:,:])
              hybrid_speciation_norm[i,:,:,0] = hybrid_speciation[i,:,:] / np.max(hybrid_speciation[i,:,:])
              admixture_norm[i,:,:,0] = admixture[i,:,:] / np.max(admixture[i,:,:])
              admixture_w_gflow_norm[i,:,:,0] = admixture_w_gflow[i,:,:] / np.max(admixture_w_gflow[i,:,:])

          no_hybridization_shf = list(range(20000))
          shuffle(no_hybridization_shf)
          hybrid_speciation_shf = list(range(20000))
          shuffle(hybrid_speciation_shf)
          admixture_shf = list(range(20000))
          shuffle(admixture_shf)
          admixture_w_gflow_shf = list(range(20000))
          shuffle(admixture_w_gflow_shf)

          X_train_tmp = np.concatenate(
              (no_hybridization_norm[no_hybridization_shf[:15000],:,:,:],
               hybrid_speciation_norm[hybrid_speciation_shf[:15000],:,:,:],
               admixture_norm[admixture_shf[:15000],:,:,:],
               admixture_w_gflow_norm[admixture_w_gflow_shf[:15000],:,:,:]),
               axis=0
          )
          y_train_tmp = np.stack(
              (np.repeat((1,0,0,0), 15000),
               np.repeat((0,1,0,0), 15000),
               np.repeat((0,0,1,0), 15000),
               np.repeat((0,0,0,1), 15000)),
               axis=1
          )
          train_shf = list(range(60000))
          shuffle(train_shf)

          X_val_tmp = np.concatenate(
              (no_hybridization_norm[no_hybridization_shf[15000:17500],:,:,:],
               hybrid_speciation_norm[hybrid_speciation_shf[15000:17500],:,:,:],
               admixture_norm[admixture_shf[15000:17500],:,:,:],
               admixture_w_gflow_norm[admixture_shf[15000:17500],:,:,:]),
               axis=0
          )
          y_val_tmp = np.stack(
              (np.repeat((1,0,0,0), 2500),
               np.repeat((0,1,0,0), 2500),
               np.repeat((0,0,1,0), 2500),
               np.repeat((0,0,0,1), 2500)),
               axis=1
          )
          val_shf = list(range(10000))
          shuffle(val_shf)

          X_test_tmp = np.concatenate(
              (no_hybridization_norm[no_hybridization_shf[17500:],:,:,:],
               hybrid_speciation_norm[hybrid_speciation_shf[17500:],:,:,:],
               admixture_norm[admixture_shf[17500:],:,:,:],
               admixture_w_gflow_norm[admixture_shf[17500:],:,:,:]),
               axis=0
          )
          y_test_tmp = np.stack(
              (np.repeat((1,0,0,0), 2500),
               np.repeat((0,1,0,0), 2500),
               np.repeat((0,0,1,0), 2500),
               np.repeat((0,0,0,1), 2500)),
               axis=1
          )
          test_shf = list(range(10000))
          shuffle(test_shf)

          np.savez_compressed(
              '../processed_data/hyde_cnn_min_data_{}.npz'.format(cu),
              xtrain=X_train_tmp[train_shf,:,:,:],
              xval=X_val_tmp[val_shf,:,:,:],
              xtest=X_test_tmp[test_shf,:,:,:],
              ytrain=y_train_tmp[train_shf,:],
              yval=y_val_tmp[val_shf,:],
              ytest=y_test_tmp[test_shf,:]
          )