Empirical example: Heliconius butterfliesΒΆ
The scripts for simulating, training, and testing HyDe-CNN for Heliconius can
be found in the heliconius/
folder on GitHub and are run in the same way as
the scripts for the simulation-based evaluations. Below is the code for analyzing
100 random samples of individuals
# Import packages
import numpy as np
import pysam
from tensorflow.keras.models import load_model
# Load trained models
min_cnn = load_model('heliconius_cnn_min.mdl')
mean_cnn = load_model('heliconius_cnn_mean.mdl')
min_mean_cnn = load_model('heliconius_cnn_min-mean.mdl')
min_flagel = load_model('heliconius_flagel_min.mdl')
mean_flagel = load_model('heliconius_flagel_mean.mdl')
# Define container class for calculating pi
class HeliconiusPi():
"""
Simple container class for storing relevant information for
the Heliconius samples. Doesn't do anything fancy, just allows
access to data.
"""
def __init__(self):
self.HeliconiusIdx = {
'ama' : [i for i in range(10)],
'chi' : [i for i in range(10,20)],
'flo' : [i for i in range(20,30)],
'mal' : [i for i in range(30,40)],
'melG' : [i for i in range(40,50)],
'num' : [50,51],
'ros' : [i for i in range(52,62)],
'txn' : [i for i in range(62,72)],
'vul' : [i for i in range(72,82)],
'zel' : [i for i in range(82,92)],
'melW' : [i for i in range(52,62)]+[i for i in range(72,82)],
'melE' : [i for i in range(10)]+[i for i in range(30,40)],
'cyd' : [i for i in range(10,20)]+[i for i in range(82,92)],
'tim' : [i for i in range(20,30)]+[i for i in range(62,72)]
}
self.vcf_file = "heliconius_chr5.vcf.gz"
self.vcf = pysam.VariantFile(self.vcf_file)
print("Preparing to process vcf file {}".format(self.vcf_file))
def __call__(self,p1,p2,p3):
"""
Makes any instance of the class callable as a function, which
does all of the sampling and returns an imput image.
"""
pi_mat = np.zeros((950,6,2))
windows = np.linspace(200000,9700000,950+1)
p1_idx, p2_idx, p3_idx = self.get_random_idx(p1,p2,p3)
print("Calculating pi for the following taxa:")
print(" {}: individuals {}".format(p1, p1_idx))
print(" {}: individuals {}".format(p2, p2_idx))
print(" {}: individuals {}".format(p3, p3_idx))
for w in range(950):
gt = self.get_gts(windows[w],windows[w+1])
_,pi_mat[w,0,0],pi_mat[w,0,1] = self.calc_pi(gt,p1_idx,p2_idx)
_,pi_mat[w,1,0],pi_mat[w,1,1] = self.calc_pi(gt,p1_idx,p3_idx)
_,pi_mat[w,2,0],pi_mat[w,2,1] = self.calc_pi(gt,p2_idx,p3_idx)
_,pi_mat[w,3,0],pi_mat[w,3,1] = self.calc_pi(gt,p1_idx,self.HeliconiusIdx['num'])
_,pi_mat[w,4,0],pi_mat[w,4,1] = self.calc_pi(gt,p2_idx,self.HeliconiusIdx['num'])
_,pi_mat[w,5,0],pi_mat[w,5,1] = self.calc_pi(gt,p3_idx,self.HeliconiusIdx['num'])
return pi_mat
def get_random_idx(self,p1,p2,p3):
"""
Randomly sample 5 individuals from melW, melE, and cyd to calculate
pairwise nucleotide diversity.
"""
p1_idx = np.random.choice(self.HeliconiusIdx[p1], 5, replace=False)
p2_idx = np.random.choice(self.HeliconiusIdx[p2], 5, replace=False)
p3_idx = np.random.choice(self.HeliconiusIdx[p3], 5, replace=False)
return (p1_idx,p2_idx,p3_idx)
def get_gts(self,start,stop):
"""
Retrieves the genotypes for all individuals within the window
[start,stop). 0-indexed.
"""
gts = []
for rec in self.vcf.fetch('chr5',start-1,stop-1):
gts.append([s['GT'] for s in rec.samples.values()])
return gts
def calc_pi(self,gt,idx1,idx2):
"""
"""
def check_if_same(gt1,gt2):
if gt1 is None or gt2 is None:
return -9
elif gt1 != gt2:
return 1
else:
return 0
pi_array = []
nsnps = len(gt)
for i in range(len(idx1)):
for j in range(len(idx2)):
pi_site = 0.0
for l in range(nsnps):
if gt[l][idx1[i]][0] is None or gt[l][idx2[j]][0] is None:
continue
ct = Counter([gt[l][idx1[i]][0],gt[l][idx2[j]][0],gt[l][idx1[i]][1],gt[l][idx2[j]][1]])
x = list(ct.values())[0]
pi_site += x*(4.0-x)/12.0
pi_array += [pi_site]
return (pi_array,np.min(pi_array),np.mean(pi_array))
# Instantiate class
hpi = HeliconiusPi()
# Run 100 replicate tests and store
min_cnn_res = []
mean_cnn_res = []
min_mean_cnn_res = []
min_flagel_res = []
for i in range(100):
mat = hpi('melE','melW','cyd')
min_dat = np.zeros((1,950,6,1))
min_dat[0,:,:,0] = mat[:,:,0]/np.max(mat[:,:,0])
mean_dat = np.zeros((1,950,6,1))
mean_dat[0,:,:,0] = mat[:,:,1]/np.max(mat[:,:,1])
min_mean_dat = np.zeros((1,950,6,2))
min_mean_dat[0,:,:,0] = mat[:,:,0]/np.max(mat[:,:,0])
min_mean_dat[0,:,:,1] = mat[:,:,1]/np.max(mat[:,:,1])
min_dat_flagel = np.zeros((1,950,6))
min_dat_flagel[0,:,:] = mat[:,:,0]/np.max(mat[:,:,0])
min_cnn_res.append(min_cnn.predict(min_dat)[0])
mean_cnn_res.append(mean_cnn.predict(mean_dat)[0])
min_mean_cnn_res.append(min_mean_cnn.predict(min_mean_dat)[0])
min_flagel_res.append(min_flagel.predict(min_dat_flagel)[0])
print('Finished iteration {}...'.format(i+1), flush=True)
# Get average prediction accuracy
print("Prediction for min CNN: {}".format(np.mean(np.array(min_cnn_res),axis=0)))
print("Prediction for mean CNN: {}".format(np.mean(np.array(mean_cnn_res),axis=0)))
print("Prediction for min-mean CNN: {}".format(np.mean(np.array(min_mean_cnn_res),axis=0)))
print("Prediction for flagel CNN: {}".format(np.mean(np.array(min_flagel_res),axis=0)))
# Also, save results to file
np.savetxt("min_cnn_res.csv",np.array(min_cnn_res),delimiter=",")
np.savetxt("meann_cnn_res.csv",np.array(mean_cnn_res),delimiter=",")
np.savetxt("min_mean_cnn_res.csv",np.array(min_mean_cnn_res),delimiter=",")
np.savetxt("min_flagel_res.csv",np.array(min_flagel_res),delimiter=",")
References
SH Martin, JW Davey, C Salazar, and CD Jiggins. 2019. Recombination rate variation shapes barriers to introgression across butterfly genomes. PLoS Biology 17:e2006288.
Dryad link for Heliconius VCFs: https://doi.org/10.5061/dryad.sk2pd88.