https://github.com/Xevkin/direkli_caprid_extended_ds
Tip revision: f803deabaa929dad5cebeec67bb0ee3b83e3c4a9 authored by Kevin on 21 September 2022, 16:14:19 UTC
Update README.md
Update README.md
Tip revision: f803dea
extended_D.py
#!/usr/bin/python
"""
I apologize for the current state of this script, it was not envisioned as being shared but in hindsight that was very unwise. It is very much not-optimized, and can be slow to run given the
size of input files
The script is currently hardcoded for my own study organism and current reference genome i.e. goat, ARS1. I will be working on making it
compliant with user-defined files and improving comments in the coming weeks - please contact me @ the email below if I have not down this or if you have questions.
This script takes a haplo.gz file as its main input, as output by ANGSD -doHaploCall.
The next five inputs define the H1, H2, H3, H4, and "true" outgroup groups/genomes.
Within each group, sample names (as in the haplo.gz header) are comma-seperated.
Within H4 I allow multiple groups to be defined; sites must be covered at least once by each H4 group and also be fixed for ancestral before a H3 derived allele is examined
H4 groups are delimited by the underscore _ character. Within each H4 group, genomes are comma seperated.
A final variable is required to turn on "single" mode. All analyses reported in the Direkli paper did not use this option, so should be set to "no"
Two example inputs are provided below:
python extended_D_Direkli4_boot_group.py H1-Genome1,H1-Genome2 H2-Genome H3-Genome H4-Group1-Genome1,H4-Group1-Genome2_H4-Group2-Genome1_H4-Group3-Genome1 Outgroup-Genome1,Outgroup-Genome2 no
python extended_D_Direkli4_boot_group.py Ganjdareh3,Ganjdareh22 Blagotin3 Direkli4 Direkli1-2,Direkli5_Ibex1,Falconeri1,Sibirica1,Tur1,Tur2_IranBezoar1,ZagrosBezoar1 Sheep1,Sheep2,Sheep3 no
Typically I will run a parallelized loop across H2-Genomes.
The regions used for bootstrap analyses are currently hardcoded into the script. The bootstrap region file is space seperated: chr block_start block_end
Note that the 11 "cutoff" thresholds refer to the frequency of the H3 derived allele among all H4 groups: <= 0%, <= 10%, <= 20%.
The calculated statistics for each of these cutoffs are outputted from the script. If only interested in those at <=0%, examine the first row of the output.
The output columns are the extended D value, standard error, Z score, number of ABBAA sites, number of nBABAA sites, and the same statistics but transversions only.
Kevin G. Daly, dalyk1@tcd.ie
"""
from __future__ import division
import sys
import gzip
import random
import numpy
#input is gzipped haplo file, list of H1, H2 group, H3 ind, list of H4s groups, grouped with _, the H5 outgroup groups, and if single mode
#create the bootstrap regions
BOOTS=[]
with open("/home/kdaly/boots.list") as FILE:
for LINE in FILE:
SPLINE = LINE.rstrip("\n").split(" ")
BOOTS.append([int(X) for X in SPLINE] + [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ])
HAPLO_FILE = sys.argv[1]
H1_GROUP = sys.argv[2].split(",")
H1_INDICES = []
H2_GROUP = sys.argv[3].split(",")
H2_INDICES = []
H3 = sys.argv[4]
H4_GROUPS = [X.split(",") for X in sys.argv[5].split("_")]
H4 = sys.argv[5].replace("_",",").split(",")
H5 = sys.argv[6].rstrip("\n").split(",")
H4_INDICES = []
H5_INDICES = []
H4_GROUP_INDICES = []
ALL_INDS = []
ALL_INDS.extend(H1_GROUP)
ALL_INDS.extend(H2_GROUP)
ALL_INDS.extend([H3])
ALL_INDS.extend(H5)
ALL_INDS.extend(H4)
SINGLE = sys.argv[7]
SINGLE_TOTAL_COUNT = 0
SINGLE_B_COUNT = 0
SINGLE_B_COUNT_TRANSV = 0
with gzip.open(HAPLO_FILE) as FILE:
for LINE in FILE:
LINE = LINE.rstrip("\n")
SPLINE = LINE.split("\t")
if LINE.startswith("chr"):
for IND in ALL_INDS:
if IND not in SPLINE:
print IND + " is not in the header. Exiting..."
raise SystemExit
for H1_INDIV in H1_GROUP:
H1_INDICES.append(SPLINE.index(H1_INDIV))
for H4_INDIV in H4:
H4_INDICES.append(SPLINE.index(H4_INDIV))
for H2_INDIV in H2_GROUP:
H2_INDICES.append(SPLINE.index(H2_INDIV))
for GROUP in H4_GROUPS:
H4_GROUP_INDICES.append([SPLINE.index(X) for X in GROUP])
H3_INDEX = SPLINE.index(H3)
for H5_INDIV in H5:
H5_INDICES.append(SPLINE.index(H5_INDIV))
#now start the actual ABAAAA BAAAA etc calculations
else:
if ([SPLINE[INDEX] for INDEX in [H3_INDEX]].count("N") == 0) and ([SPLINE[INDEX] for INDEX in H5_INDICES ].count("N") != len(H5_INDICES)) and ([SPLINE[INDEX] for INDEX in H2_INDICES ].count("N") != len(H2_INDICES)):
if SINGLE not in ["yes", "Yes", "Single"]:
if not ([SPLINE[INDEX] for INDEX in H1_INDICES ].count("N") != len(H1_INDICES)):
continue
POSITIONS = [int(X) for X in SPLINE[0:2]]
#filter on removing derived alleles present in our H4s
H4_BASES = [SPLINE[INDEX] for INDEX in H4_INDICES]
H5_BASES = [SPLINE[INDEX] for INDEX in H5_INDICES]
H2_BASES = [SPLINE[INDEX] for INDEX in H2_INDICES]
if (len(''.join(set(H5_BASES)).replace("N","")) == 1):
ANC_ALLELE = ''.join(set(H5_BASES)).replace("N","")
else:
continue
DER_ALLELE = SPLINE[H3_INDEX]
if ANC_ALLELE == DER_ALLELE:
continue
H1_BASES = [SPLINE[INDEX] for INDEX in H1_INDICES]
#randomly sample a H1 base
H1_BASE = "N"
if SINGLE not in ["yes", "Yes", "Single", "single"]:
while H1_BASE == "N":
H1_BASE = H1_BASES[random.randint(0,len(H1_BASES)-1)]
H2_BASE = "N"
while H2_BASE == "N":
H2_BASE = H2_BASES[random.randint(0,len(H2_BASES)-1)]
#get the derived allele frequency of present bases
DER_FREQ = H4_BASES.count(DER_ALLELE) / len(list(filter(lambda a: a != "N", H4) ) )
if SINGLE in ["single", "Yes", "yes", "Single"]:
MISSING = "NO"
for GROUP in H4_GROUP_INDICES:
GROUP_BASES = [SPLINE[INDEX] for INDEX in GROUP]
if GROUP_BASES.count("N") == len(GROUP):
MISSING = "YES"
if ( H4_BASES.count("N") != len(H4) ) and (MISSING == "NO"):
SINGLE_TOTAL_COUNT += 1
if (H2_BASE == DER_ALLELE) and (DER_FREQ == 0):
#print [H2_BASE, DER_ALLELE, H4_BASES, ANC_ALLELE]
SINGLE_B_COUNT += 1
if ["GA", "AG", "CT", "TC"].count("".join([ANC_ALLELE,DER_ALLELE])) == 0:
SINGLE_B_COUNT_TRANSV += 1
continue
BOOT_COUNTER = -1
#for each bootstrap region
for BOOT in BOOTS:
BOOT_COUNTER += 1
BOOT_SPLIT = BOOT
if not ( (BOOT[0] == POSITIONS[0]) and ( POSITIONS[1] >= BOOT[1]) and (POSITIONS[1] <= BOOT[2]) ):
continue
else:
#if derived allele frequency is lower than cutoff
COUNTER = -1
for DER_CUTOFF in [0.00, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
COUNTER += 1
#this is so we can filter on no missing sites in our defined groups
MISSING = "NO"
for GROUP in H4_GROUP_INDICES:
GROUP_BASES = [SPLINE[INDEX] for INDEX in GROUP]
#if any of the groups are missing data, skip
if GROUP_BASES.count("N") == len(GROUP):
MISSING = "YES"
if ( H4_BASES.count("N") != len(H4) ) and ( DER_FREQ <= DER_CUTOFF ) and (MISSING == "NO"):
# AB sites
if ( H1_BASE == ANC_ALLELE) and (H1_BASE != H2_BASE) and (H2_BASE == DER_ALLELE) :
BOOTS[BOOT_COUNTER][3][COUNTER] += 1
if ["GA", "AG", "CT", "TC"].count("".join([ANC_ALLELE,DER_ALLELE])) == 0:
BOOTS[BOOT_COUNTER][4][COUNTER] += 1
if ( H2_BASE == ANC_ALLELE) and (H1_BASE != H2_BASE) and (H1_BASE == DER_ALLELE):
BOOTS[BOOT_COUNTER][5][COUNTER] += 1
if ["GA", "AG", "CT", "TC"].count("".join([ANC_ALLELE,DER_ALLELE])) == 0:
BOOTS[BOOT_COUNTER][6][COUNTER] += 1
if SINGLE in ["yes","single", "Single"]:
print " _".join(H2_GROUP) + " " + " ".join([str(X) for X in [SINGLE_B_COUNT, SINGLE_B_COUNT_TRANSV, SINGLE_TOTAL_COUNT]])
exit()
#list of the 11 different cutoffs and extended D estimates
OUTPUT = [[] , [], [], [], [] , [], [], [], [] , [], []]
TRANSV_OUTPUT = OUTPUT
#calculate the final extended D too
DE_OUT = [ [0, 0, 0, 0] , [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0] , [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0] , [0, 0, 0, 0],[0, 0, 0, 0]]
DE_FINAL = []
#for each bootstrap region
for BLOCK in BOOTS:
#for each cutoff level
for CUTOFF in range(0,11):
DE_OUT[CUTOFF][0] += BLOCK[3][CUTOFF]
DE_OUT[CUTOFF][1] += BLOCK[4][CUTOFF]
DE_OUT[CUTOFF][2] += BLOCK[5][CUTOFF]
DE_OUT[CUTOFF][3] += BLOCK[6][CUTOFF]
for CUTOFF in range(0, 11):
#skip blocks with no data
if DE_OUT[CUTOFF][0] == 0 and DE_OUT[CUTOFF][2] == 0:
continue
else:
EXTENDED_D = (DE_OUT[CUTOFF][0] - DE_OUT[CUTOFF][2]) / (DE_OUT[CUTOFF][0] + DE_OUT[CUTOFF][2])
EXTENDED_D_TRANSV = (DE_OUT[CUTOFF][1] - DE_OUT[CUTOFF][3]) / (DE_OUT[CUTOFF][1] + DE_OUT[CUTOFF][3])
DE_FINAL.append([EXTENDED_D, EXTENDED_D_TRANSV])
#1000 replicates
for REPLICATE in range(0,1000):
#keep track of the subsampled blocks
REPLICATE_BLOCK=[]
AB_TOTAL=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
BA_TOTAL=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
AB_TRANSV_TOTAL=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
BA_TRANSV_TOTAL=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
#build a subsample of the same number of total bootstrap regions
for SUBSAMPLE in range(0, len(BOOTS)):
#randomly sample with replacement
SUBSAMPLE_INDEX = random.randrange(0, len(BOOTS))
REPLICATE_BLOCK = BOOTS[SUBSAMPLE_INDEX]
#continue to resample if the selection block has no data
while (REPLICATE_BLOCK[3][CUTOFF] == 0) and (REPLICATE_BLOCK[5][CUTOFF] == 0):
REPLICATE_BLOCK = BOOTS[random.randrange(0, len(BOOTS))]
#now generate total of AB BA sites for each of the 11 cutoffs
for CUTOFF in range(0,11):
AB_TOTAL[CUTOFF] += REPLICATE_BLOCK[3][CUTOFF]
AB_TRANSV_TOTAL[CUTOFF] += REPLICATE_BLOCK[4][CUTOFF]
BA_TOTAL[CUTOFF] += REPLICATE_BLOCK[5][CUTOFF]
BA_TRANSV_TOTAL[CUTOFF] += REPLICATE_BLOCK[6][CUTOFF]
for CUTOFF in range(0,11):
if (AB_TOTAL[CUTOFF] + BA_TOTAL[CUTOFF]) == 0:
EXTENDED_D = "NA"
EXTENDED_D_TRANSV = "NA"
OUTPUT[CUTOFF].append(EXTENDED_D)
OUTPUT[CUTOFF].append(EXTENDED_D_TRANSV)
else:
EXTENDED_D = (AB_TOTAL[CUTOFF] - BA_TOTAL[CUTOFF]) / (AB_TOTAL[CUTOFF] + BA_TOTAL[CUTOFF])
EXTENDED_D_TRANSV = (AB_TRANSV_TOTAL[CUTOFF] - BA_TRANSV_TOTAL[CUTOFF]) / (AB_TRANSV_TOTAL[CUTOFF] + BA_TRANSV_TOTAL[CUTOFF])
OUTPUT[CUTOFF].append(EXTENDED_D)
OUTPUT[CUTOFF].append(EXTENDED_D_TRANSV)
TO_PRINT = [sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]]
for CUTOFF in range(0, 11):
if "NA" in OUTPUT[CUTOFF]:
TO_PRINT.extend(["NA_" + str(CUTOFF)])
continue
DE_FINAL_VALUE = DE_FINAL[CUTOFF][0]
DE_TRANSV_FINAL_VALUE = DE_FINAL[CUTOFF][1]
STD = numpy.std(numpy.array(OUTPUT[CUTOFF]))
STD_TRANSV = numpy.std(numpy.array(TRANSV_OUTPUT[CUTOFF]))
if STD == 0:
Z = "NA"
else:
Z = DE_FINAL_VALUE / STD
if STD_TRANSV == 0:
Z_TRANSV = "NA"
else:
Z_TRANSV = DE_TRANSV_FINAL_VALUE / STD_TRANSV
TO_PRINT.extend([str(X) for X in [DE_FINAL_VALUE, STD, Z, DE_OUT[CUTOFF][0], DE_OUT[CUTOFF][2], DE_TRANSV_FINAL_VALUE, STD_TRANSV, Z_TRANSV, DE_OUT[CUTOFF][1], DE_OUT[CUTOFF][3]]])
print " ".join(TO_PRINT)