pickProbesEPICArray.py
# Picks a certain number of probes from the EPIC array ranked in order of number of species we can cover with that
# probe design.
from __future__ import print_function
import sys
import gzip
from collections import defaultdict
import operator
def main():
if len(sys.argv) != 8:
print("Usage: python pickProbesEPICArray.py <file of sites already picked> <all probes file> <EPIC manifest file> <num EPIC sites> <only Infinium II 0/1> <output file> <allow >3 CpGs 0/1>")
exit(1)
existingSites = gzip.open(sys.argv[1], 'rt')
allScoresSites = gzip.open(sys.argv[2], 'rt')
EPICfile = gzip.open(sys.argv[3], 'rt')
numEPICsites = int(sys.argv[4])
onlyInf2 = int(sys.argv[5])
oFile = gzip.open(sys.argv[6], 'wt')
allow3 = int(sys.argv[7])
print("Parsing file for EPIC probe design . . .")
EPICdesign = defaultdict(str)
firstLine = True
for line in EPICfile:
splitLine = line.strip().split(",")
if not firstLine:
chr = splitLine[1][1:-1]
coord = int(splitLine[2])
strand = splitLine[3][1:-1]
infType = splitLine[9][1:-1]
EPICdesign[(chr, coord)] = (infType, strand)
else:
firstLine = False
print("Done.")
pickedSites = defaultdict(bool)
print("Parsing file for sites already picked . . .")
firstLine = True
for line in existingSites:
if not firstLine:
splitLine = line.strip().split("\t")
chr = "chr" + splitLine[3]
coord = int(splitLine[4])
pickedSites[(chr, coord)] = line
else:
firstLine = False
print("Done.")
print("Parsing file for probes that aren't already included and are on epic array. . .")
numSpeciesEPIC = defaultdict(int)
probePicked = defaultdict(str)
for line in allScoresSites:
splitLine = line.strip().split("\t")
if (allow3 or int(splitLine[19]) <= 3):
chr = "chr" + splitLine[3]
coord = int(splitLine[4])
if (splitLine[15] == "F"):
probeStrand = "+"
else:
probeStrand = "-"
# if it's on converted strand since EPIC is all converted, not already picked and in the EPIC array
#if (chr, coord) in EPICdesign:
# print ("converted: ", splitLine[8] == "C")
# print ("already picked: ", (chr, coord) not in pickedSites)
# print ("On EPIC:", (chr, coord) in EPICdesign)
# print ("Same strand as EPIC", EPICdesign[(chr, coord)][1] == probeStrand)
if (splitLine[17] == "C") and ((chr, coord) not in pickedSites) and ((chr, coord) in EPICdesign) and (EPICdesign[(chr, coord)][1] == probeStrand):
if (EPICdesign[(chr, coord)][0] == "I"):
if (not onlyInf2):
if (len(splitLine) > 23):
species = splitLine[23].split(",")
numSpeciesEPIC[(chr, coord)] = len(species)
SNVlocation = splitLine[27]
probePicked[(chr, coord)] = "\t".join(splitLine[:23]) + "\t" + str(len(species)) + "\t" + "\t".join(splitLine[23:30]) + "\tInf1\t1\t1\n"
elif (EPICdesign[(chr, coord)][0] == "II"):
if len(splitLine) > 30: # if we have an infinium 2 for this probe given design score and underlying CG count etc
species = splitLine[30].split(",")
numSpeciesEPIC[(chr, coord)] = len(species)
SNVlocation = splitLine[34]
probePicked[(chr, coord)] = "\t".join(splitLine[:23]) + "\t" + str(len(species)) + "\t" + "\t".join(splitLine[30:]) + "\tInf2\t1\t1\n"
else:
print("Rogue non I or II Infinium design found.")
print ("Done.")
print("Sorting EPIC sites that weren't already picked by how many species they cover. . .")
sorted_CGsites_EPIC = sorted(numSpeciesEPIC.items(), key = operator.itemgetter(1))
sorted_CGsites_EPIC.reverse()
print("Done.")
print ("Picking EPIC sites. . .", )
numProbesPicked = 0
i = 0
while (numProbesPicked < numEPICsites):
CGsite = sorted_CGsites_EPIC[i][0]
if EPICdesign[CGsite][0] == "II":
numProbesPicked += 1
elif EPICdesign[CGsite][0] == "I":
numProbesPicked += 2
else:
print("Rogue non I or II Infinium design found")
if (numProbesPicked <= numEPICsites):
oFile.write(probePicked[CGsite])
i += 1
print ("Done.")
oFile.close()
existingSites.close()
allScoresSites.close()
EPICfile.close()
main()