import gzip
import sys
from collections import defaultdict
from Bio.Seq import Seq
def main():
if len(sys.argv) < 7:
print("Usage: python createMAFFeatures.py <alignment file> <species names file> <output file> <chromosome> <reference species> <reference species chromosome length file>")
exit(1)
mafFile = sys.argv[1]
speciesFile = open(sys.argv[2], 'r')
featFile = sys.argv[3]
chr = sys.argv[4]
refSpecies = sys.argv[5] + "." + chr
chromLengths = sys.argv[6]
f = gzip.open(mafFile, 'rb')
o = gzip.open(featFile, 'w')
chromSizes = open(chromLengths, 'r')
species = []
for line in speciesFile:
species.append(line.strip())
species = sorted(species)
print species
chrLength = {}
for line in chromSizes:
splitLine = line.strip().split()
chrLength[splitLine[0]] = int(splitLine[1])
idx = 0
# write header to output file
o.write("pos")
for sp in species:
o.write("," + sp)
o.write("\n")
f.readline() # read header line
scoreLine = f.readline()
while scoreLine[0] == '#':
scoreLine = f.readline()
numBases = 0
covered = {} # dictionary of bases covered to account for blocks convering duplicate pieces
while scoreLine != "":
sequenceLine = f.readline()
startHuman = -1
seqHuman = ""
alignedToHuman = {}
lastAlignment = []
reverseComplement = False
# parse an alignment block
while sequenceLine != "\n":
lastAlignment.append(sequenceLine)
splitSeq = sequenceLine.split()
type = splitSeq[0]
if type == 's':
curSpecies = splitSeq[1]
if curSpecies == refSpecies: # store info for human
trueLenHuman = int(splitSeq[3])
if splitSeq[4] == '-': # reverse complement
startHuman = chrLength[chr] - int(splitSeq[2]) - trueLenHuman
seqHuman = Seq(splitSeq[6].upper()).reverse_complement()
reverseComplement=True
else:
startHuman = int(splitSeq[2])
seqHuman = splitSeq[6].upper()
lenHumanDash = len(seqHuman)
else:
curSeq = ""
if reverseComplement:
curSeq = Seq(splitSeq[6].upper()).reverse_complement()
else:
curSeq = splitSeq[6].upper()
alignedToHuman[curSpecies[:curSpecies.find(".")]] = curSeq # store the aligned sequence in other species
sequenceLine = f.readline()
# when done with an alignment block go through and output whether each species is the same as the reference species
pos = startHuman
for i in range(lenHumanDash):
if seqHuman[i] != '-':
numBases += 1
if not (pos in covered):
o.write(str(pos) + "," + seqHuman[i])
for sp in species:
if (sp not in alignedToHuman) or (alignedToHuman[sp][i] == '-'):
o.write(",X")
else:
o.write("," + str(alignedToHuman[sp][i]))
o.write("\n")
covered[pos] = 1
pos += 1
scoreLine = f.readline()
o.close()
main()