import gzip import sys from collections import defaultdict from Bio.Seq import Seq def main(): if len(sys.argv) < 7: print("Usage: python createMAFFeatures.py ") exit(1) mafFile = sys.argv[1] speciesFile = open(sys.argv[2], 'r') featFile = sys.argv[3] chr = sys.argv[4] refSpecies = sys.argv[5] + "." + chr chromLengths = sys.argv[6] f = gzip.open(mafFile, 'rb') o = gzip.open(featFile, 'w') chromSizes = open(chromLengths, 'r') species = [] for line in speciesFile: species.append(line.strip()) species = sorted(species) print species chrLength = {} for line in chromSizes: splitLine = line.strip().split() chrLength[splitLine[0]] = int(splitLine[1]) idx = 0 # write header to output file o.write("pos") for sp in species: o.write("," + sp) o.write("\n") f.readline() # read header line scoreLine = f.readline() while scoreLine[0] == '#': scoreLine = f.readline() numBases = 0 covered = {} # dictionary of bases covered to account for blocks convering duplicate pieces while scoreLine != "": sequenceLine = f.readline() startHuman = -1 seqHuman = "" alignedToHuman = {} lastAlignment = [] reverseComplement = False # parse an alignment block while sequenceLine != "\n": lastAlignment.append(sequenceLine) splitSeq = sequenceLine.split() type = splitSeq[0] if type == 's': curSpecies = splitSeq[1] if curSpecies == refSpecies: # store info for human trueLenHuman = int(splitSeq[3]) if splitSeq[4] == '-': # reverse complement startHuman = chrLength[chr] - int(splitSeq[2]) - trueLenHuman seqHuman = Seq(splitSeq[6].upper()).reverse_complement() reverseComplement=True else: startHuman = int(splitSeq[2]) seqHuman = splitSeq[6].upper() lenHumanDash = len(seqHuman) else: curSeq = "" if reverseComplement: curSeq = Seq(splitSeq[6].upper()).reverse_complement() else: curSeq = splitSeq[6].upper() alignedToHuman[curSpecies[:curSpecies.find(".")]] = curSeq # store the aligned sequence in other species sequenceLine = f.readline() # when done with an alignment block go through and output whether each species is the same as the reference species pos = startHuman for i in range(lenHumanDash): if seqHuman[i] != '-': numBases += 1 if not (pos in covered): o.write(str(pos) + "," + seqHuman[i]) for sp in species: if (sp not in alignedToHuman) or (alignedToHuman[sp][i] == '-'): o.write(",X") else: o.write("," + str(alignedToHuman[sp][i])) o.write("\n") covered[pos] = 1 pos += 1 scoreLine = f.readline() o.close() main()