import gzip import argparse from collections import deque def add_species_coords(main_args): probes_file = gzip.open(main_args.probes_file, 'rt') maf_seq_file = gzip.open(main_args.maf_seq_file, 'rt') output_file = gzip.open(main_args.output_file, 'wt') isMammal = {"hg19": 1,"panTro4": 1,"gorGor3": 1,"ponAbe2": 1,"nomLeu3": 1,"rheMac3": 1,"macFas5": 1,"papHam1": 1,"chlSab1": 1,"calJac3": 1,"saiBol1": 1,"otoGar3": 1,"tupChi1": 1,"speTri2": 1,"jacJac1": 1,"micOch1": 1,"criGri1": 1,"mesAur1": 1,"mm10": 1,"rn5": 1,"hetGla2": 1,"cavPor3": 1,"chiLan1": 1,"octDeg1": 1,"oryCun2": 1,"ochPri3": 1,"susScr3": 1,"vicPac2": 1,"camFer1": 1,"turTru2": 1,"orcOrc1": 1,"panHod1": 1,"bosTau7": 1,"oviAri3": 1,"capHir1": 1,"equCab2": 1,"cerSim1": 1,"felCat5": 1,"canFam3": 1,"musFur1": 1,"ailMel1": 1,"odoRosDiv1": 1,"lepWed1": 1,"pteAle1": 1,"pteVam1": 1,"myoDav1": 1,"myoLuc2": 1,"eptFus1": 1,"eriEur2": 1,"sorAra2": 1,"conCri1": 1,"loxAfr3": 1,"eleEdw1": 1,"triMan1": 1,"chrAsi1": 1,"echTel2": 1,"oryAfe1": 1,"dasNov3": 1,"monDom5": 1,"sarHar1": 1,"macEug2": 1,"ornAna1": 1, "falChe1": 0,"falPer1": 0,"ficAlb2": 0,"zonAlb1": 0,"geoFor1": 0,"taeGut2": 0,"pseHum1": 0,"melUnd1": 0,"amaVit1": 0,"araMac1": 0,"colLiv1": 0,"anaPla1": 0,"galGal4": 0,"melGal1": 0,"allMis1": 0,"cheMyd1": 0,"chrPic1": 0,"pelSin1": 0,"apaSpi1": 0,"anoCar2": 0,"xenTro7": 0,"latCha1": 0,"tetNig2": 0,"fr3": 0,"takFla1": 0,"oreNil2": 0,"neoBri1": 0,"hapBur1": 0,"mayZeb1": 0,"punNye1": 0,"oryLat2": 0,"xipMac1": 0,"gasAcu1": 0,"gadMor1": 0,"danRer7": 0,"astMex1": 0,"lepOcu1": 0,"petMar2": 0} end_ID = {} end_probes = {} species_dict = {} first_line = True for line in probes_file: split_line = line.strip().split() probe_ID = split_line[0] end_probes[probe_ID] = int(split_line[26]) end_ID[int(split_line[26])] = probe_ID species_dict[probe_ID] = split_line[24].split(',') header_line = maf_seq_file.readline().strip().split(',') output_file.write('probeID\thg19_start\thg19end\t') mammal_list = [] for i in header_line: species = i.strip().split('_')[1] if species not in mammal_list: if isMammal[species] and species != 'hg19': output_file.write(species + '_start\t' + species + '_end\t' + species + '_has_insertion\t' + species + \ '_is_split\t') mammal_list.append(species) output_file.write('\n') count_insertion_issues = 0 count_probes_done = 0 pos_reference = header_line.index('pos_hg19') last_50_lines = deque() for line in maf_seq_file: split_line = line.strip().split(',') last_50_lines.append(split_line) pos_hg19 = split_line[pos_reference].split(':')[1] #print(pos_hg19) if pos_hg19 == 'd': # if deletion at current position in reference continue continue end_human = int(last_50_lines[-1][pos_reference].split(':')[1]) start_human = int(last_50_lines[0][pos_reference].split(':')[1]) while (end_human - start_human >= 50): last_50_lines.popleft() while last_50_lines[0][pos_reference].split(':')[1] == 'd': last_50_lines.popleft() end_human = int(last_50_lines[-1][pos_reference].split(':')[1]) start_human = int(last_50_lines[0][pos_reference].split(':')[1]) pos_hg19 = int(pos_hg19) if pos_hg19 in end_ID: #for prev_line in last_50_lines: # have to make sure going over last 50 lines uninterrupted by deletion # in human # print(prev_line[pos_reference + 2], sep='', end = '') #print('\n') cur_probe = end_ID[pos_hg19] output_file.write(cur_probe + '\t' + last_50_lines[0][pos_reference] + '\t' + last_50_lines[-1][ pos_reference] + '\t') found_ins = False for sp in mammal_list: if sp in species_dict[cur_probe]: idx_species = header_line.index('pos_' + sp) strand_species = header_line.index('strand_' + sp) #print(sp) #print(idx_species) has_ins = False for prev_line in last_50_lines: # have to make sure going over last 50 lines uninterrupted by deletion in human # print(prev_line[idx_species + 2], sep='', end = '') if prev_line[pos_reference] == 'd' and prev_line[idx_species] != 'd': has_ins = True found_ins = True #print('\n') is_split_probe = False # have to check if coordinates in other species are continuous for i in range(1, len(last_50_lines) - 1): last_pos = last_50_lines[i - 1][idx_species].split(':')[1] cur_pos = last_50_lines[i][idx_species].split(':')[1] if last_pos != 'd' and cur_pos != 'd': if last_pos.isdigit() and cur_pos.isdigit(): last_pos = int(last_pos) cur_pos = int(cur_pos) if cur_pos != last_pos + 1 and cur_pos != last_pos - 1: is_split_probe = True else: is_split_probe=True start_sp = last_50_lines[0][idx_species] end_sp = last_50_lines[-1][idx_species] end_pos = end_sp.split(':')[1] start_pos = start_sp.split(':')[1] if end_pos < start_pos: start_sp, end_sp = end_sp, start_sp output_file.write(str(start_sp + 1) + "\t" + str(end_sp + 1) + "\t" + str(has_ins) + '\t' + str( is_split_probe) + '\t') else: output_file.write('-\t-\t-\t-\t') if found_ins: count_insertion_issues += 1 output_file.write('\n') count_probes_done += 1 print(count_probes_done, '/', len(end_probes.keys())) if count_probes_done == len(end_probes.keys()): break output_file.close() print(count_insertion_issues, " probes have insertions in other species not accounted for.") if __name__ == '__main__': parser = argparse.ArgumentParser(description='''Adds in coordinates of methylation probes in other species, because initial file submitted to Bret did not contain this.''') parser.add_argument('-p', '--probesFile', required=True, dest='probes_file', help='File with probes that was submitted to Bret') parser.add_argument('-m', '--MAF_seq_file', required=True, dest='maf_seq_file', help='File containing sequence info extracted from MAF files, containing all positions in ' 'each species.') parser.add_argument('-o', '--outputFile', required=True, dest='output_file', help='Output file to write the probe information to.') args = parser.parse_args() add_species_coords(args)