https://github.com/shorvath/MammalianMethylationConsortium
Raw File
Tip revision: 355098ec161ae668546a1a343e755c40a475b941 authored by Soo Bin Kwon on 11 January 2022, 19:27:21 UTC
Add scripts for generating Ernst lab annot csv
Tip revision: 355098e
addSpeciesCoordsToProbes.py
import gzip
import argparse
from collections import deque

def add_species_coords(main_args):
    probes_file = gzip.open(main_args.probes_file, 'rt')
    maf_seq_file = gzip.open(main_args.maf_seq_file, 'rt')
    output_file = gzip.open(main_args.output_file, 'wt')

    isMammal = {"hg19": 1,"panTro4": 1,"gorGor3": 1,"ponAbe2": 1,"nomLeu3": 1,"rheMac3": 1,"macFas5": 1,"papHam1": 1,"chlSab1": 1,"calJac3": 1,"saiBol1": 1,"otoGar3": 1,"tupChi1": 1,"speTri2": 1,"jacJac1": 1,"micOch1": 1,"criGri1": 1,"mesAur1": 1,"mm10": 1,"rn5": 1,"hetGla2": 1,"cavPor3": 1,"chiLan1": 1,"octDeg1": 1,"oryCun2": 1,"ochPri3": 1,"susScr3": 1,"vicPac2": 1,"camFer1": 1,"turTru2": 1,"orcOrc1": 1,"panHod1": 1,"bosTau7": 1,"oviAri3": 1,"capHir1": 1,"equCab2": 1,"cerSim1": 1,"felCat5": 1,"canFam3": 1,"musFur1": 1,"ailMel1": 1,"odoRosDiv1": 1,"lepWed1": 1,"pteAle1": 1,"pteVam1": 1,"myoDav1": 1,"myoLuc2": 1,"eptFus1": 1,"eriEur2": 1,"sorAra2": 1,"conCri1": 1,"loxAfr3": 1,"eleEdw1": 1,"triMan1": 1,"chrAsi1": 1,"echTel2": 1,"oryAfe1": 1,"dasNov3": 1,"monDom5": 1,"sarHar1": 1,"macEug2": 1,"ornAna1": 1, "falChe1": 0,"falPer1": 0,"ficAlb2": 0,"zonAlb1": 0,"geoFor1": 0,"taeGut2": 0,"pseHum1": 0,"melUnd1": 0,"amaVit1": 0,"araMac1": 0,"colLiv1": 0,"anaPla1": 0,"galGal4": 0,"melGal1": 0,"allMis1": 0,"cheMyd1": 0,"chrPic1": 0,"pelSin1": 0,"apaSpi1": 0,"anoCar2": 0,"xenTro7": 0,"latCha1": 0,"tetNig2": 0,"fr3": 0,"takFla1": 0,"oreNil2": 0,"neoBri1": 0,"hapBur1": 0,"mayZeb1": 0,"punNye1": 0,"oryLat2": 0,"xipMac1": 0,"gasAcu1": 0,"gadMor1": 0,"danRer7": 0,"astMex1": 0,"lepOcu1": 0,"petMar2": 0}

    end_ID = {}
    end_probes = {}
    species_dict = {}
    first_line = True
    for line in probes_file:
        split_line = line.strip().split()
        probe_ID = split_line[0]
        end_probes[probe_ID] = int(split_line[26])
        end_ID[int(split_line[26])] = probe_ID
        species_dict[probe_ID] = split_line[24].split(',')

    header_line = maf_seq_file.readline().strip().split(',')
    output_file.write('probeID\thg19_start\thg19end\t')
    mammal_list = []
    for i in header_line:
        species = i.strip().split('_')[1]
        if species not in mammal_list:
            if isMammal[species] and species != 'hg19':
                output_file.write(species + '_start\t' + species + '_end\t' + species + '_has_insertion\t' + species + \
                                  '_is_split\t')
                mammal_list.append(species)
    output_file.write('\n')

    count_insertion_issues = 0
    count_probes_done = 0

    pos_reference = header_line.index('pos_hg19')
    last_50_lines = deque()
    for line in maf_seq_file:
        split_line = line.strip().split(',')
        last_50_lines.append(split_line)
        pos_hg19 = split_line[pos_reference].split(':')[1]
        #print(pos_hg19)

        if pos_hg19 == 'd': # if deletion at current position in reference continue
            continue
        end_human = int(last_50_lines[-1][pos_reference].split(':')[1])
        start_human = int(last_50_lines[0][pos_reference].split(':')[1])

        while (end_human - start_human >= 50):
            last_50_lines.popleft()
            while last_50_lines[0][pos_reference].split(':')[1] == 'd':
                last_50_lines.popleft()
            end_human = int(last_50_lines[-1][pos_reference].split(':')[1])
            start_human = int(last_50_lines[0][pos_reference].split(':')[1])
        pos_hg19 = int(pos_hg19)

        if pos_hg19 in end_ID:
            #for prev_line in last_50_lines: # have to make sure going over last 50 lines uninterrupted by deletion
            # in human
            #    print(prev_line[pos_reference + 2], sep='', end = '')
            #print('\n')
            cur_probe = end_ID[pos_hg19]
            output_file.write(cur_probe + '\t' + last_50_lines[0][pos_reference] + '\t' + last_50_lines[-1][
                pos_reference] + '\t')

            found_ins = False
            for sp in mammal_list:
                if sp in species_dict[cur_probe]:
                    idx_species = header_line.index('pos_' + sp)
                    strand_species = header_line.index('strand_' + sp)
                    #print(sp)
                    #print(idx_species)
                    has_ins = False
                    for prev_line in last_50_lines: # have to make sure going over last 50 lines uninterrupted by deletion in human
                    #    print(prev_line[idx_species + 2], sep='', end = '')
                        if prev_line[pos_reference] == 'd' and prev_line[idx_species] != 'd':
                            has_ins = True
                            found_ins = True
                    #print('\n')
                    is_split_probe = False # have to check if coordinates in other species are continuous
                    for i in range(1, len(last_50_lines) - 1):
                        last_pos = last_50_lines[i - 1][idx_species].split(':')[1]
                        cur_pos = last_50_lines[i][idx_species].split(':')[1]
                        if last_pos != 'd' and cur_pos != 'd':
                            if last_pos.isdigit() and cur_pos.isdigit():
                                last_pos = int(last_pos)
                                cur_pos = int(cur_pos)
                                if cur_pos != last_pos + 1 and cur_pos != last_pos - 1:
                                    is_split_probe = True
                        else:
                            is_split_probe=True
                    start_sp = last_50_lines[0][idx_species]
                    end_sp = last_50_lines[-1][idx_species]
                    end_pos = end_sp.split(':')[1]
                    start_pos = start_sp.split(':')[1]
                    if end_pos < start_pos:
                        start_sp, end_sp = end_sp, start_sp
                    output_file.write(str(start_sp + 1) + "\t" + str(end_sp + 1) + "\t" + str(has_ins) + '\t' + str(
                        is_split_probe) + '\t')
                else:
                    output_file.write('-\t-\t-\t-\t')
            if found_ins:
                count_insertion_issues += 1
            output_file.write('\n')
            count_probes_done += 1
            print(count_probes_done, '/', len(end_probes.keys()))
            if count_probes_done == len(end_probes.keys()):
                break

    output_file.close()
    print(count_insertion_issues, " probes have insertions in other species not accounted for.")

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='''Adds in coordinates of methylation probes in other species, 
    because initial file submitted to Bret did not contain this.''')

    parser.add_argument('-p', '--probesFile', required=True, dest='probes_file',
                        help='File with probes that was submitted to Bret')
    parser.add_argument('-m', '--MAF_seq_file', required=True, dest='maf_seq_file',
                        help='File containing sequence info extracted from MAF files, containing all positions in '
                             'each species.')
    parser.add_argument('-o', '--outputFile', required=True, dest='output_file',
                        help='Output file to write the probe information to.')

    args = parser.parse_args()

    add_species_coords(args)
back to top