https://github.com/shorvath/MammalianMethylationConsortium
Raw File
Tip revision: 355098ec161ae668546a1a343e755c40a475b941 authored by Soo Bin Kwon on 11 January 2022, 19:27:21 UTC
Add scripts for generating Ernst lab annot csv
Tip revision: 355098e
pickInfProbes.py
# For each CG site this script picks the Infinium 2 probe with the most species covered and outputs how many
# species that probe covers. The output from this can be used to sort by number of species covered and look at the
# coverage of each species.
#
# This script assumes that each CG site has 4 probes in the input file (with Inf 1 and 2 results on the same line)
import sys
import gzip
from collections import defaultdict

def main():
    if len(sys.argv) != 4:
        print "Usage: python pickInfProbes.py <probes file> <output file> <infinium type 1/2>"
        exit(1)
    probesFile = gzip.open(sys.argv[1], 'r')
    oFile = gzip.open(sys.argv[2], 'w')
    infiniumType = int(sys.argv[3])

    CGSpeciesCount = defaultdict(int)
    probesPicked = defaultdict(str)
    for line in probesFile:
        splitLine = line.strip().split("\t")
        if (infiniumType == 2):
            if len(splitLine) > 14:
                species = splitLine[12].split(",")
                if len(species) > 1:
                    chr = "chr" + splitLine[2]
                    coord = int(splitLine[3])
                    SNVlocation = splitLine[13]
                    SNVchoice = splitLine[14]

                    if len(species) - 1 > CGSpeciesCount[(chr, coord)]:
                        CGSpeciesCount[(chr, coord)] = len(species) - 1
                        probesPicked[(chr, coord)] = chr + "\t" + str(coord) + "\t" + str(coord + 2) + "\t" + str(len(species) - 1) + "\t" + ",".join(species) + "\t" + SNVlocation + "\t" + SNVchoice + "\t" + splitLine[4] + "\t" + splitLine[5] + "\t" + splitLine[6] + "\t" + splitLine[7] + "\n"
        else:
            if len(splitLine) > 11:
                species = splitLine[9].split(",")
                if len(species) > 1:
                    chr = "chr" + splitLine[2]
                    coord = int(splitLine[3])
                    SNVlocation = splitLine[10]
                    SNVchoice = splitLine[11]

                    if len(species) - 1 > CGSpeciesCount[(chr, coord)]:
                        CGSpeciesCount[(chr, coord)] = len(species) - 1
                        probesPicked[(chr, coord)] = chr + "\t" + str(coord) + "\t" + str(coord + 2) + "\t" + str(len(species) - 1) + "\t" + ",".join(species) + "\t" + SNVlocation + "\t" + SNVchoice + "\t" + splitLine[4] + "\t" + splitLine[5] + "\t" + splitLine[6] + "\t" + splitLine[7] + "\n"

    for site in probesPicked:
        oFile.write(probesPicked[site])
    oFile.close()
    probesFile.close()

main()
back to top