https://github.com/ukirik/giggle
Raw File
Tip revision: 49e0a5da1dd7e9618a94b6e197046e6281b2b0f1 authored by Ufuk Kirik on 14 December 2016, 14:57:17 UTC
Tip revision: 49e0a5d
haplod.py
from itertools import islice, accumulate
from operator import itemgetter
from collections import defaultdict
from stranslator import STranslator
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib
import os, argparse, gzip, csv, pprint

""" Command line parsers """
parser = argparse.ArgumentParser()
parser.add_argument("folders", nargs='+', help="output of IgDiscover runs")
parser.add_argument("-o", "--out", help="directory where the files should be output")
parser.add_argument("-g", "--germline-threshold", type=float, default=0.875, help="Threshold value for considering an allele")
parser.add_argument("-d", "--dcoverage", type=float, default=35, help="Threshold value for D-coverage")
parser.add_argument("-s", "--stranslator", help="reference file to translate s-codes")
args = parser.parse_args()

""" Constants: update indices as necessary """
DELIM = "\t"
CIND  = 0
VIND  = 1
DIND  = 2
JIND  = 3
VERR  = 20
DCOV  = 8
PLTDIR = "haplod"
VFASTA = os.path.join("final","database","human_V.fasta")   
FILTAB = os.path.join("final","filtered.tab.gz")

def check_ncol(files):

    print("Checking %d files for consistency in number of columns..." % len(files))
    ncols = []
    for infile in files:
        with gzip.open(infile,'rt') as f:
            reader = csv.reader(f, delimiter=DELIM)
            row = next(reader)
            ncols.append(len(row))
            if len(ncols) > 1 and ncols[-1] != ncols[-2]:
                sys.exit("Number of columns don't match!: " + str(ncols))

    print("Consistent number of columns found {0}".format(ncols))


def col_headers(infile):
    with gzip.open(infile,'rt') as f:
        reader = csv.reader(f, delimiter=DELIM)
        headers = next(reader)
        print("Column headers for the first file:")
        for i in range(0,len(headers)):
            print("[{0}]: {1}".format(i, headers[i])) 

def getVcandidates(fasta):
    all_vs = []
    het_vs = []
    print("reading fasta file: {0}".format(fasta))
    with open(fasta, 'r') as f:
        counter = 0
        for line in f:
            if line.startswith('>'):
                counter += 1
                genecall, *rest = line.split()
                genecall = genecall.replace('>','')
                gene, allele = genecall.split('*')
                if gene in all_vs:
                    het_vs.append(gene)
                all_vs.append(gene)
                
    print("{0} entries read".format(counter))
    return all_vs, het_vs

def sortkey(genestr):
    noallele = genestr.split('*')[0]
    gfamily, gorder, *crap = noallele.split('-')
    LAST = 99
    if gorder.endswith('D'):
        return (LAST, gfamily)
    else:
        return (int(gorder), gfamily)

def hetVD(tabfile, hetv):
    data = {}
    haplo_vd = defaultdict(lambda: defaultdict(int))

    if args.stranslator is not None:
        s = STranslator(args.stranslator)

    with gzip.open(tabfile, 'rt') as fh:
        reader = csv.reader(fh, delimiter =DELIM)
        header = next(reader)
        for row in reader:
            if not row[DIND] or 'OR' in row[DIND]:  # filter out missing or non-functional D-assignments
                continue
            if float(row[DCOV]) < args.dcoverage:   # filter out D-assignments with little coverage
                continue
            if float(row[VERR]) > 0:                # filter out ambigious V-assignments
                continue
            
            vass = row[VIND] if args.stranslator is None else s.get(row[VIND])
            vgene = vass.split('*')[0]
            dgene = row[DIND].split('*')[0]

            # all V-candidate D pairings
            if vgene in hetv:
                haplo_vd[vass][dgene] += 1

        # filter out uncertain V-alleles 
        for candidate in hetv:
            filterAlleles(haplo_vd, candidate)

        for key,val in haplo_vd.items():
            gene = key.split('*')[0]
            if gene not in data:
                data[gene] = {key: val}
            else:
                data[gene][key] = val

        return haplo_vd, data           

def filterAlleles(haplo_vd, gene):
    temp = {k: sum(v.values()) for k, v in haplo_vd.items() if gene in k}
    sorted_temp = sorted(temp.items(), key = itemgetter(1), reverse=True)

    values = [v for k,v in sorted_temp]
    cumsum = list(accumulate(values))
    scaled = [v / sum(values) for v in cumsum]
    alleles = []
    
    for i in range(0,len(scaled)):
        if scaled[i] > args.germline_threshold:
            j = 0 if i == 0 else i+1    # if gene is not heterozygous remove remaining single allele as well 
            miscalls = sorted_temp[j:]
            for call,count in miscalls:
                del haplo_vd[call]
            return

def plotGene(gene, haplodata, pdf):
    df = pd.DataFrame.from_dict(haplodata, orient="index")
    df_scaled = df.div(df.sum(axis=1), axis=0)
    df_sorted = pd.DataFrame(df_scaled, columns = sorted(df_scaled.columns, reverse=True, key = lambda s : sortkey(s)))
    
    ax = df_sorted.T.plot.barh(figsize=(8.27, 11.69))
    ax.set(xlabel="Normalized frequency")
    plt.title("IGHD combination counts per {0} allele".format(gene))
    plt.savefig(pdf, bbox_inches='tight', format='pdf')


def main():

    print(args.folders)

    # Plot style
    matplotlib.style.use("ggplot")
    
    for i in range(len(args.folders)):
        folder = args.folders[i]

        # Step1: get candidates for heterozygous V-genes
        fastaf = os.path.join(folder, VFASTA)
        allv, hetv = getVcandidates(fastaf)
        print("potentially heterozygous V genes: {0}".format(hetv))
        input("Paused... Press <Enter> to continue... ")

        tabfile = os.path.join(folder, FILTAB)

        # Step3: get VD combination counts
        haplo_vd, alldata = hetVD(tabfile, hetv)
        df = pd.DataFrame.from_dict(haplo_vd, orient="index")
        df_scaled = df.div(df.sum(axis=1), axis=0)
        df_sorted = pd.DataFrame(df_scaled, columns = sorted(df_scaled.columns, reverse=True, key = lambda s : sortkey(s)))
        #print(df_sorted.T)
        #input("Paused... Press <Enter> to conitnue...")
        
        # Check and create folders if necessary
        outdir = PLTDIR
        if args.out is not None:
            if not os.path.exists(args.out):
                os.mkdir(args.out)
            outdir = os.path.join(args.out, PLTDIR)
        
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        filename = input("Please enter a name for results of this analysis: ")
        while not filename.strip():
            filename = input("Please enter a VALID name for the resultant file: ")


        outfile = os.path.join(outdir, filename+".pdf")
        with PdfPages(outfile) as pdf:
            for gene in sorted(alldata.keys()):
                plotGene(gene, alldata[gene] , pdf)


if __name__ == "__main__":
    main()
back to top