https://github.com/ctlab/quant3p
Raw File
Tip revision: be9977925e9e842cc755f14ced72bbee5c5d6d77 authored by Alexey Sergushichev on 02 December 2019, 17:22 UTC
fixes for htseq & macs2 upgrades
Tip revision: be99779
fixmm.py
#!/usr/bin/env python

import os
import sys
import pybedtools
import argparse
import pysam
import logging
from logging import debug, info, warn, error
from copy import copy


def update_if(d, f, key, value):
    if not key in d:
        d[key] = value
        return None
    old_value = d[key]
    should_update = f(old_value, value)
    if not should_update:
        return value
    d[key] = value
    return old_value

def is_upstream(e1, e2):
    iv1 = e1
    iv2 = e2
    assert iv1.chrom == iv2.chrom
    assert iv1.strand == iv2.strand


    if iv1.strand == "+":
        return iv1.end < iv2.end
    else:
        return iv1.start > iv2.start

def extended_exons(features, extension_5p, extension_3p, extend_all_exons):
    def extend_iv(iv):
        res = copy(iv)
        if res.strand == "+":
            res.end += extension_3p
            res.start = max(res.start - extension_5p, 0)
        elif res.strand == "-":
            res.end += extension_5p
            res.start = max(res.start - extension_3p, 0)
        return res

    last_exons = {}

    def update_last_exon(exon):
        key = (exon.attrs["transcript_id"], exon.chrom, exon.strand)
        return update_if(last_exons, is_upstream, key, exon)

    for feature in features:
        if feature.fields[2] == "exon":
            iv = feature

            if extend_all_exons:
                yield extend_iv(iv)
            else:
                feature_to_push = update_last_exon(feature)
                if feature_to_push:
                    yield feature_to_push

    for exon in last_exons.values():
        yield extend_iv(exon)

def main():
    logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)-15s - %(levelname)s: %(message)s"
            )
    argparser = make_argparser()
    args = argparser.parse_args()

    extension_3p = args.extension_3p
    extension_5p = args.extension_5p

    if not args.stats_only and not args.output_file:
        error("Please either provide output file or --stats-only flag")
        sys.exit(1)

    info("Finding exonic multimappers...")

    xgtf = pybedtools.BedTool(extended_exons(
            pybedtools.BedTool(args.gtf_file),
            extension_5p=extension_5p,
            extension_3p=extension_3p,
            extend_all_exons=args.extend_all_exons))


    bamTool = pybedtools.BedTool(args.input_file)

    tocheck = {}

    exonicBam = bamTool.intersect(xgtf, split=True, s=True, nonamecheck=True)

    try:
        # using HTSeq here because of genomic intervals
        for alignment in exonicBam:
            is_multimapper = not "NH:i:1" in alignment.fields[9:]
            if not is_multimapper:
                continue

            qname = alignment.fields[0]
            if not qname in tocheck:
                tocheck[qname] = 0
            tocheck[qname] += 1
    finally:
        os.unlink(exonicBam.fn)

    info("Determining which multimappers are not real...")
    tofix = set()
    for (qname, count) in tocheck.items():
        if count == 1:
            tofix.add(qname)

    if args.counts_out:
        with open(args.counts_out, "w") as counts_out:
            for (qname, count) in tocheck.items():
                counts_out.write("%s\t%s\n" % (qname, count))

    info("Number of exonic mutlimappers: %d", len(tocheck))
    info("Number of fixable mutlimappers: %d", len(tofix))

    if args.stats_only:
        return

    in_samfile = pysam.Samfile(args.input_file, "rb")
    out_samfile = pysam.Samfile(args.output_file, "wb", template=in_samfile)

    for read in in_samfile:
        if read.qname in tofix:
            read.tags = [("NH", 1), ("HI", 1)] + \
                    [tag for tag in read.tags if not tag[0] in ["NH", "HI"]]
            read.mapq = args.new_aqual
        out_samfile.write(read)

    in_samfile.close()
    out_samfile.close()


def make_argparser():
    parser = argparse.ArgumentParser(description="Fix NH/HI flags for potentially exonic multimappers")

    parser.add_argument(
            '--ext-3p',
            dest="extension_3p",
            metavar="EXTENSION",
            type=int,
            default=10000,
            help="how far extend 3' exon ends (default = 10000)")

    parser.add_argument(
            '--ext-5p',
            dest="extension_5p",
            metavar="EXTENSION",
            default=0,
            type=int,
            help="how far extend 5' exon ends (defaults = 0)")

    parser.add_argument(
            '--aqual',
            dest="new_aqual",
            metavar="QUALITY",
            default=30,
            type=int,
            help="new alignment quality to set for fixed reads (default = 30)")

    parser.add_argument(
            '--all-exons',
            dest="extend_all_exons",
            default=False,
            action='store_true',
            help="Extend all exons of a transcript, not just the last one")

    parser.add_argument(
            '--only-last',
            dest="extend_all_exons",
            default=False,
            action='store_false',
            help="Extend only the last exon of a transcript (default)")

    parser.add_argument(
            '-g', '--annotation',
            dest="gtf_file",
            metavar="FILE",
            required=True,
            help="gtf/gff/... file with genome annotation")

    parser.add_argument(
            '--stats-only',
            dest="stats_only",
            default=False,
            action='store_true',
            help="Just show multimappers statistics, do not write the output bam")

    parser.add_argument(
            '--counts-out',
            dest="counts_out",
            metavar="FILE",
            help="file to print multimappers exonic coutns")

    parser.add_argument(
            '-o', '--out',
            dest="output_file",
            metavar="BAM",
            help="file to write fixed alignments")

    parser.add_argument(
            'input_file',
            metavar="INPUT_BAM",
            help="file with alignments to fix")
    return parser


if __name__ == '__main__':
    main()
back to top