#!/usr/bin/env python import os import sys import pybedtools import argparse import pysam import logging from logging import debug, info, warn, error from copy import copy def update_if(d, f, key, value): if not key in d: d[key] = value return None old_value = d[key] should_update = f(old_value, value) if not should_update: return value d[key] = value return old_value def is_upstream(e1, e2): iv1 = e1 iv2 = e2 assert iv1.chrom == iv2.chrom assert iv1.strand == iv2.strand if iv1.strand == "+": return iv1.end < iv2.end else: return iv1.start > iv2.start def extended_exons(features, extension_5p, extension_3p, extend_all_exons): def extend_iv(iv): res = copy(iv) if res.strand == "+": res.end += extension_3p res.start = max(res.start - extension_5p, 0) elif res.strand == "-": res.end += extension_5p res.start = max(res.start - extension_3p, 0) return res last_exons = {} def update_last_exon(exon): key = (exon.attrs["transcript_id"], exon.chrom, exon.strand) return update_if(last_exons, is_upstream, key, exon) for feature in features: if feature.fields[2] == "exon": iv = feature if extend_all_exons: yield extend_iv(iv) else: feature_to_push = update_last_exon(feature) if feature_to_push: yield feature_to_push for exon in last_exons.values(): yield extend_iv(exon) def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)-15s - %(levelname)s: %(message)s" ) argparser = make_argparser() args = argparser.parse_args() extension_3p = args.extension_3p extension_5p = args.extension_5p if not args.stats_only and not args.output_file: error("Please either provide output file or --stats-only flag") sys.exit(1) info("Finding exonic multimappers...") xgtf = pybedtools.BedTool(extended_exons( pybedtools.BedTool(args.gtf_file), extension_5p=extension_5p, extension_3p=extension_3p, extend_all_exons=args.extend_all_exons)) bamTool = pybedtools.BedTool(args.input_file) tocheck = {} exonicBam = bamTool.intersect(xgtf, split=True, s=True, nonamecheck=True) try: # using HTSeq here because of genomic intervals for alignment in exonicBam: is_multimapper = not "NH:i:1" in alignment.fields[9:] if not is_multimapper: continue qname = alignment.fields[0] if not qname in tocheck: tocheck[qname] = 0 tocheck[qname] += 1 finally: os.unlink(exonicBam.fn) info("Determining which multimappers are not real...") tofix = set() for (qname, count) in tocheck.items(): if count == 1: tofix.add(qname) if args.counts_out: with open(args.counts_out, "w") as counts_out: for (qname, count) in tocheck.items(): counts_out.write("%s\t%s\n" % (qname, count)) info("Number of exonic mutlimappers: %d", len(tocheck)) info("Number of fixable mutlimappers: %d", len(tofix)) if args.stats_only: return in_samfile = pysam.Samfile(args.input_file, "rb") out_samfile = pysam.Samfile(args.output_file, "wb", template=in_samfile) for read in in_samfile: if read.qname in tofix: read.tags = [("NH", 1), ("HI", 1)] + \ [tag for tag in read.tags if not tag[0] in ["NH", "HI"]] read.mapq = args.new_aqual out_samfile.write(read) in_samfile.close() out_samfile.close() def make_argparser(): parser = argparse.ArgumentParser(description="Fix NH/HI flags for potentially exonic multimappers") parser.add_argument( '--ext-3p', dest="extension_3p", metavar="EXTENSION", type=int, default=10000, help="how far extend 3' exon ends (default = 10000)") parser.add_argument( '--ext-5p', dest="extension_5p", metavar="EXTENSION", default=0, type=int, help="how far extend 5' exon ends (defaults = 0)") parser.add_argument( '--aqual', dest="new_aqual", metavar="QUALITY", default=30, type=int, help="new alignment quality to set for fixed reads (default = 30)") parser.add_argument( '--all-exons', dest="extend_all_exons", default=False, action='store_true', help="Extend all exons of a transcript, not just the last one") parser.add_argument( '--only-last', dest="extend_all_exons", default=False, action='store_false', help="Extend only the last exon of a transcript (default)") parser.add_argument( '-g', '--annotation', dest="gtf_file", metavar="FILE", required=True, help="gtf/gff/... file with genome annotation") parser.add_argument( '--stats-only', dest="stats_only", default=False, action='store_true', help="Just show multimappers statistics, do not write the output bam") parser.add_argument( '--counts-out', dest="counts_out", metavar="FILE", help="file to print multimappers exonic coutns") parser.add_argument( '-o', '--out', dest="output_file", metavar="BAM", help="file to write fixed alignments") parser.add_argument( 'input_file', metavar="INPUT_BAM", help="file with alignments to fix") return parser if __name__ == '__main__': main()