Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

https://github.com/ctlab/quant3p
25 December 2019, 16:37:49 UTC
  • Code
  • Branches (2)
  • Releases (0)
  • Visits
    • Branches
    • Releases
    • HEAD
    • refs/heads/bedtools
    • refs/heads/master
    No releases to show
  • 34f2683
  • /
  • quant3p
  • /
  • fixmm.py
Raw File Download
Take a new snapshot of a software origin

If the archived software origin currently browsed is not synchronized with its upstream version (for instance when new commits have been issued), you can explicitly request Software Heritage to take a new snapshot of it.

Use the form below to proceed. Once a request has been submitted and accepted, it will be processed as soon as possible. You can then check its processing state by visiting this dedicated page.
swh spinner

Processing "take a new snapshot" request ...

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • content
  • directory
  • revision
  • snapshot
origin badgecontent badge Iframe embedding
swh:1:cnt:ec0d321a6a2b20357c2c4b5ba69adf8d83069a44
origin badgedirectory badge Iframe embedding
swh:1:dir:32059bd57148a9c29e4ffcab0df82bcb22f5c059
origin badgerevision badge
swh:1:rev:be9977925e9e842cc755f14ced72bbee5c5d6d77
origin badgesnapshot badge
swh:1:snp:c3876493dcdf8da1d2957e6b67a7a1bacbc80fe5

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • content
  • directory
  • revision
  • snapshot
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Tip revision: be9977925e9e842cc755f14ced72bbee5c5d6d77 authored by Alexey Sergushichev on 02 December 2019, 17:22:45 UTC
fixes for htseq & macs2 upgrades
Tip revision: be99779
fixmm.py
#!/usr/bin/env python

import os
import sys
import pybedtools
import argparse
import pysam
import logging
from logging import debug, info, warn, error
from copy import copy


def update_if(d, f, key, value):
    if not key in d:
        d[key] = value
        return None
    old_value = d[key]
    should_update = f(old_value, value)
    if not should_update:
        return value
    d[key] = value
    return old_value

def is_upstream(e1, e2):
    iv1 = e1
    iv2 = e2
    assert iv1.chrom == iv2.chrom
    assert iv1.strand == iv2.strand


    if iv1.strand == "+":
        return iv1.end < iv2.end
    else:
        return iv1.start > iv2.start

def extended_exons(features, extension_5p, extension_3p, extend_all_exons):
    def extend_iv(iv):
        res = copy(iv)
        if res.strand == "+":
            res.end += extension_3p
            res.start = max(res.start - extension_5p, 0)
        elif res.strand == "-":
            res.end += extension_5p
            res.start = max(res.start - extension_3p, 0)
        return res

    last_exons = {}

    def update_last_exon(exon):
        key = (exon.attrs["transcript_id"], exon.chrom, exon.strand)
        return update_if(last_exons, is_upstream, key, exon)

    for feature in features:
        if feature.fields[2] == "exon":
            iv = feature

            if extend_all_exons:
                yield extend_iv(iv)
            else:
                feature_to_push = update_last_exon(feature)
                if feature_to_push:
                    yield feature_to_push

    for exon in last_exons.values():
        yield extend_iv(exon)

def main():
    logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)-15s - %(levelname)s: %(message)s"
            )
    argparser = make_argparser()
    args = argparser.parse_args()

    extension_3p = args.extension_3p
    extension_5p = args.extension_5p

    if not args.stats_only and not args.output_file:
        error("Please either provide output file or --stats-only flag")
        sys.exit(1)

    info("Finding exonic multimappers...")

    xgtf = pybedtools.BedTool(extended_exons(
            pybedtools.BedTool(args.gtf_file),
            extension_5p=extension_5p,
            extension_3p=extension_3p,
            extend_all_exons=args.extend_all_exons))


    bamTool = pybedtools.BedTool(args.input_file)

    tocheck = {}

    exonicBam = bamTool.intersect(xgtf, split=True, s=True, nonamecheck=True)

    try:
        # using HTSeq here because of genomic intervals
        for alignment in exonicBam:
            is_multimapper = not "NH:i:1" in alignment.fields[9:]
            if not is_multimapper:
                continue

            qname = alignment.fields[0]
            if not qname in tocheck:
                tocheck[qname] = 0
            tocheck[qname] += 1
    finally:
        os.unlink(exonicBam.fn)

    info("Determining which multimappers are not real...")
    tofix = set()
    for (qname, count) in tocheck.items():
        if count == 1:
            tofix.add(qname)

    if args.counts_out:
        with open(args.counts_out, "w") as counts_out:
            for (qname, count) in tocheck.items():
                counts_out.write("%s\t%s\n" % (qname, count))

    info("Number of exonic mutlimappers: %d", len(tocheck))
    info("Number of fixable mutlimappers: %d", len(tofix))

    if args.stats_only:
        return

    in_samfile = pysam.Samfile(args.input_file, "rb")
    out_samfile = pysam.Samfile(args.output_file, "wb", template=in_samfile)

    for read in in_samfile:
        if read.qname in tofix:
            read.tags = [("NH", 1), ("HI", 1)] + \
                    [tag for tag in read.tags if not tag[0] in ["NH", "HI"]]
            read.mapq = args.new_aqual
        out_samfile.write(read)

    in_samfile.close()
    out_samfile.close()


def make_argparser():
    parser = argparse.ArgumentParser(description="Fix NH/HI flags for potentially exonic multimappers")

    parser.add_argument(
            '--ext-3p',
            dest="extension_3p",
            metavar="EXTENSION",
            type=int,
            default=10000,
            help="how far extend 3' exon ends (default = 10000)")

    parser.add_argument(
            '--ext-5p',
            dest="extension_5p",
            metavar="EXTENSION",
            default=0,
            type=int,
            help="how far extend 5' exon ends (defaults = 0)")

    parser.add_argument(
            '--aqual',
            dest="new_aqual",
            metavar="QUALITY",
            default=30,
            type=int,
            help="new alignment quality to set for fixed reads (default = 30)")

    parser.add_argument(
            '--all-exons',
            dest="extend_all_exons",
            default=False,
            action='store_true',
            help="Extend all exons of a transcript, not just the last one")

    parser.add_argument(
            '--only-last',
            dest="extend_all_exons",
            default=False,
            action='store_false',
            help="Extend only the last exon of a transcript (default)")

    parser.add_argument(
            '-g', '--annotation',
            dest="gtf_file",
            metavar="FILE",
            required=True,
            help="gtf/gff/... file with genome annotation")

    parser.add_argument(
            '--stats-only',
            dest="stats_only",
            default=False,
            action='store_true',
            help="Just show multimappers statistics, do not write the output bam")

    parser.add_argument(
            '--counts-out',
            dest="counts_out",
            metavar="FILE",
            help="file to print multimappers exonic coutns")

    parser.add_argument(
            '-o', '--out',
            dest="output_file",
            metavar="BAM",
            help="file to write fixed alignments")

    parser.add_argument(
            'input_file',
            metavar="INPUT_BAM",
            help="file with alignments to fix")
    return parser


if __name__ == '__main__':
    main()

back to top

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Content policy— Contact— JavaScript license information— Web API