https://github.com/fenderglass/Ragout
Raw File
Tip revision: a68b9dba9aa7570c0a8e3f579b662524ee919e2b authored by Mikhail Kolmogorov on 27 July 2018, 01:48:58 UTC
bioconda badge
Tip revision: a68b9db
chimera_detector.py
#(c) 2013-2014 by Authors
#This file is a part of Ragout program.
#Released under the BSD license (see LICENSE file)

"""
This module tries to detect missassembled adjacencies
in input sequences and breaks them if neccesary
"""

from __future__ import print_function
import logging
from collections import defaultdict, namedtuple
from copy import copy, deepcopy

from ragout.breakpoint_graph.breakpoint_graph import BreakpointGraph

logger = logging.getLogger()
ContigBreak = namedtuple("ContigBreak", ["seq_name", "begin", "end", "good"])

class ChimeraDetector(object):
    def __init__(self, breakpoint_graphs, run_stages, target_seqs):
        logger.info("Detecting chimeric adjacencies")
        self.bp_graphs = breakpoint_graphs
        self.run_stages = run_stages
        self.target_seqs = target_seqs
        self._make_hierarchical_breaks()

    def _make_hierarchical_breaks(self):
        """
        Determines where and at what synteny blocks scale to break each contig
        """
        seq_cuts = defaultdict(lambda : defaultdict(list))

        #extracting and grouping by sequence
        for stage in self.run_stages:
            logger.debug(">>With block size: {0}".format(stage.block_size))
            breaks = self._get_contig_breaks(self.bp_graphs[stage])
            for br in breaks:
                seq_cuts[br.seq_name][stage].append(br)

        hierarchical_cuts = defaultdict(lambda : defaultdict(list))
        for seq_name in seq_cuts:
            for i in xrange(len(self.run_stages)):
                top_stage = self.run_stages[i]

                for top_break in seq_cuts[seq_name][top_stage]:
                    if top_break.good:
                        continue

                    adjusted_break = (top_break.begin, top_break.end)
                    for j in xrange(i + 1, len(self.run_stages)):
                        lower_stage = self.run_stages[j]
                        #check if there is overlapping cut
                        for lower_break in seq_cuts[seq_name][lower_stage]:
                            ovlp_left = max(adjusted_break[0], lower_break.begin)
                            ovlp_right = min(adjusted_break[1], lower_break.end)
                            #if so, update current and go down
                            if ovlp_right >= ovlp_left:
                                adjusted_break = (ovlp_left, ovlp_right)
                                break

                    break_pos = self._optimal_break(seq_name, *adjusted_break)
                    hierarchical_cuts[seq_name][top_stage].append(break_pos)
        self.hierarchical_cuts = hierarchical_cuts

    def _optimal_break(self, seq_name, break_start, break_end):
        """
        Finds the longet run of Ns within given range
        """
        seq = self.target_seqs[seq_name]
        cur_pos = break_start
        cur_len = 0
        max_pos = break_start
        max_len = 0
        for i in xrange(break_start, break_end):
            if seq[i].upper() == "N":
                cur_len += 1
            if seq[i].upper() != "N" or i == break_end - 1:
                if max_len < cur_len:
                    max_len = cur_len
                    max_pos = cur_pos
                cur_pos = i + 1
                cur_len = 0
        return max_pos + max_len / 2

    def _get_contig_breaks(self, bp_graph):
        """
        Detects chimeric adjacencies
        """
        seq_cuts = []

        subgraphs = bp_graph.connected_components()
        num_target_adj = 0
        num_unique_adj = 0
        num_removed_adj = 0
        for subgr in subgraphs:
            if len(subgr.bp_graph) > 100:
                logger.debug("Processing component of size {0}"
                             .format(len(subgr.bp_graph)))

            for (u, v, data) in subgr.bp_graph.edges_iter(data=True):
                if data["genome_id"] != subgr.target:
                    continue

                num_target_adj += 1
                genomes = set(subgr.genomes_support(u, v))
                if (genomes != set([bp_graph.target])
                    and not subgr.is_infinity(u, v)):
                    continue

                num_unique_adj += 1
                seq_name, start, end = data["chr_name"], data["start"], data["end"]
                if subgr.alternating_cycle(u, v) is not None:
                    seq_cuts.append(ContigBreak(seq_name, start, end, True))
                    continue

                seq_cuts.append(ContigBreak(seq_name, start, end, False))
                num_removed_adj += 1

        #bp_graph.debug_output()
        logger.debug("Checking {0} target adjacencies".format(num_target_adj))
        logger.debug("Found {0} target-specific adjacencies, "
                     "{1} broken as chimeric".format(num_unique_adj,
                                                     num_removed_adj))
        return seq_cuts

    def _valid_2break(self, bp_graph, red_edge):
        """
        Checks if there is a valid 2-break through the given red edge
        """
        assert len(bp_graph) == 4
        red_1, red_2 = red_edge
        cand_1, cand_2 = tuple(set(bp_graph.nodes()) - set(red_edge))
        if abs(cand_1) == abs(cand_2):
            return False

        if bp_graph.has_edge(red_1, cand_1):
            if not bp_graph.has_edge(red_2, cand_2):
                return False
            known_1 = red_1, cand_1
            known_2 = red_2, cand_2
        elif bp_graph.has_edge(red_1, cand_2):
            if not bp_graph.has_edge(red_2, cand_1):
                return False
            known_1 = red_1, cand_2
            known_2 = red_2, cand_1
        else:
            return False

        chr_1 = {}
        for data in bp_graph[known_1[0]][known_1[1]].values():
            chr_1[data["genome_id"]] = data["chr_name"]
        chr_2 = {}
        for data in bp_graph[known_2[0]][known_2[1]].values():
            chr_2[data["genome_id"]] = data["chr_name"]
        common_genomes = set(chr_1.keys()).intersection(chr_2.keys())
        for genome in common_genomes:
            if chr_1[genome] != chr_2[genome]:
                return False

        return True

    def break_contigs(self, perm_container, block_sizes):
        """
        Breaks contigs in inferred cut positions
        """
        logger.info("Removing chimeric adjacencies")

        new_container = deepcopy(perm_container)
        new_target_perms = []
        num_breaks = 0
        num_chimeras = 0

        for perm in new_container.target_perms:
            break_points = []
            for size in block_sizes:
                break_points.extend(self.hierarchical_cuts[perm.chr_name][size])
            break_points = list(set(break_points))
            num_breaks += len(break_points)
            if not break_points:
                new_target_perms.append(perm)
            else:
                num_chimeras += 1
                new_target_perms.extend(_break_permutation(perm, break_points))

        logger.debug("Chimera Detector: {0} cuts made in {1} sequences"
                        .format(num_breaks, num_chimeras))
        new_container.target_perms = new_target_perms
        return new_container


def _break_permutation(permutation, break_points):
    broken_perms = []

    cuts_stack = copy(sorted(break_points))
    cuts_stack.append(permutation.seq_len)
    current_perm = deepcopy(permutation)
    current_perm.blocks = []
    shift = 0

    for block in permutation.blocks:
        if block.end <= cuts_stack[0]:
            block.start -= shift
            block.end -= shift
            current_perm.blocks.append(block)
            continue

        if block.start < cuts_stack[0]:
            block.start = cuts_stack[0]

        #we have passed the current cut
        current_perm.seq_start = shift
        current_perm.seq_end = cuts_stack[0]
        if current_perm.blocks:
            broken_perms.append(current_perm)

        shift = cuts_stack[0]
        cuts_stack.pop(0)

        current_perm = deepcopy(permutation)
        block.start -= shift
        block.end -= shift
        current_perm.blocks = [block]

    current_perm.seq_start = shift
    current_perm.seq_end = cuts_stack[0]
    if current_perm.blocks:
        broken_perms.append(current_perm)

    return broken_perms
back to top