https://github.com/fenderglass/Ragout
Raw File
Tip revision: 4a0ab26826fa4f123c7a8ecabb71b030efa44cf0 authored by Mikhail Kolmogorov on 22 April 2023, 16:31:58 UTC
Update README.md
Tip revision: 4a0ab26
scaffolder.py
#(c) 2013-2014 by Authors
#This file is a part of Ragout program.
#Released under the BSD license (see LICENSE file)

"""
This module assembles contigs into scaffolds with respect
to given adjacencies. Also, it outputs scaffolds in different
formats
"""

from __future__ import absolute_import
from __future__ import division
from collections import defaultdict
import os
import logging

from ragout.shared.debug import DebugConfig
from ragout.shared.datatypes import (Contig, Scaffold, Link,
                                     output_scaffolds_premutations,
                                     output_permutations)
from ragout.scaffolder.output_generator import output_links
from ragout.breakpoint_graph.inferer import Adjacency
from ragout.six.moves import range, zip


logger = logging.getLogger()
debugger = DebugConfig.get_instance()


def build_scaffolds(adjacencies, perm_container, debug_output,
                    correct_distances):
    """
    Assembles scaffolds wrt to inferred adjacencies
    """
    if debug_output:
        logger.debug("Building scaffolds")
    contigs, contig_index = _make_contigs(perm_container)
    scaffolds = _extend_scaffolds(adjacencies, contigs, contig_index,
                                  correct_distances)
    num_contigs = sum([len(s.contigs) for s in scaffolds])
    logger.debug("%d contigs were joined into %d scaffolds",
                 num_contigs, len(scaffolds))

    if debugger.debugging and debug_output:
        links_out = os.path.join(debugger.debug_dir, "scaffolder.links")
        output_links(scaffolds, links_out)
        contigs_out = os.path.join(debugger.debug_dir, "scaffolder_contigs.txt")
        output_permutations(perm_container.target_perms, contigs_out)
        perms_out = os.path.join(debugger.debug_dir, "scaffolder_scaffolds.txt")
        output_scaffolds_premutations(scaffolds, perms_out)

    return scaffolds


def update_gaps(scaffolds):
    """
    Do it in the very end
    """
    for scf in scaffolds:
        for c1, c2 in zip(scf.contigs[:-1], scf.contigs[1:]):
            c1.link.gap -= c1.right_gap() + c2.left_gap()


def assign_scaffold_names(scaffolds, perm_container, ref_genome):
    """
    Names scaffolds according to homology to a chosen reference genome.
    Also ensures that scaffolds and corresponding reference chromosomes
    have the same strand.
    """
    MIN_RATE = 0.1
    PREFIX = "chr"
    chr_index = {}
    for perm in perm_container.ref_perms:
        if perm.genome_name == ref_genome:
            for block in perm.blocks:
                chr_index[block.block_id] = perm.chr_name, block.sign

    assigned_names = {}
    need_rev_compl = {}
    for scf in scaffolds:
        scf_index = defaultdict(int)
        sign_agreement = 0
        total = 0
        for contig in scf.contigs:
            for block in contig.perm.blocks:
                if block.block_id in chr_index:
                    chrom, sign = chr_index[block.block_id]
                    scf_index[chrom] += 1
                    total += 1
                    sign_agreement += int(sign == block.sign * contig.sign)

        name_str = PREFIX
        for chrom in sorted(scf_index, key=scf_index.get, reverse=True):
            if scf_index[chrom] > MIN_RATE * total:
                name_str += "_" + chrom
            else:
                break
        assigned_names[scf] = name_str
        need_rev_compl[scf] = sign_agreement < total // 2

    #in case of same names
    same_names = defaultdict(list)
    for scf, name in assigned_names.items():
        same_names[name].append(scf)
    for name, scf_list in same_names.items():
        scf_list.sort(key=lambda s: len(s.contigs), reverse=True)
        unlocalized = scf_list[1:]
        for scf in unlocalized:
            assigned_names[scf] += "_unlocalized"
        if len(unlocalized) > 1:
            for num, scf in enumerate(unlocalized):
                assigned_names[scf] += "." + str(num + 1)

    for scf in scaffolds:
        scf.name = assigned_names[scf]
        if need_rev_compl[scf]:
            new_contigs = [c.reverse_copy() for c in scf.contigs][::-1]
            for i in range(len(new_contigs) - 1):
                new_contigs[i].link = new_contigs[i + 1].link
            new_contigs[-1].link = Link(0, [])
            scf.contigs = new_contigs


def _extend_scaffolds(adjacencies, contigs, contig_index, correct_distances):
    """
    Assembles contigs into scaffolds
    """
    scaffolds = []
    visited = set()
    counter = [0]

    def extend_scaffold(contig):
        visited.add(contig)
        scf_name = "ragout-scaffold-{0}".format(counter[0])
        counter[0] += 1
        scf = Scaffold.with_contigs(scf_name, contig.left_end(),
                                    contig.right_end(), [contig])

        already_complete = (scf.right in adjacencies and
                            adjacencies[scf.right].block == scf.left and
                            adjacencies[scf.right].infinity)
        if already_complete:
            scaffolds.append(scf)
            return

        #go right
        while scf.right in adjacencies and not adjacencies[scf.right].infinity:
            adj_block = adjacencies[scf.right].block
            adj_distance = adjacencies[scf.right].distance
            adj_supporting_genomes = adjacencies[scf.right].supporting_genomes

            contig = contig_index[abs(adj_block)]
            if contig in visited:
                break

            if adj_block in [contig.left_end(), contig.right_end()]:
                if contig.left_end() == adj_block:
                    scf.contigs.append(contig)
                else:
                    scf.contigs.append(contig.reverse_copy())

                flank = scf.contigs[-2].right_gap() + scf.contigs[-1].left_gap()
                gap = adj_distance - flank if correct_distances else adj_distance
                scf.contigs[-2].link = Link(gap, adj_supporting_genomes)

                scf.right = scf.contigs[-1].right_end()
                visited.add(contig)
                continue

            break

        #go left
        while scf.left in adjacencies and not adjacencies[scf.left].infinity:
            adj_block = adjacencies[scf.left].block
            adj_distance = adjacencies[scf.left].distance
            adj_supporting_genomes = adjacencies[scf.left].supporting_genomes

            contig = contig_index[abs(adj_block)]
            if contig in visited:
                break

            if adj_block in [contig.right_end(), contig.left_end()]:
                if contig.right_end() == adj_block:
                    scf.contigs.insert(0, contig)
                else:
                    scf.contigs.insert(0, contig.reverse_copy())

                flank = scf.contigs[0].right_gap() + scf.contigs[1].left_gap()
                gap = adj_distance - flank if correct_distances else adj_distance
                scf.contigs[0].link = Link(gap, adj_supporting_genomes)

                scf.left = scf.contigs[0].left_end()
                visited.add(contig)
                continue

            break

        if len(scf.contigs) > 1:
            scaffolds.append(scf)

    #for contig in contigs:
    for contig in sorted(contigs, key=lambda c: c.signed_name()):
        if contig not in visited:
            extend_scaffold(contig)

    return scaffolds


def _make_contigs(perm_container):
    """
    A helper function to make Contig structures
    """
    contigs = []
    index = {}
    for perm in perm_container.target_perms:
        assert len(perm.blocks)
        contigs.append(Contig.with_perm(perm))
        for block in perm.blocks:
            assert block.block_id not in index
            index[block.block_id] = contigs[-1]

    return contigs, index
back to top