https://github.com/fenderglass/Ragout
Revision 54ec8318ee615f6aa28b3188cfd95ee5679cc317 authored by fenderglass on 15 January 2014, 22:20:24 UTC, committed by fenderglass on 15 January 2014, 22:20:24 UTC
1 parent b0a0b5a
Raw File
Tip revision: 54ec8318ee615f6aa28b3188cfd95ee5679cc317 authored by fenderglass on 15 January 2014, 22:20:24 UTC
sibelia path fix
Tip revision: 54ec831
assembly_refine.py
import networkx as nx
import re
import logging
from collections import namedtuple
from itertools import izip
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from datatypes import Contig, Scaffold

Edge = namedtuple("Edge", ["start", "end"])
logger = logging.getLogger()

#ignore heavy python-graphviz
def load_dot(filename):
    graph = nx.MultiDiGraph()
    edges = {}
    pattern = re.compile("([0-9]+)\s*\->\s*([0-9]+)\s*\[.*=.*\"(.+)\".*\];")
    for line in open(filename, "r").read().splitlines():
        m = pattern.match(line)
        if not m:
            continue

        v1, v2 = int(m.group(1)), int(m.group(2))
        graph.add_node(v1)
        graph.add_node(v2)
        graph.add_edge(v1, v2, label=m.group(3))
        edges[m.group(3)] = Edge(v1, v2)
    return graph, edges


def check_unique(graph, path):
    for v1, v2 in izip(path[:-1], path[1:]):
        assert graph.has_edge(v1, v2)
        if len(graph[v1][v2]) > 1:
            return False
    return True


def get_unique_path(graph, edges, prev_cont, new_cont, max_path_len):
    try:
        src = edges[str(prev_cont)].end
        dst = edges[str(new_cont)].start
    except KeyError:
        logger.debug("contigs are not in the graph")
        return None

    if src == dst:
        logger.debug("adjacent contigs {0} -- {1}".format(prev_cont, new_cont))
        return None

    if not nx.has_path(graph, src, dst):
        logger.debug("no path {0} -- {1}".format(prev_cont, new_cont))
        return None

    paths = [p for p in nx.all_shortest_paths(graph, src, dst)]
    if len(paths) > 1 or not check_unique(graph, paths[0]):
        logger.debug("multiple paths {0} -- {1}".format(prev_cont, new_cont))
        return None

    path = paths[0]
    if len(path) > max_path_len:
        logger.debug("too long path {0} -- {1} of length {2}"
                       .format(prev_cont, new_cont, len(path)))
        return None

    path_edges = []
    for p_start, p_end in izip(path[:-1], path[1:]):
        found_edge = None
        for edge_id, edge in edges.iteritems():
            if edge == Edge(p_start, p_end):
                found_edge = edge_id
                break
        assert found_edge
        path_edges.append(found_edge)

    logger.debug("unique path {0} -- {1} of length {2}"
                 .format(prev_cont, new_cont, len(path)))

    return path_edges


def insert_from_graph(graph_file, scaffolds_in, max_path_len):
    new_scaffolds = []
    graph, edges = load_dot(graph_file)

    ordered_contigs = set()
    for scf in scaffolds_in:
        ordered_contigs |= set(map(lambda s: s.name, scf.contigs))

    for scf in scaffolds_in:
        new_scaffolds.append(Scaffold(scf.name))

        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            new_scaffolds[-1].contigs.append(prev_cont)

            #find unique path
            path_edges = get_unique_path(graph, edges, prev_cont, new_cont, max_path_len)
            if path_edges is None:
                continue

            #check path consistency
            consistent = True
            for edge in path_edges:
                if edge[1:] in ordered_contigs:
                    logger.debug("Path inconsistency {0} -- {1}: {2}"
                                 .format(prev_cont, new_cont, edge))
                    consistent = False
                    break
            if not consistent:
                continue

            #insert contigs along the path
            for edge in path_edges:
                new_scaffolds[-1].contigs[-1].gap = 0
                new_scaffolds[-1].contigs.append(Contig.from_sting(edge))
                new_scaffolds[-1].contigs[-1].gap = 0

        new_scaffolds[-1].contigs.append(new_cont)
    return new_scaffolds


def refine_contigs(graph_file, scaffolds, max_path_len):
    logger.info("Refining with assembly graph")
    logger.debug("Max path len = {0}".format(max_path_len))
    new_scaffolds = insert_from_graph(graph_file, scaffolds, max_path_len)
    return new_scaffolds
back to top