https://github.com/fenderglass/Ragout
Tip revision: a68b9dba9aa7570c0a8e3f579b662524ee919e2b authored by Mikhail Kolmogorov on 27 July 2018, 01:48:58 UTC
bioconda badge
bioconda badge
Tip revision: a68b9db
merge_iters.py
#(c) 2013-2014 by Authors
#This file is a part of Ragout program.
#Released under the BSD license (see LICENSE file)
"""
This module provides some functions for
moving between two consecutive iterations
"""
from collections import namedtuple, defaultdict
from itertools import product, chain, combinations
import os
import logging
from copy import deepcopy
import networkx as nx
from ragout.shared.debug import DebugConfig
from ragout.shared.datatypes import (Contig, Scaffold, Permutation, Link,
output_scaffolds_premutations, output_permutations)
from ragout.scaffolder.output_generator import output_links
from ragout.scaffolder.scaffolder import build_scaffolds
from ragout.breakpoint_graph.inferer import Adjacency
logger = logging.getLogger()
debugger = DebugConfig.get_instance()
def merge_scaffolds(big_scaffolds, small_scaffolds, perm_container, rearrange):
"""
Merges scaffold sets from different iterations. If rearrangements are allowed,
tries to keep some small-scale rearrangements from the weaker scaffold set.
"""
logger.info("Merging two iterations")
#synchronizing scaffolds to the same permutations
big_updated = _update_scaffolds(big_scaffolds, perm_container)
small_updated = _update_scaffolds(small_scaffolds, perm_container)
if rearrange:
projector = RearrangementProjector(big_updated, small_updated, True)
new_adj = projector.project()
big_rearranged = build_scaffolds(new_adj, perm_container, False, False)
else:
big_rearranged = big_updated
merged_scf = _merge_scaffolds(big_rearranged, small_updated)
merged_scf = _merge_consecutive_contigs(merged_scf)
if debugger.debugging:
links_out = os.path.join(debugger.debug_dir, "merged.links")
output_links(merged_scf, links_out)
perms_out = os.path.join(debugger.debug_dir, "merged_scaffolds.txt")
output_scaffolds_premutations(merged_scf, perms_out)
return merged_scf
def get_breakpoints(scaffolds, bp_graph, perm_container):
"""
Counts target-specific adjacencies in scaffolds
"""
updated_scaffolds = _update_scaffolds(scaffolds, perm_container)
specific = 0
for scf in scaffolds:
for cnt in scf.contigs:
for block_1, block_2 in cnt.perm.iter_pairs():
genomes = bp_graph.genomes_support(-block_1.signed_id(),
block_2.signed_id())
if set(genomes) == set([bp_graph.target]):
specific += 1
logger.debug("Target-specific adjacencies in scaffolds: {0}"
.format(specific))
return specific
def _merge_consecutive_contigs(scaffolds):
"""
Merges consecutive contig fragments originating from a same contig
"""
new_scaffolds = []
num_contigs = 0
for scf in scaffolds:
new_contigs = []
cur_sign, cur_perm, cur_link = None, None, None
for cnt in scf.contigs:
consistent = False
if cur_sign == cnt.sign and cnt.perm.chr_name == cur_perm.chr_name:
if cur_sign > 0 and cur_perm.seq_end == cnt.perm.seq_start:
cur_perm.seq_end = cnt.perm.seq_end
cur_perm.blocks.extend(cnt.perm.blocks)
consistent = True
if cur_sign < 0 and cur_perm.seq_start == cnt.perm.seq_end:
cur_perm.seq_start = cnt.perm.seq_start
cur_perm.blocks = cnt.perm.blocks + cur_perm.blocks
consistent = True
if not consistent:
if cur_perm:
new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link))
cur_perm = deepcopy(cnt.perm)
cur_sign = cnt.sign
cur_link = cnt.link
if cur_perm:
new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link))
num_contigs += len(new_contigs)
new_scaffolds.append(Scaffold.with_contigs(scf.name, None,
None, new_contigs))
logger.debug("Merging consequtive contigs: {0} left".format(num_contigs))
return new_scaffolds
def _update_scaffolds(scaffolds, perm_container):
"""
Updates scaffolds wrt to given permutations
"""
perm_index = defaultdict(list)
for perm in perm_container.target_perms:
perm_index[(perm.chr_name, perm.repeat_id)].append(perm)
new_scaffolds = []
for scf in scaffolds:
new_contigs = []
for contig in scf.contigs:
inner_perms = []
for new_perm in perm_index[(contig.perm.chr_name,
contig.perm.repeat_id)]:
if (contig.perm.seq_start <= new_perm.seq_start
< contig.perm.seq_end):
inner_perms.append(new_perm)
assert (contig.perm.seq_start < new_perm.seq_end
<= contig.perm.seq_end)
if not inner_perms:
logger.debug("Lost: {0}".format(contig.perm))
continue
inner_perms.sort(key=lambda p: p.seq_start, reverse=contig.sign < 0)
gap_length = contig.link.gap
for new_perm in inner_perms:
gap_length -= new_perm.length()
new_link = Link(gap_length, contig.link.supporting_genomes)
new_contigs.append(Contig.with_perm(new_perm, contig.sign,
new_link))
new_contigs[-1].link = contig.link
if len(new_contigs):
new_scaffolds.append(Scaffold.with_contigs(scf.name, None,
None, new_contigs))
return new_scaffolds
class RearrangementProjector:
"""
This class handles the projection of rearrangements from weaker set
of scaffolds and ensures that these rearrangements are small-scale
"""
def __init__(self, old_scaffolds, new_scaffolds, conservative):
self.old_scaffolds = old_scaffolds
self.new_scaffolds = new_scaffolds
self._build_bp_graph()
self._build_adj_graph()
self.conservative = conservative
def project(self):
#look for valid k-breaks
num_kbreaks = 0
subgraphs = list(nx.connected_component_subgraphs(self.bp_graph))
for subgr in subgraphs:
#this is a cycle
if any(len(subgr.neighbors(node)) != 2 for node in subgr.nodes()):
continue
red_edges = []
black_edges = []
for (u, v, data) in subgr.edges_iter(data=True):
if data["scf_set"] == "old":
red_edges.append((u, v))
else:
black_edges.append((u, v))
if not self._good_k_break(red_edges, black_edges):
continue
num_kbreaks += 1
for u, v in red_edges:
self.bp_graph.remove_edge(u, v)
self.adj_graph.remove_edge(u, v)
for u, v in black_edges:
link = self.bp_graph[u][v][0]["link"]
infinity = self.bp_graph[u][v][0]["infinity"]
self.bp_graph.add_edge(u, v, scf_set="old",
link=link, infinity=infinity)
self.adj_graph.add_edge(u, v)
logger.debug("Made {0} k-breaks".format(num_kbreaks))
adjacencies = {}
for (u, v, data) in self.bp_graph.edges_iter(data=True):
if data["scf_set"] == "old":
gap, support = 0, []
if not data["infinity"]:
gap = data["link"].gap
support = data["link"].supporting_genomes
adjacencies[u] = Adjacency(v, gap, support, data["infinity"])
adjacencies[v] = Adjacency(u, gap, support, data["infinity"])
return adjacencies
def _good_k_break(self, old_edges, new_edges):
"""
Checks that the break does not change chromomsome structure significantly
"""
MIN_OVLP_SCORE = 0.9
MAX_K_BREAK = 4
if len(old_edges) > MAX_K_BREAK:
return False
new_adj_graph = self.adj_graph.copy()
for u, v in old_edges:
new_adj_graph.remove_edge(u, v)
for u, v in new_edges:
new_adj_graph.add_edge(u, v)
all_nodes = new_adj_graph.nodes()
old_sets = list(map(lambda g: set(g.nodes()),
nx.connected_component_subgraphs(self.adj_graph)))
new_sets = list(map(lambda g: set(g.nodes()),
nx.connected_component_subgraphs(new_adj_graph)))
if len(old_sets) != len(new_sets):
return False
for old_set in old_sets:
max_overlap = 0
best_score = 0
for new_set in new_sets:
overlap = len(old_set & new_set)
if overlap > max_overlap:
max_overlap = overlap
best_score = float(overlap) / len(old_set | new_set)
if best_score < MIN_OVLP_SCORE:
return False
return True
def _build_bp_graph(self):
"""
No repeats assumed!
"""
old_contigs = set()
for scf in self.old_scaffolds:
for cnt in scf.contigs:
old_contigs.add(cnt.name())
###creating 2-colored breakpoint graph
bp_graph = nx.MultiGraph()
for scf in self.old_scaffolds:
for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]):
bp_graph.add_edge(cnt_1.right_end(), cnt_2.left_end(),
scf_set="old", link=cnt_1.link,
scf_name=scf.name, infinity=False)
#chromosome ends
bp_graph.add_edge(scf.contigs[0].left_end(),
scf.contigs[-1].right_end(), scf_set="old",
infinity=True)
for scf in self.new_scaffolds:
prev_cont = None
for pos, contig in enumerate(scf.contigs):
if contig.name() in old_contigs:
prev_cont = contig
break
if prev_cont is None:
continue
for next_cont in scf.contigs[pos + 1:]:
if next_cont.name() not in old_contigs:
prev_cont.link.gap += next_cont.length() + next_cont.link.gap
common_genomes = (set(prev_cont.link.supporting_genomes) &
set(next_cont.link.supporting_genomes))
prev_cont.link.supporting_genomes = list(common_genomes)
continue
bp_graph.add_edge(prev_cont.right_end(), next_cont.left_end(),
scf_set="new", link=prev_cont.link,
scf_name=scf.name, infinity=False)
prev_cont = next_cont
self.bp_graph = bp_graph
def _build_adj_graph(self):
adj_graph = nx.Graph()
for scf in self.old_scaffolds:
for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]):
adj_graph.add_edge(cnt_1.right_end(), cnt_2.left_end())
for cnt in scf.contigs:
adj_graph.add_edge(cnt.left_end(), cnt.right_end())
#chromosome ends
adj_graph.add_edge(scf.contigs[0].left_end(),
scf.contigs[-1].right_end())
self.adj_graph = adj_graph
def _merge_scaffolds(big_scaffolds, small_scaffolds):
"""
Performs the final merging step
"""
count_diff_scaf = 0
count_diff_orient = 0
count_inconsistent = 0
total_success = 0
total_fail = 0
total_inserted = 0
not_found = 0
big_count = defaultdict(int)
for scf in big_scaffolds:
for c in scf.contigs:
big_count[c.perm] += 1
small_count = defaultdict(int)
for scf in small_scaffolds:
for c in scf.contigs:
small_count[c.perm] += 1
repeats = set(seq for (seq, count) in
chain(big_count.items(), small_count.items()) if count > 1)
big_unique = set(seq for (seq, count) in big_count.items() if count == 1)
small_index = {}
for scf in small_scaffolds:
for pos, contig in enumerate(scf.contigs):
if contig.perm not in repeats:
assert contig.perm not in small_index
small_index[contig.perm] = (scf, pos)
new_scafflods = []
for big_scf in big_scaffolds:
new_contigs = []
non_repeats = list(filter(lambda i: big_scf.contigs[i].perm
not in repeats,
xrange(len(big_scf.contigs))))
for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]):
left_cnt = big_scf.contigs[left_idx]
right_cnt = big_scf.contigs[right_idx]
consistent = False
if (left_cnt.perm in small_index and
right_cnt.perm in small_index):
consistent = True
left_scf, left_pos = small_index[left_cnt.perm]
right_scf, right_pos = small_index[right_cnt.perm]
big_sign = left_cnt.sign == right_cnt.sign
small_sign = (left_scf.contigs[left_pos].sign ==
right_scf.contigs[right_pos].sign)
if left_scf != right_scf:
count_diff_scaf += 1
consistent = False
elif big_sign != small_sign:
count_diff_orient += 1
consistent = False
else:
same_dir = left_pos < right_pos
if not same_dir:
left_pos, right_pos = right_pos, left_pos
weak_contigs = left_scf.contigs[left_pos + 1 : right_pos]
if any(c.perm in big_unique for c in weak_contigs):
count_inconsistent += 1
consistent = False
if not same_dir:
weak_contigs = list(map(lambda c: c.reverse_copy(),
weak_contigs[::-1]))
link_to_change = left_scf.contigs[left_pos].link
else:
not_found += 1
new_contigs.append(left_cnt)
if consistent:
new_contigs[-1].link = link_to_change
new_contigs.extend(weak_contigs)
total_success += 1
total_inserted += len(weak_contigs)
#logger.debug("Inserting '{0}' between {1} and {2}"
# .format(map(lambda c: c.perm, weak_contigs),
# left_cnt, right_cnt))
else:
new_contigs.extend(big_scf.contigs[left_idx+1:right_idx])
total_fail += 1
if len(new_contigs) > 1:
new_contigs.append(right_cnt)
s = Scaffold(big_scf.name)
s.contigs = new_contigs
new_scafflods.append(s)
else: #because of repeats
new_scafflods.append(big_scf)
logger.debug("Fail: not found: {0}".format(not_found))
logger.debug("Fail: different scaffolds: {0}".format(count_diff_scaf))
logger.debug("Fail: different orientatilns: {0}".format(count_diff_orient))
logger.debug("Fail: inconsistent: {0}".format(count_inconsistent))
logger.debug("Total success: {0}".format(total_success))
logger.debug("Total fail: {0}".format(total_fail))
logger.debug("Total inserted: {0}".format(total_inserted))
num_contigs = 0
for scf in new_scafflods:
num_contigs += len(scf.contigs)
logger.debug("Result: {0} contigs in {1} scaffolds"
.format(num_contigs, len(new_scafflods)))
return new_scafflods