#(c) 2013-2014 by Authors #This file is a part of Ragout program. #Released under the BSD license (see LICENSE file) """ This module provides some functions for moving between two consecutive iterations """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from collections import defaultdict from itertools import chain import os import logging from copy import deepcopy, copy import networkx as nx from ragout.shared.debug import DebugConfig from ragout.shared.datatypes import (Contig, Scaffold, Link, output_scaffolds_premutations) from ragout.scaffolder.output_generator import output_links from ragout.scaffolder.scaffolder import build_scaffolds from ragout.breakpoint_graph.inferer import Adjacency from ragout.breakpoint_graph.breakpoint_graph import GenChrPair from ragout.six.moves import range from ragout.six.moves import zip logger = logging.getLogger() debugger = DebugConfig.get_instance() def merge_scaffolds(big_scaffolds, small_scaffolds, perm_container, rearrange): """ Merges scaffold sets from different iterations. If rearrangements are allowed, tries to keep some small-scale rearrangements from the weaker scaffold set. """ logger.info("Merging two iterations") #synchronizing scaffolds to the same permutations big_updated = _update_scaffolds(big_scaffolds, perm_container) small_updated = _update_scaffolds(small_scaffolds, perm_container) if rearrange: projector = RearrangementProjector(big_updated, small_updated, True) new_adj = projector.project() big_rearranged = build_scaffolds(new_adj, perm_container, False, False) else: big_rearranged = big_updated merged_scf = _merge_scaffolds(big_rearranged, small_updated) merged_scf = _merge_consecutive_contigs(merged_scf) if debugger.debugging: links_out = os.path.join(debugger.debug_dir, "merged.links") output_links(merged_scf, links_out) perms_out = os.path.join(debugger.debug_dir, "merged_scaffolds.txt") output_scaffolds_premutations(merged_scf, perms_out) return merged_scf def get_breakpoints(scaffolds, bp_graph, perm_container): """ Counts target-specific adjacencies in scaffolds """ _updated_scaffolds = _update_scaffolds(scaffolds, perm_container) specific = 0 for scf in scaffolds: for cnt in scf.contigs: for block_1, block_2 in cnt.perm.iter_pairs(): genomes = bp_graph.genomes_support(-block_1.signed_id(), block_2.signed_id()) if set(genomes) == set([bp_graph.target]): specific += 1 logger.debug("Target-specific adjacencies in scaffolds: %d", specific) return specific def _merge_consecutive_contigs(scaffolds): """ Merges consecutive contig fragments originating from a same contig """ new_scaffolds = [] num_contigs = 0 for scf in scaffolds: new_contigs = [] cur_sign, cur_perm, cur_link = None, None, None for cnt in scf.contigs: consistent = False if cur_sign == cnt.sign and cnt.perm.chr_name == cur_perm.chr_name: if cur_sign > 0 and cur_perm.seq_end == cnt.perm.seq_start: cur_perm.seq_end = cnt.perm.seq_end cur_perm.blocks.extend(cnt.perm.blocks) consistent = True if cur_sign < 0 and cur_perm.seq_start == cnt.perm.seq_end: cur_perm.seq_start = cnt.perm.seq_start cur_perm.blocks = cnt.perm.blocks + cur_perm.blocks consistent = True if not consistent: if cur_perm: new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link)) cur_perm = deepcopy(cnt.perm) cur_sign = cnt.sign cur_link = cnt.link if cur_perm: new_contigs.append(Contig.with_perm(cur_perm, cur_sign, cur_link)) num_contigs += len(new_contigs) new_scaffolds.append(Scaffold.with_contigs(scf.name, None, None, new_contigs)) logger.debug("Merging consequtive contigs: %d left", num_contigs) return new_scaffolds def _update_scaffolds(scaffolds, perm_container): """ Updates scaffolds wrt to given permutations """ perm_index = defaultdict(list) for perm in perm_container.target_perms: perm_index[(perm.chr_name, perm.repeat_id)].append(perm) new_scaffolds = [] for scf in scaffolds: new_contigs = [] for contig in scf.contigs: inner_perms = [] for new_perm in perm_index[(contig.perm.chr_name, contig.perm.repeat_id)]: if (contig.perm.seq_start <= new_perm.seq_start < contig.perm.seq_end): inner_perms.append(new_perm) assert (contig.perm.seq_start < new_perm.seq_end <= contig.perm.seq_end) if not inner_perms: logger.debug("Lost: %s", str(contig.perm)) continue inner_perms.sort(key=lambda p: p.seq_start, reverse=contig.sign < 0) for prev_perm, next_perm in zip(inner_perms[:-1], inner_perms[1:]): if contig.sign > 0: gap_length = next_perm.seq_start - prev_perm.seq_end else: gap_length = prev_perm.seq_start - next_perm.seq_end support = [GenChrPair(prev_perm.genome_name, prev_perm.chr_name)] new_contigs.append(Contig.with_perm(prev_perm, contig.sign, Link(gap_length, support))) new_contigs.append(Contig.with_perm(inner_perms[-1], contig.sign, copy(contig.link))) if len(new_contigs): new_scaffolds.append(Scaffold.with_contigs(scf.name, None, None, new_contigs)) return new_scaffolds class RearrangementProjector: """ This class handles the projection of rearrangements from weaker set of scaffolds and ensures that these rearrangements are small-scale """ def __init__(self, old_scaffolds, new_scaffolds, conservative): self.old_scaffolds = old_scaffolds self.new_scaffolds = new_scaffolds self._build_bp_graph() self._build_adj_graph() self.conservative = conservative def connected_component_subgraphs(self,G): for c in nx.connected_components(G): yield G.subgraph(c) def project(self): #look for valid k-breaks num_kbreaks = 0 #subgraphs = list(nx.connected_component_subgraphs(self.bp_graph)) subgraphs = list(self.connected_component_subgraphs(self.bp_graph)) for subgr in subgraphs: #this is a cycle if any(len(subgr[node]) != 2 for node in subgr.nodes): continue red_edges = [] black_edges = [] for (u, v, data) in subgr.edges(data=True): if data["scf_set"] == "old": red_edges.append((u, v)) else: black_edges.append((u, v)) if not self._good_k_break(red_edges, black_edges): continue num_kbreaks += 1 for u, v in red_edges: self.bp_graph.remove_edge(u, v) self.adj_graph.remove_edge(u, v) for u, v in black_edges: #print(self.bp_graph[u][v]) link = self.bp_graph[u][v][0]["link"] infinity = self.bp_graph[u][v][0]["infinity"] self.bp_graph.add_edge(u, v, scf_set="old", link=link, infinity=infinity) self.adj_graph.add_edge(u, v) logger.debug("Made %d k-breaks", num_kbreaks) adjacencies = {} for (u, v, data) in self.bp_graph.edges(data=True): if data["scf_set"] == "old": gap, support = 0, [] if not data["infinity"]: gap = data["link"].gap support = data["link"].supporting_genomes adjacencies[u] = Adjacency(v, gap, support, data["infinity"]) adjacencies[v] = Adjacency(u, gap, support, data["infinity"]) return adjacencies def _good_k_break(self, old_edges, new_edges): """ Checks that the break does not change chromomsome structure significantly """ MIN_OVLP_SCORE = 0.9 MAX_K_BREAK = 4 if len(old_edges) > MAX_K_BREAK: return False new_adj_graph = self.adj_graph.copy() for u, v in old_edges: new_adj_graph.remove_edge(u, v) for u, v in new_edges: new_adj_graph.add_edge(u, v) old_sets = [set(g.nodes) for g in #nx.connected_component_subgraphs(self.adj_graph)] self.connected_component_subgraphs(self.adj_graph)] new_sets = [set(g.nodes) for g in #nx.connected_component_subgraphs(new_adj_graph)] self.connected_component_subgraphs(new_adj_graph)] if len(old_sets) != len(new_sets): return False for old_set in old_sets: max_overlap = 0 best_score = 0 for new_set in new_sets: overlap = len(old_set & new_set) if overlap > max_overlap: max_overlap = overlap best_score = float(overlap) / len(old_set | new_set) if best_score < MIN_OVLP_SCORE: return False return True def _build_bp_graph(self): """ No repeats assumed! """ old_contigs = set() for scf in self.old_scaffolds: for cnt in scf.contigs: old_contigs.add(cnt.name()) ###creating 2-colored breakpoint graph bp_graph = nx.MultiGraph() for scf in self.old_scaffolds: for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]): bp_graph.add_edge(cnt_1.right_end(), cnt_2.left_end(), scf_set="old", link=copy(cnt_1.link), scf_name=scf.name, infinity=False) #chromosome ends bp_graph.add_edge(scf.contigs[-1].right_end(), scf.contigs[0].left_end(), scf_set="old", infinity=True) for scf in self.new_scaffolds: prev_cont = None first_ctg = None pos = 0 for pos, contig in enumerate(scf.contigs): if contig.name() in old_contigs: prev_cont = deepcopy(contig) first_ctg = prev_cont break if prev_cont is None: continue for next_cont in scf.contigs[pos + 1:]: if next_cont.name() not in old_contigs: prev_cont.link.gap += next_cont.length() + next_cont.link.gap common_genomes = (set(prev_cont.link.supporting_genomes) & set(next_cont.link.supporting_genomes)) prev_cont.link.supporting_genomes = list(common_genomes) else: bp_graph.add_edge(prev_cont.right_end(), next_cont.left_end(), scf_set="new", link=copy(prev_cont.link), scf_name=scf.name, infinity=False) prev_cont = deepcopy(next_cont) bp_graph.add_edge(prev_cont.right_end(), first_ctg.left_end(), scf_set="new", infinity=True, link=None) self.bp_graph = bp_graph def _build_adj_graph(self): adj_graph = nx.Graph() for scf in self.old_scaffolds: for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]): adj_graph.add_edge(cnt_1.right_end(), cnt_2.left_end()) for cnt in scf.contigs: adj_graph.add_edge(cnt.left_end(), cnt.right_end()) #chromosome ends adj_graph.add_edge(scf.contigs[-1].right_end(), scf.contigs[0].left_end()) self.adj_graph = adj_graph def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for (seq, count) in chain(list(big_count.items()), list(small_count.items())) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm # not in repeats, # xrange(len(big_scf.contigs)))) non_repeats = [i for i in range(len(big_scf.contigs)) if big_scf.contigs[i].perm not in repeats] for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False weak_contigs = None link_to_change = None if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1 : right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False link_to_change = copy(left_scf.contigs[left_pos].link) #reverse complement if weak_contigs and not same_dir: link_to_change = copy(left_scf.contigs[right_pos - 1].link) weak_contigs = [c.reverse_copy() for c in weak_contigs[::-1]] for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]): pw.link = copy(nw.link) weak_contigs[-1].link = copy(left_scf.contigs[left_pos].link) else: not_found += 1 new_contigs.append(left_cnt) if consistent and weak_contigs: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx+1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: %d", not_found) logger.debug("Fail: different scaffolds: %d", count_diff_scaf) logger.debug("Fail: different orientatilns: %d", count_diff_orient) logger.debug("Fail: inconsistent: %d", count_inconsistent) logger.debug("Total success: %d", total_success) logger.debug("Total fail: %d", total_fail) logger.debug("Total inserted: %d", total_inserted) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: %d contigs in %d scaffolds", num_contigs, len(new_scafflods)) return new_scafflods