#(c) 2013-2014 by Authors #This file is a part of Ragout program. #Released under the BSD license (see LICENSE file) """ The main Ragout module. It defines top-level logic of the program """ from __future__ import absolute_import from __future__ import division import os import shutil import logging import argparse from collections import namedtuple import ragout.assembly_graph.assembly_refine as asref import ragout.scaffolder.scaffolder as scfldr import ragout.scaffolder.merge_iters as merge import ragout.maf2synteny.maf2synteny as m2s import ragout.overlap.overlap as overlap import ragout.shared.config as config from ragout.scaffolder.output_generator import OutputGenerator from ragout.overlap.overlap import OverlapException from ragout.phylogeny.phylogeny import Phylogeny from ragout.parsers.phylogeny_parser import PhyloException from ragout.breakpoint_graph.permutation import (PermutationContainer, PermException) from ragout.synteny_backend.synteny_backend import (SyntenyBackend, BackendException) from ragout.parsers.recipe_parser import parse_ragout_recipe, RecipeException from ragout.parsers.fasta_parser import read_fasta_dict, FastaError from ragout.shared.debug import DebugConfig from ragout.breakpoint_graph.breakpoint_graph import BreakpointGraph from ragout.breakpoint_graph.inferer import AdjacencyInferer from ragout.breakpoint_graph.chimera_detector import ChimeraDetector from ragout.__version__ import __version__ #register backends import ragout.synteny_backend.sibelia #import synteny_backend.cactus import ragout.synteny_backend.maf import ragout.synteny_backend.hal import ragout.six as six logger = logging.getLogger() debugger = DebugConfig.get_instance() RunStage = namedtuple("RunStage", ["name", "block_size", "ref_indels", "repeats", "rearrange"]) ID_SMALLEST = -1 def _enable_logging(log_file, debug): """ Turns on logging, sets debug levels and assigns a log file """ log_formatter = logging.Formatter("[%(asctime)s] %(name)s: %(levelname)s: " "%(message)s", "%H:%M:%S") console_formatter = logging.Formatter("[%(asctime)s] %(levelname)s: " "%(message)s", "%H:%M:%S") console_log = logging.StreamHandler() console_log.setFormatter(console_formatter) if not debug: console_log.setLevel(logging.INFO) file_handler = logging.FileHandler(log_file, mode="w") file_handler.setFormatter(log_formatter) logger.setLevel(logging.DEBUG) logger.addHandler(console_log) logger.addHandler(file_handler) def _check_extern_modules(backend): """ Checks if all necessary native modules are available """ backends = SyntenyBackend.get_available_backends() if backend not in backends: raise BackendException("\"{0}\" is not installed.".format(backend)) if not m2s.check_binary(): raise BackendException("maf2synteny binary is missing, " "did you run 'make'?") if not overlap.check_binary(): raise BackendException("overlap binary is missing, " "did you run 'make'?") def _make_run_stages(block_sizes, resolve_repeats): """ Setting parameters of run stages (iterations) """ stages = [] #general stages for structural assembly for block in block_sizes: stages.append(RunStage(name=str(block), block_size=block, ref_indels=False, repeats=False, rearrange=True)) #refining stage to close assembly gaps stages.append(RunStage(name="refine", block_size=block_sizes[ID_SMALLEST], ref_indels=True, repeats=resolve_repeats, rearrange=False)) return stages def _get_phylogeny_and_naming_ref(recipe, permutation_file): """ Retrieves phylogeny (infers if necessary) as well as naming reference genome """ if "tree" in recipe: logger.info("Phylogeny is taken from the recipe") phylogeny = Phylogeny.from_newick(recipe["tree"]) else: logger.info("Inferring phylogeny from synteny blocks data") perm_cont = PermutationContainer(permutation_file, recipe, resolve_repeats=False, allow_ref_indels=True, phylogeny=None) phylogeny = Phylogeny.from_permutations(perm_cont) logger.info("Inferred tree: %s", phylogeny.tree_string) if "naming_ref" in recipe: naming_ref = recipe["naming_ref"] else: leaves_sorted = phylogeny.leaves_by_distance(recipe["target"]) naming_ref = leaves_sorted[0] logger.info("'%s' is chosen as a naming reference", naming_ref) return phylogeny, naming_ref def _get_synteny_scale(recipe, synteny_backend): if "blocks" in recipe: if isinstance(recipe["blocks"], six.string_types): scale = config.vals["blocks"][recipe["blocks"]] else: scale = recipe["blocks"] else: scale = config.vals["blocks"][synteny_backend.infer_block_scale(recipe)] logger.info("Running withs synteny block sizes '%s'", str(scale)) return scale def _run_ragout(args): """ Top-level logic of the program """ if not os.path.isdir(args.out_dir): os.mkdir(args.out_dir) debug_root = os.path.join(args.out_dir, "debug") debugger.set_debugging(args.debug) debugger.set_debug_dir(debug_root) debugger.clear_debug_dir() out_log = os.path.join(args.out_dir, "ragout.log") _enable_logging(out_log, args.debug) logger.info("Starting Ragout v%s", str(__version__)) #parsing recipe, preparing synteny backend _check_extern_modules(args.synteny_backend) synteny_backend = SyntenyBackend.get_available_backends() \ [args.synteny_backend] recipe = parse_ragout_recipe(args.recipe) synteny_sizes = _get_synteny_scale(recipe, synteny_backend) #Running synteny backend to get synteny blocks perm_files = synteny_backend.make_permutations(recipe, synteny_sizes, args.out_dir, args.overwrite, args.threads) #setting up phylogenetic tree phylo_perm_file = perm_files[synteny_sizes[ID_SMALLEST]] phylogeny, naming_ref = _get_phylogeny_and_naming_ref(recipe, phylo_perm_file) #parsing permutation files, apply filters and build breakpoint graph logger.info("Processing permutation files") raw_bp_graphs = {} stage_perms = {} run_stages = _make_run_stages(synteny_sizes, args.resolve_repeats) for stage in run_stages: debugger.set_debug_dir(os.path.join(debug_root, stage.name)) stage_perms[stage] = PermutationContainer(perm_files[stage.block_size], recipe, stage.repeats, stage.ref_indels, phylogeny) raw_bp_graphs[stage] = BreakpointGraph(stage_perms[stage]) #initializing chimera detector target_sequences = read_fasta_dict(synteny_backend.get_target_fasta()) chim_detect = None if not args.solid_scaffolds: chim_detect = ChimeraDetector(raw_bp_graphs, run_stages, target_sequences) #inferring adjacencies: loop over stages (iterations) scaffolds = None prev_stages = [] for stage in run_stages: logger.info("Stage \"%s\"", stage.name) debugger.set_debug_dir(os.path.join(debug_root, stage.name)) prev_stages.append(stage) broken_perms = stage_perms[stage] if not args.solid_scaffolds: broken_perms = chim_detect.break_contigs(stage_perms[stage], [stage]) breakpoint_graph = BreakpointGraph(broken_perms) adj_inferer = AdjacencyInferer(breakpoint_graph, phylogeny) adjacencies = adj_inferer.infer_adjacencies() cur_scaffolds = scfldr.build_scaffolds(adjacencies, broken_perms, debug_output=True, correct_distances=False) if scaffolds is not None: if not args.solid_scaffolds: broken_perms = chim_detect.break_contigs(stage_perms[stage], prev_stages) cur_scaffolds = merge.merge_scaffolds(scaffolds, cur_scaffolds, broken_perms, stage.rearrange) merge.get_breakpoints(cur_scaffolds, breakpoint_graph, broken_perms) scaffolds = cur_scaffolds debugger.set_debug_dir(debug_root) #### #name scaffolds according to one of the references last_stage = run_stages[ID_SMALLEST] scfldr.assign_scaffold_names(scaffolds, stage_perms[last_stage], naming_ref) scfldr.update_gaps(scaffolds) #refine with the assembly graph if args.refine: out_overlap = os.path.join(args.out_dir, "contigs_overlap.dot") overlap.make_overlap_graph(synteny_backend.get_target_fasta(), out_overlap) scaffolds = asref.refine_scaffolds(out_overlap, scaffolds, target_sequences) if args.debug: shutil.copy(out_overlap, debugger.debug_dir) os.remove(out_overlap) out_gen = OutputGenerator(target_sequences, scaffolds) out_gen.make_output(args.out_dir, recipe["target"]) logger.info("Done!") def main(): parser = argparse.ArgumentParser( description="Chromosome assembly with multiple " "references", formatter_class= argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("recipe", metavar="recipe_file", help="path to recipe file") parser.add_argument("-o", "--outdir", dest="out_dir", metavar="output_dir", help="output directory", default="ragout-out") parser.add_argument("-s", "--synteny", dest="synteny_backend", default="sibelia", choices=["sibelia", "maf", "hal"], help="backend for synteny block decomposition") parser.add_argument("--refine", action="store_true", dest="refine", default=False, help="enable refinement with assembly graph") parser.add_argument("--solid-scaffolds", action="store_true", dest="solid_scaffolds", default=False, help="do not break input sequences - disables " "chimera detection module") parser.add_argument("--overwrite", action="store_true", default=False, dest="overwrite", help="overwrite results from the previous run") parser.add_argument("--repeats", action="store_true", default=False, dest="resolve_repeats", help="enable repeat resolution algorithm") parser.add_argument("--debug", action="store_true", dest="debug", default=False, help="enable debug output") parser.add_argument("-t", "--threads", dest="threads", type=int, default=1, help="number of threads for synteny backend") parser.add_argument("--version", action="version", version=__version__) args = parser.parse_args() try: _run_ragout(args) except (RecipeException, PhyloException, PermException, BackendException, OverlapException, FastaError) as e: logger.error("An error occured while running Ragout:") logger.error(e) return 1 return 0