https://github.com/fenderglass/Ragout
Raw File
Tip revision: 9b706fa0825b6a8f25a626ceffa9b4c71bdaf9e4 authored by Mikhail Kolmogorov on 30 July 2018, 20:00:30 UTC
version update
Tip revision: 9b706fa
output_generator.py
#(c) 2013-2014 by Authors
#This file is a part of Ragout program.
#Released under the BSD license (see LICENSE file)

from itertools import repeat
from collections import defaultdict
import logging
import os

from ragout.parsers.fasta_parser import write_fasta_dict, reverse_complement
from ragout.__version__ import __version__
import ragout.shared.config as config

logger = logging.getLogger()


class OutputGenerator:
    def __init__(self, fragments_fasta, scaffolds):
        self.fragments_fasta = fragments_fasta
        self.scaffolds = scaffolds

        self.unplaced_fasta = None
        self.scaffolds_fasta = None
        self.used_fragments_len = None
        self.introduced_gap_len = None

    def make_output(self, out_dir, out_prefix):
        """
        Makes full output to the given directory
        """
        out_links = os.path.join(out_dir, out_prefix + "_scaffolds.links")
        out_agp = os.path.join(out_dir, out_prefix + "_scaffolds.agp")
        out_chr = os.path.join(out_dir, out_prefix + "_scaffolds.fasta")
        out_unplaced = os.path.join(out_dir, out_prefix + "_unplaced.fasta")

        self._make_unplaced_fasta()
        write_fasta_dict(self.unplaced_fasta, out_unplaced)

        self._fix_gaps()
        self._make_scaffolds_fasta()
        write_fasta_dict(self.scaffolds_fasta, out_chr)

        self._output_agp(out_agp, out_prefix)
        self._print_statistics()
        output_links(self.scaffolds, out_links)

    def _fix_gaps(self):
        """
        Handles negative gaps, ensures that gap values are
        within deined range
        """
        def get_seq(contig):
            seq_name, seg_start, seg_end = contig.name_with_coords()
            cont_seq = self.fragments_fasta[seq_name][seg_start:seg_end]
            if contig.sign < 0:
                cont_seq = reverse_complement(cont_seq)
            return cont_seq

        def count_ns(cnt_1, cnt_2):
            seq_1, seq_2 = get_seq(cnt_1), get_seq(cnt_2)

            left_ns, right_ns = 0, 0
            for i in xrange(len(seq_1) - 1, 0, -1):
                if seq_1[i].upper() != "N":
                    break
                left_ns += 1
            for i in xrange(len(seq_2) - 1):
                if seq_2[i].upper() != "N":
                    break
                right_ns += 1
            return left_ns, right_ns

        for scf in self.scaffolds:
            for cnt_1, cnt_2 in zip(scf.contigs[:-1], scf.contigs[1:]):
                if cnt_1.link.supporting_assembly:
                    cnt_1.trim_right(max(0, -cnt_1.link.gap))
                    cnt_1.link.gap = max(0, cnt_1.link.gap)
                    continue

                left_ns, right_ns = count_ns(cnt_1, cnt_2)

                cnt_1.trim_right(left_ns)
                cnt_2.trim_left(right_ns)
                cnt_1.link.gap += left_ns + right_ns
                cnt_1.link.gap = max(cnt_1.link.gap,
                                     config.vals["min_scaffold_gap"])
                #cnt_1.link.gap = min(cnt_1.link.gap,
                #                     config.vals["max_scaffold_gap"])

    def _output_agp(self, out_agp, assembly_name):
        """
        Output file in NCBI AGP format
        """
        SHIFT = 1
        with open(out_agp, "w") as f:
            f.write("##agp-version  2.0\n")
            f.write("#ASSEMBLY NAME: {0}\n".format(assembly_name))
            f.write("#DESCRIPTION: Pseudochromosome assembly\n")
            f.write("#PROGRAM: Ragout v{0}\n".format(__version__))
            for scf in sorted(self.scaffolds, key=lambda s: s.name):
                chr_pos = 0
                for contig_id, contig in enumerate(scf.contigs):
                    chr_start = chr_pos
                    chr_end = chr_pos + contig.length()
                    chr_pos = chr_end + contig.link.gap
                    cont_name, cont_start, cont_end = contig.name_with_coords()
                    strand = "+" if contig.sign > 0 else "-"
                    support = _support_to_string(contig.link)

                    contig_num = 2 * contig_id + 1
                    gap_num = 2 * contig_id + 2
                    cont_fields = [scf.name, chr_start + SHIFT, chr_end,
                                   contig_num, "W", cont_name, cont_start + SHIFT,
                                   cont_end, strand]
                    f.write("\t".join(map(str, cont_fields)) + "\n")
                    if contig.link.gap > 0:
                        gap_fields = [scf.name, chr_end + SHIFT, chr_pos, gap_num,
                                      "N", contig.link.gap,
                                      "scaffold", "yes", support]
                        f.write("\t".join(map(str, gap_fields)) + "\n")

    def _make_unplaced_fasta(self):
        """
        Creates unplaced (not used in scaffolds) sequences in FASTA format
        """
        used_ranges_by_seq = defaultdict(list)
        for scf in self.scaffolds:
            for ctg in scf.contigs:
                seq_name, seq_start, seq_end = ctg.name_with_coords()
                used_ranges_by_seq[seq_name].append((seq_start, seq_end))
        for seq_name in self.fragments_fasta:
            seq_len = len(self.fragments_fasta[seq_name])
            used_ranges_by_seq[seq_name].append((0, 0))
            used_ranges_by_seq[seq_name].append((seq_len, seq_len))
            used_ranges_by_seq[seq_name].sort()

        unused_ranges_by_seq = defaultdict(list)
        for seq_name in self.fragments_fasta:
            for range_1, range_2 in zip(used_ranges_by_seq[seq_name][:-1],
                                        used_ranges_by_seq[seq_name][1:]):
                if range_1[1] < range_2[0]:
                    unused_ranges_by_seq[seq_name].append((range_1[1],
                                                           range_2[0]))

        unplaced_fasta = {}
        for seq_name, unused_ranges in unused_ranges_by_seq.items():
            for ur in unused_ranges:
                if ur[0] == 0 and ur[1] == len(self.fragments_fasta[seq_name]):
                    fragment_name = seq_name
                else:
                    fragment_name = seq_name + "[{0}:{1}]".format(ur[0], ur[1])
                unplaced_fasta[fragment_name] = \
                                    self.fragments_fasta[seq_name][ur[0]:ur[1]]

        self.unplaced_fasta = unplaced_fasta

    def _make_scaffolds_fasta(self):
        """
        Creates FASTA sequences of scaffolds
        """
        logger.info("Generating FASTA output")
        scaffolds_fasta = {}

        self.used_fragments_len = 0
        self.introduced_gap_len = 0
        for scf in self.scaffolds:
            scf_seqs = []
            for contig in scf.contigs:
                seq_name, seq_start, seq_end = contig.name_with_coords()
                cont_seq = self.fragments_fasta[seq_name][seq_start:seq_end]
                if contig.sign < 0:
                    cont_seq = reverse_complement(cont_seq)

                scf_seqs.append(cont_seq)
                if contig.link.gap > 0:
                    scf_seqs.append("N" * contig.link.gap)
                    self.introduced_gap_len += contig.link.gap

                self.used_fragments_len += len(cont_seq)

            scf_seq = "".join(scf_seqs)
            scaffolds_fasta[scf.name] = scf_seq

        self.scaffolds_fasta = scaffolds_fasta

    def _print_statistics(self):
        """
        Computes and prints some useful statistics
        """
        unplaced_len = sum(map(len, self.unplaced_fasta.values()))
        fragments_len = sum(map(len, self.fragments_fasta.values()))
        output_len = self.used_fragments_len + self.introduced_gap_len

        used_perc = 100 * float(self.used_fragments_len) / fragments_len
        unplaced_perc = 100 * float(unplaced_len) / fragments_len
        gap_perc = 100 * float(self.introduced_gap_len) / output_len

        unplaced_count = len(self.unplaced_fasta)
        used_fragments_num = 0
        for scf in self.scaffolds:
            used_fragments_num += len(scf.contigs)

        contigs_len = [len(c) for c in self.fragments_fasta.values()]
        scaffolds_len = [len(c) for c in self.scaffolds_fasta.values()]
        contigs_n50 = _calc_n50(contigs_len, fragments_len)
        scaffolds_n50 = _calc_n50(scaffolds_len, output_len)

        logger.info("Assembly statistics:\n\n"
                    "\tScaffolds:\t\t{0}\n"
                    "\tUsed fragments:\t\t{1}\n"
                    "\tScaffolds length:\t{2}\n\n"
                    "\tUnplaced fragments:\t{3}\n"
                    "\tUnplaced length:\t{4} ({5:2.2f}%)\n"
                    "\tIntroduced Ns length:\t{6} ({7:2.2f}%)\n\n"
                    "\tFragments N50:\t\t{8}\n"
                    "\tAssembly N50:\t\t{9}\n"
                    .format(len(self.scaffolds), used_fragments_num,
                            output_len, unplaced_count, unplaced_len,
                            unplaced_perc, self.introduced_gap_len, gap_perc,
                            contigs_n50, scaffolds_n50))


def output_links(scaffolds, out_links):
    """
    Outputs pretty table with information about adjacencies
    """
    HEADER = ["sequence", "start", "length", "gap", "support"]
    COL_GAP = 4

    with open(out_links, "w") as f:
        for scf in sorted(scaffolds, key=lambda s: s.name):
            rows = []
            cur_pos = 0

            for contig in scf.contigs:
                start = cur_pos
                cur_pos = start + contig.length() + contig.link.gap
                support = _support_to_string(contig.link)

                rows.append([contig.signed_name(), str(start),
                            str(contig.length()), str(contig.link.gap),
                            support])

            col_widths = repeat(0)
            for row in [HEADER] + rows:
                col_widths = [max(len(v), w) for v, w in zip(row, col_widths)]
            line_len = sum(col_widths) + COL_GAP * len(col_widths)

            #header
            f.write("-" * line_len + "\n")
            f.write(scf.name + "\n")
            f.write("-" * line_len + "\n")
            for hdr, width in zip(HEADER, col_widths):
                f.write(hdr + (" " * (width - len(hdr) + COL_GAP)))
            f.write("\n" + "-" * line_len + "\n")

            #values
            for row in rows:
                for val, width in zip(row, col_widths):
                    f.write(val + (" " * (width - len(val) + COL_GAP)))
                f.write("\n")

            f.write("-" * line_len + "\n\n")


def _support_to_string(link):
    """
    Converts information about supporting adjacencies to string.
    Could be used separately form OutputGenerator for debugging purposes
    """
    supp_genomes = sorted(link.supporting_genomes)
    support_to_str = lambda gc: "{0}:{1}".format(gc.genome, gc.chr)
    support = ",".join(map(support_to_str, supp_genomes))
    if link.supporting_assembly:
        support += ",~>"
    return support


def _calc_n50(scaffolds_lengths, assembly_len):
    n50 = 0
    sum_len = 0
    for l in sorted(scaffolds_lengths, reverse=True):
        sum_len += l
        if sum_len > assembly_len / 2:
            n50 = l
            break
    return n50
back to top