https://github.com/fenderglass/Ragout
Revision fedd17002dc18f3e22f3d2c35b89979e6cfc1535 authored by fenderglass on 08 August 2013, 11:30:28 UTC, committed by fenderglass on 08 August 2013, 11:30:28 UTC
1 parent 0124ed0
Raw File
Tip revision: fedd17002dc18f3e22f3d2c35b89979e6cfc1535 authored by fenderglass on 08 August 2013, 11:30:28 UTC
refactoring, repo reorganisation
Tip revision: fedd170
refass.py
#!/usr/bin/env python

import sys
import os
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


import scripts.infer_connections as ic
import scripts.breakpoint_graph as bg
import scripts.sibelia_parser as sp


class DumbContig(sp.Contig):
    def __init__(self, length):
        sp.Contig.__init__(self, "")
        self.length = length

class Scaffold:
    def __init__(self, left, right, contigs):
        self.left = left
        self.right = right
        self.contigs = contigs

#def sign_of(val):
#    return math.copysign(1, val)


def extend_scaffolds(contigs, contig_index, connections):
    scaffolds = []
    visited = set()

    def extend_scaffold(contig):
        visited.add(contig)
        scf = Scaffold(contig.blocks[0], contig.blocks[-1], [contig])
        scaffolds.append(scf)

        #go right
        while scf.right in connections:
            adjacent = connections[scf.right].end
            distance = connections[scf.right].distance
            assert len(contig_index[abs(adjacent)]) == 1

            contig = contigs[contig_index[abs(adjacent)][0]]
            if contig in visited:
                break

            if contig.blocks[0] == adjacent:
                scf.contigs.append(DumbContig(distance))
                scf.contigs.append(contig)
                scf.right = contig.blocks[-1]
                visited.add(contig)
                continue

            if -contig.blocks[-1] == adjacent:
                scf.contigs.append(DumbContig(distance))
                scf.contigs.append(contig)
                scf.contigs[-1].sign = -1
                scf.right = -contig.blocks[0]
                visited.add(contig)
                continue

            break

        #go left
        while -scf.left in connections:
            adjacent = -connections[-scf.left].end
            distance = connections[-scf.left].distance
            assert len(contig_index[abs(adjacent)]) == 1

            contig = contigs[contig_index[abs(adjacent)][0]]
            if contig in visited:
                break

            if contig.blocks[-1] == adjacent:
                scf.contigs.insert(0, DumbContig(distance))
                scf.contigs.insert(0, contig)
                scf.left = contig.blocks[0]
                visited.add(contig)
                continue

            if -contig.blocks[0] == adjacent:
                scf.contigs.insert(0, DumbContig(distance))
                scf.contigs.insert(0, contig)
                scf.contigs[0].sign = -1
                scf.left = -contig.blocks[-1]
                visited.add(contig)
                continue

            break

    for contig in contigs:
        if contig not in visited:
            extend_scaffold(contig)
    return scaffolds


def get_scaffolds(contigs, contig_index, connections):
    scaffolds = extend_scaffolds(contigs, contig_index, connections)
    scaffolds = filter(lambda s: len(s.contigs) > 1, scaffolds)
    for scf in scaffolds:
        contigs = filter(lambda c : not isinstance(c, DumbContig), scf.contigs)
        for contig in contigs:
            if contig.sign > 0:
                print contig.blocks,
            else:
                print map(lambda b: -b, contig.blocks)[::-1],
        print ""
    return scaffolds


def output_scaffolds(input_contigs, scaffolds, out_file, write_contigs=False):
    MIN_CONTIG_LEN = 500
    contigs = SeqIO.parse(input_contigs, "fasta")
    out_stream = open(out_file, "w")
    queue = {}
    for rec in contigs:
        queue[rec.id] = rec.seq

    counter = 0
    for scf in scaffolds:
        scf_seq = Seq("")
        buffer = ""

        for i, contig in enumerate(scf.contigs):
            if isinstance(contig, DumbContig):
                buffer = "N" * contig.length
                continue

            cont_seq = queue[contig.name]
            del queue[contig.name]

            if contig.sign < 0:
                cont_seq = cont_seq.reverse_complement()

            if i > 0:
                #check for overlapping
                overlap = False
                for window in xrange(5, 100):
                    if str(scf_seq)[-window:] == str(cont_seq)[0:window]:
                        assert overlap == False
                        cont_seq = cont_seq[window:]
                        overlap = True
                if not overlap:
                    scf_seq += Seq(buffer)
                #scf_seq += Seq("N" * 11)    #easy mode
            buffer = ""
            scf_seq += cont_seq

        name = "scaffold{0}".format(counter)
        counter += 1
        SeqIO.write(SeqRecord(scf_seq, id=name, description=""), out_stream, "fasta")

    count = 0
    for h, seq in queue.iteritems():
        if len(seq) > MIN_CONTIG_LEN:
            count += 1
    print "Done,", count, "contigs left"
    if write_contigs:
        for h, seq in queue.iteritems():
            SeqIO.write(SeqRecord(seq, id=h, description=""), out_stream, "fasta")


def do_job(sibelia_dir, contigs_file, out_scaffolds, out_graph):
    coords_file = os.path.join(sibelia_dir, "blocks_coords.txt")
    permutations_file = os.path.join(sibelia_dir, "genomes_permutations.txt")

    blocks_coords, seqid = sp.parse_coords_file(coords_file)
    permutations, contigs = sp.parse_permutations_file(permutations_file)
    contig_index = sp.build_contig_index(contigs)
    num_references = len(permutations)

    graph = bg.build_graph(permutations, blocks_coords)
    conected_comps = bg.get_connected_components(graph)
    connections = ic.simple_connections(graph, conected_comps, contigs, contig_index, num_references)
    scaffolds = get_scaffolds(contigs, contig_index, connections)

    output_scaffolds(contigs_file, scaffolds, out_scaffolds)
    if out_graph:
        bg.output_graph(graph, open(out_graph, "w"))


def main():
    parser = argparse.ArgumentParser(description="Tool for reference-assisted assembly")
    parser.add_argument("-s", action="store", metavar="sibelia_dir", dest="sibelia_dir",
                        required=True, help="Directory with Sibelia output")
    parser.add_argument("-c", action="store", metavar="contigs_file", dest="contigs_file",
                        required=True, help="File with contigs in fasta format")
    parser.add_argument("-o", action="store", metavar="output_file", dest="output_file",
                        default="scaffolds.fasta", help="Output scaffolds file (default: scaffolds.fasta)")
    parser.add_argument("-g", action="store", metavar="graph_file", dest="graph_file",
                        default=None, help="Output file for breakpoint graph (default: Not set)")
    args = parser.parse_args()
    do_job(args.sibelia_dir, args.contigs_file, args.output_file, args.graph_file)

if __name__ == "__main__":
    main()
back to top