https://github.com/fenderglass/Ragout
Revision abb3b63b2678ec255d86120221b11370a3d55e68 authored by fenderglass on 31 October 2013, 16:02:29 UTC, committed by fenderglass on 31 October 2013, 16:02:29 UTC
1 parent 23789cd
Raw File
Tip revision: abb3b63b2678ec255d86120221b11370a3d55e68 authored by fenderglass on 31 October 2013, 16:02:29 UTC
new branch
Tip revision: abb3b63
merge_iters.py
#!/usr/bin/env python

from collections import namedtuple
import sys
import scripts.debrujin_refine as debrujin
from scripts.datatypes import *


def parse_contigs_order(filename):
    scaffolds = []
    for line in open(filename, "r"):
        if line.startswith(">"):
            scaffolds.append(Scaffold(line.strip()[1:]))
        else:
            name = line.strip("\n").replace("=", "_") #fix for quast
            without_sign = name[1:].strip()
            sign = 1 if name[0] == "+" else -1
            scaffolds[-1].contigs.append(Contig(without_sign, sign))
    return scaffolds


def merge(big_file, small_file):
    big_scaffolds = parse_contigs_order(big_file)
    small_scaffolds = parse_contigs_order(small_file)

    big_index = set()
    for scf in big_scaffolds:
        for c in scf.contigs:
            big_index.add(c.name)

    count = 0
    new_scafflods = []
    for scf in big_scaffolds:
        result = []
        for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
            found_pair = False
            for small_scf in small_scaffolds:
                names = map(lambda c: c.name, small_scf.contigs)
                try:
                    begin = names.index(prev_cont.name)
                    end = names.index(new_cont.name)
                    found_pair = True
                    break
                except ValueError:
                    continue

            result.append(prev_cont)
            if not found_pair:
                continue

            assert end != begin
            same_dir = True
            if end < begin:
                same_dir = False
                end, begin = begin, end

            consistent = True
            for c in small_scf.contigs[begin + 1 : end]:
                if c.name in big_index:
                    consistent = False
                    break

            if not consistent or end - begin == 1:
                continue

            if ((prev_cont.sign == new_cont.sign) !=
                (small_scf.contigs[begin].sign == small_scf.contigs[end].sign)):
                continue

            count += end - begin - 1
            contigs = small_scf.contigs[begin + 1 : end]
            if not same_dir:
                contigs = contigs[::-1]
                contigs = map(lambda c: Contig(c.name, -c.sign, 0), contigs)
            result.extend(contigs)

        result.append(new_cont)
        s = Scaffold(scf.name)
        s.contigs = result
        new_scafflods.append(s)

    return new_scafflods

def main():
    if len(sys.argv) < 4:
        print "Usage: merge_iters.py big small overlap"

    scaffolds = merge(sys.argv[1], sys.argv[2])
    refined_scaffolds = debrujin.refine_contigs(sys.argv[3], scaffolds)

    scaf_out = open("merged.txt", "w")
    ref_out = open("merged_refined.txt", "w")

    for scf in scaffolds:
        scaf_out.write(">" + scf.name + "\n")
        for c in scf.contigs:
            sign = "+" if c.sign > 0 else "-"
            scaf_out.write(sign + c.name + "\n")

    for scf in refined_scaffolds:
        ref_out.write(">" + scf.name + "\n")
        for c in scf.contigs:
            sign = "+" if c.sign > 0 else "-"
            ref_out.write(sign + c.name + "\n")


if __name__ == "__main__":
    main()
back to top