https://github.com/fenderglass/Ragout
Revision 23789cdad4e3f2c5c3f00857aef23e2036e87afa authored by fenderglass on 31 October 2013, 16:00:38 UTC, committed by fenderglass on 31 October 2013, 16:00:38 UTC
1 parent 25d3aee
Tip revision: 23789cdad4e3f2c5c3f00857aef23e2036e87afa authored by fenderglass on 31 October 2013, 16:00:38 UTC
slight refactoring
slight refactoring
Tip revision: 23789cd
merge_iters.py
#!/usr/bin/env python
from collections import namedtuple
import sys
import scripts.debrujin_refine as debrujin
from scripts.datatypes import *
def parse_contigs_order(filename):
scaffolds = []
for line in open(filename, "r"):
if line.startswith(">"):
scaffolds.append(Scaffold(line.strip()[1:]))
else:
name = line.strip("\n").replace("=", "_") #fix for quast
without_sign = name[1:].strip()
sign = 1 if name[0] == "+" else -1
scaffolds[-1].contigs.append(Contig(without_sign, sign))
return scaffolds
def merge(big_file, small_file):
big_scaffolds = parse_contigs_order(big_file)
small_scaffolds = parse_contigs_order(small_file)
big_index = set()
for scf in big_scaffolds:
for c in scf.contigs:
big_index.add(c.name)
count = 0
new_scafflods = []
for scf in big_scaffolds:
result = []
for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]):
found_pair = False
for small_scf in small_scaffolds:
names = map(lambda c: c.name, small_scf.contigs)
try:
begin = names.index(prev_cont.name)
end = names.index(new_cont.name)
found_pair = True
break
except ValueError:
continue
result.append(prev_cont)
if not found_pair:
continue
assert end != begin
same_dir = True
if end < begin:
same_dir = False
end, begin = begin, end
consistent = True
for c in small_scf.contigs[begin + 1 : end]:
if c.name in big_index:
consistent = False
break
if not consistent or end - begin == 1:
continue
if ((prev_cont.sign == new_cont.sign) !=
(small_scf.contigs[begin].sign == small_scf.contigs[end].sign)):
continue
count += end - begin - 1
contigs = small_scf.contigs[begin + 1 : end]
if not same_dir:
contigs = contigs[::-1]
contigs = map(lambda c: Contig(c.name, -c.sign, 0), contigs)
result.extend(contigs)
result.append(new_cont)
s = Scaffold(scf.name)
s.contigs = result
new_scafflods.append(s)
return new_scafflods
def main():
if len(sys.argv) < 4:
print "Usage: merge_iters.py big small overlap"
scaffolds = merge(sys.argv[1], sys.argv[2])
refined_scaffolds = debrujin.refine_contigs(sys.argv[3], scaffolds)
scaf_out = open("merged.txt", "w")
ref_out = open("merged_refined.txt", "w")
for scf in scaffolds:
scaf_out.write(">" + scf.name + "\n")
for c in scf.contigs:
sign = "+" if c.sign > 0 else "-"
scaf_out.write(sign + c.name + "\n")
for scf in refined_scaffolds:
ref_out.write(">" + scf.name + "\n")
for c in scf.contigs:
sign = "+" if c.sign > 0 else "-"
ref_out.write(sign + c.name + "\n")
if __name__ == "__main__":
main()
Computing file changes ...