https://github.com/machinegun/hi-c-scaffold
Tip revision: 1b76bf63efb973583647a1eb95863d33ee6e09ad authored by Sergey Koren on 19 May 2021, 17:21:41 UTC
Merge pull request #130 from EdHarry/patch-1
Merge pull request #130 from EdHarry/patch-1
Tip revision: 1b76bf6
refactor_breaks.py
import os,sys
import argparse
import pickle
parser = argparse.ArgumentParser()
parser.add_argument('-i','--iteration',help='Number of current iteration in the pipeline')
parser.add_argument('-d','--directory',help='Project Directory')
args = parser.parse_args()
iteration = int(args.iteration)
breakpoints = {}
scaffolds_current = pickle.load(open(args.directory+'/scaffolds_iteration_'+str(iteration-1)+'.p','r'))
with open(args.directory+'/misasm_iteration_'+args.iteration+'.report','r') as f:
for line in f:
attrs = line.strip().split()
# print attrs
if line[0] != 's':
continue
try:
if len(attrs) < 2:
continue
breakpoints[attrs[0]] = []
for i in xrange(1,len(attrs)):
breakpoints[attrs[0]].append(int(attrs[i]))
except:
continue
#print breakpoints
scaff_id = 1
scaffolds_new = {}
unitig_length = {}
old2new = {}
def update_bed(expanded_scaffold):
contig2scaffold = {}
contig2info = {}
scaffold_length = {}
re_counts = {}
with open(args.directory+'/re_counts_iteration_1','r') as f:
for line in f:
attrs = line.split()
re_counts[attrs[0]] = (int(attrs[1]),int(attrs[2]))
#print re_counts
for key in expanded_scaffold:
path = expanded_scaffold[key]
scaffold_length[key] = 0
offset = 0
for i in xrange(0,len(path)-1,2):
contig = path[i].split(':')[0]
contig2scaffold[contig] = key
ori = path[i].split(':')[1] + path[i+1].split(':')[1]
if ori == 'BE':
contig2info[contig] = (offset,offset+unitig_length[contig],'FOW')
else:
contig2info[contig] = (offset,offset+unitig_length[contig],'REV')
offset += unitig_length[contig]
scaffold_length[key] += unitig_length[contig]
scaffold_re = {}
for key in expanded_scaffold:
path = expanded_scaffold[key]
length = scaffold_length[key]
offset = 0
s_left = 0
s_right = 0
if len(path) == 2:
contig = path[0].split(':')[0]
ori = path[0].split(':')[1]
left,right = re_counts[contig]
if ori == 'B':
scaffold_re[key] = (left,right)
else:
scaffold_re[key] = (right,left)
else:
midpoint = length/2
for i in xrange(0,len(path),2):
contig = path[i].split(':')[0]
left,right = re_counts[contig]
curr_contig_start = contig2info[contig][0]
curr_contig_end = contig2info[contig][1]
# curr_contig_ori = contig2info[contig][2]
if curr_contig_end < midpoint:
s_left += (left+right)
if curr_contig_start > midpoint:
s_right += (left+right)
if curr_contig_start <= midpoint and curr_contig_end >= midpoint:
left_part = midpoint - curr_contig_start
right_part = curr_contig_end - midpoint
s_left += float(left+right)/unitig_length[contig]*left_part
s_right += float(left+right)/unitig_length[contig]*right_part
## do NOT consider forward or reverse orientation here
## left is always left, right is always right
'''
if curr_contig_ori == "FOW":
s_left += (right+left)*left_part/unitig_length[contig]
s_right += (right+left)*right_part/unitig_length[contig]
else:
s_left += (right+left)*right_part/unitig_length[contig]
s_right += (right+left)*left_part/unitig_length[contig]
'''
'''
if offset <= length/2 and i+2 < len(path):
if contig2info[path[i+2].split(':')[0]][0] <= length/2:
s_left += (left + right)
else:
contig_next = path[i+2].split(':')[0]
if contig2info[contig_next][0] >= length/2:
left_part = length/2 - offset
right_part = contig2info[path[i+2].split(':')[0]][0] - length/2
s_left += left*left_part/unitig_length[contig]
s_right += right*right_part/unitig_length[contig]
if offset <= length/2 and i + 2 >= len(path):
left_part = length/2 - offset
right_part = length/2
s_left += left*left_part/unitig_length[contig]
s_right += right*right_part/unitig_length[contig]
if offset >= length/2:
s_right += (left+right)
offset += unitig_length[contig]
#scaffold_length[key] += contig_length[contig]
'''
scaffold_re[key] = (int(s_left),int(s_right))
o_lines = ""
count = 0
prev_line = ""
#print contig2scaffold
#sys.exit(1)
if True:
output = open(args.directory+'/alignment_iteration_'+str(iteration)+'.bed','w')
with open(args.directory+'/alignment_iteration_1.bed','r') as f:
for line in f:
if prev_line == "":
prev_line = line
continue
else:
prev_attrs = prev_line.split()
curr_attrs = line.split()
prev_contig = prev_attrs[0]
curr_contig = curr_attrs[0]
if prev_contig == curr_contig:
continue
prev_read = prev_attrs[3].split('/')[0]
curr_read = curr_attrs[3].split('/')[0]
first = prev_attrs[3].split('/')[1]
second = curr_attrs[3].split('/')[1]
if prev_contig in contig2scaffold and curr_contig in contig2scaffold:
prev_scaffold = contig2scaffold[prev_contig]
curr_scaffold = contig2scaffold[curr_contig]
if prev_read == curr_read and prev_scaffold != curr_scaffold and first == '1' and second == '2':
# if prev_read == curr_read and first == '1' and second == '2':
prev_info = contig2info[prev_contig]
prev_start = int(prev_attrs[1])
prev_end = int(prev_attrs[2])
if prev_info[2] == 'FOW':
new_prev_start = prev_info[0] + prev_start
new_prev_end = prev_info[0] + prev_end
else:
new_prev_start = prev_info[0] + unitig_length[prev_contig] - prev_end
new_prev_end = prev_info[0] + unitig_length[prev_contig] - prev_start
o_lines += prev_scaffold+'\t'+str(new_prev_start)+'\t'+str(new_prev_end)+'\t'+prev_attrs[3]+'\n'
count += 1
curr_info = contig2info[curr_contig]
curr_start = int(curr_attrs[1])
curr_end = int(curr_attrs[2])
if curr_info[2] == 'FOW':
new_curr_start = curr_info[0] + curr_start
new_curr_end = curr_info[0] + curr_end
else:
new_curr_start = curr_info[0] + unitig_length[curr_contig] - curr_end
new_curr_end = curr_info[0] + unitig_length[curr_contig] - curr_start
o_lines += curr_scaffold+'\t'+str(new_curr_start)+'\t'+str(new_curr_end)+'\t'+curr_attrs[3]+'\n'
count += 1
if count == 1000000:
output.write(o_lines)
count = 0
o_lines = ""
prev_line = line
#write remaining lines
output.write(o_lines)
output.close()
len_output = open(args.directory+'/scaffold_length_iteration_'+str(iteration),'w')
for key in scaffold_length:
len_output.write(key+'\t'+str(scaffold_length[key])+'\n')
len_output.close()
re_out = open(args.directory+'/re_counts_iteration_'+str(iteration),'w')
for key in scaffold_re:
re_out.write(key+'\t'+str(scaffold_re[key][0])+'\t'+str(scaffold_re[key][1])+'\n')
re_out.close()
avoid_file = open(args.directory+'/links_avoid_iteration_'+args.iteration,'w',1)
with open(args.directory+'/scaffold_length_iteration_1','r') as f:
for line in f:
attrs = line.split()
unitig_length[attrs[0]] = int(attrs[1])
for key in scaffolds_current:
if key not in breakpoints:
scaffolds_new['scaffold_'+str(scaff_id)] = scaffolds_current[key]
#old2new[key] = ['scaffold_'+str(scaff_id)]
scaff_id += 1
else:
#print key
cum_len = 0
breaks_list = []
for i in xrange(1,len(scaffolds_current[key]),2):
utg = scaffolds_current[key][i].split(':')[0]
cum_len += unitig_length[utg]
positions = breakpoints[key]
#print key, cum_len, positions
for position in positions:
if cum_len > position - 10000 and cum_len < position + 10000:
#print "found"
breaks_list.append(i)
break
print breaks_list
if len(breaks_list) == 0:
scaffolds_new['scaffold_'+str(scaff_id)] = scaffolds_current[key]
scaff_id += 1
continue
if len(breaks_list) == 1:
#print 'here'
first_part = scaffolds_current[key][:breaks_list[0]+1]
second_part = scaffolds_current[key][breaks_list[0]+1:]
print "first part : " + str(first_part)
print "second part : " + str(second_part)
first_id = 'scaffold_'+str(scaff_id)
scaff_id += 1
second_id = 'scaffold_'+str(scaff_id)
#old2new[key] = [first_id,second_id,break_list[0]]
scaff_id += 1
avoid_file.write(first_id+'\t'+second_id+'\n')
scaffolds_new[first_id] = first_part
scaffolds_new[second_id] = second_part
else:
prev = 0
prev_scaffold = ''
for i in xrange(len(breaks_list)):
if i == len(breaks_list) - 1:
print "start : " + str(prev)+"\tend : " + str(breaks_list[i]+1)
scaff = scaffolds_current[key][prev:breaks_list[i]+1]
print "part : " + str(scaff)
scaffolds_new['scaffold_'+str(scaff_id)] = scaff
scaff_id += 1
print "start : "+str(breaks_list[i]+1)
scaff = scaffolds_current[key][breaks_list[i]+1:]
print "part : " + str(scaff)
scaffolds_new['scaffold_'+str(scaff_id)] = scaff
avoid_file.write(prev_scaffold+'\t'+'scaffold_'+str(scaff_id)+'\n')
scaff_id += 1
continue
print 'start = '+str(prev)+'\tend = '+ str(breaks_list[i]+1)
scaff = scaffolds_current[key][prev:breaks_list[i]+1]
print "part : " + str(scaff)
prev = breaks_list[i]+1
scaffolds_new['scaffold_'+str(scaff_id)] = scaff
if prev_scaffold == '':
prev_scaffold = 'scaffold_'+str(scaff_id)
else:
avoid_file.write(prev_scaffold+'\t'+'scaffold_'+str(scaff_id)+'\n')
prev_scaffold = 'scaffold_'+str(scaff_id)
scaff_id += 1
print "================="
#print scaffolds_new.keys()
pickle.dump(scaffolds_new,open(args.directory+'/scaffolds_iteration_'+str(int(args.iteration) -1)+'.p','w'))
update_bed(scaffolds_new)
ofile = open(args.directory+'/misasm_'+args.iteration+'.DONE','w')
ofile.close()