https://github.com/mossmatters/HybPiper
Raw File
Tip revision: 890f615eae4ce0f31e13f4e4b3df6123ffb46dce authored by The Gitter Badger on 24 May 2016, 22:29:32 UTC
Add Gitter badge
Tip revision: 890f615
paralog_investigator.py
#!/usr/bin/env python

helptext='''This script will extract exonerate results for all competing full-length paralogs
 and deposit them in a new paralog directory within the results directory.
 Paralogs can then be collected using paralog_retriever.py in order to align
 and build gene family trees.'''

import os,sys,argparse
from Bio import SeqIO


def extract_paralogs(gene,prefix):
	
	putative_paralog_ids = [x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))]
	try:
		chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip()
	except IOError:
		return 0
	
	exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta'))
	
	if not os.path.isdir(os.path.join(gene,prefix,'paralogs')):
		os.mkdir(os.path.join(gene,prefix,"paralogs"))
	seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids]
	
	for seq in range(len(seqs_to_write)):
		if seqs_to_write[seq].id == chosen_paralog:
			seqs_to_write[seq].id = "{}.{}".format(prefix,"main")
			
		else:
			seqs_to_write[seq].id = "{}.{}".format(prefix,seq)
	
	SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')
	
	return len(seqs_to_write)


def main():
	parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument('prefix',help='Name of directory containing HybPiper output.')
	parser.add_argument('--genelist',help="Text file containing the name of each gene extract. The default is to use all genes in 'prefix/genes_with_paralog_warnings.txt'.",default=None)
	
	args = parser.parse_args()
	
	if args.genelist:
		if os.path.isfile(genelist):
			genelist = [x.rstrip() for x in open(args.genelist)]
		else:
			sys.stderr.write("ERROR: cannot find genelist {}\n".format(genelist))
			sys.exit(1)
	else:
		if os.path.isfile("{}/genes_with_paralog_warnings.txt".format(args.prefix)):
			genelist = [x.rstrip() for x in open("{}/genes_with_paralog_warnings.txt".format(args.prefix))]
		else:
			sys.stderr.write("ERROR: cannot find genes_with_paralog_warnings.txt in {}!\n".format(args.prefix))
			sys.exit(1)
				
	os.chdir(args.prefix)
	
	for gene in genelist:
		num_paralogs = extract_paralogs(gene,args.prefix)
		sys.stderr.write("{} paralogs written for {}\n".format(num_paralogs,gene))
	
	
		


if __name__ == "__main__":main()
back to top