https://github.com/mossmatters/HybPiper
Tip revision: 890f615eae4ce0f31e13f4e4b3df6123ffb46dce authored by The Gitter Badger on 24 May 2016, 22:29:32 UTC
Add Gitter badge
Add Gitter badge
Tip revision: 890f615
paralog_investigator.py
#!/usr/bin/env python
helptext='''This script will extract exonerate results for all competing full-length paralogs
and deposit them in a new paralog directory within the results directory.
Paralogs can then be collected using paralog_retriever.py in order to align
and build gene family trees.'''
import os,sys,argparse
from Bio import SeqIO
def extract_paralogs(gene,prefix):
putative_paralog_ids = [x.split()[1].rstrip() for x in open(os.path.join(gene,prefix,"paralog_warning.txt"))]
try:
chosen_paralog = open(os.path.join(gene,prefix,"exonerate_stats.csv")).readline().rstrip()
except IOError:
return 0
exonerate_dict = SeqIO.to_dict(SeqIO.parse(os.path.join(gene,prefix,"exonerate_results.fasta"),'fasta'))
if not os.path.isdir(os.path.join(gene,prefix,'paralogs')):
os.mkdir(os.path.join(gene,prefix,"paralogs"))
seqs_to_write = [exonerate_dict[x] for x in putative_paralog_ids]
for seq in range(len(seqs_to_write)):
if seqs_to_write[seq].id == chosen_paralog:
seqs_to_write[seq].id = "{}.{}".format(prefix,"main")
else:
seqs_to_write[seq].id = "{}.{}".format(prefix,seq)
SeqIO.write(seqs_to_write,os.path.join(gene,prefix,'paralogs','{}_paralogs.fasta'.format(gene)),'fasta')
return len(seqs_to_write)
def main():
parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('prefix',help='Name of directory containing HybPiper output.')
parser.add_argument('--genelist',help="Text file containing the name of each gene extract. The default is to use all genes in 'prefix/genes_with_paralog_warnings.txt'.",default=None)
args = parser.parse_args()
if args.genelist:
if os.path.isfile(genelist):
genelist = [x.rstrip() for x in open(args.genelist)]
else:
sys.stderr.write("ERROR: cannot find genelist {}\n".format(genelist))
sys.exit(1)
else:
if os.path.isfile("{}/genes_with_paralog_warnings.txt".format(args.prefix)):
genelist = [x.rstrip() for x in open("{}/genes_with_paralog_warnings.txt".format(args.prefix))]
else:
sys.stderr.write("ERROR: cannot find genes_with_paralog_warnings.txt in {}!\n".format(args.prefix))
sys.exit(1)
os.chdir(args.prefix)
for gene in genelist:
num_paralogs = extract_paralogs(gene,args.prefix)
sys.stderr.write("{} paralogs written for {}\n".format(num_paralogs,gene))
if __name__ == "__main__":main()