https://github.com/mossmatters/HybPiper
Tip revision: 5500bc99112f2f0e97dcb4ade3f7034e14166edf authored by rb-laura on 30 January 2017, 12:03:46 UTC
Delete test_HybPiper_Performance.py
Delete test_HybPiper_Performance.py
Tip revision: 5500bc9
retrieve_sequences.py
#!/usr/bin/env python
import sys,os
from Bio import SeqIO
helptext = '''Usage:
python retrieve_sequences.py baitfile.fasta sequence_dir aa/dna/intron/supercontig
This script will get the sequences generated from multiple runs of the HybSeqPipeline (reads_first.py).
Have all of the runs in the same directory (sequence_dir).
It retreives all the gene names from the bait file used in the run of the pipeline.
You must specify whether you want the protein (aa) or nucleotide (dna) sequences.
You can also specify 'intron' to retreive the intron sequences,
or 'supercontig' to get intron and exon sequences.
Will output unaligned fasta files, one per gene, to current directory.
'''
if len(sys.argv) < 4:
print helptext
sys.exit(1)
if sys.argv[3] == 'dna':
seq_dir = "FNA"
elif sys.argv[3] == 'aa':
seq_dir = "FAA"
elif sys.argv[3] == 'intron':
seq_dir = 'intron'
filename = 'introns'
elif sys.argv[3] == 'supercontig':
seq_dir = 'intron'
filename = 'supercontig'
else:
print helptext
sys.exit(1)
#Use gene names parsed from a bait file.
baitfile = sys.argv[1]
target_genes_dict = SeqIO.to_dict(SeqIO.parse(baitfile,'fasta'))
target_genes = list(set([x.id.split('-')[-1] for x in SeqIO.parse(baitfile,'fasta')]))
sampledir = sys.argv[2]
sample_names = [x for x in os.listdir(sampledir) if os.path.isdir(os.path.join(sampledir,x)) and not x.startswith('.')]
print "Retreiving {} genes from {} samples".format(len(target_genes),len(sample_names))
for gene in target_genes:
gene_seqs = []
for rec in gene_seqs:
rec.id = rec.id.split("-")[0]
rec.description = ''
for sample in sample_names:
if seq_dir == 'intron':
sample_path = os.path.join(sampledir,sample,gene,sample,'sequences',seq_dir,"{}_{}.fasta".format(gene,filename))
else:
sample_path = os.path.join(sampledir,sample,gene,sample,'sequences',seq_dir,gene+'.'+seq_dir)
if os.path.isfile(sample_path):
gene_seqs.append(SeqIO.read(sample_path,'fasta'))
print "Found {} sequences for {}.".format(len(gene_seqs),gene)
if seq_dir == 'intron':
outfilename = "{}_{}.fasta".format(gene,filename)
else:
outfilename = gene + '.' + seq_dir
SeqIO.write(gene_seqs,open(outfilename,'w'),'fasta')