Content - df15cf9cd72c3979c0a7174d27856cd0622726b1 - fdf8cca/hybpiper_stats.py

visit type:
https://github.com/mossmatters/HybPiper

07 February 2024, 09:41:08 UTC
Tip revision: 5500bc99112f2f0e97dcb4ade3f7034e14166edf authored by rb-laura on 30 January 2017, 12:03:46 UTC
Delete test_HybPiper_Performance.py
Tip revision: 5500bc9
hybpiper_stats.py
#!/usr/bin/env python2

#########################
# HybPiper Stats Script #
#########################

helptext = '''Gather statistics about HybPiper run.

Supply the output of get_seq_lengths.py and a list of HybPiper directories

For an explanation of columns, see github.com/mossmatters/HybPiper/wiki

'''

import argparse, os, sys, subprocess

def file_len(fname):
    p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
                                              stderr=subprocess.PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])


def enrich_efficiency_blastx(blastxfilename):
	'''Parse BLASTX results to calculate enrichment effiiency'''
	reads_with_hits = [x.split()[0] for x in open(blastxfilename)]
	if os.path.isfile(blastxfilename.replace(".blastx","_unpaired.blastx")):
		reads_with_hits += [x.split()[0] for x in open(blastxfilename.replace(".blastx","_unpaired.blastx"))]
	numReads = len(set(reads_with_hits))
	
	return("NA",str(numReads),"NA")


def enrich_efficiency_bwa(bamfilename):
	'''Run and parse samtools flagstat output, return number of reads and number on target'''
	samtools_cmd = "samtools flagstat {}".format(bamfilename)
	child = subprocess.Popen(samtools_cmd,shell=True,stdout=subprocess.PIPE)
	flagstat_results = [line for line in child.stdout.readlines()]
	numReads = float(flagstat_results[0].split()[0])
	mappedReads = float(flagstat_results[4].split()[0])
	
	if os.path.isfile(bamfilename.replace(".bam","_unpaired.bam")):
		unpaired_samtools_cmd = "samtools flagstat {}".format(bamfilename.replace(".bam","_unpaired.bam"))
		unpaired_child = subprocess.Popen(unpaired_samtools_cmd,shell=True,stdout=subprocess.PIPE)
		flagstat_results = [line for line in unpaired_child.stdout.readlines()]
		numReads += float(flagstat_results[0].split()[0])
		mappedReads += float(flagstat_results[4].split()[0])
	
	return str(int(numReads)),str(int(mappedReads)),"{0:.3f}".format(mappedReads/numReads)
	
def recovery_efficiency(name):
	'''Report the number of genes with mapping hits, contigs, and exon sequences'''
	a= file_len("{}/spades_genelist.txt".format(name))
	b= file_len("{}/exonerate_genelist.txt".format(name))
	c= file_len("{}/genes_with_seqs.txt".format(name))
	
	return str(a),str(b),str(c)
	
def seq_length_calc(seq_lengths_fn):
	'''From the output of get_seq_lengths.py, calculate the number of genes with seqs, and at least a pct of the reference length'''
	seq_length_dict = {}
	with open(seq_lengths_fn) as seq_len:
		gene_names = seq_len.readline()
		target_lengths = seq_len.readline().split()[1:]
		for line in seq_len:
			line = line.split()
			name = line.pop(0)
			is_25pct = 0
			is_50pct = 0
			is_75pct = 0
			is_150pct = 0
			for gene in xrange(len(line)):
				gene_length = float(line[gene])
				target_length = float(target_lengths[gene])
				if gene_length > target_length * 0.25:
					is_25pct += 1
				if gene_length > target_length * 0.50:
					is_50pct += 1
				if gene_length > target_length * 0.75:
					is_75pct += 1
				if gene_length > target_length * 1.5:
					is_150pct += 1	
			seq_length_dict[name] = [str(is_25pct),str(is_50pct),str(is_75pct),str(is_150pct)]
	return seq_length_dict
	
def main():
	parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument("seq_lengths",help="output of get_seq_lengths.py")
	parser.add_argument("namelist",help="text file with names of HybPiper output directories, one per line")
	args = parser.parse_args()
	
	categories = ["Name",
		"NumReads",
		"ReadsMapped",
		"PctOnTarget",
		"GenesMapped",
		"GenesWithContigs",
		"GenesWithSeqs",
		"GenesAt25pct",
		"GenesAt50pct",
		"GenesAt75pct",
		"Genesat150pct",
		"ParalogWarnings"
			]
	sys.stdout.write("{}\n".format("\t".join(categories)))
	
	seq_length_dict = seq_length_calc(args.seq_lengths) 
	stats_dict = {}
	
	
	for line in open(args.namelist):
		name = line.rstrip()
		stats_dict[name] = []
		#Enrichment Efficiency
		bamfile = "{}/{}.bam".format(name,name)
		blastxfile ="{}/{}.blastx".format(name,name)
		if os.path.isfile(bamfile):
			stats_dict[name] += enrich_efficiency_bwa(bamfile)
		elif os.path.isfile(blastxfile):
			stats_dict[name] += enrich_efficiency_blastx(blastxfile)
		else:
			sys.stderr.write("No .bam or .blastx file found for {}\n".format(name))
			
		#Recovery Efficiency
		stats_dict[name] += recovery_efficiency(name)
		stats_dict[name] += seq_length_dict[name]
		
		#Paralogs
		paralog_warns = file_len("{}/genes_with_paralog_warnings.txt".format(name))
		stats_dict[name].append(str(paralog_warns))
	#SeqLengths
	
	
	for name in stats_dict:	
		sys.stdout.write("{}\t{}\n".format(name,"\t".join(stats_dict[name])))


if __name__ == "__main__":main()