Content - 1e8b8bf132afbc5ab2bdfc6c2837f3ef62d487d3 - a722e8a/make_histogram.py

visit type:
Tip revision: 16a7adca009034fca2a6f68a8b83f8003a68962d authored by stevendflygare on 16 March 2015, 17:06:20 UTC
user state change
Tip revision: 16a7adc
make_histogram.py
#!/opt/local/bin/python2.7

import sys
import re
import copy
from PIL import Image
import scipy.ndimage as si
import scipy.misc
import bisect
import numpy as np
import numpy.random as nr
import random
import scipy.stats as ss
import argparse
import matplotlib
#matplotlib.use('Agg')
import matplotlib.pyplot as plt
import itertools

#command line parsing 
parser = argparse.ArgumentParser()
parser.add_argument("fiber_file", help="fiber size input file")
parser.add_argument("image_metrics", help="image metrics input file")
parser.add_argument("-c","--conversion",help="pixel to micro-meter conversion (multiplicative factor)",type=float,default=1.0)
parser.add_argument("-rf","--relative",help="use relative frequency instead of raw counts in histogram",action="store_true")
parser.add_argument("-ntr","--no_replicates",help="flag if there are no technical replicates (only one image per mouse) to get error bars",action="store_true")
parser.add_argument("-ks",help="compute test statistic across mutant / wild type permutations",action="store_true")
parser.add_argument("-bs","--bin_sizes",help="comma separated list of pixel break values for bins",type=str)
parser.add_argument("-hb",help="number of histogram bins",type=int,default=20)
parser.add_argument("-xl",help="x-axis label",type=str,default="")
parser.add_argument("-yl",help="y-axis label",type=str,default="")
parser.add_argument("-ylim",help="y limit as start,end",type=str,default=None)
parser.add_argument("-rp","--random_permute",help="flag to create background distribution",action="store_true")
args = parser.parse_args()

def load_fiber_sizes(fiber_file,mice_hash): 
	f = open(fiber_file,'r')
	for line in f:
		data = re.split(",",line.strip())
		m = re.search("([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_(\d+)\.",data[0])
		if m:
			if m.group(1) in mice_hash:
				if len(data) == 0:
					mice_hash[m.group(1)][0].append([10])
				else:
					mice_hash[m.group(1)][0].append(map(int,data[1:]))
			else:
				print "problem, mouse not found going to exit: " + m.group(1)
				sys.exit(0)
		else:
			print "problem, image name is not recognized, please check naming scheme of images: " + data[0]
			print "exiting..."
			sys.exit(0)
	f.close()

def compute_ks_test(mice_hash):
	image_names = []
	for k in mice_hash:
		image_names.append([])
		for im in mice_hash[k][1]:
			image_names[-1].append((k,im))
	total_tests = 0
	significant_tests = 0
	for p in itertools.product(*image_names):	
		mutant = []
		wild_type = []
		#print p
		for i in p:
			if mice_hash[i[0]][2] == 0:
				wild_type.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
			else:
				mutant.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
		ks_stat, p_value =  ss.ks_2samp(wild_type, mutant)
		if p_value < .05:
			significant_tests+=1
		total_tests+=1		
	
	print "results of ks test:"
	print "\t total permutations: " + str(total_tests)
	print "\t number of significant tests (at alpha = .05): " + str(significant_tests)
	print "\t proportion significant: " + str(float(significant_tests)/total_tests)		

def random_permutation(mice):
	num_mutant = 0
	mutant_wildtype = [mice[x][2] for x in mice]  #{mouse}->[[fibers],[images],mutant/wildtype]
	num_combinations = scipy.misc.comb(len(mice),sum(mutant_wildtype))
	num_permutations = min(num_combinations,500)
	wt_hists = []
	mt_hists = []
	wt_mt_diffs = []
	bins = []
	for i in range(num_permutations):
		m_copy = copy.deepcopy(mice)
		nr.shuffle(mutant_wildtype)
		for j,m in enumerate(m_copy):
			m_copy[m][2] = mutant_wildtype[j]
		#print i
		wt, mt, bins = permutation_calc(m_copy)
		diff = []
		for i,wv in enumerate(wt):
			diff.append(wv - mt[i])
		wt_hists.append(wt)
		mt_hists.append(mt)
		wt_mt_diffs.append(diff)
		
	wt,mt,bins = permutation_calc(mice)
	percentiles = []
	for i in range(len(wt)):
		diffs = [x[i] for x in wt_mt_diffs]
		diffs.sort()
		percentiles.append(float(bisect.bisect_right(diffs,wt[i]-mt[i]))/len(diffs))
	
	show_histogram([wt], [mt], bins, diff_percentiles=percentiles)
	
def permutation_calc(mice_hash):
	#bootstrap mutant / wild type pooled fibers to get error bars
	mt_hist = []
	wt_hist = []
	mutant = []
	wild_type = []
	bins = []
	if args.bin_sizes:
		bins = [int(x) for x in args.bin_sizes.split(",")]
		hb = len(bins)-1
	else:
		print "must specify histogram bins if going to randomly permute mutant / wild type status (use -bs <bin list>)"
		print "exiting..."
		sys.exit(0)
	
	#randomly select technical replicates, if any, to use
	for m in mice_hash:
		if mice_hash[m][2] == 0:
			wild_type.extend(random.sample(mice_hash[m][0],1)[0])
		else:
			mutant.extend(random.sample(mice_hash[m][0],1)[0])
	sample_size = int(min(len(wild_type),len(mutant)))
	wt_sample = random.sample(wild_type,sample_size)
	mt_sample = random.sample(mutant,sample_size)
	if args.relative:
		wt_hist, tmpbins = np.histogram(wt_sample,bins)
		wt_relfreq = wt_hist/float(sum(wt_hist))
		mt_hist, tmpbins = np.histogram(mt_sample,bins)
		mt_relfreq = mt_hist/float(sum(mt_hist))
	else:
		wt_hist, tmpbins = np.histogram(wt_sample,bins)
		mt_hist, tmpbins = np.histogram(mt_sample,bins)
	
	if args.relative:
		return wt_relfreq, mt_relfreq, bins
		
	return wt_hist, mt_hist, bins
	

def show_histogram(wt_histograms, mt_histograms, bins, diff_percentiles=None):	
	wt_hist = np.mean(wt_histograms,axis=0)
	wt_std = np.std(wt_histograms,axis=0)
	mt_hist = np.mean(mt_histograms,axis=0)
	mt_std = np.std(mt_histograms,axis=0)
			
	#print "making histogram"		
	#use relative frequencies instead of the density
	ind = np.arange(len(wt_hist))  # the x locations for the groups
	width = 0.35       # the width of the bars
	rects1 = plt.bar(ind, wt_hist, width, color='b', yerr=wt_std, error_kw=dict(elinewidth=4, ecolor='black'))
	rects2 = plt.bar(ind+width, mt_hist, width, color='r', yerr=mt_std, error_kw=dict(elinewidth=4, ecolor='black'))
	a = list(wt_hist)
	a.extend(mt_hist)
	if not args.relative:
		plt.ylim([0, int(max(a)*1.10)])
	else:
		plt.ylim([0, max(a)+.1])
	
	if args.ylim != None: #control over y-axis
		plt.ylim(args.ylim)
		
	tick_vals = []
	print "\nbins start and end pixel values: "
	for i in range(len(bins)):
		tick_vals.append(i+1)
		if i > 0:
			if diff_percentiles == None:
				print "\tbin %d: %f-%f"%(i,bins[i-1]*args.conversion,bins[i]*args.conversion)
			else:
				print "\tbin %d: %f-%f\t%f"%(i,bins[i-1]*args.conversion,bins[i]*args.conversion,diff_percentiles[i-1])
							
	#for i in range(len(bins)):
	#	if i < len(bins)-1:
	#		if not conversion:
				#tick_vals.append(int(np.mean(bins[i:i+2])/100))
	#			tick_vals.append(int(np.mean(bins[i:i+2])))
	#		else:
	#			tick_vals.append(round(np.mean(bins[i:i+2])*conversion,2))
	#plt.xticks(ind,tick_vals)
	plt.xticks(ind+width,tick_vals)
	plt.xlabel(args.xl)
	
	plt.ylabel(args.yl)
	plt.legend( (rects1[0], rects2[0]), ('Wild Type', 'Mutant') )
	plt.show()
	#plt.savefig(hist_name)
	plt.clf()	

def no_replicates_histogram(mice_hash,hist_name,hb,user_bins):
	#bootstrap mutant / wild type pooled fibers to get error bars
	mt_histograms = []
	wt_histograms = []
	mutant = []
	wild_type = []
	bins = []
	if user_bins:
		bins = map(int,re.split(",",user_bins))
		hb = len(bins)-1
		
	for m in mice_hash:
		if mice_hash[m][2] == 0:
			wild_type.extend(mice_hash[m][0][0])
		else:
			mutant.extend(mice_hash[m][0][0])
	#select .5 of the population about 1000 times and compute histograms to get the error bars
	sample_size = int(0.5*min(len(wild_type),len(mutant)))
	for i in range(1000):
		wt_sample = random.sample(wild_type,sample_size)
		mt_sample = random.sample(mutant,sample_size)
		if i == 0:
			wt_hist, tmpbins = np.histogram(wt_sample,hb)
			if not bins:
				bins = tmpbins[:]
			if args.relative:
				wt_relfreq = wt_hist/float(sum(wt_hist))
				wt_histograms.append(wt_relfreq)
				mt_hist, tmpbins = np.histogram(mt_sample,bins)
				mt_relfreq = mt_hist/float(sum(mt_hist))
				mt_histograms.append(mt_relfreq)
			else:
				wt_histograms.append(wt_hist)
				mt_hist, tmpbins = np.histogram(mt_sample,bins)
				mt_histograms.append(mt_hist)
		else:
			if args.relative:
				wt_hist, tmpbins = np.histogram(wt_sample,bins)
				wt_relfreq = wt_hist/float(sum(wt_hist))
				wt_histograms.append(wt_relfreq)
				mt_hist, tmpbins = np.histogram(mt_sample,bins)
				mt_relfreq = mt_hist/float(sum(mt_hist))
				mt_histograms.append(mt_relfreq)
			else:
				wt_hist, tmpbins = np.histogram(wt_sample,bins)
				wt_histograms.append(wt_hist)
				mt_hist, tmpbins = np.histogram(mt_sample,bins)
				mt_histograms.append(mt_hist)	
						
	show_histogram(wt_histograms, mt_histograms, bins)
		


def compile_rf_histogram(mice_hash,hist_name,hb,user_bins):
	#do permutations to get all possibilities and then let the final histogram be the average
	image_names = []
	mt_histograms = []
	wt_histograms = []
	p_index = 0
	bins = []
	if user_bins:
		bins = map(int,re.split(",",user_bins))
		hb = len(bins)-1
	
	for k in mice_hash:
		image_names.append([])
		for im in mice_hash[k][1]:
			image_names[-1].append((k,im))
	
	iterations = 0			
	for p in itertools.product(*image_names):	
		iterations+=1
		if iterations >= 500:
			break
		mutant = []
		wild_type = []
		#print p
		for i in p:
			if mice_hash[i[0]][2] == 0:
				wild_type.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
			else:
				mutant.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
		if p_index == 0:
			p_index+=1
			if not bins:
				wt_hist, tmpbins = np.histogram(wild_type,hb)
				bins = tmpbins[:]
			wt_hist, tmpbins = np.histogram(wild_type,bins)			
			wt_relfreq = wt_hist/float(sum(wt_hist))
			wt_histograms.append(wt_relfreq)
			mt_hist, tmpbins = np.histogram(mutant,bins)
			mt_relfreq = mt_hist/float(sum(mt_hist))
			mt_histograms.append(mt_relfreq)
		else:
			wt_hist, tmpbins = np.histogram(wild_type,bins)
			wt_relfreq = wt_hist/float(sum(wt_hist))
			wt_histograms.append(wt_relfreq)
			mt_hist, tmpbins = np.histogram(mutant,bins)
			mt_relfreq = mt_hist/float(sum(mt_hist))
			mt_histograms.append(mt_relfreq)
	 			
	show_histogram(wt_histograms, mt_histograms, bins)
	#print np.matrix(wt_histograms)

def compile_count_histogram(mice_hash,hist_name,hb,user_bins):
	#do permutations to get all possibilities and then let the final histogram be the average
	image_names = []
	mt_histograms = []
	wt_histograms = []
	p_index = 0
	bins = []
	if user_bins:
		bins = map(int,re.split(",",user_bins))
		hb = len(bins)-1
	for k in mice_hash:
		image_names.append([])
		for im in mice_hash[k][1]:
			image_names[-1].append((k,im))
	#need to equalize number of fibers between groups, so find smallest number among all the groups
	#and randomly select this number of fibers
	counts = []
	iterations = 0
	for p in itertools.product(*image_names):	
		iterations+=1
		if iterations >= 500:
			break
		mutant = []
		wild_type = []
		#print p
		for i in p:
			if mice_hash[i[0]][2] == 0:
				wild_type.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
			else:
				mutant.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
		counts.append(len(mutant))
		counts.append(len(wild_type))
	
	sample_size = min(counts)
	print "number of fibers to sample from each collection of mutant/wild type: " + str(sample_size)
	iterations = 0
	for p in itertools.product(*image_names):	
		iterations+=1
		if iterations >= 500:
			break
		mutant = []
		wild_type = []
		#print p
		for i in p:
			if mice_hash[i[0]][2] == 0:
				wild_type.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])
			else:
				mutant.extend(mice_hash[i[0]][0][mice_hash[i[0]][1].index(i[1])])	
		if sample_size > len(mutant) or sample_size > len(wild_type):
			if mutant > wild_type:
				mutant = random.sample(mutant,len(wild_type))
			else:
				wild_type = random.sample(wild_type,len(mutant))
		else:
			mutant = random.sample(mutant,sample_size)
			wild_type = random.sample(wild_type,sample_size)
		if p_index == 0:
			p_index+=1
			if not bins:
				wt_hist, tmpbins = np.histogram(wild_type,hb)
				bins = tmpbins[:]
			wt_hist, tmpbins = np.histogram(wild_type,bins)
			wt_histograms.append(wt_hist)
			mt_hist, tmpbins = np.histogram(mutant,bins)
			mt_histograms.append(mt_hist)
		else:
			wt_hist, tmpbins = np.histogram(wild_type,bins)
			wt_histograms.append(wt_hist)
			mt_hist, tmpbins = np.histogram(mutant,bins)
			mt_histograms.append(mt_hist)	

	show_histogram(wt_histograms, mt_histograms, bins)		


def create_decile_plot(mice_hash,plot_name):
	xvals = range(0,105,5)
	for m in mice_hash:
		if mice_hash[m][2] == 0:
			y = np.percentile(mice_hash[m][0],xvals)
			plt.plot(xvals,y,"bo-")
	for m in mice_hash:
		if mice_hash[m][2] == 1:
			y = np.percentile(mice_hash[m][0],xvals)
			plt.plot(xvals,y,"ro-")
	plt.axis([0,110,0,2000])				
	plt.savefig(plot_name)
	plt.clf()	

f = open(args.image_metrics,'r')
index = 0
mice = {} #key is mouse id, then the value of each key is [[fibers],[images],mutant/wildtype]
m_slow = {}
m_fast = {} 
for line in f:
	if index > 0:
		data = re.split(",",line.strip())
		m = re.search("([0-9a-zA-Z]+)_([0-9a-zA-Z]+)_(\d+)\.",data[0])
		if m:
			if m.group(1) in mice:
				mice[m.group(1)][1].append(m.group(2))
				mice[m.group(1)][2] = int(m.group(3))
				m_slow[m.group(1)][1].append(m.group(2))
				m_slow[m.group(1)][2] = int(m.group(3))
				m_fast[m.group(1)][1].append(m.group(2))
				m_fast[m.group(1)][2] = int(m.group(3))
			else:
				mice[m.group(1)] = [[],[],0]
				mice[m.group(1)][1].append(m.group(2))
				mice[m.group(1)][2] = int(m.group(3))
				m_slow[m.group(1)] = [[],[],0]
				m_slow[m.group(1)][1].append(m.group(2))
				m_slow[m.group(1)][2] = int(m.group(3))
				m_fast[m.group(1)] = [[],[],0]
				m_fast[m.group(1)][1].append(m.group(2))
				m_fast[m.group(1)][2] = int(m.group(3))
		else:
			print "problem, image name is not recognized, please check naming scheme of images: " + data[0]
			print "exiting..."
			sys.exit(0)
	index+=1	
f.close()

load_fiber_sizes(args.fiber_file,mice)
if args.ylim != None:
	m = re.match("([0-9]*\.*[0-9]+),([0-9]*\.*[0-9]+)",args.ylim)
	if m:
		args.ylim = (float(m.group(1)),float(m.group(2)))
	else:
		print "problem, y limits need to be specified as <start>,<stop>"
		print "exiting..."
		sys.exit(0)

if args.ks:
	compute_ks_test(mice)

if args.random_permute:
	if not args.bin_sizes:
		print "must specify histogram bins if going to randomly permute mutant / wild type status (use -bs <bin list>)"
		print "exiting..."
		sys.exit(0)
	random_permutation(mice)
elif args.no_replicates:
	no_replicates_histogram(mice,"fiber_histogram.svg",args.hb,args.bin_sizes)
elif args.relative:
	compile_rf_histogram(mice,"fiber_histogram.svg",args.hb,args.bin_sizes)
else:
	compile_count_histogram(mice,"fiber_histogram.svg",args.hb,args.bin_sizes)
	
#chi2, p, dof, ex = ss.chi2_contingency([wt_hist,mt_hist])
#print "chi2 test p-value: " + str(p)

#plotarray = []
#plotarray.append(mutant)
#plotarray.append(wild_type)
#plt.boxplot(plotarray)
#plt.xticks(np.arange(3),("","mutant","wild type"))
#plt.savefig("box_plots.png")

#plt.hist(plotarray,20,histtype='bar',color=['red','blue'],label=['Mutant','Wild Type'])
#plt.legend()
#plt.savefig("fiber_histogram.svg")
Browse the archive

https://github.com/stevendflygare/muscleQNT