https://github.com/stephenfloor/tripseq-analysis
Tip revision: 3e823abcca5b8c1e5e89dd9bd4c49e8673b3e957 authored by Stephen Floor on 24 June 2017, 00:52:49 UTC
email update
email update
Tip revision: 3e823ab
isoform_fraction.py
#!/usr/bin/env python
# isoform_fraction.py
# calculate the percent of genes with a "minor" isoform above a specified fraction or range of fractions
# SNF 2017
import sys, os, argparse, csv, random, re
from SNFUtils import * # prompt, safe_open_file, stdout_from_command
from collections import defaultdict, namedtuple
from scipy.stats import mannwhitneyu, ks_2samp
import numpy as np
import matplotlib.pyplot as plt
from itertools import izip_longest
parser = argparse.ArgumentParser(description="Calculate the percent of genes with a minor isoform above a specified fraction")
parser.add_argument("-i", "--input", help="File containing transcript distributions", required=True)
parser.add_argument("-o", "--output", help="Output filename (default is stdout)", default="stdout")
parser.add_argument("-n", "--nrep", help="Number of replicates of each point", required=True, type=int)
parser.add_argument("--tx-to-gene", help="File containing transcript ID to gene name mapping", required=True)
parser.add_argument("--text", help="Output text data in addition to plots.", action="store_true")
parser.add_argument("--format", help="Image format to export (png or pdf).", default="pdf")
parser.add_argument("--isoform-fraction", help="What fraction of minor isoform to include (default 0.1)", type=float, default=0.1)
args = parser.parse_args()
if (args.output == "stdout"):
outfile = sys.stdout
else:
outfile = safe_open_file(args.output)
tx_to_gene = defaultdict(str)
tx_to_name = defaultdict(str)
tx_to_txname = defaultdict(str)
print "Assuming five samples: cyto, nuc, monosome, low polysome, high polysome."
Isoform_samples = namedtuple('Isoform_samples', ['id', 'cyto', 'nuc', 'mono', 'low', 'high'])
# each of these is a dictionary with keys of gene names with values a list namedtuple of Samples for each isoform
hesc = defaultdict(list)
npc = defaultdict(list)
neu14 = defaultdict(list) #14 day neurons
neu50 = defaultdict(list) #50 day neurons
processed = 0
print "\nCreating tx_to_gene dictionary from %s..." % args.tx_to_gene
with open(args.tx_to_gene, "r") as txfile:
for line in txfile:
line = line.split()
if (len(line) < 4):
sys.exit("ERROR: line in tx_to_gene file has less than four entries: %s" % line)
tx_to_gene[line[0]] = line[1]
tx_to_name[line[0]] = line[2]
tx_to_txname[line[0]] = line[3]
processed += 1
print "Dictionary created (%d entries)." % processed
# sample input format:
#txid s1r1 s1r2 s2r1 s2r2 s3r1 s3r2 etc
np.seterr(all='raise')
processed = 0
rejected_pe = 0
rejected_zero = 0
fname = args.input
with open(fname, "r") as infile:
#inspect the first line
random.seed(1)
if fname.endswith("csv"):
line = [i.strip("\"") for i in infile.readline().split(",")]
else:
line = [i.strip("\"") for i in infile.readline().split()]
if (is_number(line[2])): # no header
found_header = False
infile.seek(0) #rewind
print "\tNo header found."
else:
found_header = True
# cufflinks output has the replicate number as the last two (i.e. _0 or _1). this trims.
header = [i[:-2] for i in line]
print "\tHeader found on line one."
print "\tOrder parsed: "
header = [ header[i*args.nrep+1] for i in range(len(header)/2) ]
print "\t\t" + ", ".join(header)
print "Header has %d columns" % len(header)
if ( (len(line) - 1) % args.nrep):
print "\tWARNING: %d observations found; not divisible by %d replicates." % (len(line) - 1, args.nrep)
npts = (len(line) - 1)/args.nrep
print "\tProcessing %d points (%d replicates)" % ( npts, args.nrep)
for line in infile:
if (not line.strip()):
continue
if fname.endswith("csv"):
line = [i.strip("\"") for i in line.split(",")]
else:
line = [i.strip("\"") for i in line.split()]
# convert all to floats before continuing...
line = [line[0]] + [float(i) for i in line[1:]]
txid = line[0]
obs = line[1:] # the observations
nobs = len(obs) / args.nrep
# 1 read on a 1kb transcript out of 10 million -> FPKM = 1e-10. rounding to this precision.
obs = np.around(obs, 10)
if (len(obs) % args.nrep):
print "\tWarning: transcript %s has %d observations which is not divisible by %d replicates" % (txid, len(obs), args.nrep)
# collapse replicates into means, preserve stdev
means = [np.mean(i) for i in chunkwise(obs, args.nrep)]
#stdevs = [np.std(i) for i in chunkwise(obs, args.nrep)]
if txid in tx_to_gene:
geneid = tx_to_gene[txid]
hesc[geneid].append(Isoform_samples(txid, *means[0:5]))
npc[geneid].append(Isoform_samples(txid, *means[5:10]))
neu14[geneid].append(Isoform_samples(txid, *means[10:15]))
neu50[geneid].append(Isoform_samples(txid, *means[15:20]))
else:
print "Warning: gene name for transcript %s not found" % txid
processed += 1
if not (processed % 5000):
print "Processed %d lines." % processed
# now, have read in all the transcripts into these gene dicts. compute the percent that have above whatever fraction
# calculate the percent of genes that have a minor transcript above the specified fraction for dicts constructed as described above
# returns this percent as a fraction
def calc_isoform_fraction(dict, isoform_fraction, sample):
genes_over_fraction = 0
genes_processed = 0
geneids_over_fraction = []
samples = []
for key, val in dict.iteritems():
if sample == "cyto":
samples = [i.cyto for i in val]
elif sample == "nuc":
samples = [i.nuc for i in val]
elif sample == "mono":
samples = [i.mono for i in val]
elif sample == "low":
samples = [i.low for i in val]
elif sample == "high":
samples = [i.high for i in val]
else:
print "Unrecognized sample %s" % sample
total = sum(samples)
if total > 1 : # only continue if expressed (TPM > 1 for the gen)
# transform to fractional isoform abundance
samples = [i / total for i in samples]
n_over_fraction = sum([i > isoform_fraction for i in samples])
if n_over_fraction > 1: # implies at least one minor isoform above the fraction cutoff
genes_over_fraction += 1
geneids_over_fraction.append(key)
genes_processed += 1
#print "Found %d genes over fraction %.2f out of %d genes (%3.2f%%) in %s" % (genes_over_fraction, isoform_fraction, genes_processed, genes_over_fraction/float(genes_processed)*100, sample)
#print geneids_over_fraction[0:10]
return genes_over_fraction/float(genes_processed)
#for sample in ["cyto", "nuc", "mono", "low", "high"]:
# calc_isoform_fraction(hesc, args.isoform_fraction, sample)
# calc_isoform_fraction(npc, args.isoform_fraction, sample)
# calc_isoform_fraction(neu14, args.isoform_fraction, sample)
# calc_isoform_fraction(neu50, args.isoform_fraction, sample)
def plot_it(cell_dict, name, npoints):
xvals = np.linspace(0, 0.5, npoints)
cyto_y = [calc_isoform_fraction(cell_dict, xvals[i], "cyto") for i in range(len(xvals))]
nuc_y = [calc_isoform_fraction(cell_dict, xvals[i], "nuc") for i in range(len(xvals))]
mono_y = [calc_isoform_fraction(cell_dict, xvals[i], "mono") for i in range(len(xvals))]
low_y = [calc_isoform_fraction(cell_dict, xvals[i], "low") for i in range(len(xvals))]
high_y = [calc_isoform_fraction(cell_dict, xvals[i], "high") for i in range(len(xvals))]
cdffile = safe_open_file("%s_ecdf.csv" % name)
cdffile.write("xvals,%s_cyto,%s_nuc,%s_mono,%s_low,%s_high\n" % (name, name, name, name, name))
cdffile.write("\n".join(["%f,%f,%f,%f,%f,%f" % (xvals[i], cyto_y[i], nuc_y[i], mono_y[i], low_y[i], high_y[i]) for i in range(len(xvals))]))
mw_cn_test = mannwhitneyu(cyto_y, nuc_y)
mw_ch_test = mannwhitneyu(cyto_y, high_y)
# plot eCDFs
fig, ax = plt.subplots()
plt.plot( xvals, cyto_y )
plt.plot( xvals, nuc_y )
plt.plot( xvals, mono_y )
plt.plot( xvals, low_y )
plt.plot( xvals, high_y )
plt.legend(["cyto", "nuc", "mono", "low", "high"], fontsize=8)
#plt.text(.7, 0.15, "K-S, p = %4.2f, %.3g" % (ks_test[0], ks_test[1]), fontsize=8, transform=ax.transAxes)
plt.text(.7, 0.15, "M-W CN U, p = %4.2f, %.3g" % (mw_cn_test[0], mw_cn_test[1]), fontsize=8, transform=ax.transAxes)
plt.text(.7, 0.1, "M-W CH U, p = %4.2f, %.3g" % (mw_ch_test[0], mw_ch_test[1]), fontsize=8, transform=ax.transAxes)
#plt.text(.7, 0.05, "set1: %s; set2: %s" % (", ".join(args.set1[1::2]), ", ".join(args.set2[1::2])), fontsize=8, transform=ax.transAxes)
#plt.text(.7, 0.02, "n_1: %d; n_2: %d; n_3: %d" % (len(ivals), len(jvals)), fontsize=8, transform=ax.transAxes)
plt.savefig("%s_ecdf.png" % name)
plt.close(fig)
print "plotting hesc"
plot_it(hesc, "hesc", 100)
print "plotting npc"
plot_it(npc, "npc", 100)
print "plotting neu14"
plot_it(neu14, "neu14", 100)
print "plotting neu50"
plot_it(neu50, "neu50", 100)
# now count how many isoforms there are per gene that are above a certain fraction
# returns a dict of [number_of_isoforms][count_of_genes]
def isoforms_per_gene(dict, fraction, sample):
isoforms_over_fraction = defaultdict(int)
samples = []
for key, val in dict.iteritems():
if sample == "cyto":
samples = [i.cyto for i in val]
elif sample == "nuc":
samples = [i.nuc for i in val]
elif sample == "mono":
samples = [i.mono for i in val]
elif sample == "low":
samples = [i.low for i in val]
elif sample == "high":
samples = [i.high for i in val]
else:
print "Unrecognized sample %s" % sample
total = sum(samples)
if total > 1: # only continue if expressed (TPM > 1 cutoff)
# transform to fractional isoform abundance
samples = [i / total for i in samples]
n_over_fraction = sum([i > fraction for i in samples])
isoforms_over_fraction[n_over_fraction] += 1
return isoforms_over_fraction
# run isoforms_per_gene and print the output for a cell type (name) for a specified percent (not fraction)
def count_it(dict, name, percent):
cyto = isoforms_per_gene(dict, percent/100., "cyto")
nuc = isoforms_per_gene(dict, percent/100., "nuc")
mono = isoforms_per_gene(dict, percent/100., "mono")
low = isoforms_per_gene(dict, percent/100., "low")
high = isoforms_per_gene(dict, percent/100., "high")
histfile = safe_open_file("%s_isoforms_%d.csv" % (name, percent))
nrow = max(max(cyto.keys()), max(nuc.keys()), max(mono.keys()), max(low.keys()), max(high.keys()))
xvals = range(nrow+1)
cyto_list = [cyto[i] for i in xvals]
nuc_list = [nuc[i] for i in xvals]
mono_list = [mono[i] for i in xvals]
low_list = [low[i] for i in xvals]
high_list = [high[i] for i in xvals]
histfile.write("n_isoforms,%s_cyto,%s_nuc,%s_mono,%s_low,%s_high\n" % (name, name, name, name, name))
histfile.write("\n".join(["%f,%f,%f,%f,%f,%f" % (xvals[i], cyto[i], nuc[i], mono[i], low[i], high[i]) for i in range(nrow+1)]))
# plot "histograms"
fig, ax = plt.subplots()
plt.plot( xvals, cyto_list )
plt.plot( xvals, nuc_list )
plt.plot( xvals, mono_list )
plt.plot( xvals, low_list )
plt.plot( xvals, high_list )
plt.legend(["cyto", "nuc", "mono", "low", "high"], fontsize=8)
plt.savefig("%s_hist_%d.png" % (name, percent))
plt.close(fig)
for i in [5, 10, 25]:
print "counting hesc %d" % i
count_it(hesc, "hesc", i)
print "counting npc %d" % i
count_it(npc, "npc", i)
print "counting neu14 %d" % i
count_it(neu14, "neu14", i)
print "counting neu50 %d" % i
count_it(neu50, "neu50", i)