https://github.com/stephenfloor/tripseq-analysis
Tip revision: 3e823abcca5b8c1e5e89dd9bd4c49e8673b3e957 authored by Stephen Floor on 24 June 2017, 00:52:49 UTC
email update
email update
Tip revision: 3e823ab
plot_tripseq_transcript.py
#!/usr/bin/env python
# plot_tripseq_transcript.py
# plot the indicated transcript or gene based input polysome sequencing (TrIPseq) data
# Intended to be applied to fractionated polysome sequencing data (a la tripseq) but theoretically can work for any data involving transcript-specific distributions
# Input format can be either averaged or replicates listed side-by-side; units are arbitrary (counts, fpkm, vsd, rlog, etc)
# SNF Winter 2014/2015
import sys, os, argparse, csv, random, re
from SNFUtils import * # prompt, safe_open_file, stdout_from_command
from collections import defaultdict, namedtuple
from scipy.stats.stats import pearsonr, spearmanr, kurtosis
from scipy.optimize import curve_fit
from scipy.stats import mannwhitneyu
import numpy as np
import matplotlib.pyplot as plt
from itertools import izip_longest
parser = argparse.ArgumentParser(description="Plot input transcript ID from input distribution file")
#parser.add_argument("-i", "--input", help="File containing transcript expression values", required=True)
parser.add_argument("-i", "--input", help="File containing transcript distributions", required=True)
parser.add_argument("-o", "--output", help="Output filename (default is stdout)", default="stdout")
parser.add_argument("-n", "--nrep", help="Number of replicates of each point", required=True, type=int)
parser.add_argument("--id", help="Transcript ID(s) to print (can be partial; can be comma-separated list)", required=True)
parser.add_argument("--tx-to-gene", help="File containing transcript ID to gene name mapping", required=True)
parser.add_argument("--text", help="Output text data in addition to plots.", action="store_true")
parser.add_argument("--format", help="Image format to export (png or pdf).", default="pdf")
args = parser.parse_args()
print "----------------------------------------"
print "| plot_tripseq_transcript.py |"
print "| plot transcript distribution |"
print "| run with -h for help |"
print "| snf 2/2015 |"
print "------------------------------------\n\n"
print "Arguments:"
print args
if (args.output == "stdout"):
outfile = sys.stdout
else:
outfile = safe_open_file(args.output)
tx_to_gene = defaultdict(str)
tx_to_name = defaultdict(str)
tx_to_txname = defaultdict(str)
processed = 0
print "\nCreating tx_to_gene dictionary from %s..." % args.tx_to_gene
with open(args.tx_to_gene, "r") as txfile:
for line in txfile:
line = line.split()
if (len(line) < 4):
sys.exit("ERROR: line in tx_to_gene file has less than four entries: %s" % line)
tx_to_gene[line[0]] = line[1]
tx_to_name[line[0]] = line[2]
tx_to_txname[line[0]] = line[3]
processed += 1
print "Dictionary created (%d entries)." % processed
# this is used to define the order of the input columns in the output plots, to rearrange input to output.
# janky but it works for these input files.
ORDER = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 3]
ids = args.id.split(",")
ids = [ ids[i].strip() for i in range(len(ids))]
print "Looking for ids %s" % ids
# sample input format:
# -- first column must be transcript ID; quotes are optional
# -- header line below is optional and detected by a non-numeric second column
# "ensid" "X80S" "poly2" "poly3" "poly4" "poly5" "poly6" "poly7" "poly8"
#"ENST00000000233" -0.562625014929129 -2.23696624248826 -0.765273156804401 0.702668471793972 1.36603995862049 1.02457718119074 0.751424784189265 -0.279845981572665
#"ENST00000000412" -1.10133854138933 -1.77019107516088 -1.7623840822903 0.404249332913166 1.42491481379874 1.42016301836686 1.35747792898968 0.0271086047720868
#"ENST00000000442" 1.1367571728999 -2.58192112517174 0.0240875688746112 0.482589702447365 0.969885993274519 0.643070451106668 -0.177572605307724 -0.496897158123603
np.seterr(all='raise')
processed = 0
rejected_pe = 0
rejected_zero = 0
fname = args.input
with open(fname, "r") as infile:
#inspect the first line
random.seed(1)
if fname.endswith("csv"):
line = [i.strip("\"") for i in infile.readline().split(",")]
else:
line = [i.strip("\"") for i in infile.readline().split()]
if (is_number(line[2])): # no header
found_header = False
infile.seek(0) #rewind
print "\tNo header found."
else:
found_header = True
header = line
print "\tHeader found on line one."
print "\tOutput order (CHECK THIS; should be 40, 60, p1-8, cyto):"
header = [ header[i*args.nrep+1] for i in ORDER ]
print "\t\t" + ", ".join(header)
if ( (len(line) - 1) % args.nrep):
print "\tWARNING: %d observations found; not divisible by %d replicates." % (len(line) - 1, args.nrep)
npts = (len(line) - 1)/args.nrep
print "\tProcessing %d points (%d replicates)" % ( npts, args.nrep)
for line in infile:
if (not line.strip()):
continue
if fname.endswith("csv"):
line = [i.strip("\"") for i in line.split(",")]
else:
line = [i.strip("\"") for i in line.split()]
# only process the one transcript
if True in [ re.search(ids[i], line[0]) != None for i in range(len(ids))] or True in [tx_to_name[line[0]] == ids[i] for i in range(len(ids))]:
print "\tPlotting id %s" % line[0]
# convert all to floats before continuing...
line = [line[0]] + [float(i) for i in line[1:]]
txid = line[0]
obs = line[1:] # the observations
nobs = len(obs) / args.nrep
# 1 read on a 1kb transcript out of 10 million -> FPKM = 1e-10. rounding to this precision.
obs = np.around(obs, 10)
if (len(obs) % args.nrep):
print "\tWarning: transcript %s has %d observations which is not divisible by %d replicates" % (txid, len(obs), args.nrep)
# collapse replicates into means, preserve stdev
means = [np.mean(i) for i in chunkwise(obs, args.nrep)]
stdevs = [np.std(i) for i in chunkwise(obs, args.nrep)]
# reorder the lists here to go 40S, 60S, 80S, p2-p8, cyto
means = [ means[i] for i in ORDER ]
stdevs = [ stdevs[i] for i in ORDER ]
if txid in tx_to_name:
#plotname = txid + " (" + tx_to_name[txid] + ")"
plotname = tx_to_txname[txid] + " (" + txid + ")"
outfilename = tx_to_txname[txid] + "_" + txid + "." + args.format
if (args.text):
outdataname = tx_to_txname[txid] + "_" + txid + ".txt"
else:
print "Warning: gene name for transcript %s not found" % txid
plotname = txid
outfilename = plotname + "." + args.format
if (args.text):
outdataname = plotname + ".txt"
plot_dist_fancy(range(1,npts+1), means, outfilename, stdevs, plotname)
if (args.text):
outfile = safe_open_file(outdataname)
outfile.write("\n".join( [ "%s\t%d\t%.6f\t%.6f" % (header[i], i, means[i], stdevs[i]) for i in range(len(header))]))
outfile.write("\n")