Content - aaedeac0773f6bea80f6c2e240c31e46d9001706 - d7c631f/transcriptome_properties.py

visit type:
Tip revision: 3e823abcca5b8c1e5e89dd9bd4c49e8673b3e957 authored by Stephen Floor on 24 June 2017, 00:52:49 UTC
email update
Tip revision: 3e823ab
transcriptome_properties.py
#!/usr/bin/env python


# transcriptome_properties.py

# Calculate relevant properties of an input transcriptome (or subset thereof) specified in BED format

# Properties to calculate:

# GC content (done)
# length (done)
# exonct (done) 

# optional (region-specific):

# - 5' UTR specific
#  - structure at 5' end (for 5' UTR, specifically affects 43S loading)
#  - min deltaG over sliding window of N nts (default 75)
#  - 5' TOP or etc?  pyrimidine content of 5' proximal?  sabatini defined this as a 5nt pyrimidine stretch within 15nt of TSS. (not implemented) 

# start codon specific 
#  - kozak context (done) 
#  - number of uORFs (done)
#  - uorf overlap with start codon (done)
#  - code all four possibilities for each ATG-proximal base (not implemented) 
#  - start codon itself - is it always ATG?  (done)

# - CDS specific
#  - % rare codons 
#  - signal peptide?  (not implemented; just download list from http://www.signalpeptide.de/index.php  ? ) 
#  - min deltaG over sliding window of N nts


# - 3' UTR specific
#  - miRNA binding sites (targetscan)
#  - AU-rich elements
#  - min deltaG over sliding window of N nts 

import sys,os,argparse,subprocess,re,multiprocessing,threading,time
import HumanCodonTable
import AnnotationConverter
import TargetscanScores
from SNFUtils import *
from collections import defaultdict
    

# gets the folding energy of a sequence. returns a tuple containing energies (MFE, centroid, MEA) 
# RNAfold courtesy viennaRNA 
def get_energy(sequence): 
    if len(sequence) < 1: 
        return (0, 0, 0)

    lines = stdout_from_command("echo %s | RNAfold --noPS --MEA -p" % sequence)

    #first line is just the seq - skip 
    lines.next() 

    #second line has the MFE 
    MFE = float(re.sub('[()]','', lines.next().split()[-1]))
        
    #third line has the ensemble energy - discarding for now
    lines.next() 
    
    #fourth line has the centroid energy 
    centroid = float(re.sub('[{}]','', lines.next().split()[-2]))
    
    #fifth line has the MEA 
    MEA = float(re.sub('[{}]','', lines.next().split()[-2]))

    return (MFE, centroid, MEA)

# gets the min folding energy for a sliding window of windowsize across the sequence
# uses multiprocessing via the procpool to accelerate. - removed in favor of global threading
#def get_sliding_energy(sequence, windowsize, procpool):
def get_sliding_energy(sequence, windowsize):
    if len(sequence) < 1: 
        return (0, 0, 0)
    all75s = [sequence[i:i+windowsize] for i in range(len(sequence) - windowsize)]
    #nrg = procpool.map(get_energy, all75s)
    nrg = map(get_energy, all75s)
    minMFE = min([tup[0] for tup in nrg])
    mincentroid = min([tup[1] for tup in nrg])
    minMEA = min([tup[2] for tup in nrg])
    
    return (minMFE, mincentroid, minMEA)


# gets the folding energy (MFE) of a sequence using RNALfold
def get_rnalfold_energy(sequence, windowsize): 
    if len(sequence) < 1: 
        return 0

    lines = stdout_from_command("echo %s | RNALfold -L %d" % (sequence, windowsize))

    # example output: .(((.(((..((((.....((((....((.......))....)))).)))).............))).))). ( -9.00)   44
    minMFE = min([float(line.split(' (')[1][:6]) if line[0] in ".(" else "" for line in lines])

    if not minMFE:
        minMFE = 0

    return float(minMFE)
                                

parser = argparse.ArgumentParser(description="Calculate transcriptome-wide properties")

globalargs = parser.add_argument_group(title="Global arguments")
globalargs.add_argument("-i", "--input", help="The transcriptome region (BED format)", required=True)
globalargs.add_argument("-g", "--genome", help="The genome for the input transcriptome", required=True)
globalargs.add_argument("--gc", help="Calculate GC content", action="store_true")
globalargs.add_argument("--length", help="Calculate length", action="store_true")
globalargs.add_argument("--exonct", help="Count # of exons", action="store_true") 
globalargs.add_argument("--nt", help="Number of threads (default is 8 or 4 for lfold)", default=8, type=int)
globalargs.add_argument("-o", "--output", help="Output basename (e.g. CDS)", default="tx_props")
globalargs.add_argument("--window", help="Window size for sliding window calculations (default 75)", default=75, type=int)
globalargs.add_argument("--convtorefseq", help="Filename to convert input annotations to refseq (for targetscan; e.g. knownToRefSeq.txt)", default=False)
globalargs.add_argument("--targetscanfile", help="Filename of targetscan scores (e.g. Summary_Counts.txt)", default=False)
globalargs.add_argument("--deltag", help="Calculate min deltaG in sliding window of size --window over region", action="store_true")
globalargs.add_argument("--lfold", help="Use RNALfold to calculate MFE rather than RNAfold (faster but does not compute centroid,MEA)", action="store_true")

utr5args = parser.add_argument_group(title="5' UTR specific arguments")
utr5args.add_argument("--cap-structure", help="Calculate structure at the 5' end", action="store_true")
#utr5args.add_argument("--deltag-5utr", help="Calculate min deltaG in 5' UTR in sliding window of size --window", action="store_true")

startargs = parser.add_argument_group(title="Start-codon-specific arguments")
startargs.add_argument("--kozak", help="Calculate Kozak context score", action="store_true")
startargs.add_argument("--uorf-count", help="Calculate number of 5' UTR uORFs (starting with [ACT]TG)", action="store_true")
startargs.add_argument("--uorf-overlap", help="Overlap of uORF with start codon (implies --uorf-count)", action="store_true")
startargs.add_argument("--start-codon", help="Record the start codon used (ATG or other)", action="store_true") 

cdsargs = parser.add_argument_group(title="CDS-specific arguments") 
cdsargs.add_argument("--rare-codons", help="Calculate codon usage properties", action="store_true") 
#cdsargs.add_argument("--deltag-cds", help="Calculate min deltaG in CDS in sliding window of size --window", action="store_true") 

utr3args = parser.add_argument_group(title="3' UTR specific arguments") 
utr3args.add_argument("--mirna-sites", help="Compile miRNA binding site info from targetscan", action="store_true")
utr3args.add_argument("--au-elements", help="Count number of AU-rich elements in the 3' UTR", action="store_true")
#utr3args.add_argument("--deltag-3utr", help="Calculate min deltaG in 3' UTR in sliding window of size --window", action="store_true") 

args = parser.parse_args()

print "-----------------------------------"
print "|   transcriptome_properties.py   |"
print "|    compute props of a txome     |"
print "|      run with -h for help       |"
print "|          snf 11/2014            |"
print "-----------------------------------\n\n"


print "Arguments:"

print args

# get a streaming input of all the regions from bedtools

print "\n\nReading from regions using bedtools getfasta: "

# bedtools getfasta flags: 
#  -name  preserve region name
#  -tab   tab-delimited output 
#  -s     reverse complement - strand regions 
#  -split concat all blocks in the input bed (i.e. splice together exons) 

cmd = "bedtools getfasta -fi %s -bed %s -name -tab -s -fo - -split 2> %s_stderr.log" % (args.genome, args.input, args.output)
print cmd + "\n"

# sanity check inputs here

prompted = False

if (args.lfold):
    print "Using RNALfold to calculate just MFE\n"
    if (args.nt != 4):
        print "Warning: highest performance during benchmarks with --lfold achieved with 4 threads\n"
    

if (args.cap_structure):
    prompt("Using 5' UTR specific options, so input should be 5' UTR regions!")
    prompted = True

if (args.kozak or args.uorf_overlap or args.uorf_count or args.start_codon):
    if (prompted):
        sys.exit("FATAL: multiple region-specific options for disparate regions selected.")
    prompt("Using start codon specific options, so input should be start codon regions (5utr_start)!") 
    prompted = True 

if (args.rare_codons):
    if (prompted):
        sys.exit("FATAL: multiple region-specific options for disparate regions selected.")
    prompt("Using CDS specific options, so input should be CDS regions!") 
    prompted = True 

if (args.mirna_sites or args.au_elements):
    if (prompted):
        sys.exit("FATAL: multiple region-specific options for disparate regions selected.")
    prompt("Using 3' UTR specific options, so input should be 3' UTR regions!") 
    prompted = True 

if (args.mirna_sites): 
    if (not args.convtorefseq):
        sys.exit("FATAL: argument --convtorefseq <file> required for --mirna-sites")
    if (not args.targetscanfile):
        sys.exit("FATAL: argument --targetscanfile <file> required for --mirna-sites") 

# open relevant output files 

if (args.gc):
    gc_outfile = safe_open_file("%s_gc_content.csv" % args.output)
    gc_outfile.write("transcriptID,gc_content\n")

if (args.length):
    len_outfile = safe_open_file("%s_length.csv" % args.output)
    len_outfile.write("transcriptID,length\n")

txid_to_exonct = defaultdict(int)

if (args.exonct):
    exonct_outfile = safe_open_file("%s_exonct.csv" % args.output)
    exonct_outfile.write("transcriptID,exonct\n")
    print "Building exon count dictionary..."
    # count the # of exons real quick and throw it into a dictionary
    # col 9 is exonct, col 3 is txid 
    with open(args.input, "r") as inp:
        for line in inp:
            line = line.split()
            txid_to_exonct[line[3]] = line[9]
    print "Built.\n"

if (args.cap_structure):
    cap_structure_outfile = safe_open_file("%s_cap_structure.csv" % args.output) 
    cap_structure_outfile.write("transcriptID,MFE,centroid,MEA\n")

if (args.deltag):
    if (args.lfold): 
        deltag_outfile = safe_open_file("%s_deltag_lfold.csv" % args.output)
        deltag_outfile.write("transcriptID,MFE_min\n")
    else:
        deltag_outfile = safe_open_file("%s_deltag.csv" % args.output)
        deltag_outfile.write("transcriptID,MFE_min,centroid_min,MEA_min\n")

if (args.uorf_count or args.uorf_overlap): # output the count even if just overlap is supplied; have to calculate this anyway
    uorf_count_outfile = safe_open_file("%s_uorf_count.csv" % args.output) 
    uorf_count_outfile.write("transcriptID,uORF_count\n") 

if (args.uorf_overlap):
    uorf_overlap_outfile = safe_open_file("%s_uorf_overlap.csv" % args.output)
    uorf_overlap_outfile.write("transcriptID,uORF_overlap\n")

if (args.kozak):
    kozak_outfile = safe_open_file("%s_kozak.csv" % args.output) 
    kozak_outfile.write("transcriptID,kozak_score\n")

if (args.start_codon): 
    start_codon_outfile = safe_open_file("%s_start_codon.csv" % args.output)
    start_codon_outfile.write("transcriptID,start_codon,start_codon_numeric\n")

if (args.rare_codons):
    rare_codons_outfile = safe_open_file("%s_rare_codons.csv" % args.output)
    rare_codons_outfile.write("transcriptID,avg_codon_freq,min_codon_freq\n") 

if (args.mirna_sites):
    mirna_sites_outfile = safe_open_file("%s_mirna_count.csv" % args.output)
    mirna_sites_outfile.write("transcriptID,num_miRNA_sites,miRNA_score_sum,miRNA_score_min,num_cnsv_sites,cnsv_score_sum,cnsv_score_min,num_noncnsv_sites,noncnsv_score_sum,noncnsv_score_min\n")
    mirna_density_outfile = safe_open_file("%s_mirna_density.csv" % args.output)
    mirna_density_outfile.write("transcriptID,miRNA_density,miRNA_score_density,cnsv_site_density,cnsv_score_density,noncnsv_site_density,noncnsv_score_density\n")

    try: 
        convFile = open(args.convtorefseq, 'r')
        conv2rs = AnnotationConverter.AnnotationConverter(convFile)
    except: 
        sys.exit("Cannot open file %s for annotation conversion to refseq" % args.convtorefseq)
    try:
        targetscanScoreFile = open(args.targetscanfile, 'r')
    except: 
        sys.exit("Cannot open file %s for reading targetscan scores" % args.targetscanfile)
    tsscores = TargetscanScores.TargetscanScores(targetscanScoreFile)


if (args.au_elements): 
    au_elements_outfile = safe_open_file("%s_au_elements.csv" % args.output)
    au_elements_outfile.write("transcriptID,pentamer_count,au_element_count,au_element_frac,max_au_length\n")

#pool = multiprocessing.Pool(processes=args.nt)


def process_line(line):

    line = line.split()

    transcriptID = line[0]
    #print "Processing tx %s" % transcriptID

    sequence = line[1].upper()

    seqlen = len(sequence)

    if (args.gc):

        numN = sequence.count("N")

        if ("N" in sequence): 
            #print "Warning: stripping N nucleotides from transcript %s" % (transcriptID) 
            sequence = sequence.strip("N") 
            #seqlen = len(sequence) 

        numG = sequence.count("G")
        numC = sequence.count("C")
        numA = sequence.count("A")
        numT = sequence.count("T")

        #if (numG + numC + numA + numT != seqlen):  # there must be an internal N; try to continue somewhat gracefully by adjusting the sequence length
            #seqlen -= numN
            #print "Warning: %d internal N nucleotides ignored when calculating GC content for transcript %s" % (numN, transcriptID) 
            #sys.exit("FATAL: sum of ACTG (%d) != length of sequence (%d) for sequence %s" % (numG+numC+numA+numT, seqlen, sequence))
            
        percent_gc = (numG + numC) / float(seqlen - numN) if seqlen - numN > 0 else 0

        gc_outfile.write("%s,%3.2f\n" % (transcriptID, percent_gc))

    if (args.length):
        len_outfile.write("%s,%d\n" % (transcriptID, seqlen))

    if (args.exonct):
        exonct_outfile.write("%s,%s\n" % (transcriptID, txid_to_exonct[transcriptID]))
        
    if (args.cap_structure): 
        # calculate the deltaG of the 50nt after the 5' end here, or if the 5' UTR is less than 50nt just calculate the deltaG of the whole thing.
        # using viennaRNA 
        if (seqlen < 50):
            nrg = get_energy(sequence)
        else:
            nrg = get_energy(sequence[0:50])

        cap_structure_outfile.write("%s,%.1f,%.1f,%.1f\n" % (transcriptID, nrg[0],nrg[1],nrg[2]))
        

    if (args.deltag):
        # calculate the min deltag of a sliding window of size args.window across the region 
        if (args.lfold):
            deltag_outfile.write("%s,%.1f\n" % (transcriptID, get_rnalfold_energy(sequence, args.window)))
        else: 
            if (seqlen <= args.window):
                nrg = get_energy(sequence)
                deltag_outfile.write("%s,%.1f,%.1f,%.1f\n" % (transcriptID, nrg[0],nrg[1],nrg[2]))
            else: 
                #nrg = get_sliding_energy(sequence, args.window, pool)
                nrg = get_sliding_energy(sequence, args.window)
                deltag_outfile.write("%s,%.1f,%.1f,%.1f\n" % (transcriptID, nrg[0],nrg[1],nrg[2]))
            
    if (args.kozak): 
        # score the Kozak context.  This is G c c A/G c c atg G.  The most important nts are +4, -3 and -6.  Scoring these as +3 and the others as +1. Max score = 13
        # need to implement this.  this is actually a "start codon" specific parm because it involves overlap between the 5' UTR and CDS. likewise for uorf-overlap 
        
        if (len(sequence) > 32): 
            # start codon is at -28 through -26 in the 5utr_start files, so kozak starts at -34 and ends at -24 ignoring -28 through -26  
            kozakScore = sum ((sequence[-32] == "C", sequence[-31] == "C", sequence[-29] == "C", sequence[-28] == "C"))
            kozakScore += 3 * sum((sequence[-33] == "G", sequence[-30] == "A" or sequence[-30] == "G", sequence[-24] == "G"))
            
        else: 
            # something with a UTR that is less than 6nt is not likely to follow kozak behavior, so just setting the score to -1 to indicate oddball status 
            kozakScore = -1
    
        kozak_outfile.write("%s,%d\n" % (transcriptID, kozakScore))

    if (args.uorf_count or args.uorf_overlap): 
        # count the number of uORFs (min length 10 codons) in the sequence. 
        # !!! IT IS ASSUMED that the sequence is the 5' UTR ending with ATG([N]{27}) as in the 5utr_start bed files, otherwise this makes no sense!!!!!!!!!!
        
        # uORF is defined as orf >= 10 codons that starts with ATG/CTG/GTG as per Ingolia 2011 and either ends with a stop codon or the end of the UTR which implies it continues into the proper CDS 
        # I am not sure how good these criteria really are, and they predict massive numbers of uORFs. an alternative approach would be to actually measure the translated uORFs with harringtonine in these cells... 
        

        # re to find all uORFs including near-cognate matches CTG/TTG 
        orf_pattern = re.compile(r'(?=([ACG]TG(?:...)*?)(?=TAG|TGA|TAA|.{0,2}$))')  # EOL counts because that means the uORF has read at least 27nt into the coding sequence
        uorfs = [orf for orf in orf_pattern.findall(sequence) if len(orf) >= 30]

        # trim uorfs - method one; use string comparisons 

        uorfs_trimmed=[]
        for orf in uorfs:
            found = False 
            if (orf not in uorfs_trimmed):
                for orf_cmp in uorfs_trimmed:
                    if (orf in orf_cmp):
                        found = True 
                if (not found):
                    uorfs_trimmed.append(orf)
            
        uorfs = uorfs_trimmed

        # nice up the output
        uorf_count_outfile.write("%s,%d\n" % (transcriptID, len(uorfs)))

        if (args.uorf_overlap): 
            
            starts = [sequence.find(orf) for orf in uorfs]
            ends = [starts[i] + len(uorfs[i]) for i in range(len(uorfs))]
            overlaps = sum([seqlen - ends[i] < 28 for i in range(len(ends))]) # 28 here comes from the construction of 5utr_start bed files, which have 27nt after the start.

            #print sequence 
            #print starts, ends, overlaps
            
            uorf_overlap_outfile.write("%s,%d\n" % (transcriptID, overlaps))


    if (args.start_codon): 
        # record what the start codon is, along with a numeric representation (0 is ATG, 1 is 1 mismatch, 2 is 2 mismatches, 3 is 3 mismatches)
        # INPUT MUST BE 5utr_start for these calculations, in which the start codon is from 
        

        startCodon = sequence[-27:-24]

        mismatches = sum((startCodon[0] != "A", startCodon[1] != "T", startCodon[2] != "G"))

        #print sequence, startCodon, mismatches

        start_codon_outfile.write("%s,%s,%d\n" % (transcriptID, startCodon, mismatches))


    if (args.rare_codons):
        codonTable = HumanCodonTable.HumanCodonTable()
        
        rare_codons_outfile.write("%s,%3.2f,%3.2f\n" % (transcriptID, codonTable.averageCodonFreq(sequence), codonTable.minCodonFreq(sequence, 5)))  # 5 here is the window size in codons, so 15 nucleotides 



    if (args.mirna_sites):

        refseqIDs = conv2rs.convert(transcriptID[:-5])

        if (refseqIDs): 

            nSites = 0 
            scoreSum = 0
            minScore = 999999999
            nCnsvSites = 0 
            cnsvScoreSum = 0
            cnsvMinScore = 9999999999
            nNoncnsvSites = 0 
            noncnsvScoreSum = 0
            noncnsvMinScore = 9999999999

            
            for id in refseqIDs: 
                nSites += tsscores.getNumSites(id)
                scoreSum += tsscores.getScoreSum(id) 
                newMin = tsscores.getMinScore(id)
                if (newMin < minScore):
                    minScore = newMin

                nCnsvSites += tsscores.getCnsvNumSites(id)
                cnsvScoreSum += tsscores.getCnsvScoreSum(id) 
                newMin = tsscores.getCnsvMinScore(id)
                if (newMin < cnsvMinScore):
                    cnsvMinScore = newMin

                nNoncnsvSites += tsscores.getNoncnsvNumSites(id)
                noncnsvScoreSum += tsscores.getNoncnsvScoreSum(id) 
                newMin = tsscores.getNoncnsvMinScore(id)
                if (newMin < noncnsvMinScore):
                    noncnsvMinScore = newMin

                    
            if (nSites > 0):
                    
                mirna_sites_outfile.write("%s,%d,%3.2f,%3.2f,%d,%3.2f,%3.2f,%d,%3.2f,%3.2f\n" % \
                                              (transcriptID, nSites, scoreSum, minScore, nCnsvSites, cnsvScoreSum, cnsvMinScore,\
                                                   nNoncnsvSites, noncnsvScoreSum, noncnsvMinScore))
                
                
                nSites /= float(seqlen)
                scoreSum /= seqlen
                nCnsvSites /= float(seqlen)
                cnsvScoreSum /= seqlen
                nNoncnsvSites /= float(seqlen)
                noncnsvScoreSum /= seqlen

                mirna_density_outfile.write("%s,%.6f,%.6f,%.6f,%.6f,%.6f,%.6f\n" % \
                                              (transcriptID, nSites, scoreSum, nCnsvSites, cnsvScoreSum, nNoncnsvSites, noncnsvScoreSum))



    if (args.au_elements):
        # calculate: 
        # 1) count AUUUA
        # 2) number of AU-stretches
        # 3) percentage of UTR covered by AREs 
        # 4) longest A/U stretch 
        auPentamerCount = sequence.count("AUUUA") 

        aurich_re = re.compile(r'[AU]{5,}')  # RE to find more than 5 a/u in a row 
        au_elements = aurich_re.findall(sequence)

        if (au_elements):
            numAUElements = len(au_elements)
            aurichFraction = len("".join(au_elements))/float(seqlen)
            longestAUElement = max(map(len,au_elements))

        else: 
            numAUElements = aurichFraction = longestAUElement = 0

        au_elements_outfile.write("%s,%s,%s,%3.2f,%s\n" % (transcriptID, auPentamerCount, numAUElements, aurichFraction, longestAUElement))
        



processed = 0 

delay = 0.1
if (args.lfold):  # different sleep delays for lfold or regular because threads have different average durations in the two cases
    delay = 0.01

for line in stdout_from_command(cmd):

    if (threading.activeCount() < args.nt + 1):
        t = threading.Thread(target=process_line, args=(line,))
        t.start()
        processed += 1 
        
    else:
        while(threading.activeCount() >= args.nt + 1):
            time.sleep(delay)
        t = threading.Thread(target=process_line, args=(line,))
        t.start()
        processed += 1 

    if (not(processed % 2500)):
        print "Processed %d entries..." % processed 

print "Processed %d entries." % processed 

has_stderr = False

with open ("%s_stderr.log" % args.output, "r") as logfile:
    for i,l in enumerate(logfile):
        has_stderr = True
if (has_stderr):
    print "WARNING: stderr was generated during this run. See %s_stderr.log for details" % args.output
Browse the archive

https://github.com/stephenfloor/tripseq-analysis