https://github.com/fjruizruano/ngs-protocols
Raw File
Tip revision: 39a091d1fa569a7fc717ac73c4b3de07f0a1204d authored by fjruizruano on 03 August 2023, 11:48:27 UTC
adding gfa2fas.py and extract_gfa.py
Tip revision: 39a091d
trinity_extract_longest2.py
#!/usr/bin/python

from Bio import SeqIO
import sys
import operator

try:
    secu = sys.argv[1]
except:
    secu = raw_input("Introduce Trinity's output: ")

di = {}

dat = SeqIO.parse(open(secu), "fasta")

for s in dat:
    #extract gene names	
    gene = s.id
    gene = gene.split("_")
    gene = "_".join(gene[:-1])
    #for each gene add isoforms and length
    if gene not in di:
        di[gene] = {s.id:len(s.seq)}
    else:
        di[gene][s.id] = len(s.seq)

#selecting longest isoforms
li = []

for x in di:
    dictio = sorted(di[x].iteritems(),key=operator.itemgetter(1))
    dictio.reverse()
    li.append(dictio[0][0])

#write output
w = open("Trinity.longest.fasta", "w")

dat = SeqIO.parse(open(secu), "fasta")

for s in dat:                                   
    if s.id in li:
        w.write(">%s\n%s\n" % (s.description,s.seq))

w.close()

back to top