Content - 6383247f01c8e60109dc0f7c82c4f0f6d0a2cfe9 - f504533/getBLASTN.py

visit type:
Tip revision: 3f0684db9e9bef11a9318dc3831ca479ccc6dac9 authored by agardb on 08 February 2018, 18:25:31 UTC
Add files via upload
Tip revision: 3f0684d
getBLASTN.py
print ("Start")

import re, time, gzip, os.path, sys, argparse
from Bio.Blast import NCBIWWW

class Entry:
	def __init__(self,LOCID,coord,GI,length,numExons):
		self.LOCID=LOCID
		self.coord=coord
		self.GI=GI
		self.length=length
		self.numExons=numExons

def getBLAST(GI):	#
	print ("Getting BLAST result")
	startTime = time.time()
	#This is the line to change to tweak NCBI query parameters, or use your own blast server, check the biopython docs.
	result_handle = NCBIWWW.qblast("blastn","refseq_rna",GI,hitlist_size=500,megablast=True)	#All RNA now
#	result_handle = NCBIWWW.qblast("blastn","refseq_rna",GItouse,db_genetic_code=39947,hitlist_size=5,megablast=True)
	print ("Got BLAST result in %d seconds" % (time.time() - startTime))
	with gzip.open("data/" + str(GI) + ".xml.gz", "wb") as saveIt:
		saveIt.write(result_handle.read())
	result_handle.close()
	return


parser = argparse.ArgumentParser(description="")
parser.add_argument('--withGIs',action='store_true')
parser.add_argument('--withCoords',action='store_true')
parser.add_argument('--justCount',action='store_true')
parser.add_argument('--onlyLongest',action='store_true')
parser.add_argument('--onlyShortest',action='store_true')
parser.add_argument('--gbff',nargs=1, required=True)

args = parser.parse_args()

withGIs = args.withGIs
withCoords = args.withCoords
justCount = args.justCount
onlyLongest = args.onlyLongest
onlyShortest = args.onlyShortest
gbffFilePath = args.gbff[0]


if onlyLongest or onlyShortest:
	withCoords = True
count = 0


#TODO: This mode thing is not great, probably take it out and just do everything, outputting only as necessary


with open(gbffFilePath, "r") as gbffIn:
        
	output, numExons, thisGI, thisLOCID, foundOne = [], 0, "", "", False
	entries = []
	mode = 0	#Mode: 0 to findNC, 2 then 1 to find gene and GI.  Will behave badly if either are ever absent, but will work okay if they're in either order
	for line in gbffIn:
		if mode == 0:
			if "ncRNA" in line and not("Features" in line or "/ncRNA" in line):
				matchThis = line.strip()
				while len(re.findall('\(',matchThis)) != len(re.findall('\)',matchThis)):
					matchThis += next(gbffIn).strip()
				exonCoords = re.finditer('(\d+)\.\.(\d+)',matchThis)	#This discards e.g. < signs, so "<150..300" is treated as "150..300"
				length = 0
				numExons = 0
				for eCoord in exonCoords:
					numExons += 1
					length += int(eCoord.group(2)) - int(eCoord.group(1))
				mode = 2
				foundOne = True
				if justCount:
					count += 1
					mode = 0

		if mode != 0:
			if "/gene" in line:
				result = re.search('LOC[0-9]+',line)
				if result:
					thisLOCID = result.group(0)
				else:
					print "Error, bad LOC format"
				if(not withGIs):
					mode = 0
				else:
					mode -= 1
			if ("/db_xref=\"GI:" in line) and withGIs:
				result = re.search('GI:[0-9]+',line)
				if result:
					thisGI=result.group(0).lstrip("GI:")
				else:
					print "Error, bad GI format"
				mode -= 1
		if mode == 0 and foundOne:	#About to continue

			#	def __init__(LOCID,coord,GI,length,numExons):
			entries.append(Entry(thisLOCID,matchThis.lstrip("ncRNA").strip(),thisGI,length,numExons))
			result = "%s with length %d and %d exons" % (thisLOCID,length,numExons)
			if(withGIs):
				result = "%s GI: %s" % (result,thisGI)
			if(withCoords):
				result = "%s : %s" % (result,matchThis.lstrip("ncRNA").strip())
			output.append(result)
			foundOne = False


if(justCount):
	print count
output.sort()

with open('ncRNAout', 'w') as outFile :
	outFile.truncate()
	for a in output:
		print>>outFile,a



print len(output)





entries.sort(key=lambda entry: entry.LOCID)
#def __init__(self,LOCID,coord,GI,length,numExons):
with open('processThese','w') as processThese:
	if onlyLongest:
		if(len(entries) > 0):
		        longestFound = entries[0]
		for i in xrange(0,len(entries)):
		        #print str(entries[i].LOCID) + " " + str(entries[i].GI) + " " + str(entries[i].length) + " Best: " + str(longestFound.GI)
		        if(entries[i].LOCID == longestFound.LOCID):
		                if(entries[i].length > longestFound.length):
		                        longestFound = entries[i]
		        else:		
		                if(not os.path.isfile(os.path.dirname(os.path.realpath(__file__)) + "/data/" + longestFound.GI + ".xml.gz")):
		                        getBLAST(longestFound.GI)
				processThese.write(longestFound.GI + '\n')
		                longestFound = entries[i]
		if(longestFound.GI != None):
		        if(not os.path.isfile(os.path.dirname(os.path.realpath(__file__)) + "/data/" + longestFound.GI + ".xml.gz")):
		                getBLAST(longestFound.GI)
			processThese.write(longestFound.GI + '\n')

	if onlyShortest:
		if(len(entries) > 0):
		        shortestFound = entries[0]
		for i in xrange(0,len(entries)):
		        #print str(entries[i].LOCID) + " " + str(entries[i].GI) + " " + str(entries[i].length) + " Best: " + str(shortestFound.GI)
		        if(entries[i].LOCID == shortestFound.LOCID):
		                if(entries[i].length < shortestFound.length):
		                        shortestFound = entries[i]
		        else:		
		                if(not os.path.isfile(os.path.dirname(os.path.realpath(__file__)) + "/data/" + shortestFound.GI + ".xml.gz")):
		                        getBLAST(shortestFound.GI)
				processThese.write(shortestFound.GI + '\n')
		                shortestFound = entries[i]
		if(shortestFound.GI != None):
		        if(not os.path.isfile(os.path.dirname(os.path.realpath(__file__)) + "/data/" + shortestFound.GI + ".xml.gz")):
		                getBLAST(shortestFound.GI)
			processThese.write(longestFound.GI + '\n')
Browse the archive

https://github.com/agardb/NCBI-RGA