https://github.com/jhbadger/APIS
Revision cedb1bbca42dd06094db03ac0b517a889d4b8b4c authored by Jonathan Badger on 17 August 2011, 19:32:14 UTC, committed by Jonathan Badger on 17 August 2011, 19:32:14 UTC
1 parent 67d3efe
Tip revision: cedb1bbca42dd06094db03ac0b517a889d4b8b4c authored by Jonathan Badger on 17 August 2011, 19:32:14 UTC
remove apisRun, _input from dataset name on grid
remove apisRun, _input from dataset name on grid
Tip revision: cedb1bb
DBwrapper.rb
#db wrapper
require 'ZFile'
# Exception raised when dataset not found
class DatasetNotFoundError < RuntimeError
end
class DBwrapper
def close
@connection.close if @connection
@connected = false
end
def query(sql)
errors = 0
begin
connect if (!@connected)
@query = @connection.query(sql)
rescue
10.times {|errors|
if (errors < 10)
STDERR.printf("%s\n", sql)
STDERR.printf("Error %s: Reconnecting...\n", $!)
errors += 1
connect
else
STDERR.printf("Giving up after %s\n", $!)
end
}
end
return self
end
def each
@query.each {|row|
yield row
}
end
# return hash of apis db objects (mysql or sqlite) depending on source string
def self.dbsFromSources(sources, user, password)
dbs = Hash.new
sources.each {|source|
if (source =~/.db/) # sqlite
dbs[File.basename(source, ".db")] = SQLite.new(db)
elsif (source =~/::/)
host, dbname = source.split("::")
dbs[dbname] = MyDB.new(host, dbname, user, password)
dbs[dbname].close
else
host = source
db = MyDB.new(host, "", user, password)
db.query("SHOW DATABASES").each {|row|
dbname = row[0]
if (dbname =~/_apis/)
dbs[dbname] = MyDB.new(host, dbname, user, password)
dbs[dbname].close
end
}
db.close
end
}
return dbs
end
# return hash of human readable dataset names from metadata table where it exists
def self.populateMetaName(dbs)
metaName = Hash.new
dbs.keys.each do |dbname|
metaName[dbname] = Hash.new
table = dbs[dbname].get("SHOW TABLES WHERE tables_in_#{dbname} = 'metadata'").to_s
if (table != "")
dbs[dbname].query("SELECT dataset, value FROM metadata WHERE prop='name'").each do |row|
metaName[dbname][row.first] = row.last
end
dbs[dbname].query("SELECT dataset, value FROM metadata WHERE prop='location'").each do |row|
metaName[dbname][row.first] += " " + row.last
end
end
end
return metaName
end
def count(condition)
connect if (!@connected)
query = "SELECT count(*) FROM #{condition}"
return query(query).each {|row|
return row[0].to_i
}
end
# returns true if exists one or more records matching condition
def exists?(condition)
num = count(condition)
if (num > 0)
return true
else
return false
end
end
# Insert one or more records into table
def insert(table, records)
return if records.empty? || records.first.empty?
connect if (!@connected)
val = ""
records.each {|record|
val << "("
record.each {|value|
val << "'" << value.to_s.tr("'","") << "'"
val << ","
}
val.chop!
val << "),"
}
val.chop!
query("INSERT INTO #{table} VALUES #{val}")
end
def selectFirst(query)
data = []
query(query + " limit 1").each {|row|
data = row
}
return data
end
def blastdb
if (!@blastdb)
query("select * from #{@combodb}.apisdbs").each {|row|
@blastdb = row.first
}
if (@blastdb.nil?)
STDERR.printf("Error: looks like there isn't a current blastdb for %s\n", db)
exit(1)
end
end
return @blastdb
end
def tax
if (!@tax)
@tax = Hash.new
query("select name, species, strain, taxonomy, supergroup, form from #{@combodb}.contigs").each {|row|
name, species, strain, taxonomy, supergroup, form = row
@tax[name] = Hash.new
@tax[name]["species"] = species
@tax[name]["strain"] = strain
if (form == "Mitochondria")
taxonomy = "Bacteria; Proteobacteria; Alphaproteobacteria; Rickettsiales; Rickettsiaceae; Rickettsieae; Mitochondrion;"
elsif (form == "Plastid")
taxonomy = "Bacteria; Cyanobacteria; Prochlorophytes; Prochlorococcaceae; Chloroplast; Chloroplast; Chloroplast;"
end
@tax[name]["taxonomy"] = taxonomy
@tax[name]["format"] = form
if (!supergroup.nil? && supergroup.length > 3)
@tax[name]["supergroup"] = supergroup
end
}
end
return @tax
end
def fetchProtID(id)
fetched = false
while(!fetched)
begin
query("select proteins.name, annotation, species, proteins.seq from #{@combodb}.contigs, #{@combodb}.proteins where contig_name = contigs.name and proteins.name = '#{id}'").each {|row|
name, annotation, species, seq = row
fetched = true
seq.gsub!("*","")
header = "#{name} #{annotation} {#{species}}"
return ">#{header}\n#{seq.gsub(Regexp.new(".{1,60}"), "\\0\n")}", seq.length
}
return nil
rescue
connect
end
end
end
def fetchTranID(id)
fetched = false
while(!fetched)
begin
query("select transcripts.name, proteins.annotation, species, transcripts.seq from #{@combodb}.contigs, #{@combodb}.transcripts, #{@combodb}.proteins where transcripts.contig_name = contigs.name and proteins.name = transcripts.name and transcripts.name = '#{id}'").each {|row|
name, annotation, species, seq = row
fetched = true
seq.gsub!("*","")
header = "#{name} #{annotation} {#{species}}"
return ">#{header}\n#{seq.gsub(Regexp.new(".{1,60}"), "\\0\n")}", seq.length
}
return nil
rescue
connect
end
end
end
def fetchContigSeq(id)
fetched = false
while(!fetched)
begin
query("select name, species, form, seq from #{@combodb}.contigs where name = '#{id}'").each {|row|
name, species, form, seq = row
fetched = true
seq.gsub!("*","")
header = "#{name} [#{form}] {#{species}}"
return ">#{header}\n#{seq.gsub(Regexp.new(".{1,60}"), "\\0\n")}"
}
return nil
rescue
connect
end
end
end
def fetchrRNA(id)
fetched = false
while(!fetched)
begin
q = "SELECT rrnas.name, annotation, species, rrnas.seq FROM "
q += "#{@combodb}.contigs, #{@combodb}.rrnas WHERE "
q += "contig_name = contigs.name AND rrnas.name = '#{id}'"
p q
query(q).each {|row|
name, species, form, seq = row
fetched = true
seq.gsub!("*","")
header = "#{name} #{annotation} {#{species}}"
return ">#{header}\n#{seq.gsub(Regexp.new(".{1,60}"), "\\0\n")}"
}
return nil
rescue
connect
end
end
end
def fetchFunction(id)
get("select annotation from #{@combodb}.proteins where proteins.name = '#{id}'")[0]
end
# returns best BLAST hit for a given protein in a dataset
def fetchTopBlastHit(name, dataset)
query = "SELECT subject_name, score FROM blast "
query += "WHERE dataset = \"#{dataset}\" AND seq_name = \"#{name}\""
query += " ORDER BY score DESC LIMIT 1"
sname, score = get(query)
return sname
end
# returns best BLAST hit for a given protein in a dataset
def fetchTree(name, dataset)
query = "SELECT tree FROM tree "
query += "WHERE dataset = \"#{dataset}\" AND seq_name = \"#{name}\""
get(query)
end
# returns APIS annotation for a given protein in a dataset
def fetchAPISAnnotation(name, dataset)
query = "SELECT annotation FROM annotation "
query += "WHERE dataset = \"#{dataset}\" AND seq_name = \"#{name}\""
query += " AND source=\"APIS\""
get(query)
end
# returns taxon id, species, taxonomy
def fetchSpeciesInfo(name)
get("SELECT t2.taxon_id, t2.species, t2.taxonomy FROM proteins t1 INNER JOIN " +
"contigs t2 ON t1.contig_name = t2.name WHERE t1.name = '#{name}'")
end
# fetches swiss prot name and definition
def fetchSwiss(name)
get("SELECT t1.hit, t2.definition FROM blast t1 INNER JOIN swiss t2 ON t1.hit = t2.swiss " +
"WHERE t1.source = 'phylodb' AND t1.target = 'swiss' AND t1.blast = 'blastp' AND " +
"t1.query = '#{name}'")
end
# fetches Kegg ortholog name and definition
def fetchKegg(name)
pathway = nil
ko, rest = get("SELECT t2.ko FROM blast t1 INNER JOIN kegg t2 ON t1.hit = t2.kegg_seq_id WHERE t1.source = 'phylodb'
AND t1.target = 'kegg' AND t1.blast = 'blastp' AND t2.ko IS NOT NULL AND t1.query = '#{name}'")
if (ko)
pathway, rest = get("SELECT pathway FROM pathways WHERE ko = '#{ko}'")
end
return [ko, pathway]
end
# fetches Cluster info
def fetchClusters(name)
get("SELECT gos_cluster_id, phytax_cluster_id, plant, animal, pfam, tigrfams FROM map WHERE " +
"phylodb_seq_id = '#{name}'")
end
# fetches go from pfam
def fetchPfam2Go(pfam)
return "" if pfam.nil?
pfam = $1 if (pfam =~/(.*)\.\d+/) # no decimals
go = ""
query("SELECT go FROM pfam2go WHERE pfam = '#{pfam}'").each do |row|
go += ";" if (go != "")
go += row[0]
end
return go
end
# fetches ann for pfam
def fetchPfamAnn(pfam)
get("SELECT definition FROM pfam WHERE pfam='#{pfam}'")
end
# fetches ann for tigrfam
def fetchTigrfamAnn(tigrfam)
get("SELECT definition FROM tigrfams WHERE tigrfams='#{tigrfam}'")
end
# fetches go from tigrfam
def fetchTigrFam2Go(tigrfam)
return "" if tigrfam.nil?
go = ""
query("SELECT go FROM tigrfams2go WHERE tigrfams = '#{tigrfam}'").each do |row|
go += ";" if (go != "")
go += row[0]
end
return go
end
def fetchAnnotation(name)
get("SELECT annotation FROM proteins WHERE name = \"#{name}\"")
end
# returns taxonomy for phylodb (combodb, etc.) protein
def fetchTaxonomy(name)
id, contig = name.split("-")
get("SELECT taxonomy FROM contigs WHERE name = \"#{contig}\"")
end
# return the annotation of the closest sequence to id on tree
def findClosestFunction(tree, id)
begin
tree.relatives(id).each {|match|
acc, contig = match.first.split("-")
contig, rest = contig.split("__")
match_id = acc + "-" + contig
function = fetchFunction(match_id)
if (!function.nil? && function.split(" ").size > 1 &&
function !~/unnamed/ && function !~/unknown/ &&
function !~/numExons/ && function !~/^\{/)
return function
end
}
return false
rescue
return false
end
end
# delete contig(s) matching condition and all associated records
def deleteContig(condition)
query("select name, species from #{@combodb}.contigs where #{condition}").each {|row|
name, species = row
STDERR.printf("Deleting %s (%s)...\n", name, species)
query("delete from proteins where contig_name = '#{name}'")
query("delete from transcripts where contig_name = '#{name}'")
query("delete from geneorders where contig_name = '#{name}'")
query("delete from rrnas where contig_name = '#{name}'")
query("delete from contigs where name = '#{name}'")
}
end
# build up taxonomy line from taxonomy table given taxid
def buildTaxFromTaxId(taxid, string = false, verbose = false)
levels = ["kingdom", "phylum", "class", "order", "family",
"genus", "species"]
name = ""
tax = [""]*7
while (name != "root")
query = "select parent_id, name, rank from taxonomy WHERE tax_id = #{taxid}"
pid, name, rank = selectFirst(query)
STDERR.printf("%d\t%d\t%s\t%s\n", taxid, pid, name, rank) #if verbose
return nil if pid.nil?
pos = levels.index(rank)
if (pos.nil?)
pos = 0 if name == "Viruses" || name == "Viroids"
pos = 1 if name =~ /viruses/
end
tax[pos] = name.tr(",()[]'\"/","") if (pos)
taxid = pid
end
6.step(0, -1) {|i|
if (tax[i] == "")
tax[i] = tax[i + 1].split(" (").first + " (" + levels[i] + ")"
end
}
if (string)
tline = ""
tax.each {|lev|
tline += lev
tline += "; " if lev != tax.last
}
return tline
else
return tax
end
end
# create dataset if it doesn't exist
def createDataset(dataset, owner, date, database, comments = "", group = "",
username = "", password = "")
if (!exists?("dataset where dataset='#{dataset}'"))
insert("dataset", [[dataset, owner, date, database, comments, group,
username, password]])
end
end
# delete dataset(s) matching condition and all associated records
def deleteDataset(condition, keepBlast = false)
query("select dataset from dataset where #{condition}").each {|row|
dataset, rest = row
["alignment", "annotation", "tree", "classification", "blast",
"sequence", "dataset"].each {|tbl|
next if (tbl == "blast" && keepBlast)
STDERR.printf("Deleting data in %s for %s...\n", tbl, dataset)
query("delete from #{tbl} where dataset = '#{dataset}'")
}
}
end
# load peptides from file
def loadPeptides(prot, dataset, include = false)
if (!exists?("sequence where dataset='#{dataset}'") || include)
STDERR.printf("Loading Peptides from %s...\n", prot)
else
return
end
seqs = []
inputs = []
count = 0
Bio::FlatFile.new(Bio::FastaFormat, ZFile.new(prot)).each {|seq|
id = seq.entry_id.gsub("(","").gsub(")","")
seqs.push([id, dataset, seq.seq, 0])
count += 1
name, rest = seq.definition.split(" ", 2)
if (!rest.nil? && rest.length > 3)
inputs.push([id, dataset, rest.strip, 'input'])
end
if (count % 10000 == 0)
insert("sequence", seqs)
insert("annotation", inputs)
seqs = []
inputs = []
STDERR.printf("Loaded sequence %d...\n", count)
end
}
if (seqs.size > 0)
insert("sequence", seqs)
insert("annotation", inputs)
end
end
# return processed state of peptide
def processed?(seq_name, dataset)
query("select processed from sequence where seq_name = '#{seq_name}' AND dataset='#{dataset}'").each {|row|
return true if row[0] == "1"
}
return false
end
# set processed state for peptide
def setProcessed(seq_name, dataset)
query("update sequence set processed=1 where seq_name = '#{seq_name}' AND dataset='#{dataset}'")
end
# unset processed state for peptide
def setUnProcessed(seq_name, dataset)
query("update sequence set processed=0 where seq_name = '#{seq_name}' AND dataset='#{dataset}'")
end
# return blast info for given seq_name + dataset and evalue
def fetchBlast(seq_name, dataset, evalue, maxTree, tax)
homologs = []
query("select subject_name, subject_length, evalue from blast where seq_name='#{seq_name}' and dataset='#{dataset}' and evalue <= #{evalue} order by evalue").each {|row|
homologs.push(row[0]) if (homologs.size < maxTree && !homologs.include?(row[0]) && row[1].to_i < 2000)
}
return homologs
end
# delete blast records for seq with evalue > threshold
def deleteBlast(seq_name, dataset, evalue)
query("delete from blast where seq_name='#{seq_name}' and dataset='#{dataset}' and evalue > #{evalue}")
end
# stores alignment, deleting previous if there
def createAlignment(seq_name, dataset, afa)
query("delete from alignment where seq_name = '#{seq_name}' and dataset = '#{dataset}'")
lines = []
Bio::FlatFile.new(Bio::FastaFormat, File.new(afa)).each {|aseq|
lines.push([seq_name, dataset, aseq.entry_id,
aseq.definition.split(" ", 2).last, aseq.seq])
}
insert("alignment", lines)
end
# returns dataset for peptide, with optional list of possible datasets
def getDataset(seq_name, datasets = nil)
if (datasets.nil?)
query("select dataset from dataset").each {|dataset|
datasets.push(dataset[0])
}
end
datasets.each {|dataset|
if (exists?("sequence where dataset='#{dataset}' and seq_name='#{seq_name}'"))
return dataset
end
}
raise DatasetNotFoundError, "Cannot find dataset for #{seq_name}"
end
# stores tree
def createTree(seq_name, dataset, tree)
if (exists?("tree where seq_name='#{seq_name}' AND dataset = '#{dataset}'"))
query("UPDATE tree SET tree = '#{tree}' WHERE seq_name='#{seq_name}' AND dataset = '#{dataset}'")
else
insert("tree", [[seq_name, dataset, tree]])
end
end
# interprets tree, creating classification
def createClassification(tree, name, dataset, exclude, ruleMaj)
cons = consensusTax(tree, name, ruleMaj)
lines = []
cons.each {|line|
lines.push(line) if (line.grep(/#{exclude}/).empty? || exclude.nil?)
}
first = lines[0]
first=[nil,nil,nil,nil,nil,nil,nil] if first.nil?
if (lines[1].nil?)
second = nil
else
second = lines[1]
end
mixed = false
classification = [name, dataset]
7.times {|level|
mixed = true if first[level] == "Mixed"
first[level] = "Mixed" if mixed
if (first[level] == "Mixed" || second.nil? || first[level] == second[level])
outgroup = 0
else
outgroup = 1
end
first[level] = "Undefined" if first[level].nil?
classification.push(first[level][0..45])
classification.push(outgroup)
}
return classification
end
# creates phylogenomic annotation
def createAnnotation(tree, seq_name, dataset)
function = findClosestFunction(tree, seq_name)
if (function)
return [seq_name, dataset, function.strip, "APIS"]
else
return nil
end
end
# returns array of consensus taxonomy at each relative level of tree
def consensusTax(tree, taxon, ruleMaj)
consensus = []
return [] if (tree.relatives(taxon).nil?)
tree.relatives(taxon).each {|list|
counts = []
list.each {|relative|
acc, contig = relative.split("-")
contig, rest = contig.split("__")
next if (tax[contig].nil? || tax[contig]["species"].nil?)
groups = tax[contig]["taxonomy"].split("; ")
groups.size.times {|i|
counts[i] = Hash.new if counts[i].nil?
counts[i][groups[i]] = 0 if counts[i][groups[i]].nil?
counts[i][groups[i]] += 1
}
}
if (ruleMaj)
consensus.push(counts.majority)
else
consensus.push(counts.absolute)
end
}
return consensus
end
end
class MyDB < DBwrapper
def initialize(host, dbname, user, password, combodb = nil)
require "mysql"
@host = host
@db = dbname
@user = user
@password = password
@connected = false
if (combodb.nil?)
@combodb = dbname
else
@combodb = combodb
connect
end
end
def connect
@connected = false
while(!@connected)
begin
@connection = Mysql::new(@host, @user, @password, @db)
@connected = true
rescue
#STDERR.printf("%s: waiting...\n", $!)
sleep 0.2
end
end
end
def get(query)
connect if !@connected
return @connection.query(query).fetch_row
end
# creates and inserts classification row
def createClassification(tree, name, dataset, exclude, ruleMaj)
classification = super(tree, name, dataset, exclude, ruleMaj)
where = "WHERE seq_name='#{name}' AND dataset = '#{dataset}'"
old = []
query("SELECT * FROM classification #{where}").each {|row|
old = row
}
if (!old.empty?)
cl = classification.dup
name, dataset = cl.shift, cl.shift
name, dataset = old.shift, old.shift
ranks = ["kingdom", "phylum", "class", "ord", "family", "genus",
"species"]
ranks.each {|rank|
val = cl.shift
out = cl.shift
oval = old.shift
oout = old.shift.to_i
if (val != oval)
query("UPDATE classification SET #{rank} = '#{val}' #{where}")
end
if (out != oout)
query("UPDATE classification SET #{rank}_outgroup = #{out} #{where}")
end
}
else
insert("classification", [classification])
end
end
# creates and inserts annotation string
def createAnnotation(tree, seq_name, dataset)
function = super(tree, seq_name, dataset)
if (function)
where = "WHERE seq_name='#{seq_name}' AND dataset = '#{dataset}' AND source = 'APIS'"
if (exists?("annotation #{where}"))
query("UPDATE annotation SET annotation = '#{function[-2]}' #{where}")
else
insert("annotation", [function])
end
end
end
end
class SQLite < DBwrapper
class NoSQLiteDB < RuntimeError
end
def initialize(file, mode = "r")
require 'sqlite3'
if (File.exist?(file))
@db = file
elsif (mode == "r")
raise(NoSQLiteDB, "No such database as #{file}!")
end
connect
end
def connect
@connection = SQLite3::Database.new(@db)
@connected = true
end
# Insert one or more records into table, overridden because SQLite doesn't
# allow multiple inserts
def insert(table, records)
records.each {|record|
super(table, [record])
}
end
def count(condition)
return @connection.get_first_value("select count(*) from " + condition).to_i
end
# return first row of result immediately
def get(query)
return @connection.get_first_row(query)
end
# returns array of consensus taxonomy at each relative level of tree
def consensusTax(tree, taxon, ruleMaj)
consensus = []
return [] if (tree.relatives(taxon).nil?)
tree.relatives(taxon).each {|list|
counts = []
list.each {|relative|
query = "SELECT taxonomy FROM apis_proteins "
query += "WHERE protein_id=#{relative}"
groups = @connection.get_first_value(query).split("; ")
groups.size.times {|i|
counts[i] = Hash.new if counts[i].nil?
counts[i][groups[i]] = 0 if counts[i][groups[i]].nil?
counts[i][groups[i]] += 1
}
}
if (ruleMaj)
consensus.push(counts.majority)
else
consensus.push(counts.absolute)
end
}
return consensus
end
# returns function of protein based on either protein id or name
def fetchFunction(id)
if (id =~/[^0-9]/)
where = "protein_name = '#{id}'"
else
where = "protein_id = #{id}"
end
query = "SELECT annotation FROM apis_proteins WHERE #{where}"
return @connection.get_first_value(query)
end
def findClosestFunction(tree, id)
begin
tree.relatives(id).each {|match|
function = fetchFunction(match.first)
if (!function.nil? && function.split(" ").size > 1 &&
function !~/unnamed/ && function !~/unknown/ &&
function !~/numExons/ && function !~/^\{/)
return function
end
}
return false
rescue
return false
end
end
# creates a version of the tree with protein names & species
def SpeciesTree(tree)
ali = Hash.new
tree.taxa.each {|taxon|
if (taxon !~/[^0-9]/)
query = "SELECT protein_name, taxonomy FROM apis_proteins WHERE "
query += "protein_id = #{taxon}"
name, taxonomy = @connection.get_first_row(query)
if (name)
species = taxonomy.split("; ").last.gsub(" ","_")
ali[taxon] = name + "__" + species
end
end
}
return tree.unAlias(ali)
end
end
class Array
# return majority consensus for counts array
def majority
consensus = []
size.times do |i|
total = 0.0
self[i].values.each{|val|total+=val}
name = self[i].keys.sort {|x,y| self[i][y] <=> self[i][x]}.first
if (self[i][name]/total > 0.5)
consensus[i] = name
else
consensus[i] = "Mixed"
end
end
return consensus
end
# return absolute consensus for counts array
def absolute
consensus = []
size.times do |i|
if (self[i].size == 1)
consensus.push(self[i].keys.first)
else
consensus.push("Mixed")
end
end
return consensus
end
def mostCommon
count = Hash.new
self.each do |el|
if (count[el].nil?)
count[el] = 1
else
count[el] += 1
end
end
sorted = count.keys.sort {|a, b| count[b] <=> count[a]}
return sorted[0]
end
end
Computing file changes ...