https://github.com/jhbadger/APIS
Revision cedb1bbca42dd06094db03ac0b517a889d4b8b4c authored by Jonathan Badger on 17 August 2011, 19:32:14 UTC, committed by Jonathan Badger on 17 August 2011, 19:32:14 UTC
1 parent 67d3efe
Tip revision: cedb1bbca42dd06094db03ac0b517a889d4b8b4c authored by Jonathan Badger on 17 August 2011, 19:32:14 UTC
remove apisRun, _input from dataset name on grid
remove apisRun, _input from dataset name on grid
Tip revision: cedb1bb
parseGenBank2Combo
#!/usr/bin/env ruby
Encoding.default_external="ASCII-8BIT"
require 'optparse'
require 'bio'
require 'ostruct'
require 'ZFile'
opt = OpenStruct.new
opt.sep = false
opt.tax = false
ARGV.options do |opts|
opts.banner << " gbk-file [..gbk-file...]"
opts.on("-s", "--separate",
"output in separate files for each accession (#{opt.sep})") {|t| opt.sep = t}
opts.on("-t", "--taxonomy",
"use taxonomy from combodb database, not file (#{opt.tax})") {|t| opt.tax = t}
begin
opts.parse!
rescue
STDERR.puts $!.message
STDERR.puts opts
exit(1)
end
if (ARGV.size < 1)
STDERR.puts opts
exit(1)
end
end
# classify type of molecule given definition string
def classifyMolecule(definition)
if (definition =~ /plasmid/i)
return "Plasmid"
elsif (definition =~ /mitochon/i)
return "Mitochondria"
elsif (definition =~ /plastid|chloroplast/i)
return "Plastid"
elsif (definition =~ /chromosome\s*([^\s,]+)/i )
return "Chromosome " + $1
else
return "Chromosome"
end
end
# return sequence from genbank range
def getSequence(seq, locations)
subseq = ""
locations.each do |loc|
if (loc.strand == 1)
subseq += seq[(loc.from - 1)..(loc.to - 1)]
elsif (loc.strand == -1)
subseq += Bio::Sequence::NA.new(seq[(loc.from - 1)..(loc.to - 1)]).complement.seq
else
STDERR.printf("Error: I don't know how to handle #{locations}\n")
exit(1)
end
end
return subseq
end
pep, cds, rna, trna, ent, con = nil
if (!opt.sep)
pep = File.new("all.pep", "w")
cds = File.new("all.cds", "w")
rna = File.new("all.rRNA", "w")
trna = File.new("all.tRNA", "w")
ent = File.new("all.ent", "w")
att = File.new("all.att", "w")
con = File.new("all.con", "w")
end
org = ""
ARGV.each do |gb|
Bio::FlatFile.new(Bio::GenBank, ZFile.new(gb)).each do |genome|
organism = genome.organism
if (organism == "")
organism = genome.definition.split(" ")[0] + " " + genome.definition.split(" ")[1]
end
acc = genome.accession || ""
acc = genome.locus.entry_id || "" if acc == ""
if (acc.index("scaffold:")) # EnsEmbl format
acc = acc.gsub("scaffold:","").split(":")[0..1].join("_")
end
orfname = "ORF0000"
rnaname = "RNA0000"
proteins = 0
trnas = 0
rrnas = 0
strain = ""
taxonid = ""
gene_location = nil
gene_strand = nil
product = ""
dna = genome.seq
next if (dna.length == 0) # skip empty records
if (opt.sep)
pep = File.new(acc + ".pep", "w")
cds = File.new(acc + ".cds", "w")
rna = File.new(acc + ".rRNA", "w")
trna = File.new(acc + ".tRNA", "w")
ent = File.new(acc + ".ent", "w")
att = File.new(acc + ".att", "w")
con = File.new(acc + ".con", "w")
end
if (org != organism)
STDERR.printf("Processing %s...\n", organism) if (organism != "")
org = organism
end
# process each CDS, rRNA. tRNA
genome.features.each do |feature|
product = feature["product"].first if (product == "" && feature["product"])
if feature.feature == "source" # contains taxon_id info
feature.qualifiers.each do |qual|
if (qual.value =~/taxon:([0-9]+)/)
taxon_id = $1
end
end
elsif feature.feature == "gene"
feature.qualifiers.each do |qual|
product = qual.value if (qual.qualifier == "note")
end
gene_location = feature.position
if (gene_location =~/([0-9]+)\.\.([0-9]+)/)
gene_location = [$1,$2]
if (feature.position =~/complement/)
gene_strand = "-1"
else
gene_strand = "1"
end
end
elsif feature.feature == "CDS" # proteins and their associated cds
cds_name = nil
translation = nil
feature.qualifiers.each do |qual|
if (qual.value =~/GI:/)
cds_name = qual.value.downcase.gsub(":","")
elsif (!cds_name && qual.qualifier == "protein_id")
cds_name = qual.value
elsif (qual.qualifier == "translation")
translation = qual.value
end
end
seq = getSequence(dna, feature.locations)
if (cds_name.nil?) # give it an orf name if none exist
cds_name = orfname
orfname.succ!
end
if (translation.nil?) # make one if none given
translation = Bio::Sequence::NA.new(seq).translate.seq
end
key = cds_name + "-" + acc
header = key + " " + product.to_s + " {" + organism + "}"
pep.print Bio::Sequence::AA.new(translation).to_fasta(header, 60)
cds.print Bio::Sequence::AA.new(seq).to_fasta(header, 60)
if (gene_location)
location = gene_location
strand = gene_strand
gene_location = nil
else
location = [feature.locations.first.from, feature.locations.first.to]
strand = feature.locations.first.strand
end
ent.print key + "\t" + cds_name + "\t" + location[0].to_s +
"\t" + location[1].to_s + "\t" + strand.to_s + "\t" +
product.to_s + "\n"
proteins += 1
product = ""
elsif feature.feature == "rRNA" || feature.feature == "tRNA" || feature.feature == "misc_RNA"
locus = ""
feature.qualifiers.each do |qual|
locus = qual.value if (qual.qualifier == "locus_tag")
product = qual.value if (qual.qualifier == "note" && product == "")
end
if (locus == "")
locus = rnaname
rnaname.succ!
end
key = locus.to_s + "-" + acc.to_s
header = key + " " + product.to_s + " {" + organism + "}"
seq = getSequence(dna, feature.locations)
if (product =~/rRNA|ribosomal/i || locus =~/rRNA/i)
out = rna
rrnas += 1
else
out = trna
trnas += 1
end
out.print Bio::Sequence::AA.new(seq).to_fasta(header, 60)
ent.print key + "\t" + locus + "\t" + feature.locations.first.from.to_s +
"\t" + feature.locations.first.to.to_s + "\t" +
feature.locations.first.strand.to_s + "\t" + product.to_s + "\n"
product = ""
end
end
if (opt.tax)
require 'dm-core'
require 'ComboDB'
DataMapper.setup(:combodb, "mysql://access:access@mysql-lan-pro/phylodb_annotation")
taxonomy = buildTaxFromTaxId(taxonid, true)
if (taxonomy.nil?)
STDERR.printf("No info for %d\n", taxonid)
end
else
taxonomy = genome.taxonomy.chop
end
att.print acc + "\t" + genome.seq.length.to_s + "\t"
att.print organism + "\t" + strain + "\t" + taxonid + "\t"
att.print taxonomy.gsub(",", "") + "\t" + classifyMolecule(genome.definition) + "\t"
att.print proteins.to_s + "\t" + rrnas.to_s + "\t" + trnas.to_s + "\t"
gc = (dna.count("g") + dna.count("c")) / (1.0 * dna.length)
att.print sprintf("%5.2f%%\n", gc * 100)
header = acc + " " + genome.definition
con.print Bio::Sequence::AA.new(dna).to_fasta(header, 60)
if(opt.sep)
att.close
pep.close
cds.close
rna.close
trna.close
ent.close
con.close
end
end
end
if(!opt.sep)
att.close
pep.close
cds.close
rna.close
trna.close
ent.close
con.close
end
Computing file changes ...