Revision 702db34e6b9253c68b9af52e0c694d83773c7e5c authored by Eddie Belter on 02 October 2015, 21:39:32 UTC, committed by Eddie Belter on 08 October 2015, 21:45:35 UTC
1 parent 7f14f1e
KEGGScan.pm
package PAP::Command::KEGGScan;
use strict;
use warnings;
use PAP;
use Carp 'confess';
use Bio::Annotation::DBLink;
use Bio::Seq;
use Bio::SeqIO;
use Bio::SeqFeature::Generic;
use Compress::Bzip2;
use English;
use File::Basename;
use File::chdir;
use File::Temp;
use IO::File;
use IPC::Run;
use Cwd;
use File::Path qw(remove_tree);
use DateTime;
class PAP::Command::KEGGScan {
is => 'PAP::Command',
has => [
fasta_file => {
is => 'Path',
doc => 'fasta file name',
is_input => 1,
},
],
has_optional => [
report_save_dir => {
is => 'Path',
doc => 'directory to save a copy of the raw output to',
is_input => 1,
},
bio_seq_feature => {
is => 'ARRAY',
doc => 'array of Bio::Seq::Feature',
is_output => 1,
},
keggscan_top_output => {
is => 'SCALAR',
doc => 'instance of IO::File pointing to raw KEGGscan output',
},
keggscan_full_output => {
is => 'SCALAR',
doc => 'instance of IO::File pointing to raw KEGGscan output',
},
# These parameters are used when scheduling blast jobs, do NOT affect this lsf requirements
# for this module are any modules it calls!
blast_lsf_queue => {
is => 'Text',
default_value => 'long',
},
blast_lsf_resource => {
is => 'Text',
default_value => 'select[mem>4096] rusage[mem=4096]',
},
blast_lsf_max_memory => {
is => 'Number',
default => '4096000',
},
blast_lsf_job_limit => {
is => 'Number',
default => 50,
doc => 'Maximum number of BLAST LSF jobs allowed to run at the same time',
},
fasta_chunk_size => {
is => 'Number',
default => 50,
doc => 'Maximum number of sequences allowed in a fasta chunk',
},
_working_directory => {
is => 'Path',
doc => 'analysis program working directory',
},
version => {
is => 'String',
valid_values => ['50', '52', '56','61'],
default => '56',
is_input => 1,
is_optional => 1,
},
tar => {
is => 'Boolean',
is_input => 1,
default => 1,
doc => 'If set, tar/bz2 PAP_keggscan_* directory',
},
],
# These parameters tell workflow the requirements needed for this module
has_param => [
lsf_resource => {
default => "-R 'select[mem=8192,type==LINUX64] rusage[mem=8192,tmp=1024]' -M 8192000",
},
lsf_queue => {
default => 'long',
},
],
};
sub help_brief {
"Run KEGGscan";
}
sub help_synopsis {
return <<"EOS"
EOS
}
sub help_detail {
return <<"EOS"
Need documenation here.
EOS
}
sub execute {
my $self = shift;
unless($self->version) {
$self->version($self->__meta__->property(property_name => 'version')->default_value);
}
my $keggscan_faa_path = "/gscmnt/temp212/info/annotation/KEGG/Version_". $self->version . "/genes.v" . $self->version .".faa";
# Status messages are not displayed by default...
$ENV{UR_COMMAND_DUMP_STATUS_MESSAGES} = 1;
# FIXME This directory needs to be a param with a default value
my ($kegg_stdout, $kegg_stderr);
$self->_working_directory(
File::Temp::tempdir(
'PAP_keggscan_XXXXXXXX',
DIR => '/gscmnt/temp212/info/annotation/PAP_tmp',
CLEANUP => 0,
)
);
chmod(0777, $self->_working_directory);
my $fasta_file = $self->fasta_file();
## We're about to screw with the current working directory.
## Thusly, we must fixup the fasta_file property if it
## contains a relative path.
#TODO bdericks: Is this necessary?
unless ($fasta_file =~ /^\//) {
$fasta_file = join('/', $CWD, $fasta_file);
$self->fasta_file($fasta_file);
}
# FIXME The subject fasta path needs to be a param on this command object with a default value
my ($fasta_name) = basename($self->fasta_file);
my $kegg_command = PAP::Command::KEGGScan::RunKeggScan->create(
species_name => "default",
query_fasta_path => $self->fasta_file,
subject_fasta_path => $keggscan_faa_path,
output_directory => $self->_working_directory . "/KS-OUTPUT." . $fasta_name,
blast_lsf_queue => $self->blast_lsf_queue,
blast_lsf_resource => $self->blast_lsf_resource,
blast_lsf_max_memory => $self->blast_lsf_max_memory,
fasta_chunk_size => $self->fasta_chunk_size,
blast_lsf_job_limit => $self->blast_lsf_job_limit,
);
my $rv = $kegg_command->execute;
unless ($rv) {
$self->error_message("Problem running keggscan!");
confess;
}
$self->parse_result();
my $top_output_fh = $self->keggscan_top_output();
my $full_output_fh = $self->keggscan_full_output();
## Be Kind, Rewind. Somebody will surely assume we've done this,
## so let's not surprise them.
# TODO bdericks: Is this necessary?
$top_output_fh->seek(0, SEEK_SET);
$full_output_fh->seek(0, SEEK_SET);
$self->archive_result();
## Second verse, same as the first.
$top_output_fh->seek(0, SEEK_SET);
$full_output_fh->seek(0, SEEK_SET);
# Set the status messages back to default behavior...
$ENV{UR_COMMAND_DUMP_STATUS_MESSAGES} = 0;
return 1;
}
sub parse_result {
my $self = shift;
my $top_output_fh = IO::File->new();
my $full_output_fh = IO::File->new();
my $top_output_fn = join('.', 'KS-OUTPUT', File::Basename::basename($self->fasta_file()));
$top_output_fn = join('/', $self->_working_directory(), $top_output_fn, 'REPORT-top.ks');
$top_output_fh->open("$top_output_fn") or die "Can't open '$top_output_fn': $OS_ERROR";
$self->keggscan_top_output($top_output_fh);
my $full_output_fn = join('.', 'KS-OUTPUT', File::Basename::basename($self->fasta_file()));
$full_output_fn = join('/', $self->_working_directory(), $full_output_fn, 'REPORT-full.ks');
$full_output_fh->open("$full_output_fn") or die "Can't open '$full_output_fn': $OS_ERROR";
$self->keggscan_full_output($full_output_fh);
my @features = ( );
LINE: while (my $line = <$top_output_fh>) {
chomp $line;
my @fields = split /\t/, $line;
my (
$ec_number,
$gene_name,
$hit_name,
$e_value,
$description,
$orthology
) = @fields[1,2,3,4,6,8];
## Prat filtered out lines with e-value >= 0.01 when
## generating AceDB .ace files.
if ($e_value >= 0.01) { next LINE; }
## The values in the third column should be in this form:
## gene_name (N letters; record M).
($gene_name) = split /\s+/, $gene_name;
unless ($main::locus_name) {
$self->status_message("gene_name: ". $gene_name);
($main::locus_name) = split(/_/, $gene_name);
$self->status_message("locus_name: ". $main::locus_name);
}
## Some descriptions have EC numbers embedded in them.
## Prat's removed them.
$description =~ s/\(EC .+\)//;
my $feature = new Bio::SeqFeature::Generic(-display_name => $gene_name);
$feature->add_tag_value('kegg_evalue', $e_value);
$feature->add_tag_value('kegg_description', $description);
my $gene_dblink = Bio::Annotation::DBLink->new(
-database => 'KEGG',
-primary_id => $hit_name,
);
$feature->annotation->add_Annotation('dblink', $gene_dblink);
## Sometimes there is no orthology identifier (value is literally 'none').
## It is not unforseeable that it might also be blank/undefined.
if (defined($orthology) && ($orthology ne 'none')) {
my $orthology_dblink = Bio::Annotation::DBLink->new(
-database => 'KEGG',
-primary_id => $orthology,
);
$feature->annotation->add_Annotation('dblink', $orthology_dblink);
}
push @features, $feature;
}
if ($self->report_save_dir) {
require Data::Dumper;
my $dump_file = join('/', $self->report_save_dir, 'dump_file.out');
my $fh = IO::File->new($dump_file, 'w');
die "Could not get file handle for $dump_file: $!" unless $fh;
$fh->print(Data::Dumper::Dumper(\@features) . "\n");
}
$self->bio_seq_feature( \@features );
return;
}
sub archive_result {
my $self = shift;
my $report_save_dir = $self->report_save_dir();
my $tar_bz2 = $self->tar;
if (defined($report_save_dir)) {
unless (-d $report_save_dir) {
die "does not exist or is not a directory: '$report_save_dir'";
}
my $top_output_handle = $self->keggscan_top_output();
my $full_output_handle = $self->keggscan_full_output();
my $top_target_file = File::Spec->catfile($report_save_dir, 'REPORT-top.ks.bz2');
my $full_target_file = File::Spec->catfile($report_save_dir, 'REPORT-full.ks.bz2');
my $top_bz_file = bzopen($top_target_file, 'w') or
die "Cannot open '$top_target_file': $bzerrno";
while (my $line = <$top_output_handle>) {
$top_bz_file->bzwrite($line) or die "error writing: $bzerrno";
}
$top_bz_file->bzclose();
my $full_bz_file = bzopen($full_target_file, 'w') or
die "Cannot open '$full_target_file': $bzerrno";
while (my $line = <$full_output_handle>) {
$full_bz_file->bzwrite($line) or die "error writing: $bzerrno";
}
$full_bz_file->bzclose();
}
## We will come here to tar ball PAP_keggscan_* directory
if (defined($tar_bz2)) {
my $parent_dir = $self->_working_directory."/..";
chdir($parent_dir) || confess "Unable to change directory to $parent_dir: $?";
$self->status_message("Cwd: ". getcwd);
my $user = $ENV{USER};
my $now = DateTime->now(time_zone => 'America/Chicago');
my $date_code = $now->ymd('_');
# my $tar_file_name = File::Basename::basename($self->fasta_file()). "-". $user. "-". $date_code. ".tar.bz2";
my $tar_file_name = $main::locus_name. "-". $user. "-". $date_code. ".tar.bz2";
my $tar_dir = $self->_working_directory;
my $tar_bz_cmd = "tar -C $tar_dir -jcvf $tar_file_name .";
$self->status_message("Cmd: ". $tar_bz_cmd);
system($tar_bz_cmd) == 0
|| confess "system $tar_bz_cmd failed: $?";
$self->status_message("Removing PAP_tmp dir: ". $tar_dir);
remove_tree($tar_dir) || $self->status_message("Error removing $tar_dir: ". $?);
}
return 1;
}
1;
Computing file changes ...