https://github.com/tseemann/prokka
Revision 62ff26f424ed7262502d9feedb6ed6943eb774a7 authored by Torsten Seemann on 05 March 2017, 06:44:36 UTC, committed by Torsten Seemann on 05 March 2017, 06:44:36 UTC
1 parent ef1047c
Raw File
Tip revision: 62ff26f424ed7262502d9feedb6ed6943eb774a7 authored by Torsten Seemann on 05 March 2017, 06:44:36 UTC
Major fix for cleanup_product wrt homolog and N.term (Issue #140)
Tip revision: 62ff26f
prokka-hamap_to_hmm
#!/usr/bin/env perl
use strict;
use warnings;
use Bio::SeqIO;
use Data::Dumper;
use File::Temp qw(tempdir);

my(@Options, $verbose, $datadir, $hypo, $sep, $blank, $pseudo, $minlen);
setOptions();

$datadir or die "please specify HAMAP --datadir";
-d $datadir or die "--datadir $datadir is not a valid folder";

my $dir = tempdir(CLEANUP=>1);
print STDERR "Using $dir as temp folder.\n";

my %DESC;
my $ali_fh = get_file("alignment_id.dat", "alignment /gene /product info");
while (<$ali_fh>) {
  chomp;
  my @x = split m/\t/;
  next if $x[5] =~ m/UPF\d+|homolog|[@<\[]/;
  $x[5] =~ s/^probable\s+//gi;
  $DESC{ $x[0] } = "$sep$x[4]$sep$x[5]";
}
printf STDERR "Accepted descriptions for %d families\n", scalar keys %DESC;

my $fam_fh = get_file("hamap_families.dat", "alignment /EC_number");
my($AC,$EC,$Bac);
while (<$fam_fh>) {
  chomp;
  if (m{^//}) {
    #print STDERR "AC=$AC EC=$EC Bac=$Bac\n";
    if ($AC and $EC and exists $DESC{$AC}) {
      $DESC{$AC} = $EC.$DESC{$AC}; 
    }
    if ($AC and ! $Bac) {
      print STDERR "Removing $AC as non-Bacterial\n";
      delete $DESC{$AC};
    }
    $AC = $EC = $Bac = '';
  }
  elsif (m/^AC\s+(\w+)/) {
    $AC = $1;
  }
  elsif (m/EC=([\d.-]+);/) {
    $EC = $1;
  }
  elsif (m/^ Bacteria/) {
    $Bac = 1;
  }
}
printf STDERR "%d models made it through filtering\n", scalar keys %DESC;

my $aln_file = "$datadir/hamap_seed_alignments.tar.gz";
if (-r $aln_file) {
  print STDERR "Extracting $aln_file.\n";
  system("tar -C $dir --strip-components 1 -zxf $aln_file")==0 or die "ERROR: $?";
  my @msa = <$dir/*.msa>;
  printf STDERR "Found %d MSA files.\n", scalar(@msa);
  for my $msa (@msa) {
    $msa =~ m/(\w+).msa$/ or die "can't extract ID from filename '$msa'";
    my $id = $1;
    my $desc = $DESC{$id};
    if ($desc) {
      print STDERR "Building HMM for $id: $desc\n";
      $desc =~ s{'}{\\'}g;
      system( "hmmbuild -o /dev/null -n $id /dev/stdout $msa | sed \"3i DESC  $desc\" > $dir/$id.hmm" )==0 or die "ERROR: $?";
#      if ($id eq 'MF_00023') {
#        system("head -20 $dir/$id.hmm");
#        last;
#      }
    }
    else {
      print STDERR "Skipping $id - undesirable /product name\n";
    }
  }
}
else {
  print STDERR "Could not read file: $aln_file\n";
  exit 1;
}

my $dest = 'HAMAP.hmm';
print STDERR "Combining into $dest\n";
system("cat $dir/*.hmm > $dest");


print STDERR "Done.\n";

#----------------------------------------------------------------------

sub get_file {
  my($pattern, $desc) = @_;
  my($first) = glob("$datadir/$pattern");
  if ($first and -r $first) {
    print STDERR "Found $desc: $first\n";
    open my $fh, '<', $first;
    return $fh;
  }
  die "Problem finding $pattern in $datadir ($desc)";
}

#----------------------------------------------------------------------
# Option setting routines

sub setOptions {
  use Getopt::Long;

  @Options = (
    {OPT=>"help",      VAR=>\&usage,                   DESC=>"This help"},
    {OPT=>"verbose!",  VAR=>\$verbose, DEFAULT=>0,     DESC=>"Verbose progress"},
    {OPT=>"datadir=s", VAR=>\$datadir, DEFAULT=>'',    DESC=>"Path to downloaded HAMAP folder (ftp://ftp.expasy.org/databases/hamap/)" },
    {OPT=>"sep=s",     VAR=>\$sep,     DEFAULT=>'~~~', DESC=>"Separator between EC/gene/product" },
    {OPT=>"blank=s",   VAR=>\$blank,   DEFAULT=>'',    DESC=>"Replace empty EC/gene/product with this"},
#    {OPT=>"pseudo!",   VAR=>\$pseudo,  DEFAULT=>0,     DESC=>"Include /pseudo genes"},
#    {OPT=>"hypo!",     VAR=>\$hypo,    DEFAULT=>0,     DESC=>"Include 'hypothetical protein' genes"},
#    {OPT=>"minlen=i",  VAR=>\$minlen,  DEFAULT=>0,     DESC=>"Minimum peptide length"},
  );

  #(!@ARGV) && (usage());

  &GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();

  # Now setup default values.
  foreach (@Options) {
    if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
      ${$_->{VAR}} = $_->{DEFAULT};
    }
  }
}

sub usage {
  print "Usage: $0 [options] [--datadir hamap_data_subdir] > hamap.hmm\n";
  foreach (@Options) {
    printf "  --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
           defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
  }
  exit(1);
}
 
#----------------------------------------------------------------------
back to top