Content - 5361aacdbb9fa6dbf6132aeb0e9033e2af7b0895 - d40da97/ripper.pl

visit type:
Tip revision: 44ecc7291a78ea739afe2774eb65345b190d31e6 authored by streptomyces on 01 November 2023, 12:24:35 UTC
Additions to .gitignore.
Tip revision: 44ecc72
ripper.pl
#!/usr/bin/perl
use 5.14.0;
use utf8;
use Carp;
# use lib qw(/Users/nouser/perllib);
use File::Basename;
use File::Temp qw(tempfile tempdir);
use File::Spec;
use File::Copy;
# use Sco::Common qw(tablist tablistE linelistE);
use Bio::SeqIO;
use Bio::Seq;
use Bio::SeqFeature::Generic;
use DBI;
use Digest::SHA qw(sha1_hex);

use XML::Simple;
use LWP::Simple;
use Net::FTP;


# {{{ Getopt::Long
use Getopt::Long;
my $maxDistFromTE = 8000; # From precursor peptide to the tailoring enzyme.
my $outdir = qq(ripout);
my $indir;
my $fofn;
my $outex; # extension for the output filename when it is derived on infilename.
my $conffile = qq(local.conf);
my $colorThreshScore = 20;
my $ppFeatAddLimit = 20;
my $errfile;
my $prodigalScoreThresh = 7.5;
my $prodigalshortbin = qq(prodigal-short);
my $flankLen = 17500;
my $allowedInGene = 20;
my $minPPlen = 20; # Minimum precursor peptide length.
my $maxPPlen = 120; # Maximum precursor peptide length.
my $sameStrandReward = 5;
my $fastaOutputLimit = 3;
my $runfile;
my $outfile;
my $testCnt = 0;
our $verbose;
my $skip = 0;
my $help;
GetOptions (
"outfile:s" => \$outfile,
"outdir:s" => \$outdir,
"indir:s" => \$indir,
"fofn:s" => \$fofn,
"extension:s" => \$outex,
"conffile:s" => \$conffile,
"errfile:s" => \$errfile,
"runfile:s" => \$runfile,
"testcnt:i" => \$testCnt,
"skip:i" => \$skip,
"allowedingene:i" => \$allowedInGene,
"verbose" => \$verbose,
"help" => \$help
);
# }}}

# {{{ POD Example

=head1 Name

ripper.pl

=head2 Example

 perl code/ripper.pl -outdir ripout

 export rwb=rodeowork
 export listfn=TfuA_Actino_Accessions_080217.txt
 export outdir=output_21_07_2017
 fp-rip () {
   listbn=$(basenameNoex.pl $listfn);
   ofn=${listbn}.csv
     for iline in $(tail -n 1 $listfn); do
       line=$(basenameNoex.pl $iline);
       incsv=$rwb"/"${line}"/outarch.csv"
       echo perl code/ripper.pl -outdir $outdir -- $incsv
     done
 }

 fp-rip | parallel

Below is a standalone test command.

 perl code/ripper.pl -outdir ripout -- rodout/main_co_occur.csv

=cut

# }}}

# {{{ POD Options and blurb

=head2 Blurb

RODEO outputs a file named *arch.csv.

 499497121,50841496,488474605,,rev,927480,927977,PF02481,1.3e-14,,,,,
 499497121,50841496,488487448,,fwd,928266,929873,PF13175,1e-23,PF13304,1.4e-20,,,
 499497121,50841496,488487449,,fwd,929874,931562,PF00580,6.3e-26,PF13245,7.7e-17,,,
 <more line here>
 499497121,50841496,695294193,,fwd,950727,951236,PF00155,0.00022,PF01803,0.21,,,
 499497121,50841496,488474066,,fwd,951486,951725,PF01610,9e-12,,,,,
 499497121,50841496,488487891,,rev,952257,954161,PF13304,6.3e-15,PF00005,2.6e-11,,,

The index 1 column is a genbank gid which we use to fetch the genbank file.

1. From the fetched genbank file we extract the region spanned by the output in
*arch.csv (column index 5 and 6 of the first and last lines).

2. Prodigal is run on the nucleotide sequence obtained in step 1.

3. Peptides less than 30 amino acids or more than 120 amino acids are discarded.

4. Results of Prodigal are sorted by the total score reported by Prodigal

5. The top 20 peptides (or as many as have a total prodigal score of more then 0)
are kept.

6. Peptides kept in step 5 are added to the annotation in the genbank file as
CDSs.

7. The top 3 peptides are coloured reddish brown, the next 9 are coloured green
and the remainder are coloured yellowish brown. For these features, the note
tag contains the output from Prodigal for each particular peptide.

=head2 Input files

These are F<outarch.csv> files resulting from running RODEO. This is a messy
operation and it is best done using F<rodeo.bash>. F<outarch.csv> files are
usually in a path similar to F<rodeowork/E<lt>accessionE<gt>/outarch.csv>.

=head2 Output

Output is gbk and faa files named as <accession>.gbk and <accession>.faa
written to $outdir. Some details about top scoring PPs are also written to the
table $conf{prepeptab} in the database I<andy>. This can be used to get a tsv
or csv file of results for viewing in a spreadsheet.

=head2 Subs, data structures and comments.

=cut


# }}}

if($help) {
exec("perldoc $0");
exit;
}

# {{{ Some file globals
my $template='rippXXXXX';
my $tempdir = qw(/tmp);
my $esearchURL='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?';
my $efetchURL='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?';
my $esummaryURL='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?';
my $ftp=Net::FTP->new(Host => "ftp.ncbi.nih.gov", Passive => 1);
# }}}


# {{{ open the errfile
if($errfile) {
open(ERRH, ">", $errfile);
print(ERRH "$0", "\n");
close(STDERR);
open(STDERR, ">&ERRH"); 
}
# }}}


# {{{ Populate %conf if a configuration file 
my %conf;
if(-s $conffile ) {
  open(my $cnfh, "<", $conffile);
  my $keyCnt = 0;
  while(my $line = readline($cnfh)) {
    chomp($line);
    if($line=~m/^\s*\#/ or $line=~m/^\s*$/) {next;}
    my @ll=split(/\s+/, $line, 2);
    $conf{$ll[0]} = $ll[1];
    $keyCnt += 1;
  }
  close($cnfh);
#  linelistE("$keyCnt keys placed in conf.");
}
elsif($conffile ne "local.conf") {
linelistE("Specified configuration file $conffile not found.");
}
if(exists($conf{minPPlen})) { $minPPlen = $conf{minPPlen}; }
if(exists($conf{maxPPlen})) { $maxPPlen = $conf{maxPPlen}; }
if(exists($conf{maxDistFromTE})) { $maxDistFromTE = $conf{maxDistFromTE}; }
if(exists($conf{prodigalScoreThresh})) {
  $prodigalScoreThresh = $conf{prodigalScoreThresh};
}
if(exists($conf{fastaOutputLimit})) {
  $fastaOutputLimit = $conf{fastaOutputLimit};
}
if(exists($conf{sameStrandReward})) {
  $sameStrandReward = $conf{sameStrandReward};
}
if(exists($conf{flankLen})) {
  $flankLen = $conf{flankLen};
}
if(exists($conf{"prodigalshortbin"})) {
  $prodigalshortbin = $conf{prodigalshortbin};
}


# }}}


# {{{ gbkcache and sqlite initialisation
my $gbkcache = qq(gbkcache);
if(exists($conf{gbkcache})) { $gbkcache = $conf{gbkcache}; }


my $dbfile=$conf{sqlite3fn};
my $handle=DBI->connect("DBI:SQLite:dbname=$dbfile", '', '');

my $create_table_str = <<"CTS";
create table if not exists $conf{prepeptab} (
acc       text,
species   text,
ppser     integer,
fastaid   text,
aaseq     text,
ppstrand  integer,
testrand  integer,
pptedist  integer,
score     float,
unique(acc, ppser)
);
CTS
# $handle->do("drop table if exists $conf{prepeptab}");
$handle->do($create_table_str);
# }}}

# {{{ populate @infiles
my @infiles;
if(-e $fofn and -s $fofn) {
open(FH, "<", $fofn);
while(my $line = readline(FH)) {
chomp($line);
if($line=~m/^\s*\#/ or $line=~m/^\s*$/) {next;}
my $fn;
if($indir) {
$fn = File::Spec->catfile($indir, $line);
}
else {
$fn = $line;
}

push(@infiles, $fn);
}
close(FH);
}
else {
@infiles = @ARGV;
}

# }}}

# {{{ out file handle
my $ofh;
if($outfile) {
  unless(-d $outdir) {
    mkdir($outdir) or croak("Failed to make $outdir\n");
  }
  my $outpath = File::Spec->catfile($outdir, $outfile);
  open($ofh, ">", $outfile);
}
else {
  open($ofh, ">&STDOUT");
}
#}}}

# Cycle through all the infiles.
for my $infile (@infiles) {
my ($noex, $dir, $ext)= fileparse($infile, qr/\.[^.]*/);
my $bn = $noex . $ext;

=head3 Genbank and fasta file names.

These are derived from $taildir which is the immediate
directory containing the input file (outarch.csv). The
way this has been designed, $taildir is a protein
accession number.

=cut

my @temp = split(/\//, $dir);
my $taildir = pop(@temp);
if($outdir) {
  unless(-d $outdir) {
    if(mkdir($outdir)) {
    }
    else {
      croak("Failed to make $outdir");
    }
  }
}


=pod

Make a genbank filename and a fasta filename. The are the
filenames output is written to.

=cut

my ($ofn, $fastafn);
if($outdir) {
$ofn = File::Spec->catfile($outdir, $taildir . ".gbk"); 
$fastafn = File::Spec->catfile($outdir, $taildir . ".faa"); 
}
else {
$ofn = $taildir . ".gbk"; 
$fastafn = $taildir . ".faa"; 
}

linelistE("Genbank output is $ofn. Fasta output is $fastafn");


open(my $ifh, "<$infile") or croak("Could not open $infile"); # main_co_occur.csv
$skip = 1; # There is a header.
if($skip) {
for (1..$skip) { my $discard = readline($ifh); }
}
my @coords;
my $gbkgi; # Column index 2 of outarch.csv. Value is the same in all rows.
my $teProtAcc; # We remove ".1", ".2" from the end of this one.
my $teProtId; # This is the $teProtAcc without the removal mentioned above.
my $teNdx; # Index for the tailoring enzyme.
my $lineCnt = 0;

# read lines in the input file to assign to the variables above
while(my $line = readline($ifh)) {
chomp($line);
if($line=~m/^\s*\#/ or $line=~m/^\s*$/
  or $line=~m/^query/ ) {next;}
#Query,Genus/Species,Nucleotide_acc,Protein_acc,start,end,dir,PfamID1,Name1,Description1,E-value1,PfamID2,Name2,D
#POF95626.1,Pseudomonas putida,MINE01000015.1,POF95618.1,459205,459898,+,PF12146,Hydrolase_4,"Serine aminopeptida

my @ll=split(",", $line);
$teProtAcc = $ll[0];
$teProtId = $teProtAcc; # In case we need to look in the genbank file.
$teProtAcc =~ s/\.[^.]*$//;
$gbkgi = $ll[2];
my $strand = $ll[6] =~ m/^-/ ? -1 : 1;

my $start = $ll[4];
my $end = $ll[5];

# For strand -1 the order in the file is end, start
# Hence the if block below.
if($strand == -1) {
$start = $ll[5];
$end = $ll[4];
}
push(@coords, [$start, $end, $strand]);

=head3 Getting the index for the tailoring enzyme used in the RODEO search.

This relies on the fact that in outarch.csv, column 0 and
column 3 will be the same for this protein.

=cut

if($ll[0] eq $ll[3]) {
$teNdx = $lineCnt;
# tablistE(@ll);
linelistE("TE Line: $line");
tablistE("TE Line:", $start, $end, $strand);
}
$lineCnt += 1;
}
close($ifh);
unless($lineCnt) {
  croak("Only a header line in $infile.\nStopping"); # main_co_occur.csv
}

# TEcoordsByProtId (gbkfile => filename, protid => proteinid);
unless(defined($teNdx)) {
  linelistE("\$teNdx is undefined. Looking into the downloaded genbank file.\n");
  my($gbkfh, $gbkfn)=tempfile($template, DIR => $tempdir, SUFFIX => '.gbk');
  my $gbkstored = File::Spec->catfile($gbkcache, ($gbkgi . ".gbk"));
  if(-e $gbkstored) {
    copy($gbkstored, $gbkfn);
    linelistE("$gbkstored copied to $gbkfn");
  }
  else {
    fetchGbk(uid => $gbkgi, outfile => $gbkfn);
    if(-d $gbkcache) {
      copy($gbkfn, $gbkstored);
    }
  }
  my $cr = TEcoordsByProtId(gbkfile => $gbkfn, protid => $teProtId);
  if(ref($cr)) {
    push(@coords, $cr);
    $teNdx = $#coords;
    tablistE("Found TE ($teProtId) in $gbkstored: ", @{$cr}, $teNdx);
  }
  else {
    linelistE("No $teProtId found in $gbkstored. Stopping.");
    unlink($gbkfn);
    exit();
  }
}

=head3 Coordinates to be extracted from the Genbank file.

These are $flankLen left and right of the middle of the TE.

=cut

tablistE("TE index is $teNdx");
my $cr = $coords[$teNdx]; # coords ref.
my $start = $cr->[0];
my $end = $cr->[1];
my $teStart = $cr->[0]; # The tailoring enzyme start.
my $teEnd = $cr->[1]; # The tailoring enzyme end.
my $teStrand = $cr->[2]; # The tailoring enzyme strand as 1 or -1.

my $midpos = int(($start + $end)/2);
my $minpos = $midpos - ($flankLen - 1);
my $maxpos = $minpos + ($flankLen * 2 - 1);

my($gbkfh, $gbkfn)=tempfile($template, DIR => $tempdir, SUFFIX => '.gbk');
close($gbkfh);

my $gbkstored = File::Spec->catfile($gbkcache, ($gbkgi . ".gbk"));

if(-e $gbkstored) {
  copy($gbkstored, $gbkfn);
  linelistE("$gbkstored copied to $gbkfn");
}
else {
  fetchGbk(uid => $gbkgi, outfile => $gbkfn);
  if(-d $gbkcache) {
    copy($gbkfn, $gbkstored);
  }
}

tablistE($minpos, $maxpos);

my($subfh, $subfn)=tempfile($template, DIR => $tempdir, SUFFIX => '.fna');
my($tefh, $tefn)=tempfile($template, DIR => $tempdir, SUFFIX => '.fna');
my $teout=Bio::SeqIO->new(-fh => $tefh, -format => 'fasta');

my $seqio=Bio::SeqIO->new(-file => $gbkfn);
my $seqout=Bio::SeqIO->new(-fh => $subfh, -format => 'fasta');
my $seqobj=$seqio->next_seq();
my $species = $seqobj->species();
my @classification = $species->classification();
my $seqlen = $seqobj->length();

# Below, adjustments to keep $minpos and $maxpos
# sensible in terms of the downloaded genbank file.
if($seqlen <= $flankLen * 2) {
  $minpos = 1; $maxpos = $seqlen;
}
elsif($minpos < 1) {
  $minpos = 1;
  $maxpos = $minpos + ($flankLen * 2 - 1);
}
elsif($maxpos > $seqlen) {
  $maxpos = $seqlen;
  $minpos = $maxpos - ($flankLen * 2 - 1);
}

# The output sequence object.
my $subseq = $seqobj->subseq($minpos, $maxpos);
my $teNtseq = $seqobj->subseq($teStart, $teEnd);

=head2 Locate TE on the sub genbank file

We need the nt sequence of the TE to locate it on the
sub gbk. Function I<locateTE()> is called for this.
It, in turn, uses I<blastn>.

=cut

my $teobj = Bio::Seq->new(-seq => $teNtseq);
$teobj->display_id("teNt");
$teobj->description("Tailoring enzyme");
$teout->write_seq($teobj);
close($tefh);

my $subobj = Bio::Seq->new(-seq => $subseq);
# $subobj->display_id("subseq");
$subobj->description("$gbkgi $minpos $maxpos");
$seqout->write_seq($subobj);
close($subfh);

# The sub called below uses blastn to locate the TE on the sub-genbank.
my ($teSubStart, $teSubEnd) = locateTE(tefn => $tefn, subfn => $subfn);

  tablistE($gbkfn, $minpos, $maxpos);

my $subgbk = subgenbank(infile => $gbkfn, start => $minpos, end => $maxpos);
my $subgbkFT = subgenbank(infile => $gbkfn, start => $minpos, end => $maxpos);
my @subft = $subgbk->remove_SeqFeatures();

=head3 @recoord

List of listrefs. [start, end, strand] of each CDS feature in @subft.

Coordinates stored in @recoord are used for checking the overlap
of prodigal findings with existing annotated features. Features which
have sizes in the range of the size of precursor peptides are not pushed
into @recoord. This is to allow for the situation where the precursor
peptide might be already annotated in the genbank file.

=cut

my @recoord; # start, end and strand of sequence features.
for my $subft (@subft) {
  if($subft->primary_tag() eq 'CDS') {
    my $st = $subft->start();
    my $en = $subft->end();
    my $str = $subft->strand();
    my $protlen = ((($en - $st) + 1) - 3) / 3;
    unless ($protlen >= $minPPlen and $protlen <= $maxPPlen) {
      push(@recoord, [$st, $en, $str]);
    }
    # if($st == $teSubStart and $en == $teSubEnd)
    if(abs($st - $teSubStart) <= 2 and
      abs($en - $teSubEnd) <= 2) {
      $subft->add_tag_value("color", "0 255 0");
    }
  }
  if($subft->primary_tag() ne 'gene') {
    $subgbk->add_SeqFeature($subft);
  }
}

=head3 Prodigal

Run Prodigal and parse the output file to populate @prdl
with listrefs for each line in the prodigal output.

Columns in the output of Prodigal are

 0. Start position
 1. End position
 2. Strand as + or -
 3. Prodigal score

There are more columns but these are the ones we are
concerned with.

=cut

# Below, run prodigal on the subsequence fasta file.
my($prdfh, $prdfn)=tempfile($template, DIR => $tempdir, SUFFIX => '.prodigal');
close($prdfh);
my $xstr = qq($prodigalshortbin -p meta -f gff -i $subfn -s $prdfn);
my $discard = qx($xstr); # Only interested in the output in file $prdfn.

# Read prodigal output and populate @prdl.
open(PRD, "<", $prdfn);
my @prdl;
while(my $line = readline(PRD)) {
if($line=~m/^\s*\#/ or $line=~m/^\s*$/ or $line !~ m/^\d/) {next;}
chomp($line);
my @ll=split(/\t/, $line);
push(@prdl, [@ll]);
}
close(PRD);

# Same strand reward
# For loop below applies the same strand reward to all
# prodigal output records.

for my $lr (@prdl) {
  my $prdStrand = $lr->[2] eq '+' ? 1 : -1;
  if($prdStrand == $teStrand) {
    $lr->[3] += $sameStrandReward;
  }
  $lr->[2] = $prdStrand;
  my $shadig = sha1_hex(join("_", $lr->[0], $lr->[1], $lr->[2]));
  push(@{$lr}, $shadig);
}


# Then we sort the prodigal output records by score.
# This gives us @sprdl.

my @sprdl = sort {$b->[3] <=> $a->[3]} @prdl;


# Loop through the sorted (by score) prodigal output
# Columns in the output of Prodigal are
# 
#  0. Start position
#  1. End position
#  2. Strand as + or -
#  3. Prodigal score

my $prdlCnt = 0;
my $ppFastaOutputCnt = 1; # Init to one because of later sql insertion.
my $allFastaOutputCnt = 9001; # Init to one because of later sql insertion.
my @terpos; # To hold the positions of the last nucleotide of genes.
my $seqout1;

SPRDL: for my $lr (@sprdl) { # @sprdl: sorted (by score) prodigal results.
my @ll = @{$lr}; 
my ($start, $end, $strand) = @ll[0,1,2];
my $score = $ll[3];
my $peplen = (($end - $start) + 1) / 3; $peplen -= 1;
# my $strand = $temp eq '+' ? 1 : -1;
splice(@ll, 2, 1, $strand);
my $terpos;
if($strand == 1) { $terpos = $end; }
elsif($strand == -1) { $terpos = $start; }

# Proceed only for a range of peptide lengths. Otherwise look at
# the next prodigal record.
# 
# $ll[3] is the total prodigal score.
unless ($peplen >= $minPPlen and $peplen <= $maxPPlen) { next SPRDL; }

# If an end position ($terpos) has been seen before, skip it.
if(grep {$_ == $terpos} @terpos) { next SPRDL; }

else {
# Test for overlap with an annotated gene.
# Basically we check for overlap with coordinates in @recoord. 
my $inGene = 0; #flag
OUTER: for my $pos ($start, $end) {
  for my $cr (@recoord) {
    if($pos >= ($cr->[0] + $allowedInGene)
      and $pos <= ($cr->[1] - $allowedInGene)) {
      $inGene = 1; last OUTER;
    }
  }
}

=pod

We also test if a prodigal record lies totally within
a feature in the sequence object.

=cut

unless($inGene) {
  for my $cr (@recoord) {
    if($start <= $cr->[0] and $end >= $cr->[1]) {
      $inGene = 1;
      last;
    }
  }
}

if($inGene) { next SPRDL; }

my @prodHead = qw(
Beg End Std Total CodPot StrtSc Codon RBSMot
Spacer RBSScr UpsScr TypeScr GCCont ShaHex
);

my $dx = 0;
my @prodStr; # String containing information coming from prodigal.
for my $phead (@prodHead) {
  push(@prodStr, "$phead: $ll[$dx]"); 
  $dx += 1;
}
my $prodStr = join("; ", @prodStr);
my $distFromTE = distTE([$start, $end], [$teSubStart, $teSubEnd]);

=pod

$prft. Bio::SeqFeature for prodigal record.
A feature is created and added regardless of the distance
from TE or the prodigal score. Color is however decided
by prodigal rank, distance from TE and prodigal score.

=cut


# $prft. Bio::SeqFeature for prodigal record.
my $color = prdcol($prdlCnt, $distFromTE, $score);
my $prft = Bio::SeqFeature::Generic->new(
-primary => 'CDS',
-start => $start,
-end => $end,
-strand => $strand,
-tag => {
  'color' => $color,
  'note' => $prodStr
}
);

# $teSubStart: Start of the TE in the subsequence.
# $teSubEnd: End of the TE in the subsequence.
# $teStrand: Strand of the TE.
# filename, organism, PPsequence, PPstrand, TEstrand.
$subgbkFT->add_SeqFeature($prft);
my $aaseq = aaseq($prft);
$aaseq =~ s/\*$//;
$aaseq =~ s/^[VL]/M/;
$aaseq =~ s/^[vl]/m/;

if($prdlCnt < $ppFeatAddLimit) {
$subgbk->add_SeqFeature($prft);
$prft->add_tag_value("translation", $aaseq);
}

$prdlCnt += 1;


# $ppFeatAddLimit
# Prodigal features are added upto 20 features. More than that
# are added only if their prodigal score > 0.
# if($score <= 0 and $prdlCnt >= $ppFeatAddLimit) { last; }


=pod

If within specified distance of the TE and score > 0 and count < 20
insert a record in SQL table $conf{prepeptab}.
Also write the peptide sequences to the fasta filename $fastafn
derived from $taildir. See B<Genbank and fasta file names> above.

=cut

# if within specified range of the TE, insert a record in SQL
# table $conf{prepeptab}.
if($distFromTE <= $maxDistFromTE and $prdlCnt <= $ppFeatAddLimit) {
  unless(ref($seqout1)) {
    $seqout1=Bio::SeqIO->new(-file => ">$fastafn");
  }
  my $strandReward = "0";
  if($strand == $teStrand) {
    $strandReward = 1;
  }
  if($ppFastaOutputCnt <= $fastaOutputLimit or ($score >= $prodigalScoreThresh)) {
    my $aaobj = Bio::Seq->new(-seq => $aaseq);
    my $fastaid = $teProtAcc . "_" . $ppFastaOutputCnt;
    $aaobj->display_id($fastaid);
    $aaobj->description("SameStrand: $strandReward; " . $prodStr);
    $seqout1->write_seq($aaobj);
    my $spbinom = $species->binomial("FULL");
    $spbinom =~ s/'//g;
    insertSQL($teProtAcc, $spbinom, $ppFastaOutputCnt, $fastaid, $aaseq,
        $strand, $teStrand, $distFromTE, $score);
    $ppFastaOutputCnt += 1;
  }
}
# The else below is only to use a different series of fastaid postfix,
# $allFastaOutputCnt. And in this case we don't care about how many have
# already been reported.
else {
  unless(ref($seqout1)) {
    $seqout1=Bio::SeqIO->new(-file => ">$fastafn");
  }
  my $strandReward = "0";
  if($strand == $teStrand) {
    $strandReward = 1;
  }
  my $aaobj = Bio::Seq->new(-seq => $aaseq);
  my $fastaid = $teProtAcc . "_" . $allFastaOutputCnt;
  $aaobj->display_id($fastaid);
  $aaobj->description("SameStrand: $strandReward; " . $prodStr);
  $seqout1->write_seq($aaobj);
  my $spbinom = $species->binomial("FULL");
  $spbinom =~ s/'//g;
  insertSQL($teProtAcc, $spbinom, $allFastaOutputCnt, $fastaid, $aaseq,
      $strand, $teStrand, $distFromTE, $score);
  $allFastaOutputCnt += 1;
}
  push(@terpos, $terpos);
} # End of else for the if which skips done terpos.

} # end of SPRDL: for my $lr (@sprdl)

=pod

At this point we have gone through all the output from Prodigal.

=cut


=head3 Write the genbank output file.

Below, the $subgbk object is written out to $ofn which is derived
from $taildir. See B<Genbank and fasta file names> above.

=cut

my $subout = Bio::SeqIO->new(-file => ">$ofn", -format => "genbank");
$subgbk->display_id($teProtAcc . "_cluster");
$subgbk->accession_number($teProtAcc . "_cluster");
$subgbk->species($species);
$subout->write_seq($subgbk);

#copy($gbkfn, $gbkgi . ".gbk");
#copy($subfn, "sub.fna");
#copy($prdfn, $gbkgi . ".prodigal");

unlink($gbkfn, $subfn, $prdfn, $tefn);
}

close($ofh);
exit;

# Multiple END blocks run in reverse order of definition.
END {
close(STDERR);
close(ERRH);
$handle->disconnect();
}


# {{{ sub insertSQL {
sub insertSQL {
  my ($teProtAcc, $spbinom, $ppFastaOutputCnt, $fastaid, $aaseq,
      $strand, $teStrand, $distFromTE, $score) = @_;
  my $instr = qq/insert or replace into $conf{prepeptab} values(/;
      $instr .= $handle->quote($teProtAcc) . ", ";
      $instr .= $handle->quote($spbinom) . ", ";
      $instr .= $ppFastaOutputCnt . ", ";
      $instr .= $handle->quote($fastaid) . ", ";
      $instr .= $handle->quote($aaseq) . ", ";
      $instr .= $strand . ", ";
      $instr .= $teStrand . ", ";
      $instr .= $distFromTE . ", ";
      $instr .= $score . ")";
      unless($handle->do($instr)) {
      linelistE($instr);
      }
}
# }}}

# {{{ sub distTE.

=head3 sub distTE

Gets two pairs of numbers

prodigal peptide start and end

TE start and end

Returns the minimum distance between the two.

=cut


sub distTE {
  my $prse = shift(@_);
  my $tese = shift(@_);
  my $mindist = 99999999;
  for my $prpos (@{$prse}) {
    for my $tepos (@{$tese}) {
      my $dist = abs($prpos - $tepos);
      $mindist = $mindist < $dist ? $mindist : $dist; 
    }
  }
  return($mindist);
}
# }}}

# {{{ sub prodigal2aa listref, fastafilename. Not used.

sub prodigal2aa {
  my $lr = shift(@_);
  my $subfn = shift(@_);

  my $seqio = Bio::SeqIO->new(-file => $subfn, -format => "fasta");
  my $seqobj = $seqio->next_seq();

  my ($start, $end, $strand) = @{$lr}[0, 1, 2];

  my $cds;
  if($strand == 1) {
    $cds = $seqobj->trunc($start, $end);
  }
  else {
    $cds = $seqobj->trunc($start, $end)->revcom();
  }
  my $aaobj = $cds->translate();
  $aaobj->display_id($start);
  $aaobj->description("$end, $strand");
  return($aaobj);
}
# }}}

# {{{ sub seqobj2print $aaobj. Not used.
sub seqobj2print {
  my $aaobj = shift(@_);
  my $memvar;
 open(my $memh, ">", \$memvar)
     or die "Can't open memory file: $!";
 my $bout=Bio::SeqIO->new(-fh => $memh, -format => 'fasta');
  $bout->write_seq($aaobj);
  close($memh);
  return($memvar);
}
# }}}

# {{{ sub aaseq
sub aaseq {
my $ft = shift(@_);
my $fseq = $ft->spliced_seq(-nosort => 1);
my $aaobj = $fseq->translate();
return($aaobj->seq());
}
# }}}

# {{{ sub prdcol

=head3 sub prdcol

The colour in which a precursor peptide feature will get rendered
in Artemis gets decided here.

=cut


sub prdcol {
my $prdlCnt = shift(@_);
my $dte = shift(@_);
my $score = shift(@_);
if($dte <= $maxDistFromTE and $score >= $colorThreshScore) {
return("255 0 160");
}
elsif($prdlCnt < 3) {
return("255 0 0");
}
elsif($prdlCnt < 12) {
return("255 110 110");
}
else {
return("255 188 188");
}
}
# }}}

# {{{ sub locateTE

=head3 sub locateTE

The TE nucleotide sequence fasta file

The sub sequence fasta file.

Uses blastn to determine and return the start and end of the TE in
the subsequence.

=cut

sub locateTE {
my %args = @_;
my $tefn = $args{tefn};
my $subfn = $args{subfn};
my($blnh, $blnf)=tempfile($template, DIR => $tempdir, SUFFIX => '.fna');
close($blnh);
my $xstr = qq(blastn -query $tefn -subject $subfn -task megablast);
$xstr .= qq( -out $blnf -outfmt 6 -dust no);
qx($xstr);
open(my $bloh, "<", $blnf);
my $line = readline($bloh); chomp($line); 
linelistE($line);
close($bloh);
unlink($blnf);
my @ll = split(/\t/, $line);
my($ts, $te) = @ll[8,9];
my $start = $ts < $te ? $ts : $te;
my $end = $ts < $te ? $te : $ts;
return($start, $end);
}
# }}}

# {{{ sub fetchGbk hash(uid, outfile) returns(Bio::Seq object) ### for nucleotide genbanks only
# if outfile is specified we get a file, otherwise we get a Bio::Seq object.
sub fetchGbk {
my %args = @_;
my $uid = $args{uid};
my $outfile = $args{outfile};
my $efetch= $efetchURL;
$efetch .= 'db=nucleotide&';
$efetch .= "id=$uid\&";
$efetch .= "retmode=text\&";
$efetch .= "rettype=gbwithparts";
my ($fh, $filename)=tempfile($template, DIR => $tempdir, SUFFIX => '.gbk');
select($fh);
getprint($efetch);
select(STDOUT);
close($fh);
if($outfile) {
if(copy($filename, $outfile)) {
  unlink($filename);
  return(1);
}
else {
  carp("Copy of $filename to $outfile failed");
  unlink($filename);
  return(0);
}
}
else {
my $seqio = Bio::SeqIO->new(-file  => $filename);
my $seqobj = $seqio->next_seq();
unlink($filename);
return($seqobj);
}

}
# }}}

# {{{ sub subgenbank. hash(infile, start, end). Returns a Bio::Seq.
# optional keys: outformat
sub subgenbank {
  my %args = @_;
  my $ingbk = $args{infile};
  my $start = $args{start};
  my $end = $args{end};

  my $seqio = Bio::SeqIO->new(-file => $ingbk);
  my $seqobj=$seqio->next_seq();
  my $inlen = $seqobj->length();
  if($end > $inlen) { $end = $inlen; }
  my $subseqstr = $seqobj->subseq($start, $end);

  my $outobj = Bio::Seq->new(-seq => $subseqstr);

  my @ft = $seqobj->get_all_SeqFeatures();
  my @oft;
  for my $ft (@ft) {
    my @temp = $ft->location()->each_Location();
    my $nloc = scalar(@temp);
    if($nloc == 1 and $ft->start() >= $start and $ft->end() <= $end) {
      push(@oft, $ft);
    }
  }

  for my $ft (@oft) {
    my $fst = $ft->start();
    my $fend = $ft->end();
    my $adj = $start - 1;
    $ft->start($fst - $adj);
    $ft->end($fend - $adj);
    $outobj->add_SeqFeature($ft);
  }

  return($outobj);

}
# }}}

# {{{ sub TEcoordsByProtId (gbkfile => filename, protid => proteinid);
sub TEcoordsByProtId {
  my %args = @_;
  my $ingbk = $args{gbkfile};
  my $protid = $args{protid};
  my $seqio = Bio::SeqIO->new(-file => $ingbk);
  my $seqobj=$seqio->next_seq();
  for my $ft ($seqobj->get_all_SeqFeatures()) {
    if($ft->has_tag("protein_id")) {
      my (@temp) = $ft->get_tag_values("protein_id");
      if(grep {$_ eq $protid} @temp) {
        my $start = $ft->start();
        my $end = $ft->end();
        my $strand = $ft->strand();
        return([$start, $end, $strand]);
      }
    }
  }
return();
}
# }}}



# {{{ subroutines tablist, linelist, tabhash and their *E versions.
# The E versions are for printing to STDERR.

sub tablist {
  my @in = @_;
  print(join("\t", @in), "\n");
}

sub tablistE {
  my @in = @_;
  print(STDERR join("\t", @in), "\n");
}

sub linelist {
  my @in = @_;
  print(join("\n", @in), "\n");
}

sub linelistE {
  my @in = @_;
  print(STDERR join("\n", @in), "\n");
}

# }}}

__END__
Browse the archive

https://github.com/streptomyces/ripper