Raw File
map_genes.pl
#!/usr/bin/perl -w

###This script takes the .ptt and the filter_cpm file for mapping insertions to the gene
###The format of the .ptt file is the standard annotation file format from NCBI, or you can refer to the .ptt file in the demo section

if (!@ARGV){
  print "$0 <.ptt> <processed file> <gene_disrupt_percent (max=1) \n";
  exit;
}

my $ptt=shift @ARGV;
my $file=shift @ARGV;
my $percent=shift @ARGV;
my $out=$file."_mapped";    


my $logfile;
if ($file=~/(\w+\.scarf)/){
   $logfile=$1.".log";
}

open LOG,">>$logfile";

my $gene;
my $start;
my $end;
my @temp;
my @gene;
my $name;
my $strand;
my $annotation;

open PTT,$ptt;
while (<PTT>){
    chomp;
    if (m/^(\d+)\.\.(\d+)/){
      @temp=split /\t/,$_;
      $start->{$temp[5]}=$1;
      $end->{$temp[5]}=$2;
      $strand->{$temp[5]}=$temp[1];
      $length->{$temp[5]}=abs($end->{$temp[5]}-$start->{$temp[5]});
      $annotation->{$temp[5]}=$temp[-1];
    }
}

foreach my $gene (sort keys %$start){
     for (my $i=$start->{$gene};$i<=$end->{$gene};$i++){
          $target->{$i}=$gene;
     }
}

my $count;
my $hit;
my $inside_ORF=0;
my $total_in=0;
my $insertion;

open CPM,$file;
while (<CPM>){
     @temp=split /\s+/,$_;
#     if ($temp[4]>3){
       $total_in++;
       if ($target->{$temp[1]}){
         if ($strand->{$target->{$temp[1]}} eq "+"){
             $position=abs(($temp[1]-$start->{$target->{$temp[1]}})/$length->{$target->{$temp[1]}});
         }elsif ($strand->{$target->{$temp[1]}} eq "-"){
             $position=abs(($temp[1]-$end->{$target->{$temp[1]}})/$length->{$target->{$temp[1]}});
         }
         if ($position<=$percent){
           my $po=sprintf ("%.3f",$position);
           $inside_ORF++;
           $hit->{$target->{$temp[1]}}++;
           $count->{$target->{$temp[1]}}+=$temp[2];
           my $insert_info=$po.":".$temp[1].":".$temp[2];
           push @{$insertion->{$target->{$temp[1]}}},$insert_info;
         }
       }
 #    }
}

open OUT,">$out";
print OUT"Gene ID\tUnique insertions\tCount\tAnnotation\n";

foreach $gene (sort keys %$start){
     if ($hit->{$gene}){
        my @out;
        foreach my $item(sort @{$insertion->{$gene}}){
         push @out,$item;
        }
        my $details=join"\t",@out;
        print OUT "$gene\t$hit->{$gene}\t$count->{$gene}\t$annotation->{$gene}\t$details\n";
     }else{
        $hit->{$gene}=0;
        $count->{$gene}=0;
        print OUT "$gene\t$hit->{$gene}\t$count->{$gene}\t$annotation->{$gene}\n";
    }
}


$intergenic=$total_in-$inside_ORF;
print LOG "Sample\tTotal insertions\t Insertions inside of ORF\tInsertions located intergenic region\n";
print LOG "$file\t$total_in\t$inside_ORF\t$intergenic\n";
close LOG;
back to top