map_genes_operon.pl
#!usr/bin/perl -w -s
#--------------------------------------------------------------------------#
# Author: Meng Wu, the Gordon Lab, Washington University in St. Louis #
# #
# File: map_genes.pl #
# Date: 2014-09-05 #
# Version: 1.30 #
# #
# Usage: #
# see Usage in the code #
# #
# Contact: mengwu@wustl.edu #
#--------------------------------------------------------------------------#
use POSIX qw(log10);
if (@ARGV !=5){
print "\nUsage: perl map_genes.pl <ptt> <operons_file> <length_disrupt_percent (max=1)> <operon_probability_cutoff (max=1)> <infile.txt>\n";
exit;
}
open PTT, $ARGV[0];
open OPS, $ARGV[1];
my $percent = $ARGV[2]; #percent of gene considered "hit"
my $operon_percent=$ARGV[3]; #operon probability score cutoff
##########
# Step 1: process_ptt
##########
# this module delimits genes based on ptt file and % of gene considered "hit"
# input is (.ptt file from NCBI, gene_disrupt_percent)
# output is a series of arrays:
#print "processing ptt file\n";
my @left_end;
my @right_end;
my @gene_number;
my @annotation;
my @gene_length;
my @strand;
my @left_end_percent;
my @right_end_percent;
my @gene_length_percent;
my @unique_left;
my @unique_right;
my @shared_left;
my @shared_right;
my @unique_length;
my @shared_length;
my $distal=1-$percent;
#skip past header info in .ptt file
my $ptt_line=<PTT>;
$ptt_line=<PTT>;
$ptt_line=<PTT>;
#go through the ptt file and read info into arrays
my $line_number=0;
while ($ptt_line=<PTT>){
chomp $ptt_line;
my @temp_array = split (/\t/, $ptt_line); #split ptt line by tabs
my $gene = $temp_array[5];
# $gene =~ s/_//g;
$gene_number[$line_number]=$gene;
$strand[$line_number]=$temp_array[1];
$annotation[$line_number]=$temp_array[8];
my @coordinates_array = split (/\../, $temp_array[0]);
$left_end[$line_number]=$coordinates_array[0];
$right_end[$line_number]=$coordinates_array[1];
$gene_length[$line_number]=abs($right_end[$line_number]-$left_end[$line_number]);
$line_number++;
}
my $number_of_genes=scalar @gene_number;
#trim genes to exclude distal x%
for (my $i=0;$i<$number_of_genes; $i++) {
$gene_length_percent[$i]=int (($gene_length[$i]*$percent)+.5);
if ($strand[$i] eq "+") { #if on positive strand
$left_end_percent[$i]=$left_end[$i];
$right_end_percent[$i]=$left_end[$i]+$gene_length_percent[$i];
}
if ($strand[$i] eq "-") {
$right_end_percent[$i]=$right_end[$i];
$left_end_percent[$i]=$right_end[$i]-$gene_length_percent[$i];
}
}
# delimit unique and shared regions of first X% of genes
for (my $j=0;$j<$number_of_genes;$j++) {#go through all genes
my $set_start=0;
my $set_end=0;
if ($j==0){ # if first gene, set no overlap from last gene
$unique_left[$j]=$left_end_percent[$j];
$set_start=1;
}
if ($j==($number_of_genes-1)){ #if last gene
$unique_right[$j]=$right_end_percent[$j]; # set no overlap from first gene
$set_end=1;
$shared_left[$j]="none";
$shared_right[$j]="none";
$shared_length[$j]="none";
}
#assign unique left end of gene
if ($set_start==0){ # if not first or last gene
if ($left_end_percent[$j]>$right_end_percent[$j-1]) { #if left_edge is after previous gene right_edge
$unique_left[$j]=$left_end_percent[$j]; #unique left_edge is left_end_percent
}
if ($left_end_percent[$j]<=$right_end_percent[$j-1]) { #if left_edge is before previous gene right_edge
$unique_left[$j]=$right_end_percent[$j-1]+1; #unique left_edge is (previous gene right_end_percent)+1
}
}
#assign unique right end of gene
if ($set_end==0){
if ($right_end_percent[$j]<$left_end_percent[$j+1]) { #if right_edge is before next gene left_edge
$unique_right[$j]=$right_end_percent[$j]; #unique right_edge is right_end_percent
$shared_left[$j]="none"; #no shared region for gene j
$shared_right[$j]="none";
$shared_length[$j]="none";
}
if ($right_end_percent[$j]>=$left_end_percent[$j+1]) { #if right_edge is after next gene left_edge
$unique_right[$j]=$left_end_percent[$j+1]-1; #unique right_edge is (next gene left_end_percent)-1
$shared_left[$j]=$left_end_percent[$j+1];
$shared_right[$j]=$right_end_percent[$j];
$shared_length[$j]=$shared_right[$j]-$shared_left[$j];
}
}
}
##########
# Step 2: establish operon boundaries
##########
#print "establishing operon boundaries\n";
#OPS file should be tab-delim two-column
#column 1 : gene number
#column 2 : probability (max 1) that this gene is in an operon w/ next gene
#example: Gene1 .5
#50% probability that Gene1 is in an operon w/ Gene2
my @operon_prob;
my @operon_name;
my @operon_left;
my @operon_right;
my @operon_strand;
my @operon_left_gene;
my @operon_right_gene;
my @operon_left_gene_number;
my @operon_right_gene_number;
my $name;
my $in_operon=0;
my $count=0;
while (my $line = <OPS>){
chomp $line;
my @temp_array = split (/\t/, $line);
if ($temp_array[0] ne $gene_number[$count]){
print "ptt and operon files not lining up: $gene_number[$count]\t$temp_array[0]\n";
exit;
}
$operon_prob[$count]=$temp_array[1];
$count++;
}
for (my $i=0; $i<$number_of_genes-1; $i++){ #go through each gene
my $assigned=0;
if (($in_operon==0) && ($operon_prob[$i]>=$operon_percent)) { #if starting an operon
$in_operon=1;
push (@operon_left, $left_end[$i]);
push (@operon_left_gene_number, $i);
push (@operon_strand, $strand[$i]);
$name = $gene_number[$i];
$assigned=1;
}
if (($assigned==0) && ($in_operon==1) && ($operon_prob[$i]<$operon_percent)){ #if ending an operon
$in_operon=0;
push (@operon_right, $right_end[$i]);
push (@operon_right_gene_number, $i);
$name = $name."-".$gene_number[$i];
push (@operon_name, $name);
$assigned=1;
}
if (($assigned==0) && ($in_operon==0) && ($operon_prob[$i]<$operon_percent)){ #if one-gene operon
push (@operon_left, $left_end[$i]);
push (@operon_right, $right_end[$i]);
push (@operon_left_gene_number, $i);
push (@operon_right_gene_number, $i);
push (@operon_name, $gene_number[$i]."-".$gene_number[$i]);
push (@operon_strand, $strand[$i]);
$assigned=1;
}
}
my $number_of_operons=scalar @operon_name;
########
# Step 3: Map insertion counts to genes and operons
########
my %sampleIDs; #$sampleIDs{sampleID}=1 if present in file
my %genes_hit_direct; #$genes_hit_direct{coordinate} = array of genes hit directly by coordinate
my %genes_hit_polar; #$genes_hit_polar{coordinate} = array of genes in downstream operon from coordinate
my %gene_hitcount_direct; #sample-specific value for how many reads mapping directly to that gene
my %gene_hitcount_polar; #sample-specific value for how many reads mapping directly or upstream of that gene
my %gene_unique_sites; #sample-specific value for number of unique insertion locations in that gene
my %seen_coordinate; #$seen_coordinate{coordinate} = 0 if new, 1 if previously mapped
my %hitcount;
open IN, "$ARGV[4]";
while (my $line=<IN>) {
chomp $line;
if ($line =~m/^>/){
my @temp = split (/\t/, $line);
my $sampleID = $temp[0];
my $coordinate = $temp[1];
my $norm_count = $temp[2];
#print "sample ID $sampleID coordinate $coordinate count $norm_count\n";
#if sample is new, add it to list of samples
unless (exists $sampleIDs{$sampleID}){
$sampleIDs{$sampleID}=1;
}
#if coordinate is new, map it to genes and operons
unless (exists ($seen_coordinate{$coordinate})){
$seen_coordinate{$coordinate}=1;
########
# identify genes hit directly by coordinate
########
my $found=0;
my $found_operon=0;
for (my $check=0;$check<$number_of_genes; $check++){ #go through each pair in the left-right arrays
if (($left_end_percent[$check]<=$coordinate) && ($coordinate<=$right_end_percent[$check])){ #if hit is in proximal portion of gene
$found=1;
$found_operon=1;
push (@{$genes_hit_direct{$coordinate}}, $gene_number[$check]); #allows for single coordinate to hit multiple genes
#print "$coordinate mapped to gene $gene_number[$check]\n";
push (@{$genes_hit_polar{$coordinate}}, $gene_number[$check]);
} #end gene-hit section
} #end go through genes section
if ($found==0){
@{$genes_hit_direct{$coordinate}}[0]="none";
} #end not-found in gene section
##########
# identify genes hit by polar effect
##########
for (my $check_operon=0;$check_operon<$number_of_operons;$check_operon++) { #go through each operon
if (($operon_left[$check_operon]<=$coordinate) && ($coordinate <=$operon_right[$check_operon])) {
if ($operon_strand[$check_operon] eq "+") {
for (my $b=$operon_left_gene_number[$check_operon]; $b<=$operon_right_gene_number[$check_operon]; $b++){ #go through each gene in the operon
if ($left_end[$b]>$coordinate){ #if gene starts downstream of coordinate
#add this gene to the list of genes hit polarly by this coordinate
$found_operon=1;
push (@{$genes_hit_polar{$coordinate}}, $gene_number[$b]);
#print "$coordinate mapped (polar) to gene $gene_number[$b]\n";
} #end gene-hit-polar section
} #end go through genes in operon section
} # end positive-strand operon section
if ($operon_strand[$check_operon] eq "-"){
for (my $j=$operon_right_gene_number[$check_operon]; $j>=$operon_left_gene_number[$check_operon];$j--){
if ($right_end[$j]<$coordinate){ #if gene starts downstream of coordinate
$found_operon=1;
push (@{$genes_hit_polar{$coordinate}}, $gene_number[$j]);
#print "$coordinate mapped (polar) to gene $gene_number[$j]\n";
} #end gene-hit-polar section
} #end go through genes in operon section
} # end minus-strand operon section
} # end coordinate in operon section
} # end of operon section
if ($found_operon==0){
@{$genes_hit_polar{$coordinate}}[0]="none";
} # end not-found-operon section
} #end "new coordinate" loop
########
# assign input-specific data to coordinate
########
$hitcount{$coordinate}->{$sampleID}=$norm_count;
unless ($genes_hit_direct{$coordinate}[0] eq "none") {
for (my $k=0; $k< scalar @{$genes_hit_direct{$coordinate}}; $k++) { #go through each gene in the array referenced by $genes_hit_direct{$coordinate} and add hitcount for this coordinate/input
$gene_hitcount_direct{$genes_hit_direct{$coordinate}[$k]}->{$sampleID}=0 if !defined $gene_hitcount_direct{$genes_hit_direct{$coordinate}[$k]}->{$sampleID};
$gene_hitcount_polar{$genes_hit_polar{$coordinate}[$k]}->{$sampleID}=0 if !defined $gene_hitcount_polar{$genes_hit_polar{$coordinate}[$k]}->{$sampleID};
$gene_unique_sites{$genes_hit_direct{$coordinate}[$k]}->{$sampleID}=0 if !defined $gene_unique_sites{$genes_hit_direct{$coordinate}[$k]}->{$sampleID};
$gene_hitcount_direct{$genes_hit_direct{$coordinate}[$k]}->{$sampleID}=$gene_hitcount_direct{$genes_hit_direct{$coordinate}[$k]}->{$sampleID}+$hitcount{$coordinate}->{$sampleID};
$gene_hitcount_polar{$genes_hit_polar{$coordinate}[$k]}->{$sampleID}=$gene_hitcount_polar{$genes_hit_polar{$coordinate}[$k]}->{$sampleID}+$hitcount{$coordinate}->{$sampleID};
$gene_unique_sites{$genes_hit_direct{$coordinate}[$k]}->{$sampleID}++;
} #end add direct hits
} #end coordinate hits direct genes
unless ($genes_hit_polar{$coordinate}[0] eq "none") {
for (my $l=0; $l<scalar @{$genes_hit_polar{$coordinate}}; $l++){ #go through each gene in the array referenced by $genes_hit_polar{$coordinate} and add hitcount for this coordinate/input
$gene_hitcount_polar{$genes_hit_polar{$coordinate}[$l]}->{$sampleID}=0 if !defined $gene_hitcount_polar{$genes_hit_polar{$coordinate}[$l]}->{$sampleID};
$gene_hitcount_polar{$genes_hit_polar{$coordinate}[$l]}->{$sampleID}=$gene_hitcount_polar{$genes_hit_polar{$coordinate}[$l]}->{$sampleID}+$hitcount{$coordinate}->{$sampleID};
} #end add polar hits
} #end coordinate hits polar genes
} #end "if line starts with ">" loop
} #end of "go through each line of input" loop
########
# Produce output files
########
my @columns;
foreach my $key (sort (keys %sampleIDs)){
push (@columns, $key);
}
my $output1=$ARGV[4]."_genes_direct.txt";
my $output2=$ARGV[4]."_genes_polar.txt";
my $output3=$ARGV[4]."_genes_sites.txt";
my $output4=$ARGV[4]."_insertions.txt";
open OUT, ">$output1";
open OUT2, ">$output2";
open OUT3, ">$output3";
open OUT4, ">$output4";
# write output files
# print header info
print OUT "GeneID\t";
print OUT2 "GeneID\t";
print OUT3 "GeneID\t";
print OUT4 "Coordinate\tGenes\t";
for (my $i=0; $i<scalar @columns; $i++){
print OUT "$columns[$i]\t";
print OUT2 "$columns[$i]\t";
print OUT3 "$columns[$i]\t";
print OUT4 "$columns[$i]\t";
}
print OUT "\n";
print OUT2 "\n";
print OUT3 "\n";
print OUT4 "\n";
for (my $j=0; $j<$number_of_genes; $j++){ #go through each gene
print OUT "$gene_number[$j]\t";
print OUT2 "$gene_number[$j]\t";
print OUT3 "$gene_number[$j]\t";
for (my $k=0; $k<scalar @columns; $k++){
$gene_hitcount_direct{$gene_number[$j]}->{$columns[$k]}=0 if !defined $gene_hitcount_direct{$gene_number[$j]}->{$columns[$k]};
$gene_hitcount_polar{$gene_number[$j]}->{$columns[$k]}=0 if !defined $gene_hitcount_polar{$gene_number[$j]}->{$columns[$k]};
$gene_unique_sites{$gene_number[$j]}->{$columns[$k]}=0 if !defined $gene_unique_sites{$gene_number[$j]}->{$columns[$k]};
print OUT "$gene_hitcount_direct{$gene_number[$j]}->{$columns[$k]}\t";
print OUT2 "$gene_hitcount_polar{$gene_number[$j]}->{$columns[$k]}\t";
print OUT3 "$gene_unique_sites{$gene_number[$j]}->{$columns[$k]}\t";
}
print OUT "$annotation[$j]\n";
print OUT2 "$annotation[$j]\n";
print OUT3 "$annotation[$j]\n";
}
#print out coordinate data to insertions.txt file
foreach my $key (keys %hitcount){ #go through each key (coordinate) in %hitcount
print OUT4 "$key\t";
for (my $k=0; $k< scalar @{$genes_hit_direct{$key}}; $k++) { #go through each gene in the array referenced by $genes_hit_direct{$key}
print OUT4 "$genes_hit_direct{$key}[$k]";
unless ($k==((scalar @{$genes_hit_direct{$key}})-1)){
print OUT4 ":";
}
}
print OUT4 "\t";
for (my $j=0; $j<scalar @columns; $j++){
$hitcount{$key}->{$columns[$j]} = 0 if !defined $hitcount{$key}->{$columns[$j]};
print OUT4 "$hitcount{$key}->{$columns[$j]}\t";
}
print OUT4 "\n";
}