https://github.com/Ettwiller/TSS
Tip revision: 5382267dc89460b2ec30e33508dff4479c71c3ef authored by Laurence Ettwiller on 14 February 2019, 18:51:53 UTC
updated to new samtools requirement
updated to new samtools requirement
Tip revision: 5382267
filter_tss.pl
#!/usr/bin/perl
use strict;
use Getopt::Long qw(GetOptions);
#this program takes two gtf files (output of bam2firstbasegtf.pl) corresponding to the control library (--control) and the cappable-seq library (--tss).
#and output the result (in gtf format into $OUT file (from --out)
my $error_sentence = "USAGE : perl $0 --tss tss_file --control control_file --out output.gtf\n OPTIONAL : --cutoff log2(ratio) --Rformat 1";
my $tssfile;
my $controlfile;
my $CUTOFF = 0;
my $Rformat =0;
my $OUT;
#================================
#getting all the options :
GetOptions ("tss=s" => \$tssfile, # numeric
"control=s" => \$controlfile, #gtf from bam2firstbasebam.pl
"cutoff=s" => \$CUTOFF, #enrichment score
"Rformat=s" => \$Rformat,
"out=s" => \$OUT #output file
) or die "$error_sentence";
#================================
#checking if everything is fine :
if (!$tssfile || !$controlfile || !$OUT) {die "$error_sentence\n";}
if(-e $OUT) { die "File $OUT Exists, please remove old file or rename output file (--out)"};
#=================================
my ($tss, $total_tss) = parse($tssfile);
my ($control, $total_control) = parse($controlfile);
#open the output file :
my($day, $month, $year)=(localtime)[3,4,5];
open(OUT, ">$OUT") or die "can't save result into $OUT\n";
print OUT "## File generated by $0 on $day-".($month+1)."-".($year+1900)."\n";
foreach my $chr(keys %$tss)
{
my $pos = $$tss{$chr};
foreach my $position (sort {$a<=>$b} keys %$pos)
{
my $dir = $$pos{$position};
foreach my $direction (keys %$dir)
{
my $enrichement_score="NaN"; my $score_control="NaN"; my $score_tss = "NaN";
if ($$control{$chr}{$position}{$direction})
{
$score_control = $$control{$chr}{$position}{$direction}{"score"};
$score_tss = $$tss{$chr}{$position}{$direction}{"score"};
$enrichement_score = log($score_tss/$score_control)/log(2);
}
else {
#no read for the control at this position -we add a pseudocount
$score_control = sprintf('%.3f',(1/$total_control)*1000000);
$score_tss = $$tss{$chr}{$position}{$direction}{"score"};
$enrichement_score = log($score_tss/$score_control)/log(2);
}
if ($enrichement_score >= $CUTOFF)
{
my $line = $$tss{$chr}{$position}{$direction}{"line"};
if ($Rformat ==0)
{
my @tmp = split /\t/, $line;
my $id= $tmp[2];
$id = $id.$enrichement_score;
$tmp[2]= $id;
my $new_line = join("\t", @tmp);
$new_line = $new_line.";enrichment_score=$enrichement_score;score_control=$score_control";
print OUT "$new_line\n";
}
if ($Rformat ==1)
{
print OUT "$score_tss\t$score_control\t$enrichement_score\n";
}
}
}
}
}
close OUT;
sub parse {
my ($file) =@_;
my $total_reads=0;
my %result;
open (FILE, $file) or die "can't open $file\n";
foreach my $line (<FILE>)
{
chomp $line;
if ($line !~ /^\#/) #not a header file !!
{
my @tmp = split /\t/, $line;
my $chr = $tmp[0];
my $pos = $tmp[3];
my $score = $tmp[5];
my $info = $tmp[8];
my $sens = $tmp[6];
$info =~/number_of_read=(\d+);total_number_of_read=(\d+)/;
my $read = $1;
$total_reads = $2;
$result{$chr}{$pos}{$sens}{"line"}=$line;
$result{$chr}{$pos}{$sens}{"score"}=$score;
$result{$chr}{$pos}{$sens}{"read"}=$read;
}
}
close FILE;
return(\%result, $total_reads);
}