https://github.com/Ettwiller/TSS
Raw File
Tip revision: 5382267dc89460b2ec30e33508dff4479c71c3ef authored by Laurence Ettwiller on 14 February 2019, 18:51:53 UTC
updated to new samtools requirement
Tip revision: 5382267
filter_tss.pl
#!/usr/bin/perl

use strict;
use Getopt::Long qw(GetOptions);


#this program takes two gtf files (output of bam2firstbasegtf.pl) corresponding to the control library (--control) and the cappable-seq library (--tss). 
#and output the result (in gtf format into $OUT file (from --out)



my $error_sentence = "USAGE : perl $0 --tss tss_file --control control_file --out output.gtf\n OPTIONAL : --cutoff log2(ratio) --Rformat 1";
my $tssfile;
my $controlfile;
my $CUTOFF = 0;
my $Rformat =0;
my $OUT;

#================================
#getting all the options :
GetOptions ("tss=s" => \$tssfile,    # numeric
	    "control=s" => \$controlfile, #gtf from bam2firstbasebam.pl
	    "cutoff=s" => \$CUTOFF, #enrichment score
	    "Rformat=s" => \$Rformat,
	     "out=s" => \$OUT #output file
    ) or die "$error_sentence";

#================================
#checking if everything is fine :
if (!$tssfile || !$controlfile || !$OUT) {die "$error_sentence\n";}
if(-e $OUT) { die "File $OUT Exists, please remove old file or rename output file (--out)"};
#=================================                                                                                                                         
my ($tss, $total_tss) = parse($tssfile);
my ($control, $total_control) = parse($controlfile);

#open the output file :
my($day, $month, $year)=(localtime)[3,4,5];  
open(OUT, ">$OUT") or die "can't save result into $OUT\n";
print OUT "## File generated by $0 on $day-".($month+1)."-".($year+1900)."\n";

foreach my $chr(keys %$tss)
{
    my $pos = $$tss{$chr};
    foreach my $position (sort {$a<=>$b} keys %$pos)
    {
	my $dir = $$pos{$position};
	foreach my $direction (keys %$dir)
	{
	    my $enrichement_score="NaN"; my $score_control="NaN"; my $score_tss = "NaN";
	    if ($$control{$chr}{$position}{$direction})
	    {
		$score_control = $$control{$chr}{$position}{$direction}{"score"};
		$score_tss = $$tss{$chr}{$position}{$direction}{"score"};


		$enrichement_score = log($score_tss/$score_control)/log(2);

	    }
	    else {
		#no read for the control at this position -we add a pseudocount
		$score_control = sprintf('%.3f',(1/$total_control)*1000000);
		$score_tss = $$tss{$chr}{$position}{$direction}{"score"};
		$enrichement_score = log($score_tss/$score_control)/log(2);
		
	    }
	    if ($enrichement_score >= $CUTOFF)
	    {
		my $line = $$tss{$chr}{$position}{$direction}{"line"};
		if ($Rformat ==0)
		{
		    my @tmp = split /\t/, $line;
		    my $id= $tmp[2];
		    $id = $id.$enrichement_score;
		    $tmp[2]= $id;
		    my $new_line = join("\t", @tmp);
		    $new_line = $new_line.";enrichment_score=$enrichement_score;score_control=$score_control";
		    print OUT "$new_line\n";
		}
		if ($Rformat ==1)
		{
		    print OUT "$score_tss\t$score_control\t$enrichement_score\n";
		}
	    }

	}
    }
    
}
close OUT;

sub parse {
    my ($file) =@_;
    my $total_reads=0;
    my %result;
    open (FILE, $file) or die "can't open $file\n";
    foreach my $line (<FILE>)
    {
	chomp $line;
	if ($line !~ /^\#/) #not a header file !!
	{
	    my @tmp = split /\t/, $line;
	    my $chr = $tmp[0];
	    my $pos = $tmp[3];
	    my $score = $tmp[5];
	    my $info = $tmp[8];
	    my $sens = $tmp[6];
	    $info =~/number_of_read=(\d+);total_number_of_read=(\d+)/;
	    my $read = $1;
	    $total_reads = $2;
	    
	    $result{$chr}{$pos}{$sens}{"line"}=$line;
	    $result{$chr}{$pos}{$sens}{"score"}=$score;
	    $result{$chr}{$pos}{$sens}{"read"}=$read;
	}
    }
   
    close FILE;
    return(\%result, $total_reads);
}
back to top