https://github.com/Ettwiller/TSS
Raw File
Tip revision: 5382267dc89460b2ec30e33508dff4479c71c3ef authored by Laurence Ettwiller on 14 February 2019, 18:51:53 UTC
updated to new samtools requirement
Tip revision: 5382267
bam2firstbasegtf.pl
#!/usr/bin/perl
use strict;
use Getopt::Long qw(GetOptions);
use File::Temp qw(tempfile);

my $error_sentence = "USAGE : perl $0 --bam bamfile --out output.gtf\nOPTIONAL : --lib_type FR (default F) --cutoff 1 (default 0) --absolute 1\n";

=comment
Paired reads:

RF: first read (/1) of fragment pair is sequenced as anti-sense (reverse(R)), and second read (/2) is in the sense strand (forward(F)); typical of the dUTP/UDG sequencing method.

FR: first read (/1) of fragment pair is sequenced as sense (forward), and second read (/2) is in the antisense strand (reverse)

Unpaired (single) reads:

F: the single read is in the sense (forward) orientation

R: will not work!
 INPUT :
bam file X.bam (preferably generated through local alignment such as Bowtie --local)
out output file name
#OUTPUT :
#gtf file
=cut 

#OPTIONS :
my $bamfile;
my $lib_type = "F";
my $CUTOFF = 0;
my $OUT; #output file name (gtf file)
my $absolute = 0;
#================================= 
#get options :
GetOptions ("bam=s" => \$bamfile,    # the bam file containing the mapped reads
	    "lib_type=s" => \$lib_type, #the type of RNA library used during the experiment
	    "cutoff=s" => \$CUTOFF,#read number cutoff (see paper)
	    "out=s" => \$OUT, #output file name.
	    "absolute=s" => \$absolute
    ) or die "USAGE : perl $0 $error_sentence";

#=================================
#if something went wrong, notify :
if (!$bamfile || !$OUT) {die "$error_sentence"};
if ($lib_type ne "FR" && $lib_type ne "RF" && $lib_type ne "F"){ die "USAGE : perl $0 $error_sentence";
}
#do not override a file 
if(-e $OUT) { die "File $OUT Exists, please remove old file or rename output file (--out)"};
#================================= 
#start the main program :
my $generic =  clean_name($bamfile);
my $resulting_bam;


if ($lib_type eq "F") #single read library with R1 being the most 5' end of the transcripts. 
{
    $resulting_bam = $bamfile;
}
elsif ($lib_type eq "FR")
{
    my $tmp = $generic."_R1";
    #extract R1 from bam and sort the reads
    my $library_command = "samtools view -f64 -b $bamfile | samtools sort - $tmp";
    $resulting_bam = $tmp.".bam";
    system($library_command);
}
elsif ($lib_type eq "RF")
{
    my $tmp = $generic."R2";
    #extract R2 from bam and sort the reads                                                                                                               
    my $library_command = "samtools view -f128 -b $bamfile | samtools sort - $tmp";
    $resulting_bam = $tmp.".bam";
    system($library_command);
}

#get the total number of reads that map to the genome.
my $count_mapped_read = `samtools view -c -F4 $resulting_bam`;
chomp $count_mapped_read;
print STDERR "number of reads mapping to genome = $count_mapped_read\n";

#create a tmp file containing the tmp bed. 
my $file_tmp = new File::Temp( UNLINK => 1 );
my $command = "bedtools bamtobed -cigar  -i $resulting_bam > $file_tmp"; 
system($command);
my $result = parse_bed($file_tmp);
unlink($file_tmp);
my $total_count = 0;

my($day, $month, $year)=(localtime)[3,4,5];  
open(OUT, ">$OUT") or die "can't save result into $OUT\n";
print OUT "## File generated by $0 on $day-".($month+1)."-".($year+1900)."\n";


foreach my $chr (sort keys %$result)
{
    my $tmp = $$result{$chr};
    foreach my $keys (sort {$a<=>$b} keys %$tmp)
    {
	if ($$result{$chr}{$keys}{"-"}{"count"})
        {
            #get the RRS score for the position key and orientation -                                                                                     
	    my $neg;
            my $relative_neg = sprintf('%.3f',($$result{$chr}{$keys}{"-"}{"count"}/$count_mapped_read)*1000000);
            my $abs_neg = $$result{$chr}{$keys}{"-"}{"count"};
	    if ($absolute ==0) {$neg = $relative_neg;}
            if ($absolute ==1) {$neg = $abs_neg;} #option to count the absolute number of reads instead of the relative # of reads
	    if ($neg >= $CUTOFF)
            {
                my $start = $keys;
                $total_count ++;
                my $id = $chr."_".$start."_".$abs_neg."_-_";
		

                print OUT "$chr\t$generic\t$id\t$start\t$start\t$relative_neg\t-\t.\tnumber_of_read=$abs_neg;total_number_of_read=$count_mapped_read\n";
            }
	    
        }
	
	
	if ($$result{$chr}{$keys}{"+"}{"count"})
	{
	    #get the RRS score for the position key and orientation +

	    my $pos;
	    my $relative_pos = sprintf('%.3f',($$result{$chr}{$keys}{"+"}{"count"}/$count_mapped_read)*1000000);
	    my $abs_pos = $$result{$chr}{$keys}{"+"}{"count"};
	    if ($absolute ==1) {$pos = $abs_pos;} #option to count the absolute number of reads instead of the relative # of reads  
	    if ($absolute ==0) {$pos= $relative_pos;}
	    if ($pos >= $CUTOFF)
	    {
		my $start = $keys +1;
		$total_count ++;

		my $id = $chr."_".$start."_".$abs_pos."_+_";		
		print OUT "$chr\t$generic\t$id\t$start\t$start\t$relative_pos\t+\t.\tnumber_of_read=$abs_pos;total_number_of_read=$count_mapped_read\n";
	    }
	}
	
	
    }
}
close OUT;

sub clean_name {
    my ($file)=@_;
    my $generic = $file;
    $generic =~ s/\.tss//;
    $generic =~ s/.*\///g;
    return $generic;
} 

sub parse_bed {
    my ($file)=@_;
    open (FILE, $file) or die;
    my %result;
    foreach my $line (<FILE>)
    {
	chomp $line;
	my ($start, $orientation)=undef;
	my @tmp = split /\t/, $line;
	my $chr = $tmp[0];
	$orientation = $tmp[5];
	if ($orientation eq "+")
	{
	    $start = $tmp[1];
	}
	elsif ($orientation eq "-"){
	    $start = $tmp[2];
	}
	$result{$chr}{$start}{$orientation}{"count"}++;

    
    }
    

    return \%result;
}
back to top