Content - 250897e87a4606c84968cbbbeb81dc6d0c4a2e76 - 89f85eb/rankOnCaddScore.pl

visit type:
Tip revision: 3b569742cedc2eded1d60bdaba1c2ee91134c8e1 authored by David A. Parry on 10 July 2020, 10:56:40 UTC
Update readme.md
Tip revision: 3b56974
rankOnCaddScore.pl
#!/usr/bin/env perl

use warnings;
use strict; 
use POSIX qw/strftime/;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Getopt::Long;
use Pod::Usage;
use Data::Dumper;
use Bio::DB::HTS::Tabix; 
use FindBin qw($RealBin);
use lib "$RealBin/lib";
use lib "$RealBin/lib/dapPerlGenomicLib";
use VcfReader 0.3;
use VcfhacksUtils;

my @cadd_files = ();
my @cadd_dirs = ();
my %opts = (cadd_file => \@cadd_files, dir => \@cadd_dirs);
GetOptions(
        \%opts,
        "input|i=s",
        "output|o=s",
        "cadd_file|c=s{,}",
        "dir|r=s{,}",
        "not_found|n=s",
        "do_not_rank|d",
        "filter|f=f",
        "keep_unscored|k",
        "progress|p",
        "help|h",
        "manual|m",
        ) or pod2usage(-message => "Syntax Error.", exitval => 2);

pod2usage(-verbose => 2, -exitval => 0) if $opts{manual};
pod2usage(-verbose => 1, -exitval => 0) if $opts{help};
pod2usage(-message => "-i/--input argument is required", exitval => 2) if not $opts{input};
pod2usage(-message => "Either -c/--cadd_file or -r/--dir argument is required", exitval => 2) if not @{$opts{cadd_file}} and not @{$opts{dir}};
pod2usage(-message => "-f/--filter argument cannot be less than 0", exitval => 2) if defined $opts{filter} && $opts{filter} < 0;

my $sortex;
if (not $opts{do_not_rank}){
    if (-s $opts{input} > (1024**2 * 50)){#use Sort::External if file size bigger than 50 MB
        eval "use Sort::External; 1" or print STDERR "Sort::External module was not found - ".
            "will attempt to sort in memory. For huge files it is recommended to install Sort::External.\n";
        if (not $@){
            my $scheme = sub {
                substr($Sort::External::b, 0, 6) <=> 
                substr($Sort::External::a, 0, 6) };#rev numeric sort CADD scores
            $sortex = Sort::External->new
            (
                mem_threshold => 1024**2 * 24, 
                sortsub => $scheme,
            );
        }
    }
}
my %cadd_iters = ();
foreach my $dir (@{$opts{dir}}){
    opendir (my $DIR, $dir) or die "Can't read directory $dir: $!\n";
    my @c = map {"$dir/$_"} grep { /\.(b)*gz$/ } readdir $DIR;
    die "No .gz or .bgz files found in directory $dir.\n" if not @c;
    push @{$opts{cadd_file}}, @c;
}
foreach my $cadd_file (@{$opts{cadd_file}}){
    checkCaddFile($cadd_file);
    $cadd_iters{$cadd_file} = Bio::DB::HTS::Tabix->new(filename =>  $cadd_file);
}

my ($header, $first_var, $VCF)  = VcfReader::getHeaderAndFirstVariant($opts{input});
die "Header not ok for input ($opts{input}) "
    if not VcfReader::checkHeader( header => $header );
my $NOT_FOUND;
if ($opts{not_found}){
    open ($NOT_FOUND, ">$opts{not_found}") or die "Can't open $opts{not_found} for writing: $!\n";
}
my $OUT;
if ($opts{output}){
    open ($OUT, ">$opts{output}") or die "Can't open $opts{output} for writing: $!\n";
}else{
    $OUT = \*STDOUT;
}

printHeader();

my @variants = ();
my $n         = 0; #variants/lines
my $scored    = 0;
my $not_found = 0;
my $filtered  = 0;
my $progressbar;
my $next_update = 0;
my $time = strftime("%H:%M:%S", localtime);
my $total_vcf = 0;
if (defined $opts{progress}){
    ($progressbar, $total_vcf) = VcfhacksUtils::getProgressBar(
        input  => $opts{input},
        name => "Annotating", 
    );
}else{
    print STDERR "CADD annotation commencing: $time\n";
}
processLine($first_var); 
while (my $line = <$VCF>){
    processLine($line); 
}

sub processLine{
    my $line = shift;
    next if $line =~ /^#/;
    $n++;
    chomp $line;
    my @split = split("\t", $line, 9); #only need first 8 fields, possible 
                                       #optimization for VERY long lines
    my %min_vars = VcfReader::minimizeAlleles(\@split);
    #get score for each allele or '-' if it can't be found
    my @cadd_scores = findCaddScore(\%min_vars, \%cadd_iters);
    my @nf_alts = getAltsWithoutScore(\@cadd_scores, \%min_vars);
    $scored += @cadd_scores - @nf_alts;
    foreach my $al (@nf_alts){
        next if $min_vars{$al}->{ALT} eq '*';
        $not_found++;
        if ($opts{not_found}){
            print $NOT_FOUND join
            (
                "\t", 
                $min_vars{$al}->{CHROM},
                $min_vars{$al}->{POS},
                '.',
                $min_vars{$al}->{REF},
                $min_vars{$al}->{ALT},
            ) . "\n";
        }
    }
    my $max = 
    (
        sort {$b <=> $a} 
        grep {! /^\.$/ } 
        @cadd_scores
    )[0];
    #if there is no score set $max to -1 so it goes to bottom of the pile
    $max = -1 if not defined $max;
    if ($opts{filter}){
        if($max < $opts{filter}){
            unless ($opts{keep_unscored} and $max == -1){
                $filtered++;
                next;
            }
        }
    }
    my $out_line = VcfReader::addVariantInfoField 
    (
        line  => \@split,
        id    => 'CaddPhredScore', 
        value => join(",", @cadd_scores),
    );
    if ($opts{do_not_rank}){
        print $OUT join("\t", @$out_line ) . "\n";
    }else{
        push @variants, sprintf("%-6s%s", $max, join("\t", @$out_line) );
        if ($sortex){#no Sort::External , sort in memory
            if (@variants > 99999){
                $sortex->feed(@variants);
                @variants = ();
            }
        }
    }
    updateProgress();
}
if ($progressbar){
    $progressbar->update($total_vcf) if $total_vcf >= $next_update;
}

print STDERR "\nDone annotating CADD scores.\n";
unless ($opts{do_not_rank}){
    $n = 0;
    $next_update = 0;
    if (defined $opts{progress} and $total_vcf){
        $progressbar = Term::ProgressBar->new
        (
            {
                name => "Writing", 
                count => $total_vcf, 
                ETA => "linear", 
            }
        );
    }
    if ($sortex){
        $sortex->feed(@variants) if @variants;
        $sortex->finish;
        print STDERR "Writing output...\n";
        while ( defined( $_ = $sortex->fetch ) ) {
            print $OUT substr($_, 6) ."\n";
            $n++;
            if ($progressbar){
               $next_update = $progressbar->update($n) if $n >= $next_update;
            }
        }
    }else{
        print STDERR "Performing sort in memory...\n";
        @variants = sort
        {
            substr($b, 0, 6) <=> substr($a, 0, 6) 
        } @variants;#rev numeric sort CADD scores
        print STDERR "Writing output...\n";
        foreach my $var (@variants){
            print $OUT substr($var, 6) ."\n";
            $n++;
            if ($progressbar){
               $next_update = $progressbar->update($n) if $n >= $next_update;
            }
        }
    }
    if ($progressbar and $total_vcf){
        $progressbar->update($total_vcf) if $total_vcf >= $next_update;
    }
}
$time = strftime("%H:%M:%S", localtime);
print STDERR "Time finished: $time\n";
print STDERR "$n variants processed\n";
print STDERR "$scored alleles scored\n";
print STDERR "$not_found alleles not found.\n";
if ($opts{filter}){
    print STDERR "$filtered variants filtered on CADD score.\n";
}

##########################
sub updateProgress{
    if ($progressbar) {
        if ($n >= $next_update){
            $next_update = $progressbar->update( $n )
        }
    }elsif($opts{progress}){
        VcfhacksUtils::simpleProgress($n, 0, " variants processed" );
    }
}

##########################
sub findCaddScore{
    my ($vars, $tabix_iter) = @_;
    my @scores = ();#return score for each allele
ALLELE: foreach my $al (sort {$a<=>$b} keys %{$vars}){
        foreach my $iter (keys %{$tabix_iter}){
            my $it = $tabix_iter->{$iter}->query
            ( 
                "$vars->{$al}->{CHROM}:$vars->{$al}->{POS}-" . 
                ($vars->{$al}->{POS} + 1) 
            );
            next if not defined $it;
            while (my $result = $it->next){
                chomp($result);
                my @res = split("\t", $result);
                next if $res[0] ne $vars->{$al}->{CHROM};
                my ($pos, $ref, $alt) = VcfReader::reduceRefAlt
                (
                    $res[1], 
                    $res[2], 
                    $res[3]
                );
                next if ($vars->{$al}->{POS} != $pos);
                next if ($vars->{$al}->{REF} ne $ref); #should we error here?
                next if ($vars->{$al}->{ALT} ne $alt); #diff alt allele
                push @scores, $res[5];
                next ALLELE;
            }
        }#not found in any of our tabix iterators
        push @scores, '.';
    }
    return @scores;
}

##########################
sub checkCaddFile{
    my ($file) = @_;
    if ($file !~ /\.gz$/){
        die "CADD file $file does not have a '.gz' extension and is ".
            "presumably not bgzip compressed. Please ensure to use a ".
            "bgzip compressed CADD file.\n";
    }
    my $FH = new IO::Uncompress::Gunzip $file or 
      die "IO::Uncompress::Gunzip failed while opening $file ".
      "for reading: \n$GunzipError";
    my $valid_header = 0;
    while (my $line = <$FH>){
        if ($line =~ /^##/){
            next;
        }elsif ($line =~ /^#/){
            if ($line !~ /^#Chrom\tPos\tRef\tAlt\t\w+Score\tPHRED/i){
                die "Invalid header line:\n\n$line\n\n".
                "Expected to find a header as follows:\n\n".
                "#Chrom\tPos\tRef\tAlt\tRawScore\tPHRED\n";
            }else{
                $valid_header++;
            }
        }else{
            last;
        }
    }
    close $FH;
    die "No valid header found for CADD file $file!\n" if not $valid_header;

    my $index = "$file.tbi";
    if (not -e $index ){
        warn "No index found for $file - attempting to index with tabix...\n";
        VcfReader::indexVcf($file);
        warn "Done.\n";
    }
}

##########################
sub getAltsWithoutScore{
    my ($cadd_scores) = @_;
    my @alts = ();
    for (my $i = 0; $i < @$cadd_scores; $i++ ){
        if ($cadd_scores->[$i] eq '.'){
            #allele 1 will be first (i.e. pos 0) in array etc. 
            #so increment by 1
            push @alts, $i + 1;
        }
    }
    return @alts;
}

##########################
sub printHeader{
    my %inf_h  = 
    (
        ID          =>  "CaddPhredScore",
        Number      =>  "A",
        Type        =>  "Float",
        Description =>  "Variant site CADD phred score. Score provided " . 
                        "for each allele and given as '.' if not found.",
    );
    my @headstring = grep {/^##/} @$header;
    push @headstring, VcfhacksUtils::getInfoHeader(%inf_h);
    push @headstring, VcfhacksUtils::getOptsVcfHeader(%opts);  
    push @headstring, "$header->[-1]";

    print $OUT join("\n", @headstring) . "\n"; 
}

##########################
=head1 NAME

rankOnCaddScore.pl - annotate, rank and/or filter variants using CADD PHRED scores.

=head1 SYNOPSIS

        rankOnCaddScore.pl -i [vcf file] -c [tabix indexed cadd file(s)] [options]
        rankOnCaddScore.pl -h (display help message)
        rankOnCaddScore.pl -m (display manual page)

=cut

=head1 ARGUMENTS

=over 8

=item B<-i    --input>

Input VCF file.

=item B<-c    --cadd_file>

No, not a murder-solving monk, but one or more bgzip compressed and tabix indexed file of CADD scores. CADD score files are available from http://cadd.gs.washington.edu/download and can also be generated by uploading your own data to http://cadd.gs.washington.edu/score.

=item B<-r    --dir>

One or more directories containing bgzip compressed and tabix indexed file of CADD scores as above. Only files with '.gz' or '.bgz' file extensions will be included.

=item B<-o    --output>

Output CADD score annotated/filtered file. Optional - default is STDOUT.

=item B<-n    --not_found>

Output file for variants that can't be found in CADD files.  This output can be uploaded to the CADD server (http://cadd.gs.washington.edu/score) for scoring. The output from CADD server can then be added to your CADD files used in analysing future data if desired.
        
=item B<-d    --do_not_rank>

Use this flag to annotate variants but not reorder your VCF.

=item B<-f    --filter>

CADD Phred score cut-off. Filter any variant that has a CADD score below this score. 

=item B<-k    --keep_unscored>

Use this flag when filtering on CADD score to retain any variants for which a CADD score can not be found in your output.

=item B<-p    --progress>

Use this flag to print % progress to STDERR.

=item B<-h    --help>

Display help message.

=item B<-m    --manual>

Show manual page.

=back 

=cut


=head1 EXAMPLES

  rankOnCaddScore.pl -i input.vcf -o cadd_ranked.vcf -c cadd_file1.tsv.gz cadd_file2.tsv.gz
  (rank input.vcf on CADD score and output to cadd_ranked.vcf)

  rankOnCaddScore.pl -i input.vcf -o cadd_ranked.vcf -c cadd_file1.tsv.gz cadd_file2.tsv.gz -f 15
  (rank input.vcf on CADD score and filter anything with a CADD PHRED score below 15)

  rankOnCaddScore.pl -i input.vcf -o cadd_annotated.vcf -c cadd_file1.tsv.gz cadd_file2.tsv.gz -d
  (annotate but don't rank)

  rankOnCaddScore.pl -i input.vcf -o cadd_annotated.vcf -c cadd_file1.tsv.gz cadd_file2.tsv.gz -d -f 15
  (annotate but don't rank and filter anything  with a CADD PHRED score below 15)

  rankOnCaddScore.pl -i input.vcf -o cadd_annotated.vcf -c cadd_file1.tsv.gz cadd_file2.tsv.gz -d -f 15 -n not_found.tsv
  (as above but outputting variants that don't have CADD scores to not_found.tsv)

  rankOnCaddScore.pl -i input.vcf -o cadd_annotated.vcf -r /path/to/cadd/dir -d -f 15 -n not_found.tsv
  (as above but specifying a directory containing CADD score files rather than individual files)

=head1 DESCRIPTION

This program reads a VCF file and searches the provided CADD score files for matching variants. Upon finding a corresponding variant in a CADD file it annotates the variant INFO field with "CaddPhredScore=" and a score for each allele (or . when a score is not found for an allele). By default variants are ordered by CADD PHRED score (or the highest CADD PHRED score for a multiallelic site) but this can be avoided using the --do_not_sort option. You may also filter on CADD score, but be aware that this is likely to be somewhat arbitrary. Advice from the authors suggests a PHRED score of 15 may be a good value to filter from because it is the median value for all possible canonical splice site changes and non-synonymous variants.

CADD files for all possible SNVs in the human genome can be downloaded from http://cadd.gs.washington.edu/download as well as indels present in 1000 genomes samples and ESP samples. When variants are not found you can use the --not_found option to output these to a file which can then be uploaded to http://cadd.gs.washington.edu/score to score these variants.  These can then be added to your database for future analyses after tabix indexing the output (tabix -s 1 -b 2 -e 2 cadd_output.tsv.gz). I would suggest collecting all your various CADD score files in one folder so that you can pass '/data/cadd_score/*.gz' to the --cadd_file option to analyze all the files in that folder.

=cut

=head1 AUTHOR

David A. Parry

=head1 COPYRIGHT AND LICENSE

Copyright 2014,2015  David A. Parry

This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.

=cut
Browse the archive

https://github.com/gantzgraf/vcfhacks