https://github.com/gantzgraf/vcfhacks
Tip revision: 3b569742cedc2eded1d60bdaba1c2ee91134c8e1 authored by David A. Parry on 10 July 2020, 10:56:40 UTC
Update readme.md
Update readme.md
Tip revision: 3b56974
filterGts.pl
#!/usr/bin/env perl
use strict;
use warnings;
use POSIX qw/strftime/;
use Data::Dumper;
use FindBin qw($RealBin);
use List::Util qw(first sum max);
use Getopt::Long;
use Pod::Usage;
use lib "$RealBin/lib";
use lib "$RealBin/lib/dapPerlGenomicLib";
use VcfReader 0.3;
use VcfhacksUtils;
my @ab = ();
my %opts = (a => \@ab);
GetOptions(
\%opts,
"i|input=s",
"o|output=s",
"a|ab=f{,}",
"d|depth=i",
"s|supporting_reads=i",
"p|progress",
"h|help|?",
"m|manual",
) or pod2usage( -exitval => 2, -message => "Syntax error" );
pod2usage( -verbose => 2, -exitval => 0 ) if $opts{m};
pod2usage( -verbose => 1, -exitval => 0 ) if $opts{h};
pod2usage( -exitval => 2, -message => "--input is required" ) if not $opts{i};
pod2usage
(
-exitval => 2,
-message => "--ab and/or --supporting_reads and/or --depth are required"
) if not @ab and not $opts{d} and not $opts{s};
informUser("Checking input VCF\n");
my ($header, $first_var, $VCF) = #this method means we can read from STDIN/pipe
VcfReader::getHeaderAndFirstVariant($opts{i});
die "VCF header not OK for $opts{i}\n"
if not VcfReader::checkHeader(header => $header);
my %samples_to_columns = VcfReader::getSamples
(
header => $header,
get_columns => 1
);
my @samp_order = sort {$samples_to_columns{$a} <=> $samples_to_columns{$b}} keys %samples_to_columns;
die "No samples found in VCF!\n" if not %samples_to_columns;
my $OUT;
if ( $opts{o} ) {
open( $OUT, ">$opts{o}" ) || die "Can't open $opts{o} for writing: $!";
}else {
$OUT = *STDOUT;
}
my $progressbar;
my $next_update = 0;
my $total_vcf = 0;
if ( $opts{p} ){
($progressbar, $total_vcf) = VcfhacksUtils::getProgressBar(
input => $opts{input},
name => "Analyzing",
);
}else{
print STDERR "Processing variants...\n";
}
print_header();
print $OUT filterGts($first_var);
my $n = 1;
updateProgress();
while (my $l = <$VCF>){
next if $l =~ /^#/;
$n++;
chomp $l;
print $OUT filterGts($l);
updateProgress();
}
close $VCF;
close $OUT;
################################################
sub updateProgress{
return if not $opts{p};
if ($progressbar) {
$next_update = $progressbar->update($n) if $n >= $next_update;
}else{
VcfhacksUtils::simpleProgress($n, 0, " variants processed" );
}
}
################################################
sub filterGts{
my $line = shift;
my @split = split( "\t", $line );
my @gts = ();
my @calls = VcfReader::getAllSampleVariants(\@split);
my $format = VcfReader::getVariantField(\@split, "FORMAT");
SAMP: for (my $i = 0; $i < @samp_order; $i++){
my @ads = VcfReader::getSampleAlleleDepths
(
line => \@split,
sample => $samp_order[$i],
sample_to_columns => \%samples_to_columns,
);
@ads = map { $_ =~ /^\d+$/ ? $_ : 0 } @ads;
my $gt = VcfReader::getSampleGenotypeField
(
line => \@split,
field => "GT",
sample => $samp_order[$i],
sample_to_columns => \%samples_to_columns,
);
my @alts = split(/[\/\|]/, $gt);
my $depth = sum(@ads);
if (not $depth){
#might be absence of AD field, but DP field might be present
$depth = VcfReader::getSampleGenotypeField
(
line => \@split,
field => 'DP',
sample => $samp_order[$i],
sample_to_columns => \%samples_to_columns,
);
}
$depth ||= 0;
if ($opts{d} and $opts{d} > $depth){
push @gts, filterGenotype($calls[$i], $format);
next SAMP;
}
if (@ab and $depth > 0){
my $al_b = 0;
if ($alts[0] eq '0' and $alts[1] =~ /[1-9]/){ #only analyze het samples where one allele is REF
$al_b = $ads[$alts[1]]/$depth;
if ($al_b < $ab[0]){
push @gts, filterGenotype($calls[$i], $format);
next SAMP;
}elsif (@ab > 1 and $al_b > $ab[1]){
push @gts, filterGenotype($calls[$i], $format);
next SAMP;
}
}
}
if ($opts{s}){
my @non_ref = grep { $_ ne '0' and $_ ne '.' } @alts;
my $x = 0;
foreach my $non (@non_ref){
$x++ if ($ads[$non] < $opts{s});#not enough reads to support allele
}
if (@non_ref and $x >= @non_ref){#all ALT alleles have to few supporting reads
push @gts, filterGenotype($calls[$i], $format);
next SAMP;
}
}
push @gts, $calls[$i];
}
return join("\t", @split[0..8], @gts) . "\n";
}
################################################
sub filterGenotype{
my ($call, $format) = @_;
my @c = split(":", $call);
my @f = split(":", $format);
my $g = 0;
$g++ until $f[$g] eq 'GT' or $g > $#f;
if ($g > $#f){
warn "Variant missing 'GT' field!\n";
return $call;
}
$c[$g] = './.';
return join(":", @c);
}
################################################
sub print_header{
my $meta_head = join("\n", grep {/^##/} @$header);
my $headstring = '';
my $optstring = VcfhacksUtils::getOptsVcfHeader(%opts) . "\n".$header->[-1] ."\n" ;
print $OUT "$meta_head\n";
print $OUT $optstring;
}
#################################################
sub informUser{
my $msg = shift;
my $time = strftime( "%H:%M:%S", localtime );
if ($progressbar){
$progressbar->message( "[INFO - $time] $msg" );
}else{
print STDERR "[INFO - $time] $msg";
}
}
###########################################################
=head1 NAME
filterGts.pl - set genotypes to no-calls if below specified allele depths/balance
=head1 SYNOPSIS
filterGts.pl -i [vcf file] -a [allele balance cutoff]
filterGts.pl -i [vcf file] -d [minimum total allele depth]
filterGts.pl -i [vcf file] -s [minimum supporting reads]
filterGts.pl -h (display help message)
filterGts.pl -m (display manual page)
=cut
=head1 ARGUMENTS
=over 8
=item B<-i --input>
Input VCF file. Required.
=item B<-o --output>
Output file.
=item B<-d --depth>
Minimum depth. Genotypes with depth below this value will be converted to
no-calls.
=item B<-b --allele_balance>
Minimum and optional maximum alt allele ratio per sample call. If one value is
provided this will act as the minimum allele balance cutoff. If a second value
is provided this will be used as a maximum allele balance cutoff. Only
heterozygous variants where one allele is the reference nucleotide will be
filtered using this option. Valid values are between 0.00 and 1.00.
=item B<-s --supporting_reads>
Minimum number of supporting reads for alternative alleles. If all
non-reference alleles for a genotype have fewer than this number of reads
supporting them, the genotype will be changed to a no-call.
=item B<-p, --progress>
Use this flag to show a progress bar while this program is running.
=item B<-h --help>
Display help message.
=item B<-m --manual>
Show manual page.
=back
=cut
=head1 DESCRIPTION
Changes called genotypes in a VCF to no-calls ('./.') while retaining other
format fields according to user-specified allele depth options.
=head1 AUTHOR
David A. Parry
=head1 COPYRIGHT AND LICENSE
Copyright 2016, David A. Parry
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version. This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details. You should have received a copy of the GNU General Public License along
with this program. If not, see <http://www.gnu.org/licenses/>.
=cut