https://github.com/gantzgraf/vcfhacks
Tip revision: 3b569742cedc2eded1d60bdaba1c2ee91134c8e1 authored by David A. Parry on 10 July 2020, 10:56:40 UTC
Update readme.md
Update readme.md
Tip revision: 3b56974
getFunctionalVariants.pl
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use POSIX qw/strftime/;
use Data::Dumper;
use List::Util qw ( first max min ) ;
use File::Temp qw/ tempfile /;
use FindBin qw($RealBin);
use lib "$RealBin/lib";
use lib "$RealBin/lib/dapPerlGenomicLib";
use VcfReader 0.3;
use VcfhacksUtils;
my $progressbar;
my @samples = ();
my @classes = ();
my %default_classes = ();
my @add_classes = ();
my @damaging = ();
my @biotypes = ();
my @custom_af = ();
my @eval_filters = ();
my %opts =
(
classes => \@classes,
add_classes => \@add_classes,
d => \@damaging,
s => \@samples,
biotype_filters => \@biotypes,
j => \@custom_af,
eval_filters => \@eval_filters,
);
GetOptions(
\%opts,
"add_classes=s{,}" ,
"a|af=f" ,
"biotype_filters=s{,}",
"b|progress" ,
"canonical_only" ,
"classes=s{,}" ,
'clinvar=s',
"consensus_splice_site",
"c|cadd_filter=f",
"d|damaging=s{,}" ,
"e|equal_genotypes",
"eval_filters=s{,}",
"f|find_shared_genes:s",
"g|gq=i",
"gene_counts=s",
"h|?|help" ,
"i|input=s" ,
"j|custom_af=s{,}",
"k|keep_any_damaging" ,
"maf=f" ,
"manual" ,
"m|mode=s" ,
"no_biotype_filtering",
"n|num_matching=i",
"o|output=s" ,
"pass_filters" ,
"pl=f",
"s|samples=s{,}",
"skip_unpredicted" ,
"t|target_genes=s",
"v|var_quality=i",
"u|max_sample_allele_frequency=f",
) or pod2usage(-message => "Syntax error", -exitval => 2);
pod2usage(-verbose => 2, -exitval => 0) if ($opts{manual});
pod2usage(-verbose => 1, -exitval => 0) if ($opts{h});
if (not $opts{i}){
pod2usage
(
-message => "Syntax error - an input file must be supplied.\n",
-exitval => 2
);
}
if (defined $opts{af} && ($opts{af} < 0 or $opts{af} > 1.0)){
pod2usage
(
-message => "-a/--allele_frequency option requires a value between 0.00 and 1.00 to ".
"filter on allele frequency.\n",
-exitval => 2
);
}
if (defined $opts{u} ){
if ($opts{u} < 0 or $opts{u} > 1.0){
pod2usage
(
-message => "-u/--max_sample_allele_frequency option requires a ".
"value between 0.00 and 1.00 to filter on allele ".
"frequency.\n",
-exitval => 2
);
}
if (not @samples){
informUser
(
"WARNING: No 'samples specified - all samples will be used for ".
"-u/--max_sample_allele_frequency calculation\n"
);
# pod2usage
# (
# -message => "-u/--max_sample_allele_frequency option requires ".
# "samples to be specified with the -s/--samples ".
# "argument.\n",
# -exitval => 2
# );
}
}
if (defined $opts{f}){
informUser
(
"WARNING: No 'samples specified - all samples will be used for ".
"--find_shared_genes option\n"
) if not @samples;
# pod2usage
# (
# -message => "--find_shared_genes option requires at least one " .
# "sample to be specified with the -s/--samples argument.\n",
# -exitval => 2
# ) if not @samples;
}
#GQs are >= 0
$opts{v} = defined $opts{v} ? $opts{v} : 0;
pod2usage(
-message => "SYNTAX ERROR: variant quality scores (-v/--var_quality)".
" must be 0 or greater.\n",
-exitval => 2
) if ( $opts{v} < 0 );
$opts{g} = defined $opts{g} ? $opts{g} : 20;
pod2usage(
-message => "SYNTAX ERROR: Genotype quality (-g/--gq) ".
"scores must be 0 or greater.\n",
-exitval => 2
) if $opts{g} < 0;
if (defined $opts{pl}){
pod2usage(
-message => "SYNTAX ERROR: Genotype quality (--pl) ".
"scores must be 0 or greater.\n",
-exitval => 2
) if $opts{pl} < 0;
}
#CADD phred score filter is >= 0 (0 means no CADD filtering)
if (defined $opts{c}){
pod2usage(
-message => "SYNTAX ERROR: -c/--cadd_filter cannot be a negative value.\n",
-exitval => 2
) if $opts{c} < 0;
}
#get and check header
my ($header, $first_var, $VCF) = VcfReader::getHeaderAndFirstVariant($opts{i});
die "ERROR: Invalid VCF header in $opts{i}\n"
if not VcfReader::checkHeader(header => $header);
my %sample_to_col = VcfReader::getSamples
(
header => $header,
get_columns => 1,
);
#check --gene_counts arguments
my %gene_count_opts = ();
if ($opts{gene_counts}){
checkGeneCountArgs();
}
if ( ( not @samples and (defined $opts{f} or $opts{u} or $opts{gene_counts}) )
or ( grep { /^all$/i } @samples )
){
# if not samples specified but using --find_shared_genes, --gene_counts
# or --max_sample_allele_frequency option
# or if 'all' is listed in --samples option
# add all samples
if (not %sample_to_col ){
if ( $gene_count_opts{count_mode} ne 'allele_counts'
){
die "No samples found in VCF header!\n";
}
}
@samples = ();#need to remove 'all' if nothing else
push @samples, keys %sample_to_col;
}
@samples = VcfhacksUtils::removeDups(@samples);
#get available INFO fields from header
my %info_fields = VcfReader::getInfoFields(header => $header);
#check mode
if ($opts{m}){
if ( lc($opts{m}) ne 'vep' and lc($opts{m}) ne 'snpeff' ){
die <<EOT
SYNTAX ERROR: Unrecognised value for --mode: '$opts{m}'.
Valid values are 'vep' or 'snpeff'.
EOT
;
}
$opts{m} = lc($opts{m});
}
#check VEP/SNPEFF header and get annotation order
# - will set $opts{m} if not already defined
my %csq_header = getAndCheckCsqHeader();
# hash of functional annotations from SnpEff/VEP to their annotation index
my $feature_id = $opts{m} eq 'vep' ? "feature" : "feature_id";
my $symbol_id = $opts{m} eq 'vep' ? "symbol" : "gene_name";
my $gene_id = $opts{m} eq 'vep' ? "gene" : "gene_id";
#set default consequence fields to retrieve
my @csq_fields = getCsqFields();#consequence fields to retrieve from VCF
#and check variant consequence classes are acceptable
my %class_filters = map { $_ => undef } getAndCheckClasses();
#check in silico prediction classes/scores are acceptable
my %in_silico_filters = getAndCheckInSilicoPred();
#hash of prediction program names and values to filter
my @eval_exp = getAndCheckEvalFilters();
#and check biotype classes are acceptable
my %biotype_filters = map { $_ => undef } getAndCheckBiotypes();
# get available allele frequency annotations if filtering on AF
# NOTE: we do not use VEP annotations as they do not necessarily match the variant allele
my %af_info_fields = (); #check available INFO fields for AF annotations
if ( defined $opts{a} ) {
%af_info_fields = getAfAnnotations(\%info_fields);
}
#by default keep variant if there's an associated pathogenic ClinVar field
my ($keep_clinvar, $clinvar_functional) = checkClinVarInfo();
#if filtering on CADD score check we have a CaddPhredScore header
if ( $opts{c} ){
if (not exists $info_fields{CaddPhredScore}){
informUser("WARNING: No 'CaddPhredScore' INFO field found in header ".
"- your input probably does not contain annotations for filtering on" .
" CADD score.\n");
}
}
#if using -t/--target_genes argument, check file
my %targets = ();
my %sargs = ();
my $current_target = '';
my ($TMP, $tmpout);#have to write to temp file, sort and dedup if using targets
#keys are Ensembl Gene IDs, values are hashes of chrom, start, end coordinates
if ($opts{t}){
if (not $opts{gene_counts}){
die "-t/--target_genes option can only be used when also using "
."--gene_counts option\n";
}
readTargetGenesFile();
%sargs = VcfReader::getSearchArguments($opts{i});
($TMP, $tmpout) = tempfile(UNLINK => 1);
}
#Get filehandle for output (STDOUT or file) and optionally gene list
#(STDERR or file) or gene counts (file)
my ($OUT, $LIST, $GENECOUNTS) = openOutput();
#write header and program run parameters
writeOptionsToHeader();
my %contigs = ();
#keep track of contigs we've seen so we can check input is (roughly) sorted
#... (only relevant if using --find_shared_genes option)
my %transcript_sample_vars = ();
#stores variant IDs meeting our criteria per (key) transcript
my %vcf_lines = ();
#hash of variant IDs to matching VCF line
my %transcript_to_symbol = ();
#transcript ID to gene symbol
my %genes_to_transcripts = ();
#gene symbols => transcripts with variants
my %gene_burden = ();
my %cumulative_burden = ();
#if using --gene_counts this has an entry for each gene with a functional
#variant. Entries are hashes where keys are sample names
my %allele_counts = ();
#if using --gene_counts in allele_counts mode
my %allele_number = ();
#if using --gene_counts in allele_counts mode
#progress bar vars
my $next_update = 0;
my $total_vars = 0;
my $var_count = 0;
my $functional_count = 0;
if ($opts{b}) {
$progressbar = VcfhacksUtils::getProgressBar(
input => $opts{i},
name => "filtering",
);
}
if ($opts{t}){
close $VCF;
processTargets();
}else{
processByLine();
}
if($opts{gene_counts}){
outputGeneCountTotals();
}
#################################################
sub processTargets{
foreach my $t (keys %targets){
$current_target = $t;
my @lines = VcfReader::searchByRegion
(
%sargs,
chrom => $targets{$t}->{chrom},
start => $targets{$t}->{start},
end => $targets{$t}->{end},
);
processLine($_) for @lines;
$var_count++;#for progress bar
updateProgressBar();
outputGeneCounts();
}
close $TMP;
my %contigs = VcfReader::getContigOrder($opts{i});#will already be indexed
informUser("Sorting and outputting variants...\n");
VcfReader::sortVcf
(
vcf => $tmpout,
output => $OUT,
contig_order => \%contigs,
);
informUser("Done\n");
}
#################################################
sub processByLine{
processLine($first_var);
updateProgressBar();
while (my $line = <$VCF>){
processLine($line);
updateProgressBar();
}
if (defined $opts{f}){
$functional_count += checkMatchingVariants();
}elsif($opts{gene_counts}){
outputGeneCounts();
}
close $VCF;
updateProgressBar();
outputGeneList();
close $OUT;
if ($opts{b} and not $progressbar){
print STDERR "\r$var_count variants processed...\n";
}
informUser
(
"$functional_count variants matching criteria found from $var_count total".
" variants.\n"
);
}
#################################################
sub processLine{
my $line = shift;
return if $line =~ /^#/;#skip header
$var_count++;#for progress bar
chomp $line;
my @split = split("\t", $line, 9);#do not need GT fields in the first
my ($chrom, $qual, $filter) = VcfReader::getMultipleVariantFields
(
\@split,
'CHROM',
'QUAL',
'FILTER',
);
#skip whole line if not passing filter or variant quality settings
if ($opts{pass_filters}){
return if $filter ne 'PASS';
}
if ($opts{v}){
return if $qual < $opts{v};
}
#get alleles to arrays of 'functional' consequences
my %allele_to_csq = getFunctionalCsq(\@split);
if (defined $opts{f}){
#if using matching variants go through each allele and each consequence and
#put transcript IDs into a hash to a hash of sample names to variant IDs
if (isNewChromosome($chrom)){
#if new chrom check genes and clear collected data
$functional_count += checkMatchingVariants();
}
recordMatchedSamples
(
\@split,
\%allele_to_csq,
);
}elsif( $opts{gene_counts}){
#if using gene_counts
if (isNewChromosome($chrom)){
outputGeneCounts();
}
recordGeneCounts
(
\@split,
\%allele_to_csq,
);
}else{
#if not doing gene counts or matching variants simply print if we've got a
#functional allele
if (%allele_to_csq){
my $line_ref = VcfReader::addVariantInfoField
(
line => \@split,
id => "getFunctionalVariantsMatchedAlleles",
value => join(",", (sort {$a <=> $b} keys %allele_to_csq)),
);
print $OUT join("\t", @$line_ref) ."\n";
$functional_count++;
}
}
}
#################################################
sub recordGeneCounts{
my $split = shift;
my $alleles = shift;
my @matched_samples = ();
return if not %{$alleles};
my $out_line_ref;
if ($gene_count_opts{count_mode} eq 'allele_counts'){
my @allele_counts = ();
foreach my $i (sort {$a <=> $b} keys %{$alleles}){
my ($sc, $sn) = getSampleCounts
(
$split,
$i,
);
push @allele_counts, $sc;
foreach my $annot (@{$alleles->{$i}}){
my $tr = $annot->{$feature_id};
my $symbol = $annot->{$symbol_id};
my $gene = $annot->{$gene_id};
$gene ||= $tr;
$symbol ||= $gene;
if ($current_target){
next if $gene ne $current_target;
}
$transcript_to_symbol{$gene} = $symbol;
$gene_burden{$gene}->{SC} += $sc;
$gene_burden{$gene}->{SN} ||= 0 ;
$gene_burden{$gene}->{SN} = $gene_burden{$gene}->{SN} > $sn ?
$gene_burden{$gene}->{SN} : $sn;
}
}
$out_line_ref = VcfReader::addVariantInfoField
(
line => $split,
id => "getFunctionalVariantsAlleleCounts",
value => join(",", @allele_counts),
);
}else{
my @biallelics = ();#array ref per allele of samples w/ findBiallelic tags
if ($gene_count_opts{model} eq 'recessive'){
#if wanting to use --gene_counts to find recessives
my @hom = split
(
",",
VcfReader::getVariantInfoField($split, "findBiallelicSamplesHom")
);
my @het = split
(
",",
VcfReader::getVariantInfoField($split, "findBiallelicSamplesHet")
);
#entries are per allele, with multiple samples separated by '|'
for (my $i = 0; $i < @hom; $i++){
my @s = grep { $_ ne '.' } split(/\|/, $hom[$i]);
push @s, grep { $_ ne '.' } split(/\|/, $het[$i]);
push @biallelics, \@s;
}
}
foreach my $i (sort {$a <=> $b} keys %{$alleles}){
my @samp_with_allele = ();
if ($gene_count_opts{model} eq 'recessive'){
if (@{$biallelics[$i - 1]}){
push @samp_with_allele, @{$biallelics[$i - 1]};
}
}else{#getting gene counts for specified inheritance model
my %samp_to_gt = VcfReader::getSampleCall
(
line => $split,
sample_to_columns => \%sample_to_col,
minGQ => $opts{g},
multiple => \@samples,
);
@samp_with_allele = addSamplesForGeneCounts
(
\%samp_to_gt,
$i,
$split,
);
}
if (@samp_with_allele){
push @matched_samples, join("|", @samp_with_allele);
#for INFO field of output line
foreach my $annot (@{$alleles->{$i}}){
my $tr = $annot->{$feature_id};
my $symbol = $annot->{$symbol_id};
my $gene = $annot->{$gene_id};
$gene ||= $tr;
$symbol ||= $gene;
if ($current_target){
next if $gene ne $current_target;
}
foreach my $sample (@samp_with_allele){
$gene_burden{$gene}->{$sample} = undef;
$transcript_to_symbol{$gene} = $symbol;
# keys for each %{gene_burden{$gene}} will be undef for
# counting burden (# samples with qualifying variant)
}
}
}else{
push @matched_samples, '.';
}
}
$out_line_ref = VcfReader::addVariantInfoField
(
line => $split,
id => "getFunctionalVariantsMatchedSamples",
value => join(",", @matched_samples),
);
}
$out_line_ref = VcfReader::addVariantInfoField
(
line => $out_line_ref,
id => "getFunctionalVariantsMatchedAlleles",
value => join(",", (sort {$a <=> $b} keys %{$alleles})),
);
my $FHOUT = $OUT;
if ($opts{t}){
$FHOUT = $TMP;
}
print $FHOUT join("\t", @$out_line_ref ) . "\n";
$functional_count++;
}
#################################################
sub recordMatchedSamples{
# this returns the samples with a matching allele/genotype for adding to INFO
# and also adds information to %matched_transcript hash to record samples with
# functional variants in a given transcript
my $split = shift;
my $alleles = shift;
splitRemainingFields($split);
return if not %{$alleles};
my @matched_samples = ();
my %samp_to_gt = VcfReader::getSampleCall
(
line => $split,
sample_to_columns => \%sample_to_col,
minGQ => $opts{g},
multiple => \@samples,
);
my ($chrom, $pos, $ref, $alt) = VcfReader::getMultipleVariantFields
(
$split,
'CHROM',
'POS',
'REF',
'ALT',
);
my $var_id = "$chrom:$pos-$ref,$alt";
my $store_this_line = 0;
#skip if not identical GTs in all affecteds and identical variants required
if ($opts{e}){
my @matched_gts, identicalGenotypes(\%samp_to_gt, $opts{n});
return if not @matched_gts;
my $i_match ;
foreach my $i (sort {$a <=> $b} keys %{$alleles}){
foreach my $matched_gt( @matched_gts ){
#if require equal genotypes skip if this is not a matched allele
my @m = split("/", $matched_gt);
if (first { $_ eq $i } @m ){#first is ok cos we never touch allele 0
$i_match++;
last;
}
}
if ($i_match){
if (my @samp_with_allele = addSamplesWithGt
(
\%samp_to_gt,
\@matched_gts,
$split,
)
){
push @matched_samples, join("|", @samp_with_allele);
recordTranscriptSamplesAndVariants
(
\@{$alleles->{$i}},
\@samp_with_allele,
$var_id,
);
$store_this_line++;
}else{
push @matched_samples, '.';
}
}else{
push @matched_samples, '.';
}
}
}else{
foreach my $i (sort {$a <=> $b} keys %{$alleles}){
if (my @samp_with_allele = addSamplesWithAllele
(
\%samp_to_gt,
$i,
$var_id,
)
){
push @matched_samples, join("|", @samp_with_allele);
recordTranscriptSamplesAndVariants
(
\@{$alleles->{$i}},
\@samp_with_allele,
$var_id,
);
$store_this_line++;
}else{
push @matched_samples, '.';
}
}
}
if ($store_this_line){
my $line_ref = VcfReader::addVariantInfoField
(
line => $split,
id => "getFunctionalVariantsMatchedSamples",
value => join(",", @matched_samples),
);
$line_ref = VcfReader::addVariantInfoField
(
line => $split,
id => "getFunctionalVariantsMatchedAlleles",
value => join(",", (sort {$a <=> $b} keys %{$alleles})),
);
$vcf_lines{$var_id} = join("\t", @$line_ref);
}
}
#################################################
sub recordTranscriptSamplesAndVariants{
my $annotations = shift;
my $samples = shift;
my $var_id = shift;
foreach my $annot (@$annotations){
my $tr = $annot->{$feature_id};
$transcript_to_symbol{$tr} = $annot->{$symbol_id};
foreach my $sample (@$samples){
push @{ $transcript_sample_vars{$tr}->{ $sample } }, $var_id;
}
}
}
#################################################
sub isNewChromosome{
my $chrom = shift;
return 0 if $opts{t};
if (exists $contigs{$chrom} ){
#require chroms to be ordered together
if ($contigs{$chrom} != scalar(keys %contigs) - 1){
die <<EOT
Encountered a variant for contig $chrom with at least one variant for a
different contig inbetween. Your VCF input must be sorted such that all contigs
are kept together when using the -f/--find_shared_genes or --gene_counts
options. Please sort your input and try again.
EOT
;
}
return 0;
}
#if new chrom record its contig index and return true
$contigs{$chrom} = scalar(keys%contigs);
return $contigs{$chrom} > 0;#return false if this is the first
#chrom we've encountered
}
#################################################
sub getFunctionalCsq{
my $split = shift;
my %allele_to_csq = (); #keys are allele numbers, values = array of
#'functional' csq
#get Allele frequency annotations if they exist and we're filtering on AF
my %af_info_values = ();
if (%af_info_fields){
%af_info_values = getAfInfoValues($split);
}
#get alleles and consequences for each allele
my @alleles = VcfReader::readAlleles(line => $split);
my @csq = getConsequences($split);
my @alts_to_vep_allele = ();#get VEP style alleles if needed
if ($opts{m} eq 'vep'){
@alts_to_vep_allele = VcfReader::altsToVepAllele
(
ref => $alleles[0],
alts => [ @alleles[1..$#alleles] ],
);
}
my @cadd_scores = ();
if ($opts{c}){
my $cadd = VcfReader::getVariantInfoField
(
$split,
'CaddPhredScore',
);
if ($cadd){
@cadd_scores = split(",", $cadd);
}
}
#assess each ALT allele (REF is index 0)
ALT: for (my $i = 1; $i < @alleles; $i++){
next ALT if $alleles[$i] eq '*';
#if CADD filtering skip if less than user-specified cadd filter
#if no CADD score for allele then we won't skip unless using --
if (@cadd_scores){
my $score = $cadd_scores[$i - 1];
if ($score ne '.'){
next ALT if $score < $opts{c};
}elsif ($opts{skip_unpredicted}){
next ALT;
}
}
#filter if AF annotations > MAF
if (%af_info_fields){ #check for annotateSnps.pl etc. frequencies
next ALT if ( alleleAboveMaf($i - 1, \%af_info_values) );
}
#get all consequences for current allele
my @a_csq = ();
if ($opts{m} eq 'vep'){
@a_csq = grep { $_->{allele} eq $alts_to_vep_allele[$i-1] } @csq;
}else{
@a_csq = grep { $_->{allele} eq $alleles[$i] } @csq;
}
#0 or 1 for each allele if pathogenic - i.e. we keep all annotations
# if allele is flagged as pathogenic in ClinVar
# ClinVar annotations come via annotateSnps.pl using a ClinVar file
my @clinvar_path = keepClinvar($split, scalar(@alleles)-1);
#skip if VEP/SNPEFF annotation is not the correct class
# (or not predicted damaging if using that option)
# and allele is not flagged as pathogenic in ClinVar
CSQ: foreach my $annot (@a_csq){
if ($current_target){
next CSQ if $annot->{$gene_id} ne $current_target
};
my $is_cvar_path = $clinvar_path[$i-1];
if ($clinvar_functional and $is_cvar_path){
#check it at least matches a default 'functional class for this gene
unless (consequenceMatchesClass($annot, $split, 1)){
$is_cvar_path = 0;
}
}
if ($is_cvar_path or consequenceMatchesClass($annot, $split)){
push @{$allele_to_csq{$i}}, $annot;
}
}
}#each ALT
return if not keys %allele_to_csq;#no functional variant
# WE LEAVE THESE SAMPLE FILTERS TIL LAST IN CASE VCF HAS A HUGE NUMBER OF
# SAMPLES, IN WHICH CASE THESE WILL BE THE LEAST EFFICIENT TESTS
#skip if no variant allele in affecteds
splitRemainingFields($split);
my %samp_to_gt = VcfReader::getSampleCall
(
line => $split,
sample_to_columns => \%sample_to_col,
minGQ => $opts{g},
multiple => \@samples,
);
return if not haveVariant(\%samp_to_gt);
#skip if not identical GTs in all affecteds and identical variants required
my @matched_gts = ();
if ($opts{e}){
push @matched_gts, identicalGenotypes(\%samp_to_gt, $opts{n});
return if not @matched_gts;
}
# get allele frequencies in @samples if using -u/--max_sample_allele_frequency
my %s_allele_counts = ();
my $allele_count;
if ($opts{u}){
%s_allele_counts = VcfReader::countAlleles
(
minGQ => $opts{g},
line => $split,
);
map { $allele_count += $s_allele_counts{$_} } keys %s_allele_counts;
# filter if frequency in @samples is greater or equal to
# -u/--max_sample_allele_frequency
foreach my $i (keys %allele_to_csq){
if ($opts{u} and $allele_count > 0){
#my $freq = $s_allele_counts{$i}/(@samples*2);
my $freq = $s_allele_counts{$i}/$allele_count;
delete $allele_to_csq{$i} if $freq >= $opts{u};
}
}
}
return %allele_to_csq;
}
#################################################
sub outputGeneCountTotals{
my $count = 0;
my $total = 0;
if ($gene_count_opts{sample_count}){
$total = $gene_count_opts{sample_count};
}else{
$total = @samples if @samples;
}
if ( %cumulative_burden ) {
if ($gene_count_opts{count_mode} eq 'allele_counts'){
$count = $cumulative_burden{SC};
$total = $cumulative_burden{SN};
}else{
$count = keys %cumulative_burden;
}
}
my $without = $total - $count;
$without = 0 if ($without < 0);
print $GENECOUNTS join
(
"\t",
"TOTAL",
"TOTAL",
$count,
$without,
) . "\n";
close $GENECOUNTS;
}
#################################################
sub outputGeneCounts{
if (not keys %gene_burden and $current_target){
#processing a target gene but no variants found
#output 0 if sample count was provided
if ($gene_count_opts{sample_count}){
print $GENECOUNTS join
(
"\t",
$current_target,
$targets{$current_target}->{symbol},
0,
$gene_count_opts{sample_count},
) . "\n";
}
}else{
foreach my $g (keys %gene_burden){
my $count = 0;
my $without = 0;
if ($gene_count_opts{count_mode} eq 'allele_counts'){
$count = $gene_burden{$g}->{SC};
$without = $gene_burden{$g}->{SN} - $count;
$without = $without > 0 ? $without : 0; #don't allow negative values
$cumulative_burden{SC} += $count;
if ($without > $cumulative_burden{SN}){
$cumulative_burden{SN} = $without;
}
}else{
$count = keys %{$gene_burden{$g}};
map {$cumulative_burden{$_} = undef } keys %{$gene_burden{$g}};
$without = @samples - $count;
}
print $GENECOUNTS join
(
"\t",
$g,
$transcript_to_symbol{$g},
$count,
$without,
) . "\n";
}
}
%gene_burden = ();
}
#################################################
sub getSampleCounts{
#return no. samples with allele and total number samples
my $line = shift;
my $allele = shift;
my $field = 'AC';
my $count = 0;
my $number = 0;
my $chrom = VcfReader::getVariantField($line, 'CHROM');
if (exists $info_fields{AC_Het} and exists $info_fields{AC_Hom}){
my $hemi = 0;
if ($chrom =~ /^(chr)X|Y/ and exists $info_fields{AC_hemi}){
my $hemis = VcfReader::getVariantInfoField($line, "AC_Hemi");
$hemi = (split ",", $hemis)[$allele - 1];
}
my $hets = VcfReader::getVariantInfoField($line, "AC_Het");
my $homs = VcfReader::getVariantInfoField($line, "AC_Hom");
my $het = (split ",", $hets)[$allele - 1];
my $hom = (split ",", $homs)[$allele - 1];
if ($gene_count_opts{model} eq 'dominant' and $hom > 0){
$count = 0;#discard if there's a homozygote for a dominant variant(?)
}else{
#no reliable way to find comp. hets from ExAC allele counts
$count = $het + $hemi + $hom;
}
}elsif (exists $info_fields{AC}){
my $ac = VcfReader::getVariantInfoField($line, 'AC');
$count = ((split ",", $ac)[$allele - 1])/2;#assume diploidy
}else{
die "AC INFO field is required for using --gene_counts in ".
"'allele_counts' mode\n";
}
if ($gene_count_opts{sample_count}){
if ($chrom =~ /^(chr)Y/ and exists $gene_count_opts{males}){
$number = $gene_count_opts{males};
}else{
$number = $gene_count_opts{sample_count};
}
}else{
my $an_field;
if (exists $info_fields{AN_Adj}){
$an_field = 'AN_Adj';
}elsif (exists $info_fields{AN}){
$an_field = 'AN';
}else{
die "AN INFO field is required for using --gene_counts in ".
"'allele_counts' mode\n";
}
my $an = VcfReader::getVariantInfoField($line, $an_field);
$number = int($an/2) + ($an % 2);#assume diploidy
}
return ($count, $number);
}
#################################################
sub addSamplesForGeneCounts{
#will already have added samples if using
#recessive gene count model
my ($gts, $allele, $line) = @_;
my @samp = ();
foreach my $s ( keys %{$gts} ){
if ($gene_count_opts{model} eq 'dominant'){
if ($gts->{$s} =~ /^$allele[\/\|]$allele$/){
#found a homozygote - discard all(?)
if (checkGtPl(["$allele/$allele"], $s, $line )){
return;
}
}
}
if ($gts->{$s} =~ /^($allele[\/\|]\d+|\d+[\/\|]$allele)$/){
if (checkAllelePl($allele, $s, $line)){
push @samp, $s;
}
}
}
return @samp;
}
#################################################
sub addSamplesWithGt{
my ($gts, $matched, $line) = @_;
my @samp = ();
foreach my $s ( keys %{$gts} ){
next if ($gts->{$s} =~ /^\.[\/\|]\.$/ );
(my $gt = $gts->{$s}) =~ s/[\|]/\//;
#don't let allele divider affect whether a genotype matches
if ($opts{pl}){
if (checkGtPl($matched, $s, $line )){
push @samp, $s;
}
}else{
if ( first { $_ eq $gt } @$matched ){
push @samp, $s ;
}
}
}
return @samp;
}
#################################################
sub addSamplesWithAllele{
my ($gts, $allele, $line) = @_;
my @samp = ();
foreach my $s ( keys %{$gts} ){
if ($gts->{$s} =~ /^($allele[\/\|]\d+|\d+[\/\|]$allele)$/){
if (checkAllelePl($allele, $s, $line)){
push @samp, $s;
}
}
}
return @samp;
}
#################################################
sub checkAllelePl{
#returns 0 if PL for any genotype that does not include $allele
# is lower than $opts{pl}. Otherwise returns 1.
return 1 if not $opts{pl} ;
my ($allele, $sample, $line) = @_;
my $pl = VcfReader::getSampleGenotypeField
(
line => $line,
field => "PL",
sample => $sample,
sample_to_columns => \%sample_to_col,
);
my @pls = split(",", $pl);
my @idxs = VcfReader::calculateOtherAlleleGindexes($line, $allele);
foreach my $i (@idxs){
if ($i > $#pls){
my ($chrom, $pos) = VcfReader::getMultipleVariantFields
(
$line,
'CHROM',
'POS'
);
informUser
(
"WARNING: Not enough PL scores ($pl) for allele " .
"'$i', sample $sample at $chrom:$pos. Your VCF may be malformed.\n"
);
return 1;
}
return 0 if $pls[$i] < $opts{pl};
}
return 1;
}
#################################################
sub checkGtPl{
#returns 0 if PL for any genotype other than those in @$gts
# is lower than $opts{pl}. Otherwise returns 1.
return 1 if not $opts{pl} ;
my ($gts, $sample, $line) = @_;
my $pl = VcfReader::getSampleGenotypeField
(
line => $line,
field => "PL",
sample => $sample,
sample_to_columns => \%sample_to_col,
);
my @pls = split(",", $pl);
my @idxs = ();
foreach my $gt (@$gts){
push @idxs, VcfReader::calculateGenotypeGindex($gt);
}
foreach my $idx (@idxs){
if ($idx > $#pls){
my ($chrom, $pos) = VcfReader::getMultipleVariantFields
(
$line,
'CHROM',
'POS'
);
informUser
(
"WARNING: Not enough PL scores ($pl) for genotypes '".
join(", ", @$gts) . "', sample $sample at $chrom:$pos.".
" Your VCF may be malformed.\n"
);
return 1;
}
}
for (my $i = 0; $i < @pls; $i ++){
next if grep {$_ == $i} @idxs;
return 0 if $pls[$i] < $opts{pl};
}
return 1;
}
#################################################
sub checkClinVarInfo{
my @cvar_args = $opts{clinvar} ? split(",", $opts{clinvar}) : () ;
if (grep { lc($_) eq 'disable'} @cvar_args){
return 0;
}
my $iclvar = 0;
my $func = 0;
foreach my $f ( qw / ClinVarPathogenic AS_CLNSIG / ) {
if (exists $info_fields{$f} and $info_fields{$f}->{Number} eq 'A'){
if (grep { lc($_) eq 'functional'} @cvar_args){
informUser
(
"Identified '$f' INFO field and '--clinvar functional' ".
"option is in use - alleles marked as pathogenic".
" will be considered as having a 'functional' consequence ".
"if they match any of the default 'functional' classes.\n"
);
$func++;
}else{
informUser
(
"Identified '$f' INFO field - alleles marked as pathogenic".
" will be considered as having a 'functional' consequence ".
"regardless of functional consequence. To change this ".
"behaviour run with the option '--clinvar disable'\n"
);
}
$iclvar++;
}elsif(exists $info_fields{$f}){
informUser
(
"WARNING: Ignoring INFO field '$f' because 'Number' is given ".
"as $info_fields{$f}->{Number} rather than the expected 'A'.\n"
);
delete $info_fields{$f};#remove so we don't try using this annotation later
}
}
if ($iclvar){
if (grep { lc($_) ne 'no_conflicted'
and lc($_) ne 'all'
and lc($_) ne 'functional' } @cvar_args
){
die "Unrecognised value '$opts{clinvar}' passed to --clinvar option.\n";
}elsif (grep { lc($_) eq 'all' } @cvar_args){
return ('all', $func);
}elsif (grep { lc($_) eq 'no_conflicted' } @cvar_args){
if (not exists $info_fields{ClinVarConflicted} and
not exists $info_fields{AS_CLNSIG}
#can glean from CLNSIG field if more than one type of assertion made
){
informUser
(
"WARNING: no ClinVarConflicted or CLNSIG INFO field found in header.".
" All variants with ClinVarPathogenic annotations will be".
" kept.\n"
);
return ('all', $func);
}
return ('no_conflicted', $func);
}else{
return ('all', $func);
}
}else{
if ($opts{clinvar}){
informUser
(
"WARNING: neither ClinVarPathogenic nor AS_CLNSIG INFO fields".
" were found in header. ClinVarPathogenic variants will not be".
" identified.\n"
);
}
return (0, 0);
}
}
#################################################
sub keepClinvar{
#returns an array with a value for each allele
#values are 1 if pathogenic and and 0 if not
my $v = shift;
my $n_alts = shift;
if (not $keep_clinvar){
return map { 0 } 0..$n_alts
}
my @c_path = ();
my @d_path = ();
my @c_conf = ();
my @d_conf = ();
if (exists $info_fields{ClinVarPathogenic}){
my $path = VcfReader::getVariantInfoField
(
$v,
'ClinVarPathogenic',
);
if (defined $path){
@c_path = split(",", $path);
if ($keep_clinvar eq 'no_conflicted'){
@c_conf = split(",", VcfReader::getVariantInfoField
(
$v,
'ClinVarConflicted',
)
);
@c_path = map {$c_path[$_] == 1 and $c_conf[$_] == 0} 0..$#c_path;
}
}
}
if (exists $info_fields{AS_CLNSIG}){
my $path = VcfReader::getVariantInfoField
(
$v,
'AS_CLNSIG',
);
if (defined $path){
my @csig = split(",", $path);
foreach my $c (@csig){
my @path = split(/\|/, $c);
if (grep {$_ eq '4' or $_ eq '5'} @path){
push @d_path, 1;
if (grep {$_ eq '2' or $_ eq '3'} @path){
push @d_conf, 1;
}else{
push @d_conf, 0;
}
}else{
push @d_path, 0;
push @d_conf, 0;
}
}
if ($keep_clinvar eq 'no_conflicted'){
@d_path = map {$d_path[$_] eq '1' and $d_conf[$_] eq '0'} 0..$#d_path;
}
}
}
if (@c_path){
if (@d_path){
if ($keep_clinvar eq 'no_conflicted'){
#an annotation of 0 from the ClinVarPathogenic might mean
#no info rather than designated as benign, so only consider as
#conflicted if ClinVarConflicted annotation is 1 or $d_path[$_] == 0
#in this case, if we have annotations from VCF in @d_path,
#about what is in @c_path
return map { $d_path[$_] eq '1' and $c_conf[$_] ne '1'} 0..$#d_path;
}else{
#don't care about conflicted annotations
#keep if either source is flagged as pathogenic
return map {$d_path[$_] eq '1' or $c_path[$_] eq '1'} 0..$#d_path;
}
}else{
return @c_path;
}
}elsif(@d_path){
return @d_path;
}else{
return map { 0 } 0..$n_alts
}
}
#################################################
sub outputGeneList{
return if not $LIST;
foreach my $k (sort keys %genes_to_transcripts){
print $LIST "$k:\t" . join
(
",",
@{ $genes_to_transcripts{$k} }
) . "\n";
}
close $LIST;
}
#################################################
sub checkMatchingVariants{
my $count = 0;
my $min_matching = $opts{n} ? $opts{n} : scalar @samples;
my %lines_to_print = ();
foreach my $tr (keys %transcript_sample_vars){
if (scalar (keys %{ $transcript_sample_vars{$tr} }) >= $min_matching ){
my %new_lines = map { $vcf_lines{$_} => undef }
map { @{$transcript_sample_vars{$tr}->{$_}} }
keys %{ $transcript_sample_vars{$tr} } ;
%lines_to_print = (%new_lines, %lines_to_print);
my $symbol = $transcript_to_symbol{$tr};
push @{$genes_to_transcripts{$symbol}}, $tr;
}
}
#convert lines to split array refs for sorting
my @lines_out = map { [ split("\t", $_ )] } keys %lines_to_print;
@lines_out = VcfReader::sortByPos( \@lines_out );
foreach my $l (@lines_out){
print $OUT join("\t", @$l) . "\n";
$count++;
}
%transcript_sample_vars = ();
%vcf_lines = ();
%transcript_to_symbol = ();
return $count;
}
#################################################
sub readTargetGenesFile{
return if not $opts{t};
open (my $TARGETS, "<", $opts{t}) or die "Could not open -t/--target_genes"
. " file '$opts{t}': $!\n";
my @cols =
(
"Ensembl Gene ID",
"Chromosome Name",
"Gene Start (bp)",
"Gene End (bp)",
"Associated Gene Name",
);
my $header = <$TARGETS>;
chomp $header;
my @split = split("\t", $header);
my %h = ();
{
no warnings 'uninitialized';
foreach my $c (@cols){
my $i = 0;
$i++ until uc($split[$i]) eq uc($c) or $i > $#split;
if ($i > $#split){
die <<EOT
Required column '$c' not found in -t/--target_genes file '$opts{t}'. Appropriate
files can be obtained by outputting your gene lists from Ensembl's biomart with
the attributes "Ensembl Gene ID", "Chromosome Name", "Gene Start (bp)", "Gene
End (bp)" and "Associated Gene Name".
EOT
;
}else{
$h{$c} = $i;
}
}
}
while (my $l = <$TARGETS>){
chomp $l;
my @s = split("\t", $l);
my $id = $s[$h{"Ensembl Gene ID"}];
$targets{$id}->{chrom} = $s[$h{"Chromosome Name"}];
$targets{$id}->{start} = $s[$h{"Gene Start (bp)"}];
$targets{$id}->{end} = $s[$h{"Gene End (bp)"}];
$targets{$id}->{symbol} = $s[$h{"Associated Gene Name"}];
}
}
#################################################
sub checkGeneCountArgs{
informUser
(
"WARNING: No 'samples specified - all samples will be used for ".
"--gene_counts option\n"
) if not @samples;
my @keys = qw / file model count_mode sample_count /;
my @split = split(',', $opts{gene_counts});
@gene_count_opts{@keys} = @split;
$gene_count_opts{model} ||= 'dominant';
$gene_count_opts{count_mode} ||= 'genotypes';
if ( $gene_count_opts{model} ne 'dominant' and
$gene_count_opts{model} ne 'recessive' and
$gene_count_opts{model} ne 'both'
){
die "ERROR: Unrecognised model '$gene_count_opts{model}' for --gene_counts argument\n";
}
if ($gene_count_opts{model} eq 'recessive'){
foreach my $f (qw /findBiallelicSamplesHom findBiallelicSamplesHet/){
if (not exists $info_fields{$f}){
die "ERROR: Could not find '$f' INFO field entry in VCF header".
" In order to use recessive mode with the --gene_counts ".
"please run findBiallelic.pl on your input first.\n";
}
}
}
}
#################################################
sub getAndCheckCsqHeader{
my %csq_head = ();
if (not $opts{m}){
eval {
%csq_head = VcfReader::readVepHeader
(
header => $header
);
} ;
if (not $@){
$opts{m} = 'vep';
informUser("Found VEP header - using VEP 'CSQ' annotations.\n");
}else{
informUser("No VEP header found. Trying SnpEff...\n");
eval {
%csq_head = VcfReader::readSnpEffHeader
(
header => $header
);
} ;
if (not $@){
$opts{m} = 'snpeff';
informUser("Found SnpEff header - using SnpEff 'ANN' annotations.\n");
}else{
die "ERROR: Could not find VEP or SnpEff headers in input. ".
"Please annotate your input with either program and try again.\n";
}
}
}else{
if ($opts{m} eq 'vep'){
%csq_head = VcfReader::readVepHeader
(
header => $header
);
}else{
%csq_head = VcfReader::readSnpEffHeader
(
header => $header
);
}
}
return %csq_head;
}
#################################################
sub getCsqFields{
my @fields = ();
if ($opts{m} eq 'vep'){
@fields =
qw(
allele
gene
feature
feature_type
consequence
symbol
biotype
);
if ($opts{canonical_only}){
push @fields, "canonical";
}
if ($opts{consensus_splice_site}){
push @fields, "splice_consensus";
}
}else{
@fields =
qw(
allele
annotation
annotation_impact
gene_name
gene_id
feature_type
feature_id
transcript_biotype
);
if ($opts{canonical_only}){
informUser
(
"WARNING: --canonical_only option is ignored when ".
"working with SnpEff annotations.\n"
);
delete $opts{canonical_only};
}
if ($opts{consensus_splice_site}){
informUser
(
"WARNING: --consensus_splice_site option is ignored when ".
"working with SnpEff annotations.\n"
);
delete $opts{consensus_splice_site};
}
}
foreach my $f (@fields){
if (not exists $csq_header{$f}){
if ($f eq 'splice_consensus'){
die "Could not find 'splice_consensus' field in header " .
"- please ensure you use the SpliceConsensus plugin when ".
"running VEP.\n";
}
die "Could not find '$f' field in $opts{m} consequence header " .
"- please ensure you have annotated your file including the ".
"appropriate fields.\n";
}
}
return @fields;
}
#################################################
sub getAndCheckClasses{
my %all_classes = ();
if ($opts{m} eq 'vep'){
%all_classes = VcfhacksUtils::readVepClassesFile();
}else{
%all_classes = VcfhacksUtils::readSnpEffClassesFile();
}
grep { $all_classes{$_} eq 'default' } keys %all_classes;
%default_classes =
map { $_ => undef }
grep { $all_classes{$_} eq 'default' }
keys %all_classes;
if (not @classes){
@classes = keys %default_classes;
}
push @classes, @add_classes if (@add_classes);
if ($opts{m} eq 'vep' and $opts{consensus_splice_site}){
push @classes, "splice_region_variant" ;
}
@classes = map { lc($_) } @classes;
@classes = VcfhacksUtils::removeDups(@classes);
foreach my $class (@classes) {
die "Error - variant class '$class' not recognised.\n"
if not exists $all_classes{lc($class)} ;
}
return @classes;
}
#################################################
sub getAndCheckBiotypes{
return if $opts{no_biotype_filtering};
my %all_biotypes = VcfhacksUtils::readBiotypesFile();
if (not @biotypes){
@biotypes = grep { $all_biotypes{$_} eq 'filter' } keys %all_biotypes;
}
@biotypes = map { lc($_) } @biotypes;
@biotypes = VcfhacksUtils::removeDups(@biotypes);
foreach my $biotyp (@biotypes) {
die "Error - biotype '$biotyp' not recognised.\n"
if not exists $all_biotypes{lc($biotyp)} ;
}
return @biotypes;
}
#################################################
sub getAndCheckInSilicoPred{
return if not @damaging;
my %filters = VcfhacksUtils::getAndCheckInSilicoPred($opts{m}, \@damaging);
if ($opts{m} eq 'vep'){#VEP prediction results will be in CSQ field
foreach my $d (keys %filters){
if ($d ne 'all' and not exists $csq_header{$d}){
informUser
(
"WARNING: No '$d' field found in CSQ header of VCF. ".
"No in silico filtering will be performed for $d.\n"
);
}
}
%filters = map { $_ => $filters{$_} }
grep { exists $csq_header{$_} }
keys %filters;
push @csq_fields, keys %filters;
}else{#SnpEff predictions will be added via SnpSift
foreach my $d (keys %filters){
if (not exists $info_fields{$d}){
informUser
(
"WARNING: No '$d' field found in INFO fields of VCF. ".
"No in silico filtering will be performed for $d. Did ".
"you annotate with SnpSift?\n"
);
}
%filters = map { $_ => $filters{$_} }
grep { exists $info_fields{$_} }
keys %filters;
}
}
return %filters;
}
#################################################
sub getAndCheckEvalFilters{
return if not @eval_filters;
my @filters = ();
my %csq_add = ();
FLT: foreach my $s (@eval_filters){
my %f = VcfhacksUtils::getEvalFilter($s);
foreach my $fld (@{$f{field}}){
(my $fb = $fld) =~ s/^\(//;#we may have used ( to specify precedence
if (not exists $csq_header{lc($fb)}){
informUser
(
"WARNING: No '$fb' field found in CSQ header of ".
"VCF. Cannot use --eval_filter expression '$s' ".
"for filtering.\n"
);
next FLT;
}
$csq_add{lc($fb)}++;
}
push @filters, \%f;
}
push @csq_fields, keys %csq_add;
return @filters;
}
#################################################
sub splitRemainingFields{
my $s = shift;#array ref to modify in place
my $l = pop(@$s);
push @$s, split("\t", $l);
}
#################################################
sub openOutput{
my $OUT_FH;
my $LIST_FH;
my $COUNT_FH;
if ($opts{o}) {
open( $OUT_FH, ">", $opts{o} ) || die "Can't open $opts{o} for writing: $!\n";
}
else {
$OUT_FH = \*STDOUT;
}
if (defined $opts{f}) {
if ( $opts{f} eq '' ){
#user specified --list option but provided no argument
$LIST_FH = \*STDERR;
}else{
open( $LIST_FH, ">", $opts{f} )
or die "Can't open $opts{f} for writing: $!\n";
}
}
if ($opts{gene_counts}){
open( $COUNT_FH, ">", $gene_count_opts{file} )
or die "Can't open $gene_count_opts{file} for writing: $!\n";
}
return ($OUT_FH, $LIST_FH, $COUNT_FH);
}
#################################################
sub writeOptionsToHeader{
#print meta header lines
my $FH = $OUT;
if ($TMP){
$FH = $TMP;
}
my %al_info =
(
ID => "getFunctionalVariantsMatchedAlleles",
Number => ".",
Type => "String",
Description => "Alleles meeting criteria from getFunctionalVariants.pl"
);
my %ms_info =
(
ID => "getFunctionalVariantsMatchedSamples",
Number => "A",
Type => "String",
Description => "Alleles meeting criteria from getFunctionalVariants.pl"
);
my %ct_info =
(
ID => "getFunctionalVariantsAlleleCounts",
Number => "A",
Type => "String",
Description => "Alleles meeting criteria from getFunctionalVariants.pl"
);
#add existing meta header lines
print $FH join("\n", grep { /^##/ } @$header) . "\n" ;
#add new INFO fields
print $FH VcfhacksUtils::getInfoHeader(%al_info) . "\n";
if (defined $opts{f}){
print $FH VcfhacksUtils::getInfoHeader(%ms_info) . "\n";
}
if ($opts{gene_counts}){
if ($gene_count_opts{count_mode} eq 'allele_counts'){
print $FH VcfhacksUtils::getInfoHeader(%ct_info) . "\n";
}else{
print $FH VcfhacksUtils::getInfoHeader(%ms_info) . "\n";
}
}
#add header line detailing program options
print $FH VcfhacksUtils::getOptsVcfHeader(%opts) . "\n";
print $FH "$header->[-1]\n";
}
#################################################
sub informUser{
my $msg = shift;
my $time = strftime( "%H:%M:%S", localtime );
if ($progressbar){
$progressbar->message( "[INFO - $time] $msg" );
}else{
print STDERR "[INFO - $time] $msg";
}
}
#################################################
sub updateProgressBar{
if ($progressbar) {
if ($var_count >= $next_update){
$next_update = $progressbar->update( $var_count )
}
}elsif($opts{b}){
VcfhacksUtils::simpleProgress($var_count);
}
}
#################################################
sub haveVariant{
return 1 if not @samples;
my $gts = shift;
foreach my $k (keys %$gts){
if ($gts->{$k} =~ /(\d+)[\/\|](\d+)/){
return 1 if ( $1 > 0 or $2 > 0 );
}
}
return 0;
}
#################################################
sub identicalGenotypes{
my $gts = shift;
my $min_matching = shift;
$min_matching = defined $min_matching ? $min_matching : scalar keys %$gts;
my @matched = ();
my %matches = (); #keys are genotypes, values are arrays of samples with GT
foreach my $s (keys %$gts){
if ( $gts->{$s} =~ /^\.[\/\|]\.$/ ){
next;
}elsif ( $gts->{$s} =~ /^0[\/\|]0$/ ){
next;
}
(my $current_geno = $gts->{$s}) =~ s/[\|]/\//;
#don't let allele divider affect whether a genotype matches or not
push @{ $matches{$current_geno} } , $s;
if ($min_matching == scalar keys %$gts){
#if we require all to match, bail out if there's more than one genotype
return 0 if keys %matches > 1;
}
}
foreach my $k (keys %matches){
my $m = 0;
foreach my $s (@{$matches{$k}}){
$m++;
}
push @matched, $k if $m >= $min_matching;
}
return @matched;
}
#################################################
sub getAfAnnotations{
my $info_fields = shift;
my %af_found = ();
my @af_fields = qw (
AS_CAF
AS_G5A
AS_G5
AS_COMMON
EVS_EA_AF
EVS_AA_AF
EVS_ALL_AF
);
foreach my $c (@custom_af){
if (not exists $info_fields->{$c}){
informUser( "WARNING: User specified custom allele frequency ".
"(-j/--custom_af) field '$c' not found in VCF header. This ".
"field may not exist in your VCF.\n");
}
}
push @af_fields, @custom_af;
foreach my $key (keys %$info_fields){
my $warning = "WARNING: Found expected frequency annotation ($key) in ".
"INFO fields, but 'Number' field is $info_fields->{$key}->{Number}, ".
"expected 'A'. Ignoring this field.\n";
my $info = "Found allele frequency annotation: $key. ".
"This will be used for filtering on allele frequency.\n";
if (grep { $key eq $_ } @af_fields){
if ($info_fields->{$key}->{Number} ne 'A'){
informUser($warning);
}else{
informUser($info);
$af_found{$key} = $info_fields->{$key};
}
}else{
if ($key =~ /^FVOV_AF_\S+$/){
if ($info_fields->{$key}->{Number} ne 'A'){
informUser($warning);
}else{
informUser($info);
$af_found{$key} = $info_fields->{$key};
}
}
}
}
return %af_found;
}
#################################################
sub alleleAboveMaf{
my $i = shift; #1-based index of alt allele to assess
my $af_values = shift; #hash ref of INFO fields to their values
foreach my $k (keys %{$af_values}){
next if not $af_values->{$k};
next if $af_values->{$k} eq '.';
my @split = split(",", $af_values->{$k});
next if $split[$i] eq '.';
if ($k eq "AS_G5" or $k eq "AS_G5A"){
if ($opts{a} <= 0.05 and $split[$i] > 0){
return 1;
}
}elsif ($k eq "AS_COMMON"){
if ($opts{a} <= 0.01 and $split[$i] > 0){
return 1;
}
}else{#should be a standard allele freq now
if ($af_info_fields{$k}->{Type} eq 'Float' or $k eq 'AS_CAF'){
return 1 if $split[$i] >= $opts{a};
}else{
inform_user("WARNING: Don't know how to parse INFO field: $k.\n");
}
}
}
return 0;
}
#################################################
sub getAfInfoValues{
my $l = shift;
my %values = ();
foreach my $k (keys %af_info_fields){
$values{$k} = VcfReader::getVariantInfoField($l, $k);
}
return %values;
}
#################################################
sub getConsequences{
my $line = shift;
if ($opts{m} eq 'vep'){
return VcfReader::getVepFields
(
line => $line,
field => \@csq_fields,
vep_header => \%csq_header,
);
}else{
return VcfReader::getSnpEffFields
(
line => $line,
field => \@csq_fields,
snpeff_header => \%csq_header,
);
}
}
#################################################
sub consequenceMatchesClass{
my $annot = shift;
my $l = shift;
my $use_default_classes = shift;
if ($opts{m} eq 'vep'){
return consequenceMatchesVepClass($annot, $use_default_classes);
}else{
return consequenceMatchesSnpEffClass($annot, $l, $use_default_classes);
}
}
#################################################
sub consequenceMatchesVepClass{
my $annot = shift;
my $use_default = shift;
#intergenic variants have no feature associated with them - skip
return 0 if $annot->{consequence} eq "intergenic_variant";
#skip unwanted biotypes
return 0 if (exists $biotype_filters{$annot->{biotype}}) ;
#skip non-canonical transcripts if --canonical_only selected
if ($opts{canonical_only}) {
return 0 if ( not $annot->{canonical} );
}
my @anno_csq = split( /\&/, $annot->{consequence} );
#skip NMD transcripts
return 0 if ( grep { /NMD_transcript_variant/i } @anno_csq );
#eval filters trump annotation class
foreach my $evf (@eval_exp){
return 1 if VcfhacksUtils::evalFilter($evf, $annot);
}
#score filters trump annotation class
ANNO: foreach my $ac (@anno_csq){
$ac = lc($ac);#we've already converted %class_filters to all lowercase
if ($use_default){
return exists $default_classes{$ac} ;
}
if ( exists $class_filters{$ac} ){
if ($ac eq 'missense_variant' and %in_silico_filters){
return 1 if damagingMissenseVep($annot);
}elsif ( lc $ac eq 'splice_region_variant'
and $opts{consensus_splice_site}){
my $consensus = $annot->{splice_consensus};
next if not $consensus;
if ( $consensus !~ /SPLICE_CONSENSUS\S+/i ) {
inform_user(
"WARNING: SPLICE_CONSENSUS annotation '$consensus' is not " .
"recognised as an annotation from the SpliceConsensus VEP plugin.\n");
}else{
return 1;
}
}else{
return 1;
}
}
}
return 0;#no annotation matching %class_filters
}
#################################################
sub consequenceMatchesSnpEffClass{
my $annot = shift;
my $l = shift;
my $use_default = shift;
#skip variants with undef features (intergenic variants)
return if not defined $annot->{feature_id};
#skip unwanted biotypes
return 0 if exists $biotype_filters{lc $annot->{transcript_biotype} };
#eval filters trump annotation class
foreach my $evf (@eval_exp){
return 1 if VcfhacksUtils::evalFilter($evf, $annot);
}
my @anno_csq = split( /\&/, $annot->{annotation} );
ANNO: foreach my $ac (@anno_csq){
$ac = lc($ac);#we've already converted %class_filters to all lowercase
if ($use_default){
return exists $default_classes{$ac} ;
}
if ( exists $class_filters{$ac} ){
if ($ac eq 'missense_variant' and %in_silico_filters){
return 1 if damagingMissenseSnpEff($l);
}else{
return 1;
}
}
}
return 0;#no annotation matching %class_filters
}
#################################################
sub damagingMissenseVep{
#returns 1 if variant is damaging and should be kept
#returns 0 if variant is benign and should be filtered
my $anno = shift;
my %filter_matched = ();
PROG: foreach my $k ( sort keys %in_silico_filters) {
my $score = $anno->{ lc $k };
if ( not $score or $score =~ /^unknown/i ){
#don't filter if score is not available for this variant
#unless skip_unpredicted is in effect
$filter_matched{$k}++ unless $opts{skip_unpredicted};
next;
}
SCORE: foreach my $f ( @{ $in_silico_filters{$k} } ) {
my $do_filter = 0;
if ($f =~ /^(polyphen|sift|condel)$/){
$do_filter = isDamagingVepInsilico($k, $f, $score);
}else{
$do_filter = isDamagingDbnsfpInsilico($k, $f, $score);
}
if ($do_filter){
return 1 if $opts{keep_any_damaging};
$filter_matched{$k}++;
}
}
}
foreach my $k ( keys %in_silico_filters ) {
#filter if any of sift/condel/polyphen haven't matched our deleterious settings
return 0 if not exists $filter_matched{$k};
}
return 1;
}
#################################################
sub isDamagingDbnsfpInsilico{
my ($tool, $f, $score) = @_;
my @pred = split("&", $score); #multiple scores are separated by '&'
if ( $f =~ /^\d+(\.\d+)*$/ ) {#if we want to filter on score
if ($tool =~ /^(fathmm_score|provean_score|sift_score)$/){
#lower is more damaging for these tools
my $min = min(@pred);
return $min <= $f;
}else{
#higher = more damaging
my $max = max(@pred);
return $max >= $f;
}
}else{#if we want to filter on prediction
return grep { lc($_) eq lc($f) } @pred;
}
}
#################################################
sub isDamagingVepInsilico{
my ($tool, $f, $score) = @_;
if ( $f =~ /^\d(\.\d+)*$/ ) {#if we want to filter on score
my $prob;
if ( $score =~ /\((\d(\.\d+)*)\)/ ) {#get score
$prob = $1;
}else {
return 0;
}
if ( lc $tool eq 'polyphen'){
return $prob >= $f;
#higher is more damaging for polyphen
}else{
return $prob <= $f;
#lower is more damaging for sift and condel
}
}else{#if we want to filter on prediction
$score =~ s/\(.*\)//;#get prediction
return lc $f eq lc $score ; #damaging if match
}
}
#################################################
sub damagingMissenseSnpEff{
#returns 1 if variant is damaging and should be kept
#returns 0 if variant is benign and should be filtered
my $l = shift;
my %filter_matched = ();
PROG: foreach my $k ( sort keys %in_silico_filters) {
my $pred = VcfReader::getVariantInfoField($l, $k);
if (not defined $pred){
$filter_matched{$k}++ unless $opts{skip_unpredicted};
next;
}
my @scores = split(",", $pred);
#snpsift does not annotate properly per allele...
#... so as a fudge we have to just count any matching value
foreach my $score (@scores){
next if ( not $score or $score eq '.' );
foreach my $f ( @{ $in_silico_filters{$k} } ) {
if ( lc $f eq lc $score ) { #damaging
return 1 if $opts{k};
$filter_matched{$k}++;
next PROG;
}
}
}
}
foreach my $k ( keys %in_silico_filters ) {
#filter if any of sift/condel/polyphen haven't matched our deleterious settings
return 0 if not exists $filter_matched{$k};
}
return 1;
}
#################################################
=head1 NAME
getFunctionalVariants.pl - retrieve variants according to their functional annotation
=head1 SYNOPSIS
getFunctionalVariants.pl -i <variants.vcf> [options]
getFunctionalVariants.pl --help (show help message)
getFunctionalVariants.pl --manual (show manual page)
=cut
=head1 ARGUMENTS
=over 8
=item B<-i --input>
VCF file annotated with Ensembl's variant_effect_predictor.pl (VEP) script or SnpEff.
=item B<-o --output>
File to print output (optional). Will print to STDOUT by default.
=item B<-m --mode>
This program will attempt to detect the format of your input automatically by looking for VEP or SnpEff annotations, but you may specify either 'vep' or 'snpeff' with this option to select the mode employed by the script if you have a file with both VEP and SnpEff annotations. By default, if both annotations are present and the program is run without this option, VEP annotations will be used. NOTE: Only SnpEff annotations using the more recent 'ANN' style annotations rather than the older 'EFF' style are recognised by this program.
=item B<-s --samples>
Only keep variants present in one of these samples. You may specify 'all' (without quotes) instead of a sample name to select all samples in the VCF (potentially useful if used in confunction with --find_shared_genes). If you only want to identify variants present in a subset of these samples you can use the --num_matching argument to specify a minimum number of samples that must carry a variant.
=item B<-f --find_shared_genes>
Use this switch to only output variants that make up 'functional' variants in the same genes for the samples specified by the -s/--samples argument. If -s/--samples option is not specified this will act as if ALL samples in the VCF were specified. This will also return a list of genes containing 'functional' variants in these samples. If a filename is passed to this option the list will be printed to file, otherwise the list will be printed to STDERR.
=item B<-n --num_matching>
If s/--samples or -f/--find_shared_genes arguments are specified use this option to specify the minimum number of samples with variants in the same gene before outputting variants (and genes).
=item B<-e --equal_genotype>
If -s/--samples argument is specified use this flag if you only want to keep variants with identical genotypes in each sample (or a minimum number of samples as specified by the -n/--num_matching option).
=item B<--gene_counts>
Give a filename for outputting the counts of gene IDs vs number of samples with qualifying variants (e.g. for input to a burden test). Optionally the user may also add the positional arguments 'model', 'count mode' and 'sample count' separated by a commas after the filename (e.g. --gene_counts gene_counts.txt,recessive).
Valid values for B<model> are 'dominant' (only samples with heterozygous variants counted), 'recessive' (requires annotations from findBiallelic.pl to identify samples with compound het or homozygous variants) or 'both' (count a sample regardless of whether it is heterozygous or homozygous). If using the 'recessive' model you will need to annotate your variants with findBiallelic.pl first and use consistent settings for functional/in silico filters between both programs.
Valid values for B<count mode> are 'genotypes' or 'allele_counts'. The 'genotypes' setting is the default, where sample numbers are analyzed from genotype calls. The 'allele_counts' setting involves inferring sample numbers from INFO field annotations such as 'AC' or standard ExAC style het/hom counts.
The B<sample count> argument is a way of specifying the total number of samples. The main reason for using this would be when using 'allele_counts' to infer the number of samples where the total number of samples in the cohort would otherwise be unknown or could only be guessed from 'AN' style annotations.
An example use for this argument with a VEP annotated ExAC VCF might is given below:
--gene_counts ExAC.r0.3.gene_counts.txt,both,allele_counts,60706
=item B<--classes>
One or more mutation classes to retrieve. By default only variants labelled with one of the following classes will count as 'functional' variants:
B<VEP:>
TFBS_ablation
TFBS_amplification
frameshift_variant
inframe_deletion
inframe_insertion
initiator_codon_variant
missense_variant
protein_altering_variant
regulatory_region_ablation
regulatory_region_amplification
splice_acceptor_variant
splice_donor_variant
stop_gained
stop_lost
transcript_ablation
transcript_amplification
B<SnpEff:>
chromosome
coding_sequence_variant
inframe_insertion
disruptive_inframe_insertion
inframe_deletion
disruptive_inframe_deletion
exon_loss_variant
frameshift_variant
missense_variant
rare_amino_acid_variant
splice_acceptor_variant
splice_donor_variant
stop_lost
5_prime_UTR_premature_start_codon_gain_variant
start_lost
stop_gained
exon_loss
Available classes that can be chosen instead of (or in addition to - see below) these classes can be found in the data/vep_classes.tsv and data/snpeff_classes.tsv files respectively.
=item B<--add_classes>
Specify one or more classes, separated by spaces, to add to the default mutation classes used for finding functional variants.
=item B<--consensus_splice_site>
Use this flag in order to keep splice_region_variant classes only if they are in a splice consensus region as defined by the SpliceConsensus VEP plugin. You do not need to specify 'splice_region_variant' using --classes or --add_classes options when using this flag. You B<MUST> have used the SpliceConsensus plugin when running the VEP for this option to work correctly. This option is only used when running on VEP annotations.
=item B<--canonical_only>
Only consider canonical transcripts (VEP annotations only).
=item B<-c --cadd_filter>
If you have annotated CADD phred scores for alleles using rankOnCaddScore.pl you may use this option to specify a CADD phred score threshold for variants. Any alleles that have a CADD phred score below the value specified here will be filtered. Variants without a CADD phred score will not be filtered unless using the --skip_unpredicted option. You should have run rankOnCaddScore.pl with the --do_not_sort option to maintain chromosome order of your variants if you use this option with the -f/--find_shared_genes option or else you will need to sort your VCF before running getFunctionalVariants.pl.
=item B<--clinvar>
Specify the mode for dealing with ClinVar annotations. By default, if ClinVar annotations added by annotateSnps.pl are found, variants with 'pathogenic' or 'likely pathogenic' annotations are kept regardless of their functional consequence. Here you may specify the following values:
=over 12
=item B<all>
default behaviour, any variant with a ClinVar 'pathogenic' or 'likely pathogenic' will be kept
=item B<no_conflicted>
as above except that variants with conflicting annotations (e.g. 'benign' and 'likely pathogenic') will not be automatically kept
=item B<functional>
only keep ClinVar 'pathogenic' or 'likely pathogenic variants if they match one of the default 'functional' classes (see --classes argument). This option can be combined with 'all' or 'no_conflicted' options, separate the values with a comma.
=item B<disable>
turns off automatic retention of variants with ClinVar annotations.
=back
=item B<-d --damaging>
Specify in silico prediction scores to filter on.
If running on VEP annotations you may either use PolyPhen, SIFT and/or Condel scores given by VEP or annotations from dbNSFP added using the dbNSFP VEP plugin. If running on SnpEff, annotations scores provided by SnpSift's 'dbnsfp' mode will be used.
NOTE: when using SnpEff annotations, prediction program names are case-sensitive.
=over 12
=item B<VEP mode:>
Specify SIFT, PolyPhen or Condel labels or scores to filter on. Add the names of the programs you want to use, separated by spaces, after the --damaging option. By default SIFT will keep variants labelled as 'deleterious', Polyphen will keep variants labelled as 'possibly_damaging' or 'probably_damaging' and Condel will keep variants labelled as 'deleterious'.
If you want to filter on custom values specify values after each program name in the like so:
'polyphen=probably_damaging'
Seperate multiple values with commas - e.g.
'polyphen=probably_damaging,possibly_damaging,unknown'
You may specify scores between 0 and 1 to filter on scores rather than labels - e.g.
'sift=0.3'
For polyphen, variants with scores lower than this score are considered benign and filtered, for SIFT and Condel higher scores are considered benign.
B<Valid labels for SIFT:> deleterious, tolerated
B<Valid labels for Polyphen:> probably_damaging, possibly_damaging, benign, unknown
B<Valid labels for Condel:> deleterious, neutral
To use default values for all three programs you may use 'all' (i.e. '--damaging all') BUT PLEASE NOTE: if you have added dbNSFP annotations to your input VCF these will also be included (see below).
=item B<SnpEff or VEP dbNSFP mode:>
Your input must have been annotated using SnpSift's dbnsfp option (if using SnpEff annotations) or using the dbNSFP VEP plugin (if using VEP annotations). Recognised annotations are:
fathmm-MKL_coding_pred
FATHMM_pred
LRT_pred
MetaLR_pred
MetaSVM_pred
MutationAssessor_pred
MutationTaster_pred
PROVEAN_pred
Polyphen2_HDIV_pred
Polyphen2_HVAR_pred
SIFT_pred
You may instead choose to use the 'score' or dbNSFP 'rankscore' annotations for these tools (e.g. 'fathmm-MKL_coding_score').
You may specify 'all' to use the default values for all programs. Choosing custom values is also performed in the same way as for VEP annotations above (e.g. dbNSFP_MutationAssessor_pred=H,M,L).
=back
For more details of the available and default settings for these programs please see the files 'data/snpeff_insilico_pred.tsv' or 'data/vep_insilico_pred.tsv'.
The default behaviour is to only keep variants predicted as damaging by ALL programs specified, although if the value is not available for one or more programs than that program will be ignored for filtering purposes. See the next two options for alternative behaviours.
=item B<-k --keep_any_damaging>
If using multiple programs for filters for --damaging argument use this flag to keep variants predicted to be damaging according to ANY of these programs.
=item B<--skip_unpredicted>
Skip alleles that do not have a score from one or more programs specified by the -d/--damaging argument or if using the -c/--cadd_filter option and an allele has no CADD phred score. The --keep_any_damaging argument will override this behaviour for -d/--damaging predictions if any of the available predictions are considered damaging.
=item B<--biotype_filters>
By default features/transcripts with the following biotypes are ignored:
IG_C_pseudogene
IG_J_pseudogene
IG_V_pseudogene
nonsense_mediated_decay
non_stop_decay
processed_pseudogene
pseudogene
retained_intron
transcribed_processed_pseudogene
transcribed_unprocessed_pseudogene
TR_J_pseudogene
TR_V_pseudogene
unitary_pseudogene
unprocessed_pseudogene
You may override this filter by specifying biotypes to filter with this option or prevent biotype filtering using the --no_biotype_filtering option (see below).
Filtering will not occur if any non-filtered biotype contains a 'functional' variant. For example, with the default settings, if a splice_donor_variant only affects a 'pseudogene' transcript then this variant will be filtered, but if it also affects a 'protein_coding' transcript then it will not be filtered.
The 'data/biotypes.tsv' file contains a list of valid biotypes and the default behaviour of this program (i.e. 'keep' or 'filter').
=item B<--no_biotype_filtering>
Use this flag to consider consequences affecting ALL biotypes.
=item B<-a --allele_frequency>
Use a value between 0.00 and 1.00 to specify allele frequencey filtering for annotations from annotateSnps.pl, filterOnEvsMaf.pl or filterVcfOnVcf.pl if you've previously run these programs to annotate your VCF. If an allele frequency is available for an allele it will be filtered if equal to or greater than the value specfied here.
Note: allele frequencies added by VEP are not used for filtering as they check the allele frequency at the site, not of the specific alleles in your variant call.
=item B<-j --custom_af>
If using the --af/--allele_frequency option and your data contains allele frequency fields from sources not recognised by this program, you may give the name of these allele frequency INFO fields here and they will be used for filtering in addition to the default fields. Note that these annotations must contain an annotation per ALT allele (i.e. the INFO field header must specify 'Number=A') to work properly and the allele frequency should be expressed as a number between 0.00 and 1.00 in order to be compatible with the default allele frequency fields recognised by this program.
=item B<-u --max_sample_allele_frequency>
Use this option to specify an allele frequency (between 0.00 and 1.00) for filtering alleles in your VCF. Alleles present at this frequency or higher in your samples of interest will be filtered. If -s/--samples argument is specified, only these samples will be used for calculating the allele frequency, otherwise all samples in your VCF will be used.
=item B<-v --var_qual>
Minimum variant Phred-like quality score to consider. Variants with a QUAL field lower than this value will be filtered. Default is 20.
=item B<-g --gq>
Minimum genotype qualities to consider. Only applicable when using the -s/--samples option. Any genotype call below this threshold will be considered a no call. Default is 20
=item B<--pl>
Minimum 0-based phred-scale genotype likelihood (see the VCF spec for details) for alternative genotypes. Only applicable when using the -s/--samples option. When considering a given allele, if the sample has a PL below this value for a genotype not including this allele, the sample will not be considered a carrier of that allele. Default - not used.
=item B<--pass_filters>
Only consider variants with a PASS filter field. If the FILTER field for variant is not PASS the variant will be skipped.
=item B<--eval_filters>
Use this option to create custom filters for KEEPING variants on the basis of values in the VEP or SnpEff consequence fields.
Expressions must take the format of 'field name' 'comparator' 'value to compare' separated by white space. Multiple expressions can be used together along with the logical operators 'and', 'or' or 'xor'. The value for 'field name' will be used to extract the value for the given field from the VEP/SnpEff consequence INFO field. The resulting expression is evaluated using perl's built-in 'eval' function.
For example:
--eval_filters "LoF eq 'HC'"
#keeps any variant with a LoF annotation of 'HC'
--eval_filters "(ada_score >= 0.6 and rf_score >= 0.6) or maxentscan_diff > 5"
#keeps variants with ada_score and rf_scores of 0.6 or higher or with
#maxenstscan diff of 5 or higher
=item B<-b --progress>
Show a progress bar while working.
=item B<-h --help>
Show the program's help message.
=item B<--manual>
Show the program's manual page.
=back
=cut
=head1 EXAMPLES
getFunctionalVariants.pl -i input.vcf -o out.vcf
#output variants with a default 'functional' consequence (annotated by VEP or SnpEff)
getFunctionalVariants.pl -i input.vcf -o out.vcf -s sample1 sample2 -f shared_genes.txt
#as above but only for variants where sample1 or sample2 contain a variant allele with a 'functional' consequence
getFunctionalVariants.pl -s all -i input.vcf -o out.vcf -n 2 -af 0.001 -f shared_genes.txt
#
=cut
=head1 DESCRIPTION
In its simplest form this program will print specific variant classes from a VCF file annotated with either Ensembl's variant_effect_predictor.pl program or SnpEff and filter out other variant classes. Input must be a VCF annotated by the variant_effect_predictor.pl program using the '--vcf' option or a VCF annotated with SnpEff using the (now default) ANN style annotations.
As well as outputting variants on the basis of functional annotation, this program can identify genes with functional variants in specific samples using the --samples and --find_shared genes options.
=cut
=head1 AUTHOR
David A. Parry
University of Edinburgh
=head1 COPYRIGHT AND LICENSE
Copyright 2015, David A. Parry
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut