https://github.com/gantzgraf/vcfhacks
Tip revision: 3f98bcf4a42745fdc4d47b654788e7ec10c497e9 authored by David A. Parry on 08 May 2015, 13:22:58 UTC
update readme files for new release
update readme files for new release
Tip revision: 3f98bcf
getFunctionalVariantsVep.pl
#!/usr/bin/perl
#
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
use Data::Dumper;
use Term::ProgressBar;
use FindBin;
use lib "$FindBin::Bin/lib";
use ParseVCF;
my $help;
my $manual;
my $infile;
my $outfile;
my @classes;
my $no_head;
my $progress;
my $pass;
my $total_lines;
my @add;
my $canonical_only;
my $gmaf;#filter GMAF if present if equal to or above this value (0.0 - 0.5)
my $any_maf;#filter MAF if present if equal to or above this value in any population
my @damaging;
my $keep_any_damaging;
my $filter_unpredicted;
my @gene_lists; #array of files containing Ensembl Gene IDs to filter
my @samples;
my $in_every_sample;
my $matching_genes = 0;#will change to '' if user specifies --find_shared_genes but provides no argument, in which case print to STDERR
my $splice_consensus = 0; #use this flag to check for SpliceConsensus VEP plugin annotations
my %opts = ("help" => \$help,
"manual" => \$manual,
"pass" => \$pass,
"progress" => \$progress,
"input" => \$infile,
"output" => \$outfile,
"classes" => \@classes,
"additional_classes" => \@add,
"remove_headers" => \$no_head,
"canonical_only" => \$canonical_only,
"maf" => \$any_maf,
"gmaf" => \$gmaf,
"damaging" => \@damaging,
"keep_any_damaging" => \$keep_any_damaging,
"unpredicted_missense" => \$filter_unpredicted,
"list" => \@gene_lists,
"samples" => \@samples,
"each_sample" => \$in_every_sample,
"find_shared_genes" => \$matching_genes,
"consensus_splice_site" => \$splice_consensus,
);
GetOptions(\%opts,
"help" ,
"manual" ,
"pass" ,
"progress" ,
"input=s" ,
"output=s" ,
"classes=s{,}" ,
"additional_classes=s{,}" ,
"remove_headers" ,
"canonical_only" ,
"maf=f" ,
"gmaf=f" ,
"damaging=s{,}" ,
"keep_any_damaging" ,
"unpredicted_missense" ,
"list=s{,}",
"samples=s{,}",
"each_sample",
"find_shared_genes:s",
"consensus_splice_site",
) or pod2usage(-message => "Syntax error", -exitval => 2);
pod2usage(-verbose => 2) if ($manual);
pod2usage(-verbose => 1) if ($help);
pod2usage(-message =>
"Syntax error - an input file must be supplied.\n",
-exitval => 2) if (not $infile);
pod2usage(-message =>
"--gmaf option requires a value between 0.00 and 0.50 to filter on global minor allele frequency.\n",
-exitval => 2) if (defined $gmaf && ($gmaf < 0 or $gmaf > 0.5));
if ($matching_genes){
pod2usage(-message =>
"--find_shared_genes option requires at least one sample to be specified with the --samples argument.\n",
-exitval => 2) if not @samples;
}elsif ($matching_genes eq ''){
pod2usage(-message =>
"--find_shared_genes option requires at least one sample to be specified with the --samples argument.\n",
-exitval => 2) if not @samples;
}
my @valid = qw (transcript_ablation
splice_donor_variant
splice_acceptor_variant
stop_gained
frameshift_variant
stop_lost
initiator_codon_variant
inframe_insertion
inframe_deletion
missense_variant
transcript_amplification
splice_region_variant
incomplete_terminal_codon_variant
synonymous_variant
stop_retained_variant
coding_sequence_variant
mature_miRNA_variant
5_prime_UTR_variant
3_prime_UTR_variant
intron_variant
NMD_transcript_variant
non_coding_exon_variant
nc_transcript_variant
upstream_gene_variant
downstream_gene_variant
TFBS_ablation
TFBS_amplification
TF_binding_site_variant
regulatory_region_variant
regulatory_region_ablation
regulatory_region_amplification
feature_elongation
feature_truncation
intergenic_variant);
if (not @classes){
@classes = qw (transcript_ablation
splice_donor_variant
splice_acceptor_variant
stop_gained
frameshift_variant
stop_lost
initiator_codon_variant
inframe_insertion
inframe_deletion
missense_variant
transcript_amplification
TFBS_ablation
TFBS_amplification
regulatory_region_ablation
regulatory_region_amplification);
}
push (@classes, @add) if (@add);
push @classes, "splice_region_variant" if $splice_consensus;
foreach my $class (@classes){
die "Error - variant class '$class' not recognised.\n" if not grep {/^$class$/i} @valid;
}
my $OUT;#output filehandle
if ($outfile){
open ($OUT, ">$outfile") || die "Can't open file $outfile for writing.\n";
}else{
$OUT = \*STDOUT;
}
my $LIST; #output filehandle for matching genes
if ($matching_genes){
open ($LIST, ">$matching_genes") || die "Can't open $matching_genes for writing: $!\n";
}elsif($matching_genes eq ''){#user specified --list option but provided no argument
$LIST = \*STDERR;
$matching_genes = 1;
}
my @csq_fields = qw (allele gene feature feature_type consequence hgnc);#default fields to retrieve from CSQ INFO field
my %damage_filters = (); #hash of prediction program names and values to filter
if (@damaging){
my %valid_damaging = (sift => ["deleterious", "tolerated"], polyphen => ["probably_damaging", "possibly_damaging", "benign", "unknown"], condel => ["deleterious", "neutral"]);
my %default_damaging = (sift => ["deleterious", ], polyphen => ["probably_damaging", "possibly_damaging",], condel => ["deleterious", ]);
foreach my $d (@damaging){
my ($prog, $label) = split("=", $d);
if (exists $valid_damaging{lc$prog}){
push @csq_fields, lc($prog);
no warnings 'uninitialized';
my @filters = add_damaging_filters(lc$prog, lc$label, \%valid_damaging, \%default_damaging);
push @{$damage_filters{lc$prog}}, @filters;
}elsif (lc($prog) eq 'all'){
%damage_filters = %default_damaging;
push @csq_fields, qw(sift polyphen condel);
}else{
die "Unrecognised value ($d) passed to --damaging argument. See --help for more info.\n";
}
}
}else{
if ($keep_any_damaging and $filter_unpredicted){
die "--keep_any_damaging and --unpredicted_missense arguments can only be used when --damaging argument is in effect.\n";
}elsif ($keep_any_damaging){
die "--keep_any_damaging argument can only be used when --damaging argument is in effect.\n";
}elsif ($filter_unpredicted){
die "--unpredicted_missense argument can only be used when --damaging argument is in effect.\n";
}
}
if ($canonical_only){
push @csq_fields, 'canonical';
}
if (defined $gmaf or defined $any_maf){
push @csq_fields, 'gmaf';
}
if ($splice_consensus){
push @csq_fields, 'splice_consensus';
}
my @genes_to_filter;
if (@gene_lists){
foreach my $gene_file (@gene_lists){
open (my $GENE, $gene_file) or die "Can't open gene list file '$gene_file': $!\n";
chomp (my @head = split("\t", <$GENE>));
no warnings 'uninitialized';
my $id_field = 0;
$id_field++ until $head[$id_field] eq 'Ensembl Gene ID' or $id_field > $#head;
if ($id_field > $#head){
die "Can't find 'Ensembl Gene ID' field in header of gene list file '$gene_file'.\n";
}
while (chomp (my $line = <$GENE>)){
my @split = split("\t", $line);
push @genes_to_filter, $split[$id_field];
}
}
my %seen = ();
@genes_to_filter = grep { ! $seen{$_}++ } @genes_to_filter;
@genes_to_filter = sort @genes_to_filter;
}
my $file = 0;
my $got = 0;
my $count = 0 ;
my %genes_per_sample = (); # will take on the format of $genes_per_sample{sample} = anonymous hash with keys = transcript id values = gene symbol
# e.g. $genes_per_sample{sample}->{ENST0012345} = {AGENE}
my @available_mafs = ();
my $vcf_obj = ParseVCF->new(file=> $infile);
my $vep_header = $vcf_obj->readVepHeader();
die "No 'consequence' field identified in header for file $infile - " .
"please annotate with Ensembl's variant_effect_precictor.pl script.\n" if (not exists $vep_header->{consequence});
die "No GMAF field in header for file $infile - please annotate GMAF with Ensembl's variant_effect_precictor.pl script.\n"
if (defined $gmaf and not exists $vep_header->{gmaf});
if (defined $any_maf){
foreach my $key (keys %{$vep_header}){
if ($key =~ /\w_MAF$/){
push @available_mafs, $key;
push @csq_fields, $key;
}
}
}
if (@csq_fields > 1){
my %seen = ();
@csq_fields = grep { ! $seen{$_}++ } @csq_fields;
}
foreach my $c (@csq_fields){
if (not exists $vep_header->{$c}){
die "Couldn't find '$c' VEP field in header - please ensure your VCF is annotated with " .
"Ensembl's variant effect precictor specifying the appropriate annotations.\n";
}
}
my $progressbar;
my $line_count = 0;
my $next_update = 0;
if ($progress){
if ($infile eq "-"){
print STDERR "Can't use --progress option when input is from STDIN\n";
$progress = 0;
}else{
$progressbar = Term::ProgressBar->new({name => "Retrieving:", count => $vcf_obj->countLines("variants"), ETA => "linear", });
}
}
unless ($no_head){
print $OUT $vcf_obj->getHeader(0) ."##getFunctionalVariantsVep.pl=\"";
my @opt_string = ();
foreach my $k (sort keys %opts){
if (not ref $opts{$k}){
push @opt_string, "$k=$opts{$k}";
}elsif (ref $opts{$k} eq 'SCALAR'){
if (defined ${$opts{$k}}){
push @opt_string, "$k=${$opts{$k}}";
}else{
push @opt_string, "$k=undef";
}
}elsif (ref $opts{$k} eq 'ARRAY'){
if (@{$opts{$k}}){
push @opt_string, "$k=" .join(",", @{$opts{$k}});
}else{
push @opt_string, "$k=undef";
}
}
}
print $OUT join(" ", @opt_string) . "\"\n" . $vcf_obj->getHeader(1);
}
LINE: while (my $line = $vcf_obj->readLine){
$count++;
$line_count++;
my $printed_line = 0;
if ($progress){
$next_update = $progressbar->update($line_count) if $line_count >= $next_update;
}
if ($pass){
next LINE if $vcf_obj->getVariantField("FILTER") ne 'PASS';
}
#get vep consequence fields
my @csq = $vcf_obj->getVepFields(\@csq_fields); #returns array of hashes e.g. $csq[0]->{Gene} = 'ENSG000012345' ; $csq[0]->{Consequence} = 'missense_variant'
die "No consequence field found for line:\n$line\nPlease annotated your VCF file with ensembl's variant effect precictor before running this program.\n" if not @csq;
#assign each ALT genotype to a VEP allele (VEP cuts off the first nt of insertion or deletion alt alleles)
my %vep_alleles = ();#hash of VEP alleles to VCF's ALT alleles
my @alts = $vcf_obj->readAlleles(alt_alleles => 1);
my $ref = $vcf_obj->getVariantField("REF");
my @v_all = $vcf_obj->altsToVepAllele( ref => $ref, alt => \@alts);
@vep_alleles{ @v_all } = @alts;#$vep{$allele} now corresponds to GT value
#START FILTERING on CSQ fields
#check whether canonical transcript
ANNOT: foreach my $annot (@csq){
my @anno_csq = split(/\&/, $annot->{consequence});
#skip NMD transcripts
if (grep {/NMD_transcript_variant/i} @anno_csq){
next ANNOT;
}
#check against gene filter list
if (@genes_to_filter){#this will be empty if --list argument wasn't specified
my $i = binsearch($annot->{gene}, \@genes_to_filter);
if ($i > -1){
next ANNOT;
}
}
if($canonical_only){
next if (not $annot->{canonical});
}
#check minor allele frequency
if (defined $gmaf){
if ($annot->{gmaf}){
if ($annot->{gmaf} =~ /\w+:(\d\.\d+)/){
next if $1 >= $gmaf;
}
}
}
if (defined $any_maf){
foreach my $some_maf (@available_mafs){
if ($annot->{$some_maf}){
if ($annot->{$some_maf} =~ /\w+:(\d\.\d+)/){
next if $1 >= $any_maf;
}
}
}
}
#check whether annotation is present in @samples if @samples specified
my @found_in_sample = ();
foreach my $sample (@samples){
my @sample_alleles = $vcf_obj->getSampleActualGenotypes(sample => $sample, return_alleles_only => 1);
if ($in_every_sample){
next ANNOT if not grep {/^$vep_alleles{$annot->{allele}}$/} @sample_alleles;
push @found_in_sample, $sample;
}else{
if (grep {/^$vep_alleles{$annot->{allele}}$/} @sample_alleles){
push @found_in_sample, $sample;
}
}
}
if (@samples){
next ANNOT if not @found_in_sample;
}
#check vep consequence class
CLASS: foreach my $class (@classes){
foreach my $ac (@anno_csq){
if (lc$ac eq lc$class){
if (lc$class eq 'missense_variant' and %damage_filters){
next if (filter_missense($annot, \%damage_filters, $keep_any_damaging, $filter_unpredicted));
}elsif(lc$class eq 'splice_region_variant' and $splice_consensus){
my $consensus = $annot->{splice_consensus};
next if not $consensus;
if ($consensus !~/SPLICE_CONSENSUS\S+/i){
print STDERR "WARNING - SPLICE_CONSENSUS annotation $consensus is".
" not recognised as an annotation from the SpliceConsensus VEP plugin.\n";
next;
}
}
print $OUT "$line\n" if not $printed_line;
$printed_line++;
$got++;
#need to cycle through each annotation if checking for matching genes between samples
#otherwise we can skip to next line
if ($matching_genes){
my $symbol ;
if ($annot->{hgnc}){
$symbol = $annot->{hgnc};
}elsif($annot->{gene}){
$symbol = $annot->{gene};
}else{
$symbol = $annot->{feature};
}
foreach my $sample (@found_in_sample){
$genes_per_sample{$sample}->{$annot->{feature}} = $symbol;
}
}else{
next LINE;
}
}
}
}
}
}
close $OUT;
print STDERR "$got variants of $count identified.\n";
if ($matching_genes){
print STDERR "Identifying matching genes between samples...\n";
my %matches = ();#symbol = key, value = array of transcript ids
my $transcript_count = 0;
TRANSCRIPT: foreach my $transcript ( keys %{$genes_per_sample{$samples[0]}}){
#check all transcripts from first sample in all other samples
for (my $i = 1; $i < @samples; $i++){
if (not $genes_per_sample{$samples[$i]}->{$transcript}){
next TRANSCRIPT;
}
}
#if we haven't hit 'next TRANSCRIPT' gene must be shared in all samples
$transcript_count++;
push @{$matches{$genes_per_sample{$samples[0]}->{$transcript}}}, $transcript;
}
if (keys %matches){
foreach my $gene (sort keys %matches){
my %seen = ();
@{$matches{$gene}} = grep {! $seen{$_}++ } @{$matches{$gene}};#remove duplicates
print $LIST "$gene:" . join(",", @{$matches{$gene}}) . "\n";
}
print STDERR "$transcript_count matching transcripts found between samples.\n";
print STDERR scalar (keys %matches) . " matching genes found between samples.\n";
}else{
print STDERR "0 matching transcripts found between samples.\n";
}
}
###################
###########
sub filter_missense{
#returns 1 if missense should be filtered
#otherwise returns 0;
# uses $keep_any setting to return 0 if any of these predictions match, otherwise all available
# scores must be deleterious/damaging
# if $filter_missing is used a variant will be filtered if no score is available (overriden by $keep_any setting)
my ($anno, $filter_hash, $keep_any, $filter_missing) = @_;
#my %default_damaging = (sift => ["deleterious", ], polyphen => ["probably_damaging", "possibly_damaging",], condel => ["deleterious", ]);
my %filter_matched = ();
PROG: foreach my $k (sort keys %$filter_hash){
my $score = $anno->{lc$k};
if (not $score or $score =~ /^unknown/i){ #don't filter if score is not available for this variant unless $filter_missing is in effect
$filter_matched{$k}++ unless $filter_missing;
next;
}
SCORE: foreach my $f (@{$filter_hash->{$k}}){
if ($f =~ /^\d(\.\d+)*$/){
my $prob;
if ($score =~ /^(\d(\.\d+)*)/){
$prob = $1;
}else{
next SCORE;#if score not available for this feature ignore and move on
}
if (lc$k eq 'polyphen'){
if ($prob >= $f){#higher is more damaging for polyphen - damaging
return 0 if $keep_any;
$filter_matched{$k}++;
next PROG;
}else{#benign
}
}else{
if ($prob <= $f){#lower is more damaging for sift and condel - damaging
return 0 if $keep_any;
$filter_matched{$k}++;
next PROG;
}else{#benign
}
}
}else{
$score =~ s/\(.*\)//;
if (lc$f eq lc$score){#damaging
return 0 if $keep_any;
$filter_matched{$k}++;
next PROG;
}
}
}
}
foreach my $k (sort keys %$filter_hash){
#filter if any of sift/condel/polyphen haven't matched our deleterious settings
return 1 if not exists $filter_matched{$k};
}
return 0;
}
###########
sub add_damaging_filters{
my ($prog, $label, $valid_hash, $default_hash) = @_;
if ($label){
if ($label =~ /^\d(\.\d+)*$/){
die "Numeric values for condel, polyphen or sift filtering must be between 0 and 1.\n" if ( $label < 0 or $label > 1);
return $label;
}else{
my @lb = split(",", $label);
foreach my $l (@lb){
die "Invalid filter parameter '$l' used with --damaging argument. See --help for valid arguments.\n" if not grep{/^$l$/} @{$valid_hash->{$prog}};
}
return @lb;
}
}else{#give default values for $prog
return @{$default_hash->{$prog}};
}
}
###########
sub binsearch{
my ($x, $ar) = @_;
my $u = $#{$ar};
my $l = 0;
while ($l <= $u){
my $i = int(($l + $u)/2);
if ($x lt $ar->[$i]){
$u = $i -1;
}elsif ($x gt $ar->[$i]){
$l = $i +1;
}else{
return $i;
}
}
return -1;
}
=head1 NAME
getFunctionalVariantsVep.pl - retrieve specific variant classes from an annotated vcf file.
=head1 SYNOPSIS
=over
=item getFunctionalVariantsVep.pl --input <annotated vcf file> [options]
=item getFunctionalVariantsVep.pl --help for help message
=item getFunctionalVariantsVep.pl --manual for manual page
=back
=cut
=head1 ARGUMENTS
=over 8
=item B<-i --input>
VCF file with functional annotations from variant_effect_predictor.pl script.
=item B<-o --output>
File to write output. Default is STDOUT.
=item B<--canonical_only>
Only look at consequences for canonical transcripts.
=item B<--classes>
If you wish to specify a custom set of variant classes to retrieve enter them here separated by spaces.
Default classes are:
transcript_ablation
splice_donor_variant
splice_acceptor_variant
stop_gained
frameshift_variant
stop_lost
initiator_codon_variant
inframe_insertion
inframe_deletion
missense_variant
transcript_amplification
TFBS_ablation
TFBS_amplification
regulatory_region_ablation
regulatory_region_amplification
The user can specify one or more of the following classes instead:
transcript_ablation
splice_donor_variant
splice_acceptor_variant
stop_gained
frameshift_variant
stop_lost
initiator_codon_variant
inframe_insertion
inframe_deletion
missense_variant
transcript_amplification
splice_region_variant
incomplete_terminal_codon_variant
synonymous_variant
stop_retained_variant
coding_sequence_variant
mature_miRNA_variant
5_prime_UTR_variant
3_prime_UTR_variant
intron_variant
NMD_transcript_variant
non_coding_exon_variant
nc_transcript_variant
upstream_gene_variant
downstream_gene_variant
TFBS_ablation
TFBS_amplification
TF_binding_site_variant
regulatory_region_variant
regulatory_region_ablation
regulatory_region_amplification
feature_elongation
feature_truncation
intergenic_variant
=item B<-a --add_classes>
Specify one or more classes, separated by spaces, to add to the default mutation classes used.
=item B<--consensus_splice_site>
Use this flag in order to keep splice_region_variant classes only if they are in a splice consensus region as defined by the SpliceConsensus plugin. You do not need to specify 'splice_region_variant' using --classes or --add_classes options when using this flag. You B<MUST> have used the SpliceConsensus plugin when running the VEP for this option to work correctly.
=item B<-g --gmaf>
Use a value between 0.00 and 0.50 to specify global minor allele frequencey filtering. If GMAF is available for variant it will be filtered if equal to or greater than the value specfied here.
=item B<--maf>
Like gmaf but filter on any population specific minor allele frequency annotated by the VEP as well as the GMAF.
=item B<-d --damaging>
Specify SIFT, PolyPhen or Condel labels or scores to filter on. Add the names of the programs you want to use, separated by spaces, after the --damaging option. By default SIFT will keep variants labelled as 'deleterious', Polyphen will keep variants labelled as 'possibly_damaging' or 'probably_damaging' and Condel will keep variants labelled as 'deleterious'.
If you want to filter on custom values specify values after each program name in the like so: 'polyphen=probably_damaging'. Separate multiple values with commas - e.g. 'polyphen=probably_damaging,possibly_damaging,unknown'. You may specify scores between 0 and 1 to filter on scores rather than labels - e.g. 'sift=0.3'. For polyphen, variants with scores lower than this score are considered benign and filtered, for SIFT and Condel higher scores are considered benign.
=over
=item Valid labels for SIFT: deleterious, tolerated
=item Valid labels for Polyphen: probably_damaging, possibly_damaging, benign, unknown
=item Valid labels for Condel : deleterious, neutral
=back
To use default values for all three programs use 'all' (i.e. '--damaging all').
The default behaviour is to only keep variants predicted as damaging by ALL programs specified, although if the value is not available for one or more programs than that program will be ignored for filtering purposes. For best results/maximum flexibility it is recommended that you annotate both 'scores' and 'predictions' when generating your input VCF with variant_effect_predictor.pl.
=item B<-k --keep_any_damaging>
If using multiple programs with the --damaging argument use this flag to keep variants predicted to be damaging according to ANY of these programs.
=item B<-u --unpredicted_missense>
Skip variants that do not have a score from one or more programs specified by the --damaging argument. The --keep_any_damaging argument will override this behaviour if any of the available predictions are considered damaging.
=item B<-l --list>
File containing a list of genes that should be filtered (e.g. LOF-tolerant genes). The program expects a tab delimited list with a header line specifying 'Ensembl Gene ID' for the column containing gene identifiers.
=item B<-r --remove_headers>
Use this flag to prevent printing of headers to output.
=item B<--pass>
Keep only variants passing filters (i.e. FILTER field is PASS).
=item B<-s --samples>
Only keep variants present in at least one of these samples. Can change behaviour to keep only variants present in ALL of these samples using the --each_sample switch.
=item B<-e --each_sample>
When --samples arguments are specified this switch will cause the program to only return variants present in ALL of the samples specified by the --samples argument.
=item B<-f --find_shared_genes>
If --samples argument is specified this switch will return a list of genes shared by each sample according to the filtering criteria. If a filename is given the list will be printed to file, otherwise the list will be printed to STDERR.
=item B<--progress>
Display progress bar.
=item B<-h --help>
Display help message.
=item B<--manual>
Show manual page.
=back
=cut
=head1 EXAMPLES
=over
=item getFunctionalVariantsVep.pl -i var.vcf
(filter on default set of consequences, print output to STDOUT.)
=item getFunctionalVariantsVep.pl -i var.vcf --classes stop_gained stop_lost
(filter on 'stop_gained' and 'stop_lost' classes only.)
=item getFunctionalVariantsVep.pl -i var.vcf --maf 0.01
(filter on default set of consequences, filter anything with a MAF greater than or equal to 0.01 [1 %].)
=item getFunctionalVariantsVep.pl -i var.vcf --maf 0.01 --damaging all
(As above but only keep missense variation predicted damaging by all annotation programs.)
=item getFunctionalVariantsVep.pl -i var.vcf --maf 0.01 --damaging all --list gene_list.txt
(As above but also filter genes in gene_list.txt file.)
=item getFunctionalVariantsVep.pl -i var.vcf --maf 0.01 --damaging all --list gene_list.txt -o output.vcf
(As above but specifying output file)
=back
=head1 DESCRIPTION
In its simplest form this program will print specific variant classes from a VCF file annotated with Ensembl's variant_effect_predictor.pl program and filter out others. Input must be a VCF annotated by the variant_effect_predictor.pl program using the '--vcf' option, annotations are read from the INFO field of the VCF. By default the following classes are kept:
transcript_ablation
splice_donor_variant
splice_acceptor_variant
stop_gained
frameshift_variant
stop_lost
initiator_codon_variant
inframe_insertion
inframe_deletion
missense_variant
transcript_amplification
TFBS_ablation
TFBS_amplification
regulatory_region_ablation
regulatory_region_amplification
You can add extra classes using the --add option or specify a totally different set of classes using the --classes option. Options are also available to additionally filter on Polyphen/SIFT/Condel prediction scores, GMAF, canonical transcripts or gene lists as detailed above.
=head1 AUTHOR
David A. Parry
University of Leeds
=head1 COPYRIGHT AND LICENSE
Copyright 2013 David A. Parry
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.