Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
filter_VCF.py
#! /usr/bin/env python
###############################################################################
## LICENSE
##
## Copyright (C) 2020
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
###############################################################################
# DATE: 2020
# AUTHOR: Chase W. Nelson
# CONTACT: cnelson@gate.sinica.edu.tw
# CITATION: https://github.com/chasewnelson/SARS-CoV-2-ORF3d
from Bio import SeqIO
from scipy.stats import binom
import numpy as np
import os
import random
import re
import sys
Usage = "Script to dynamically filter within-host variants to control for a user-defined false-discovery rate"
#n = total reads
#p = error rate / 3
#x = number of reads of minor allele
#FDR = (1 - binomcdf(n, p, x -1) ) * length_of_the_genome
# Check argument number
if not len(sys.argv) == 6:
raise SystemExit("\n### TERMINATED: need 5 unnamed arguments:\n" + \
#" # (1) .VCF SNP report\n" + \
" # (1) analysis-wide FDR cutoff\n" + \
" # (2) minor allele frequency cutoff (min allowed)\n" + \
" # (3) genome length\n" + \
" # (4) sequencing error rate per site (assumes all nts equally probable)\n" + \
" # (5) number of samples in analysis\n\n")
#infile_VCF = sys.argv[1]
FDR_cutoff = float(sys.argv[1])
MAF_cutoff = float(sys.argv[2])
genome_len = int(sys.argv[3])
error_per_site = float(sys.argv[4])
num_samples = int(sys.argv[5])
###############################################################################
### Gather VCF files in working directory
VCF_filenames = [name for name in os.listdir(".") if os.path.isfile(name) and name.endswith(".vcf")]
print("Analysis-wide FDR cutoff: " + str(FDR_cutoff))
print("Minor allele frequency cutoff (min allowed): " + str(MAF_cutoff))
print("Genome length: " + str(genome_len))
print("Sequencing error rate per site (assumes all nts equally probable): " + str(error_per_site))
print("Number of samples in analysis: " + str(num_samples))
print("VCF files to examine, gathered from working directory: " + str(VCF_filenames))
print("Will write output to files with names of the form: <VCF_name>_filtered.vcf\n")
###############################################################################
### Extract SNP data from VCF
# Prepare regex
VCF_DP_pattern = r"DP=\d+" # DP=266
VCF_DP_regex = re.compile(VCF_DP_pattern)
VCF_AF_pattern = r"AF=[\d\.]+" # AF=0.015038
VCF_AF_regex = re.compile(VCF_AF_pattern)
sample_count = 0
total_variant_count = 0
total_variant_pass_count = 0;
variants_pass_freqs = []
variants_pass_ACs = []
variants_pass_DPs = []
variants_fail_freqs = []
variants_fail_ACs = []
variants_fail_DPs = []
for this_VCF in VCF_filenames:
# Check file exists
if os.path.isfile(this_VCF): # this_VCF
# Create outfile name
this_outfile_prefix = str(this_VCF)
this_outfile_prefix = this_outfile_prefix.replace(".vcf", "")
this_outfile_name = this_outfile_prefix + "_filtered.vcf"
print("\nFiltered VCF output to: " + this_outfile_name + "\nEliminated records:")
# Open output file for writing
this_outfile_hdl = open(this_outfile_name, "w")
new_metadata_lines = "##FILTER=<ID=MAF_cutoff,Description=\"Minor allele frequency cutoff (min allowed): " + \
str(round(MAF_cutoff, 5)) + "\">\n" + \
"##FILTER=<ID=FDR_cutoff,Description=\"Analysis-wide FDR cutoff (max): " + \
str(round(FDR_cutoff, 5)) + "\">\n" + \
"##FILTER=<ID=error_per_site,Description=\"Sequencing error rate per site: " + \
str(round(error_per_site, 5)) + "\">" # no newline
# Keep track of number pass
variant_count = 0
variant_pass_count = 0;
with open(this_VCF, "r") as f:
lines = (line.rstrip() for line in f)
this_VCF_ID = str(this_VCF).rstrip(".vcf")
sample_count += 1
for line in lines:
if line.startswith("##"): # simply re-print existing metadata headers
# Write to FASTA file
this_outfile_hdl.write(line + "\n")
elif line.startswith("#"): # add new metadata, then print header line
this_outfile_hdl.write(new_metadata_lines + "\n")
this_outfile_hdl.write(line + "\n")
else:
variant_count += 1
total_variant_count += 1
line_list = line.split("\t")
# Extract information for this SNP
this_CHROM = line_list[0]
this_POS = int(line_list[1])
this_INFO = line_list[7]
this_DP = VCF_DP_regex.search(this_INFO)
this_DP = this_INFO[this_DP.start():this_DP.end()]
this_DP = int(this_DP.replace("DP=", ""))
this_AF = VCF_AF_regex.search(this_INFO)
this_AF = this_INFO[this_AF.start():this_AF.end()] # AF=0.367003
this_AF = float(this_AF.replace("AF=", ""))
# Make sure it's the MINOR allele frequency
this_AF_minor = float(this_AF)
if this_AF_minor > 0.5:
this_AF_minor = 1 - this_AF_minor
# Allele count to be the nearest integer
this_AC_minor = round(this_DP * this_AF_minor)
# Calculate analysis-wide FDR for this allele count and coverage
FDR_per_genome_dataset = (1 - binom.cdf(this_AC_minor - 1, this_DP, error_per_site / 3)) * genome_len * num_samples
if FDR_per_genome_dataset > FDR_cutoff or this_AF_minor < MAF_cutoff: # exclude
variants_fail_freqs.append(this_AF_minor)
variants_fail_ACs.append(this_AC_minor)
variants_fail_DPs.append(this_DP)
print("Excluded the following record: " + str(this_VCF) + ", " + str(this_CHROM) + ":" + \
str(this_POS) + ",DP=" + str(this_DP) + ",AC=" + str(this_AC_minor) + \
",FDR=" + str(round(FDR_per_genome_dataset, 5)) + ",MAF=" + \
str(round(this_AF_minor, 5)))
else: # include (print)
variants_pass_freqs.append(this_AF_minor)
variants_pass_ACs.append(this_AC_minor)
variants_pass_DPs.append(this_DP)
variant_pass_count += 1
total_variant_pass_count += 1
this_outfile_hdl.write(line + "\n")
this_outfile_hdl.close()
print("Variants pass: " + str(variant_pass_count) + "/" + str(variant_count))
else:
raise SystemExit("\n### TERMINATED: file doesn't exist: " + str(this_VCF) + "\n")
###############################################################################
### FINISH
mean_freq_pass = np.mean(np.array(variants_pass_freqs))
median_freq_pass = np.median(np.array(variants_pass_freqs))
min_freq_pass = np.min(np.array(variants_pass_freqs))
max_freq_pass = np.max(np.array(variants_pass_freqs))
mean_AC_pass = np.mean(np.array(variants_pass_ACs))
median_AC_pass = np.median(np.array(variants_pass_ACs))
min_AC_pass = np.min(np.array(variants_pass_ACs))
max_AC_pass = np.max(np.array(variants_pass_ACs))
mean_DP_pass = np.mean(np.array(variants_pass_DPs))
median_DP_pass = np.median(np.array(variants_pass_DPs))
min_DP_pass = np.min(np.array(variants_pass_DPs))
max_DP_pass = np.max(np.array(variants_pass_DPs))
mean_freq_fail = np.mean(np.array(variants_fail_freqs))
median_freq_fail = np.median(np.array(variants_fail_freqs))
min_freq_fail = np.min(np.array(variants_fail_freqs))
max_freq_fail = np.max(np.array(variants_fail_freqs))
mean_AC_fail = np.mean(np.array(variants_fail_ACs))
median_AC_fail = np.median(np.array(variants_fail_ACs))
min_AC_fail = np.min(np.array(variants_fail_ACs))
max_AC_fail = np.max(np.array(variants_fail_ACs))
mean_DP_fail = np.mean(np.array(variants_fail_DPs))
median_DP_fail = np.median(np.array(variants_fail_DPs))
min_DP_fail = np.min(np.array(variants_fail_DPs))
max_DP_fail = np.max(np.array(variants_fail_DPs))
print("\n")
print("###############################################################################")
print("Total samples examined: " + str(sample_count))
print("Total variants examined: " + str(total_variant_count))
print("Total variants pass: " + str(total_variant_pass_count))
print("Fraction variants pass: " + str(total_variant_pass_count / total_variant_count))
print("min / mean / median / max FREQUENCY of passing alleles: " + str(min_freq_pass) + " / " + \
str(mean_freq_pass) + " / " + str(median_freq_pass) + " / " + str(max_freq_pass))
print("min / mean / median / max AC of passing alleles: " + str(min_AC_pass) + " / " + \
str(mean_AC_pass) + " / " + str(median_AC_pass) + " / " + str(max_AC_pass))
print("min / mean / median / max DP of passing alleles: " + str(min_DP_pass) + " / " + \
str(mean_DP_pass) + " / " + str(median_DP_pass) + " / " + str(max_DP_pass))
print("min / mean / median / max FREQUENCY of failing alleles: " + str(min_freq_fail) + " / " + \
str(mean_freq_fail) + " / " + str(median_freq_fail) + " / " + str(max_freq_fail))
print("min / mean / median / max AC of failing alleles: " + str(min_AC_fail) + " / " + \
str(mean_AC_fail) + " / " + str(median_AC_fail) + " / " + str(max_AC_fail))
print("min / mean / median / max DP of failing alleles: " + str(min_DP_fail) + " / " + \
str(mean_DP_fail) + " / " + str(median_DP_fail) + " / " + str(max_DP_fail))
print("###############################################################################\n")
raise SystemExit("\n### All done; we stopped here, dear.\n")
Computing file changes ...