Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
extract_positions_by_timepoint.py
#! /usr/bin/env python
###############################################################################
## LICENSE
##
## Copyright (C) 2020
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
###############################################################################
# DATE: 2020
# AUTHOR: Chase W. Nelson
# CONTACT: cnelson@gate.sinica.edu.tw
# CITATION: https://github.com/chasewnelson/
from Bio import SeqIO
from datetime import date, datetime, time
import sys
import os
Usage = "Track the frequency of alleles at specific sites in a sliding time window"
# Check argument number
if not len(sys.argv) == 7:
raise SystemExit("\n### TERMINATED: need 6 unnamed arguments:\n" + \
" # (1) .TSV with date metadata\n" + \
" # (2) .FASTA alignment\n" + \
" # (3) .TSV with a single column of sites to track\n" + \
" # (4) name of desired results file (.TSV)\n" + \
" # (5) window size\n" + \
" # (6) step size\n\n")
infile_metadata = sys.argv[1]
infile_FASTA = sys.argv[2]
infile_sites = sys.argv[3]
outfile_name = sys.argv[4]
window_size = int(sys.argv[5])
step_size = int(sys.argv[6])
print("Date metadata from " + str(infile_metadata))
print("FASTA alignment from " + str(infile_FASTA))
print("Sites to track from " + str(infile_sites))
print("RESULTS will be placed in " + str(outfile_name))
print("Will perform a time-based sliding window: window_size=" + str(window_size) + ", step_size=" + str(step_size) + " (days)\n")
# Check infiles exist
if not os.path.isfile(infile_metadata):
raise SystemExit("\n### TERMINATED: infile 1 (IDs) does not exist\n")
if not os.path.isfile(infile_FASTA):
raise SystemExit("\n### TERMINATED: infile 2 (FASTA) does not exist\n")
if not os.path.isfile(infile_sites):
raise SystemExit("\n### TERMINATED: infile 3 (one column of sites) does not exist\n")
# Create outfile_prefix name
print("Will write output to: " + outfile_name + "\n")
# Build a dictionary of dates/times
ID_to_date = {}
min_date = datetime(2019, 12, 20)
# for now, ASSUME col 1 is ID, and col
line_num = 0
with open(infile_metadata, "r") as f:
lines = (line.rstrip() for line in f)
for line in lines:
line_num += 1
line_list = line.split("\t")
this_date = datetime(2019, 12, 20)
if line_num == 1:
# print("Accession ID")
# print(line_list[0])
if not line_list[0] == "Accession ID" and line_list[3] == "Collection date":
raise SystemExit(
"\n### TERMINATED: header must be col1=\"Accession ID\" and col4=\"Collection date\"\n")
else:
date_worked = False
seq_ID = line_list[0]
seq_date = line_list[3]
try:
this_date = datetime.strptime(seq_date, "%Y-%m-%d") # shortcut %F didn't work
date_worked = True
except:
try:
# ID_to_date[seq_ID] =
this_date = datetime.strptime(seq_date, "%m/%d/%y") # shortcut %D didn't work for
date_worked = True
except:
print("Improper date: seq_ID=" + str(seq_ID) + ",seq_date=" + str(seq_date))
if this_date >= min_date and date_worked:
ID_to_date[seq_ID] = this_date
print("\n")
# find day 0
day0_date = min(ID_to_date.values())
day0_ID = list(ID_to_date.keys())[list(ID_to_date.values()).index(day0_date)]
# Get number of times from time 0
ID_to_days_elapsed = {}
for this_seq_ID, this_date in zip(ID_to_date.keys(), ID_to_date.values()):
this_delta = this_date - day0_date
ID_to_days_elapsed[this_seq_ID] = this_delta.days
### Gather sites to track
tracked_sites_list = []
with open(infile_sites) as infile_sites_hdl:
lines = [x.rstrip() for x in infile_sites_hdl]
for line in lines:
tracked_sites_list.append(int(line))
print("Sites to track=" + ",".join(map(str, tracked_sites_list)))
print("Sequence counts per window:\n")
time_site_nts_dict = {}
# Gather frequency data
this_window_start = 0
while (this_window_start + window_size) <= max(ID_to_days_elapsed.values()):
this_window_end = this_window_start + window_size
this_window_midpoint = (this_window_start + this_window_end) / 2
# Open FASTA for reading
recs = SeqIO.parse(infile_FASTA, "fasta")
# Initialize entry for this window if needed
time_site_nts_dict[this_window_midpoint] = {}
this_window_seq_count = 0
for rec in recs:
this_ID = str(rec.id)
if this_ID in ID_to_days_elapsed.keys():
this_ID_days_elapsed = ID_to_days_elapsed[this_ID]
# If this sequence belongs to this window, add its allele(s)
if this_ID_days_elapsed >= this_window_start and this_ID_days_elapsed <= this_window_end:
this_window_seq_count += 1
this_seq = str(rec.seq).upper()
# Extract column for alignment subset including sequence of this timepoint
for this_site in tracked_sites_list:
if this_site not in time_site_nts_dict[this_window_midpoint]:
time_site_nts_dict[this_window_midpoint][this_site] = str(this_seq[this_site - 1])
else:
time_site_nts_dict[this_window_midpoint][this_site] += str(this_seq[this_site - 1])
print("this_window_start=" + str(this_window_start) + "," + "this_window_end=" + str(this_window_end) + "," + \
"this_window_midpoint=" + str(this_window_midpoint) + "," + "this_window_seq_count=" + str(this_window_seq_count))
# Update window
this_window_start += step_size
# Open TSV for writing
outfile_hdl = open(outfile_name, "w")
# Write header
outfile_hdl.write("\t".join(map(str, ['this_midpoint', 'this_site', 'defined_allele_count', 'defined_nt_count', \
'major_nt', 'major_nt_count', 'major_nt_freq', 'minor_nt', 'minor_nt_count', 'minor_nt_freq', \
"A_count", "C_count", "G_count", "T_count", \
"A_freq", "C_freq", "G_freq", "T_freq"])) + \
"\n")
for this_midpoint in time_site_nts_dict.keys():
for this_site in time_site_nts_dict[this_midpoint].keys():
this_col_letters = str(time_site_nts_dict[this_midpoint][this_site])
this_col_nt_counts = {"A" : 0, "C" : 0, "G" : 0, "T" : 0}
for this_letter in list(this_col_letters):
# print(this_letter)
if this_col_nt_counts.get(this_letter) is None:
this_col_nt_counts[this_letter] = 1
else:
this_col_nt_counts[this_letter] += 1
minor_nt = '-'
minor_nt_count = sum(this_col_nt_counts.values()) # max it out first
major_nt = '-'
major_nt_count = 0
defined_nt_count = 0
defined_allele_count = 0
for key in this_col_nt_counts.keys():
if key == "A" or key == "C" or key == "G" or key == "T":
defined_nt_count += this_col_nt_counts[key]
defined_allele_count += 1
if this_col_nt_counts[key] < minor_nt_count and this_col_nt_counts[key] > 0:
minor_nt = key
minor_nt_count = this_col_nt_counts[key]
if this_col_nt_counts[key] > major_nt_count:
major_nt = key
major_nt_count = this_col_nt_counts[key]
major_nt_freq = major_nt_count / defined_nt_count
minor_nt_freq = minor_nt_count / defined_nt_count
this_col_nt_freqs = {"A": 0, "C": 0, "G": 0, "T": 0}
for key in this_col_nt_counts.keys():
if key == "A" or key == "C" or key == "G" or key == "T":
this_col_nt_freqs[key] = this_col_nt_counts[key] / defined_nt_count
# Write to TSV file
outfile_hdl.write("\t".join(map(str, [this_midpoint, this_site, defined_allele_count, defined_nt_count, \
major_nt, major_nt_count, major_nt_freq, minor_nt, minor_nt_count, minor_nt_freq, \
this_col_nt_counts["A"], this_col_nt_counts["C"], this_col_nt_counts["G"], this_col_nt_counts["T"], \
this_col_nt_freqs["A"], this_col_nt_freqs["C"], this_col_nt_freqs["G"], this_col_nt_freqs["T"]])) + \
"\n")
# Close output file
outfile_hdl.close()
raise SystemExit("\n### All done; we stopped here, dear.\n")
Computing file changes ...