Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
Raw File
SARS-CoV-2_locate_genes.pl
#! /usr/bin/perl

# PROGRAM: Locate gene start and stop sites by finding the first sequences beginning
# and ending with the pre-specified sequences, hardcoded below, taken from the reference
# sequence: https://www.ncbi.nlm.nih.gov/nuccore/MN908947.3

#########################################################################################
## LICENSE
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
#########################################################################################

# AUTHOR: Chase W. Nelson
# Copyright (C) 2019 Chase W. Nelson
# DATE CREATED: April 2019

# CONTACT1: cnelson@amnh.org
# CONTACT2: cwnelson88@gmail.com

# AFFILIATION: Sackler Institute for Comparative Genomics, American Museum of Natural
#     History, New York, NY 10024, USA

# CITATION1: OLGenie, https://github.com/chasewnelson/OLGenie
# CITATION2: Nelson CW, Ardern Z, Wei X. OLGenie: detecting natural selection to identify functional overlapping genes. In preparation.
# CITATION2: Nelson CW, Moncla LH, Hughes AL (2015) SNPGenie: estimating evolutionary 
#	parameters to detect natural selection using pooled next-generation sequencing data. 
#	Bioinformatics 31(22):3709-11, doi: 10.1093/bioinformatics/btv449.

use strict;
use Data::Dumper;

# Get the time
my $time1 = time;
my $local_time1 = localtime;

STDOUT->autoflush(1);

my $fasta_file = $ARGV[0];

unless(-f "$fasta_file") {
	die "\n### DIE: no fasta file.\n\n";
}

# Read in the group of sequences from the fasta file
my %header2sequence;
my $seq = '';
my $header = '';
my @headers_arr;
my $seq_num = 0;
my $last_seq_length;

open(IN_FASTA, "$fasta_file") or die "Could not open file $fasta_file\n";

while(<IN_FASTA>) {
	chomp;
	if(/>/) {
		if($seq_num == 0) {
			$header = $_;
			$header =~ s/^>//; # get rid of FASTA header indicator
			$header =~ tr/\|/_/; # convert pipes (|) to underscores (_), following IQTree convention
			$header =~ s/\s.*$//; # trim anything including and after the first whitespace
			
			$seq_num ++;
		} else {
			$seq = uc($seq);
			$seq =~ tr/U/T/;
			
			if($header =~ /[^\w^\-^\.]/) {
				die "\n### TAXA NAMES CONTAIN INAPPROPRIATE CHARACTERS IN FASTA FILE: $header.\n" .
					"### Only alphanumeric characters (a-z, A-Z, 0-9), underscores (_), dashes (-), and periods (.) may be used. SCRIPT TERMINATED.\n\n";
			}
			
			if(exists $header2sequence{$header}) {
				die "\n\n### DIE: Each sequence in the FASTA multiple sequence alignment must have a unique header (up to first whitespace) for identification. TERMINATED.\n\n";
			} else {
				$header2sequence{$header} = $seq;
				push(@headers_arr, $header);
			}
			
			$header = $_;
			$header =~ s/^>//; # get rid of FASTA header indicator
			$header =~ tr/\|/_/; # convert pipes (|) to underscores (_), following IQTree convention
			$header =~ s/\s.*$//; # trim anything including and after the first whitespace
			
			$seq_num ++;
			
			my $this_seq_length = length($seq);
			
			if($last_seq_length && ($last_seq_length != $this_seq_length)) {
				die "\n\n### DIE: The sequences must be aligned, i.e., must be the same length. TERMINATED.\n\n";
			} else {
				$last_seq_length = $this_seq_length;
				$seq = '';
			}
		}
	} else {
		$seq .= $_;
	}
}

close IN_FASTA;

$seq = uc($seq);
$seq =~ tr/U/T/;

if($header =~ /^[^\w^\-^\.]/) {
	die "\n### TAXA NAMES CONTAIN INAPPROPRIATE CHARACTERS IN FASTA FILE: $header.\n" .
		"### Only alphanumeric characters (a-z, A-Z, 0-9), underscores (_), dashes (-), and periods (.) may be used. SCRIPT TERMINATED.\n\n";
}
$header2sequence{$header} = $seq;
push(@headers_arr, $header);


##########################################################################################
# Hard-code gene beginnings and endings
my %gene_to_termini;

# Hard-code

# ORF1ab: full gene
$gene_to_termini{'ORF1ab_1'}->{'start'} = 'ATGGAGAGCCTTGTCCCTGGTTTCAA';
$gene_to_termini{'ORF1ab_1'}->{'end'} = 'TCAGCTGATGCACAATCGTTTTTAAAC';

$gene_to_termini{'ORF1ab_2'}->{'start'} = 'CGGGTTTGCGGTGTAAGTGCAGCCCG';
$gene_to_termini{'ORF1ab_2'}->{'end'} = 'TAGTGATGTTCTTGTTAACAACTAA';

# ORF1ab_1: NOL (Wuhan-Hu-1 266-13465)
$gene_to_termini{'ORF1ab_1_NOL'}->{'start'} = 'ATGGAGAGCCTTGTCCCTGGTTTCAAC';
$gene_to_termini{'ORF1ab_1_NOL'}->{'end'} = 'AGCTGATGCACAATCGTTTTTA';

# ORF1ab: OL (Wuhan-Hu-1 13469-13483)
$gene_to_termini{'ORF1ab_1_OL_ORF1ab_2'}->{'start'} = 'GGGTTTGCGGTGTAA'; # whole thing
$gene_to_termini{'ORF1ab_1_OL_ORF1ab_2'}->{'end'} = 'GGGTTTGCGGTGTAA';

# ORF1ab_2: NOL (Wuhan-Hu-1 266-13465)
$gene_to_termini{'ORF1ab_2_NOL'}->{'start'} = 'GCAGCCCGTCTTACACCGTGCGGCACAGGC';#
$gene_to_termini{'ORF1ab_2_NOL'}->{'end'} = 'ATGTTCTTGTTAACAACTAA';

# ORF1a: full gene
$gene_to_termini{'ORF1a'}->{'start'} = 'ATGGAGAGCCTTGTCCCTGGTTTCAACGA';
$gene_to_termini{'ORF1a'}->{'end'} = 'CGGGTTTGCGGTGTAA';

# Nonstructural proteins: mature peptides
$gene_to_termini{'nsp1'}->{'start'} = 'ATGGAGAGCCTTGTCCCTGGTTTCAACGAGA';
$gene_to_termini{'nsp1'}->{'end'} = 'CGTGAACTCATGCGTGAGCTTAACGGAGGG';

$gene_to_termini{'nsp2'}->{'start'} = 'GCATACACTCGCTATGTCGATAACAACTTCT';
$gene_to_termini{'nsp2'}->{'end'} = 'ACAATACCTTCACACTCAAAGGCGGT';

$gene_to_termini{'nsp3'}->{'start'} = 'GCACCAACAAAG';
$gene_to_termini{'nsp3'}->{'end'} = 'CAAAGATAGCACTTAAGGGTGGT';

$gene_to_termini{'nsp4'}->{'start'} = 'AAAATTGTTAAT';
$gene_to_termini{'nsp4'}->{'end'} = 'CAAACCTCTATCACCTCAGCTGTTTTGCAG';

$gene_to_termini{'nsp5'}->{'start'} = 'AGTGGTTTTAGAAAAATGGCATTCCCATC';
$gene_to_termini{'nsp5'}->{'end'} = 'CAATGCTCAGGTGTTACTTTCCAA';

$gene_to_termini{'nsp6'}->{'start'} = 'AGTGCAGTGAAAAGAACAATCAAGGGTAC';
$gene_to_termini{'nsp6'}->{'end'} = 'CTTGTATCAAAGTAGCCACTGTACAG';

$gene_to_termini{'nsp7'}->{'start'} = 'TCTAAAATGTCAGATGTAAAGTGCACATCA';
$gene_to_termini{'nsp7'}->{'end'} = 'TGCTGGACAACAGGGCAACCTTACAA';

$gene_to_termini{'nsp8'}->{'start'} = 'GCTATAGCCTCAGAGTTTAGTTCCCTTCCA';
$gene_to_termini{'nsp8'}->{'end'} = 'CAATTCTGCTGTCAAATTACAG';

$gene_to_termini{'nsp9'}->{'start'} = 'AATAATGAGCTTAGTCCTGTTGCACTACGA';
$gene_to_termini{'nsp9'}->{'end'} = 'AGTTTAGCTGCCACAGTACGTCTACAA';

$gene_to_termini{'nsp10'}->{'start'} = 'GCTGGTAATGCAACAGAAGTGCCTGCCAA';
$gene_to_termini{'nsp10'}->{'end'} = 'GTGATCAACTCCGCGAACCCATGCTTCAG';

$gene_to_termini{'nsp11'}->{'start'} = 'TCAGCTGATGCACAATCGTTTTTAAAC';
$gene_to_termini{'nsp11'}->{'end'} = 'CGGGTTTGCGGTG';

$gene_to_termini{'nsp12_1'}->{'start'} = 'TCAGCTGATGCACAATCGTTTTTA';
$gene_to_termini{'nsp12_1'}->{'end'} = 'TCAGCTGATGCACAATCGTTTTTAAAC';

$gene_to_termini{'nsp12_2'}->{'start'} = 'CGGGTTTGCGGTGTAAGTGCAGCCC';
$gene_to_termini{'nsp12_2'}->{'end'} = 'TACACACCGCATACAGTCTTACAG';

$gene_to_termini{'nsp13'}->{'start'} = 'GCTGTTGGGGCTTGTGTTCTTTGCAATT';
$gene_to_termini{'nsp13'}->{'end'} = 'AATTCCACGTAGGAATGTGGCAACTTTACAA';

$gene_to_termini{'nsp14'}->{'start'} = 'GCTGAAAATGTAACAGGACTCTTTAAA';
$gene_to_termini{'nsp14'}->{'end'} = 'CTGGAACACTTTTACAAGACTTCAG';

$gene_to_termini{'nsp15'}->{'start'} = 'AGTTTAGAAAATGTGGCTTTTAATGTT';
$gene_to_termini{'nsp15'}->{'end'} = 'TAGAAACATTTTACCCAAAATTACAA';

$gene_to_termini{'nsp16'}->{'start'} = 'TCTAGTCAAGCGTGGCAACCGGGTGT';
$gene_to_termini{'nsp16'}->{'end'} = 'CTAGTGATGTTCTTGTTAACAACTAA';

# Structural and accessory
$gene_to_termini{'S'}->{'start'} = 'ATGTTTGTTTTTCTTGTT';
$gene_to_termini{'S'}->{'end'} = 'GTCAAATTACATTACACATAA';

$gene_to_termini{'ORF3a'}->{'start'} = 'ATGGATTTGTTTATGAGAATCTTCAC';
$gene_to_termini{'ORF3a'}->{'end'} = 'GACTACTAGCGTGCCTTTGTAA';

# ORF3a: NOL1 (Wuhan-Hu-1 25393-25521)
$gene_to_termini{'ORF3a_NOL1'}->{'start'} = 'ATGGATTTGTTTATGAGAATCTTCAC';
$gene_to_termini{'ORF3a_NOL1'}->{'end'} = 'GATACAAGCCTCACTCCCTTTC';

# ORF3a: OL (Wuhan-Hu-1 25522-25698)
$gene_to_termini{'ORF3a_OL_ORF3d'}->{'start'} = 'GGATGGCTTATTGTTGGCGTTGCA';
$gene_to_termini{'ORF3a_OL_ORF3d'}->{'end'} = 'CTCGTTGCTGCTGGCCTTGAA';

# ORF3a: NOL2 (Wuhan-Hu-1 25699-26220)
$gene_to_termini{'ORF3a_NOL2'}->{'start'} = 'GCCCCTTTTCTCTATCTTTATGCTTT';
$gene_to_termini{'ORF3a_NOL2'}->{'end'} = 'GACTACTAGCGTGCCTTTGTAA';

$gene_to_termini{'ORF3d'}->{'start'} = 'ATGGCTTATTGTTGGCGTTGCACTTC';
$gene_to_termini{'ORF3d'}->{'end'} = 'TTTGCTCGTTGCTGCTGGCCTTGA';

$gene_to_termini{'E'}->{'start'} = 'ATGTACTCATTCGTTTCGGAAGAGACAGGT';
$gene_to_termini{'E'}->{'end'} = 'GTTCCTGATCTTCTGGTCTAA';

$gene_to_termini{'M'}->{'start'} = 'ATGGCAGATTCCAACGGTACTATTACCGT';
$gene_to_termini{'M'}->{'end'} = 'GCTTTGCTTGTACAGTAA';

# M_OL_MB: TTATGGCCAGTAACTTTAGCTTGTTTTGTGCTT .. TCCATGTGGTCATTCAATCCAGAAACTAAC

$gene_to_termini{'ORF6'}->{'start'} = 'ATGTTTCATCTCGTTGACTTTCAGG';
$gene_to_termini{'ORF6'}->{'end'} = 'AGCAACCAATGGAGATTGATTAA';

$gene_to_termini{'ORF7a'}->{'start'} = 'ATGAAAATTATTCTTTTCTTGGCA';
$gene_to_termini{'ORF7a'}->{'end'} = 'CAAAAGAAAGACAGAATGA';

# ORF7a: NOL (Wuhan-Hu-1 27394-27753)
$gene_to_termini{'ORF7a_NOL'}->{'start'} = 'ATGAAAATTATTCTTTTCTTGGCA';
$gene_to_termini{'ORF7a_NOL'}->{'end'} = 'CACTCAAAAGAAAGACA';

$gene_to_termini{'ORF7b'}->{'start'} = 'ATGATTGAACTTTCATTAATTGAC';
$gene_to_termini{'ORF7b'}->{'end'} = 'GAAACTTGTCACGCCTAA';

# ORF7b: NOL (Wuhan-Hu-1: 27762-27887)
$gene_to_termini{'ORF7b_NOL'}->{'start'} = 'GAACTTTCATTAATTGACTTCT';
$gene_to_termini{'ORF7b_NOL'}->{'end'} = 'GAAACTTGTCACGCCTAA';

# ORF8 starts identically to ORF8'; probably the same
$gene_to_termini{'ORF8'}->{'start'} = 'ATGAAATTTCTTGTTTTCTTAGGAA';
$gene_to_termini{'ORF8'}->{'end'} = 'TTGTTTTAGATTTCATCTAA';

# ORF8a and ORF8b?

$gene_to_termini{'N'}->{'start'} = 'ATGTCTGATAATGGACCCCAAAATCAGCG';
$gene_to_termini{'N'}->{'end'} = 'GCTGACTCAACTCAGGCCTAA';

# N: NOL1 (Wuhan-Hu-1 28274-28282)
$gene_to_termini{'N_NOL1'}->{'start'} = 'ATGTCTGAT'; # whole thing
$gene_to_termini{'N_NOL1'}->{'end'} = 'ATGTCTGAT';

# N: OL ORF9b (Wuhan-Hu-1 28283-28579)
$gene_to_termini{'N_OL_ORF9b'}->{'start'} = 'AATGGACCCCAAAATCAG';
$gene_to_termini{'N_OL_ORF9b'}->{'end'} = 'GTGACGGTAAAATGAAA';

# N: NOL2 (Wuhan-Hu-1 28580-28732)
$gene_to_termini{'N_NOL2'}->{'start'} = 'GATCTCAGTCCAAGATGGTATTT';
$gene_to_termini{'N_NOL2'}->{'end'} = 'ACCCGCAATCCTGCTAAC';

# N: OL ORF9c (Wuhan-Hu-1 28733-28957)
$gene_to_termini{'N_OL_ORF9c'}->{'start'} = 'AATGCTGCAATCGTGCTACAACTTCCTC';
$gene_to_termini{'N_OL_ORF9c'}->{'end'} = 'CTGCTGCTTGACAGATTGAAC';

# N: NOL3 (Wuhan-Hu-1 28958-29533)
$gene_to_termini{'N_NOL3'}->{'start'} = 'CAGCTTGAGAGCAAAATGTCTGG';
$gene_to_termini{'N_NOL3'}->{'end'} = 'GCTGACTCAACTCAGGCCTAA';

$gene_to_termini{'ORF9b'}->{'start'} = 'ATGGACCCCAA'; #AATCAGCGAAATG';
$gene_to_termini{'ORF9b'}->{'end'} = 'ACCAGACGAATTCGTGGTGGTGACGGTAAAATGA';

$gene_to_termini{'ORF9c'}->{'start'} = 'ATGCTGCAATCGTGCTACAACTT';
$gene_to_termini{'ORF9c'}->{'end'} = 'CTGCTGCTTGACAGATTGA';

$gene_to_termini{'ORF10'}->{'start'} = 'ATGGGCTATATAAACGTTTTCGC';
$gene_to_termini{'ORF10'}->{'end'} = 'TAACTTTAATCTCACATAG';


#print "\n################################################################################";
#print "\nANALYSIS MODE=$mode\...\n";
my %gene_to_sites;

my @gene_names_sorted = qw/ORF1ab_1 ORF1ab_2 ORF1ab_1_NOL ORF1ab_1_OL_ORF1ab_2 ORF1ab_2_NOL 
						ORF1a nsp1 nsp2 nsp3 nsp4 nsp5 nsp6 nsp7 nsp8 nsp9 nsp10 nsp11 
						nsp12_1 nsp12_2 nsp13 nsp14 nsp15 nsp16 S ORF3a ORF3a_NOL1 
						ORF3a_OL_ORF3d ORF3a_NOL2 ORF3d E M ORF6 ORF7a ORF7a_NOL ORF7b ORF7b_NOL 
						ORF8p N N_NOL1 N_OL_ORF9b N_NOL2 N_OL_ORF9c N_NOL3 ORF9b ORF9c 
						ORF10/;
						
#my @gene_names_sorted = qw/ORF1ab_1 ORF1ab_2 ORF1a nsp1 nsp2 nsp3 nsp4 nsp5 nsp6 nsp7 nsp8 nsp9 nsp10 nsp11 nsp12_1 nsp12_2 
#						nsp13 nsp14 nsp15 nsp16 S S_confirm ORF3a ORF3d E M ORF6 ORF7a ORF7b ORF8p N ORF9b ORF9c ORF10/;


GENE_LOOP: foreach my $gene_name (@gene_names_sorted) {
#GENE_LOOP: foreach my $gene_name (sort keys %gene_to_termini) {
	my $start_seq = $gene_to_termini{$gene_name}->{'start'};
	my $end_seq = $gene_to_termini{$gene_name}->{'end'};
	
	SEQ_LOOP: foreach my $sequence_name (sort keys %header2sequence) {
		my $sequence = $header2sequence{$sequence_name};
		
		# Find index of start
		my $start_index = index($sequence, $start_seq);
		
		# Find index of end
		my $end_index = index($sequence, $end_seq);
		
		if($start_index > -1 && $end_index > -1) {
			# Compute positions
			my $start_site = $start_index + 1;
			my $end_site = $end_index + length($end_seq);
			
			# Print GTF line, go to next gene
			print "SARSCoV2_exhaustive\tSNPGenie\tCDS\t$start_site\t$end_site\t.\t+\t0\tgene_id \"$gene_name\";\n";
			
			# If S gene, print the 9 preceding codons
			if($gene_name eq 'S') {
				print "SARSCoV2_exhaustive\tSNPGenie\tCDS\t" . 
					($start_site - (9 * 3)) . "\t" .
					($start_site - 1) . "\t" .
					".\t+\t0\tgene_id \"$gene_name\_altStart\";\n";
			}
			
			next GENE_LOOP;
		}
		
	}
	
	print "\n### WARNING: didn't find $gene_name\n";
}

exit;


back to top