Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
extract_fasta_by_sites.pl
#! /usr/bin/perl
#########################################################################################
# EXAMPLE CALL:
#########################################################################################
# extract_fasta_by_sites.pl <multiple_aligned_seqs.fasta> <gene_coordinates_to_extract.gtf>
#########################################################################################
# Copyright (C) 2014 Chase W. Nelson
# Date created: 07 December 2014
# AUTHOR: Chase W. Nelson
# Affiliation1: Austin L. Hughes lab, University of South Carolina (Columbia, SC 29208, USA)
# Affiliation2: Wen-Hsiung Li lab, Academia Sinica (Taipei, Taiwan)
# Contact1: cwnelson88@gmail.com
# Contact2: nelsoncw@email.sc.edu
# ACKNOWLEDGMENTS: Written by C.W.N. with support from a National Science Foundation
# Graduate Research Fellowship (DGE-0929297), a National Science Foundation East Asian
# and Pacific Summer Institutes Fellowship, and a University of South Carolina
# Presidential Fellowship.
use strict;
use warnings;
# Check that an argument is given
if(! $ARGV[1]) {
die "\n###WARNING: Two arguments must be supplied: (1) <multiple_aligned_seqs.fasta>; ".
"(2) <gene_coordinates_to_extract.gtf>\n\n";
}
my $fasta_file = $ARGV[0];
my $gtf_file = $ARGV[1];
# Get product names first
my %products_hash;
open (GTF_FILE, $gtf_file);
while (<GTF_FILE>) {
if($_ =~ /CDS\t\d+\t\d+\t[\.\d+]\t\+/) { # Must be on the + strand
if($_ =~ /\s*gene_id\s*\"gene\:([\w\s\.\-\:']+)\"/) {
$products_hash{$1} = 1;
} elsif($_ =~ /\s*gene_id\s*\"([\w\s\.\-\:']+ [\w\s\.\-\:']+)\"/) {
$products_hash{$1} = 1;
} elsif($_ =~/\s*gene_id\s*\"([\w\s\.\-\:']+)\"/) {
$products_hash{$1} = 1;
}
}
}
close GTF_FILE;
my @product_names = keys %products_hash;
# Now get product coordinates
my %products_sites_hh;
foreach my $product (@product_names) {
my $start_site_1;
my $stop_site_1;
my $start_site_2;
my $stop_site_2;
open (CURRINFILE, $gtf_file);
while (<CURRINFILE>) {
if($_ =~ /CDS\t\d+\t\d+\t[\.\d+]\t\+/) { # Must be on the + strand
chomp;
if (! $start_site_1) {
if ($_ =~/gene_id "$product";/) {
if ($_ =~/CDS\t(\d+)\t(\d+)/) {
$start_site_1 = $1;
$stop_site_1 = $2;
}
} elsif($_ =~/gene_id "gene\:$product";/) {
if ($_ =~/CDS\t(\d+)\t(\d+)/) {
$start_site_1 = $1;
$stop_site_1 = $2;
}
}
} else {
if ($_ =~/gene_id "$product";/) {
if ($_ =~/CDS\t(\d+)\t(\d+)/) {
$start_site_2 = $1;
$stop_site_2 = $2;
last; # This might be changed if we go on to add more segments
}
} elsif($_ =~/gene_id "gene\:$product";/) {
if ($_ =~/CDS\t(\d+)\t(\d+)/) {
$start_site_2 = $1;
$stop_site_2 = $2;
last; # This might be changed if we go on to add more segments
}
}
}
}
}
close CURRINFILE;
if ((!$start_site_2) && (($stop_site_1 - $start_site_1 + 1) % 3) != 0) {
die "\n\n## WARNING: The CDS coordinates for gene $product in the gtf file ".
"do not yield a set of complete codons,\n".
"## or are absent from the file. The number of nucleotides must ".
"be a multiple of 3.\n## SNPGenie terminated.\n\n";
}
if ($start_site_2 && (((($stop_site_1 - $start_site_1 + 1) + ($stop_site_2 - $start_site_2 + 1)) % 3) != 0)) {
die "\n\n## WARNING: The CDS coordinates for gene $product in the gtf file ".
"do not yield a set of complete codons,\n".
"## or are absent from the file. The number of nucleotides must ".
"be a multiple of 3.\n## SNPGenie terminated.\n\n";
}
my @coord_arr;
if (! $start_site_2) {
$products_sites_hh{$product}->{start}=$start_site_1;
$products_sites_hh{$product}->{stop}=$stop_site_1;
} else {
if ($start_site_1 < $start_site_2) {
$products_sites_hh{$product}->{start}=$start_site_1;
$products_sites_hh{$product}->{stop}=$stop_site_1;
$products_sites_hh{$product}->{start2}=$start_site_2;
$products_sites_hh{$product}->{stop2}=$stop_site_2;
} else {
$products_sites_hh{$product}->{start}=$start_site_2;
$products_sites_hh{$product}->{stop}=$stop_site_2;
$products_sites_hh{$product}->{start2}=$start_site_1;
$products_sites_hh{$product}->{stop2}=$stop_site_1;
}
}
}
# Generate new prefix
my $new_file_prefix;
if($fasta_file =~/\.fasta/) {
$new_file_prefix = $`;
} elsif($fasta_file =~/\.txt/) {
$new_file_prefix = $`;
} elsif($fasta_file =~/\.fa/) {
$new_file_prefix = $`;
} else {
$new_file_prefix = "new_file";
}
my $seq_counter = 0;
my $motif_counter = 0;
my $seq = '';
my $header = '';
my @seqs_arr;
my @headers_arr;
#my @id_arr;
my @seq_lengths;
my $last_seq_length;
open(FASTA_FILE, "$fasta_file") or die "Could not open file $fasta_file\n";
print "\n\nBuilding sequences...\n\n";
while(<FASTA_FILE>) {
chomp;
if(/>/) {
if($seq_counter == 0) {
$header = $_;
$seq_counter ++;
} else {
push(@seqs_arr,$seq);
push(@headers_arr,$header);
$header = $_;
$seq_counter ++;
my $this_seq_length = length($seq);
push(@seq_lengths,$this_seq_length);
if($last_seq_length && ($last_seq_length != $this_seq_length)) {
die "\n\nDIE: The sequences must be aligned, i.e., must be the same length. TERMINATED.\n\n";
} else {
$last_seq_length = $this_seq_length;
$seq = '';
}
}
} else {
$seq .= $_;
}
}
push(@seqs_arr,$seq);
push(@seq_lengths,$last_seq_length);
push(@headers_arr,$header);
my $this_seq_length = length($seq);
push(@seq_lengths,$this_seq_length);
if($last_seq_length && ($last_seq_length != $this_seq_length)) {
die "\n\nDIE: The sequences must be aligned, i.e., must be the same length. TERMINATED.\n\n";
} else {
$last_seq_length = $this_seq_length;
$seq = '';
}
close FASTA_FILE;
foreach my $product (@product_names) {
my $this_file_name = "$new_file_prefix\_$product\.fasta";
print "Printing $this_file_name\...\n\n";
open(OUTPUT_FILE, ">>$this_file_name");
if($products_sites_hh{$product}->{start2}) { # if there's a second segment
print "\nGot here 2\n";
my $start = $products_sites_hh{$product}->{start};
my $start_index = ($start - 1);
my $stop = $products_sites_hh{$product}->{stop};
my $start2 = $products_sites_hh{$product}->{start2};
my $start2_index = ($start2 - 1);
my $stop2 = $products_sites_hh{$product}->{stop2};
my $extracted_length1 = ($stop - $start + 1);
my $extracted_length2 = ($stop2 - $start2 + 1);
my $extracted_length_total = $extracted_length1 + $extracted_length2;
if($start > $stop) {
die "\nThe start site must occur before the stop site\n\n";
} elsif($start2 > $stop2) {
die "\nThe start site must occur before the stop site\n\n";
}
if($extracted_length_total % 3 != 0) {
print "\n\nWARNING: extracted region is not a multiple of 3 (will not form whole \n".
"codons if this is a coding region.\n\n";
}
# FOREACH SEQ
for(my $seq_index = 0; $seq_index < @seqs_arr; $seq_index++) {
my $seq = $seqs_arr[$seq_index];
my $extract_seq1 = substr($seq, $start_index, $extracted_length1);
my $extract_seq2 = substr($seq, $start2_index, $extracted_length2);
my $extract_seq_total = $extract_seq1 . $extract_seq2;
my $new_header = $headers_arr[$seq_index];
print OUTPUT_FILE "$new_header\n";
print OUTPUT_FILE "$extract_seq_total\n";
}
} else {
my $start = $products_sites_hh{$product}->{start};
my $start_index = ($start - 1);
my $stop = $products_sites_hh{$product}->{stop};
my $extracted_length = ($stop - $start + 1);
if($start > $stop) {
die "\nThe start site must occur before the stop site\n\n";
}
if($extracted_length % 3 != 0) {
print "\n\nWARNING: extracted region is not a multiple of 3 (will not form whole \n".
"codons if this is a coding region.\n\n";
}
# FOREACH SEQ
for(my $seq_index = 0; $seq_index < @seqs_arr; $seq_index++) {
my $seq = $seqs_arr[$seq_index];
my $extract_seq = substr($seq, $start_index, $extracted_length);
my $new_header = $headers_arr[$seq_index];
print OUTPUT_FILE "$new_header\n";
print OUTPUT_FILE "$extract_seq\n";
}
}
close OUTPUT_FILE;
}
print "\n### DONE ###\n\n";
exit;
Computing file changes ...