https://github.com/skingan/PBsnp2fa.pl
Tip revision: bd42ba487dd45e7f4d6f292aa3f3675b355fbb72 authored by Sarah B. Kingan on 30 October 2015, 12:32:23 UTC
Update PBsnp2fa.pl
Update PBsnp2fa.pl
Tip revision: bd42ba4
PBsnp2fa.pl
#!/usr/bin/perl -w
####################################################################################################
#
# Sarah B. Kingan
# University of Rochester
# 1 October 2014
# updated 30 October 2015
#
# Title: PBsnp2fa.pl
#
#
# This program generates fasta alignments of samples
# from user-defined coordinates.
#
# Input:
# 1. popbam snp outfile, bgz compressed and tabix indexed
# 2. reference sequence in fastA format
# 3. coordinates given at command line (e.g. 2L:1000-2000)
# 4. optional text file with list of sample IDs, one ID per line. Sample order
# should be the same as in the BAM file header. If omitted, ID's are
# "sample_1", "sample_2" etc.
#
# Output:
# fasta file of region for all samples.
#
#
####################################################################################################
use strict;
use Bio::Seq;
use Bio::DB::Fasta;
use Tabix;
my $usage = "PBsnp2fa.pl <snp.bgz> <ref.fa> <chrom:start-end> <OPT:sample_list.txt>\n";
# Check that user has write permissions for SNP file directory
my $SNP_infile = $ARGV[0] or die $usage;
my @path_array = split("/", $SNP_infile);
pop @path_array;
my $path = join("/", @path_array);
my $r; my $w; my $x;
($r,$w,$x) = (-r $path, -w _, -x _);
unless ($w == 1) {
print "Error: you do not have write permissions for SNP file directory!\n";
exit;
}
# Setup tabix-index snp file
my $SNP_tabix = Tabix->new(-data => $SNP_infile);
# check if snp file is bgzipped
unless ($SNP_infile =~ /.bgz$/) {
print "snp file must be compressed with bgzip!\n";
print "try: bgzip -c myfile.snp > myfile.snp.bgz\n"
}
# check for tabix index file
unless (-e $SNP_infile.'.tbi') {
print "snp file must be indexed with Tabix!\n";
print "try: tabix -b 2 -e 2 -s 1 myfile.snp.bgz\n";
}
# Setup reference fasta database
my $ref_fasta_file = $ARGV[1] or die $usage;
my $ref_fastaDB = Bio::DB::Fasta->new($ref_fasta_file);
# Get region coordinates
my $coordinates = $ARGV[2] or die $usage;
my $chrom;
my $start;
my $end;
if ($coordinates =~ /(\S+):([0-9]+)-([0-9]+)/) {
$chrom = $1;
$start = $2;
$end = $3;
}
else {
print "wrong coordinates format!\n";
print $usage;
exit;
}
# Fetch reference sequence for interval
my $ref_seq = $ref_fastaDB->seq("$chrom:$start,$end");
# load SNP data for interval into hash
# key = position
# value = array of consensus bases for each sample
my %interval_SNP_hash;
my $SNP_interval = $SNP_tabix->query($chrom, $start, $end);
my $ncolumns;
while (my $SNP_line = $SNP_tabix->read($SNP_interval)) {
my @SNP_line_array = split("\t", $SNP_line);
$ncolumns = scalar(@SNP_line_array);
my $position = $SNP_line_array[1];
my @consensus_base_array = makeSNParray(@SNP_line_array);
@{$interval_SNP_hash{$position}} = @consensus_base_array;
}
my $nsam = ($ncolumns - 3)/4;
# create sample name array from user input file
my @sample_array;
if ($ARGV[3]) {
my $sample_list = $ARGV[3];
@sample_array = makeSampleArray($sample_list);
# check that number of samples provided matches snp file dimensions
if (scalar@sample_array != $nsam) {
print "sample list contains incorrect number of samples!\n";
die;
}
}
# create generic sample name array
else {
for (my $n = 1; $n<=$nsam; $n++) {
push(@sample_array, 'sample_'.$n);
}
}
# print sequence for each sample
for (my $i = 0; $i<scalar@sample_array; $i++) {
my $seq = $ref_seq;
foreach my $position (sort {$a<=>$b} keys %interval_SNP_hash) {
substr($seq, ($position-$start), 1, ${$interval_SNP_hash{$position}}[$i]);
}
print ">$sample_array[$i]|$chrom:$start-$end\n";
print $seq, "\n";
}
#### SUBROUTINES ####
# create hash of SNP data
##########################
sub makeSNParray {
##########################
my (@SNP_array) = @_;
my @consensus_base_array;
for (my $i = 3; $i<scalar(@SNP_array); $i+=4) {
my $sample_index = ($i-3)/4;
if (($SNP_array[$i+1] > 25) # snp quality
&& ($SNP_array[$i+2] > 25) # RMS quality
&& ($SNP_array[$i+3] > 3)) { # read depth
$consensus_base_array[$sample_index] = $SNP_array[$i];
}
else {
$consensus_base_array[$sample_index] = 'N';
}
}
return @consensus_base_array;
}
# create array of sample names
##########################
sub makeSampleArray {
##########################
my ($sample_list) = @_; # text_file
my @sample_array;
open (LIST, $sample_list);
while (my $line = <LIST>) {
chomp$line;
unless ($line =~ /^\s*$/) {
push(@sample_array, $line)
}
}
return @sample_array;
}
exit;