https://github.com/minillinim/APP
Tip revision: d7e8cbba2fdba8705527a141848c6f5b30118d7b authored by Askars on 20 March 2012, 03:22:08 UTC
Merge branch 'master' of git://github.com/minillinim/APP
Merge branch 'master' of git://github.com/minillinim/APP
Tip revision: d7e8cbb
app_make_results.pl
#!/usr/bin/perl
###############################################################################
#
# app_make_results.pl
#
# Normalise and complete the QIIME pieline
#
# Copyright (C) 2011 Michael Imelfort and Paul Dennis
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################
#pragmas
use strict;
use warnings;
#core Perl modules
use Getopt::Long;
#CPAN modules
use File::Basename;
use Statistics::R;
use Data::Dumper;
#locally-written modules
#load the pretty names for the fields
use AppConfig;
BEGIN {
select(STDERR);
$| = 1;
select(STDOUT);
$| = 1;
}
# get input params and print copyright
printAtStart();
my $options = checkParams();
######################################################################
# CODE HERE
######################################################################
#### GLOBALS
#### Get the conf base
my $global_conf_base = basename($options->{'config'});
my @cb_1 = split /_/, $global_conf_base;
my @cb_2 = split /\./, $cb_1[1];
$global_conf_base = $cb_2[0];
#### GET THE WORKING DIR
getWorkingDirs($options->{'config'});
#### make these dirs
makeResultsDirs(0);
# sample IDs
my %global_samp_ID_list = ();
my $global_num_samples = 0;
# we can compare sequences to the greengenes or the SILVA dbs
my $global_comp_DB_type = "GG";
# there are a number of different ways to normalise
# by default don't normalise
my $global_norm_style = "TABLE";
my $global_norm_sample_size = 0;
# how many times are we going to resample sequences (by default)
# in second round normalisation
my $global_norm_seq_norm_rounds = 7;
# defaults for rarefication
my $global_rare_M = 50;
# This is made dynamic in parse below
my $global_rare_X = -1;
my $global_rare_S = 50;
my $global_rare_N = 50;
my $global_min_sample_size = 100000000000;
my $global_max_sample_size = -1;
# defaults for otu similarity
my $global_similarity_setting = 0.97;
#defaults for assign taxonomy
my $global_e_value = 0.001;
# how many reps to do when normalising the OTU table
my $global_norm_num_reps = 1000;
# we only need a subset of these guys to do jacknifing
my $global_JN_file_count = 100;
if($global_JN_file_count > $global_norm_num_reps) { $global_JN_file_count = $global_norm_num_reps - 1; }
#### Override defaults from config or user
if(exists $options->{'identity'}) { $global_similarity_setting = $options->{'identity'}; }
if(exists $options->{'e'}) { $global_e_value = $options->{'e'}; }
print "Checking is all the config checks out...\t\t";
parse_config_results();
# update our databases (GG by default)
my $TAX_tax_file = $QIIME_TAX_tax_file;
my $TAX_blast_file = $QIIME_TAX_blast_file;
my $imputed_file = $QIIME_imputed_file;
if($global_comp_DB_type eq "SILVA")
{
$TAX_tax_file = $SILVA_TAX_tax_file;
$TAX_blast_file = $SILVA_TAX_blast_file;
$imputed_file = $SILVA_imputed_file;
}
#### Start the results pipeline!
print "All good!\n";
#### Create a communication bridge with R and start R
my $global_R_instance = Statistics::R->new();
$global_R_instance->start();
####
#### NN DATA SET PROCESSING!
####
print "----------------------------------------------------------------\n";
print "Start TABLE BASED NORMALISATION data set processing...\n";
print "----------------------------------------------------------------\n";
print "Copying reads for analysis...\n";
copy_read_subset("$global_QA_dir/$global_acacia_output_dir/$ACACIA_out_file","$global_TB_processing_dir/$nn_fasta_file");
print "Picking OTUs for non normalised data set...\n";
`pick_otus.py -i $global_TB_processing_dir/$nn_fasta_file -s $global_similarity_setting -o $global_TB_processing_dir/uclust_picked_otus`;
print "Gettting a representitive set...\n";
`pick_rep_set.py -i $global_TB_processing_dir/uclust_picked_otus/$nn_otus_file -f $global_TB_processing_dir/$nn_fasta_file`;
# if we are doing OTU_AVERAGE (or if we've ben asked to) then we need to assign taxonomy here
print "Assigning taxonomy for non normalised data set...\n";
my $nn_rep_set_fasta = "$global_TB_processing_dir/".$nn_fasta_file."_rep_set.fasta";
`assign_taxonomy.py -i $nn_rep_set_fasta -t $TAX_tax_file -b $TAX_blast_file -m blast -e $global_e_value -o $global_TB_processing_dir`;
print "Treeing non normalised data set...\n";
`align_seqs.py -i $nn_rep_set_fasta -t $imputed_file -p 0.6 -o $global_TB_processing_dir/pynast_aligned`;
my $nn_rep_set_aligned = "$global_TB_processing_dir/pynast_aligned/".$nn_fasta_file."_rep_set_aligned.fasta";
`filter_alignment.py -i $nn_rep_set_aligned -o $global_TB_processing_dir`;
my $nn_rep_set_aligned_filtered = "$global_TB_processing_dir/".$nn_fasta_file."_rep_set_aligned_pfiltered.fasta";
`make_phylogeny.py -i $nn_rep_set_aligned_filtered -r midpoint`;
my $nn_rep_set_aligned_filtered_tree = "$global_TB_processing_dir/".$nn_fasta_file."_rep_set_aligned_pfiltered.tre";
`mv $nn_rep_set_aligned_filtered_tree $nn_tree_file`;
print "Making NON NORMALISED otu table...\n";
my $nn_rep_set_tax_assign = "$global_TB_processing_dir/".$nn_fasta_file."_rep_set_tax_assignments.txt";
`make_otu_table.py -i $global_TB_processing_dir/uclust_picked_otus/$nn_otus_file -t $nn_rep_set_tax_assign -o $nn_otu_table_file`;
# do rarefaction for unnormalised data
print "Rarefaction and diversity...\n";
`multiple_rarefactions.py -i $nn_otu_table_file -o $global_TB_processing_dir/rarefied_otu_tables/ -m $global_rare_M -x $global_rare_X -s $global_rare_S -n $global_rare_N`;
`alpha_diversity.py -i $global_TB_processing_dir/rarefied_otu_tables/ -t $nn_tree_file -o $global_TB_processing_dir/alpha_div/ -m chao1,chao1_confidence,PD_whole_tree,observed_species,simpson,shannon,fisher_alpha`;
`collate_alpha.py -i $global_TB_processing_dir/alpha_div/ -o $global_TB_processing_dir/alpha_div_collated/`;
# make the same image twice (two different formats)
`make_rarefaction_plots.py -i $global_TB_processing_dir/alpha_div_collated/ -m $global_QA_dir/qiime_mapping.txt -o $global_TB_results_dir/alpha_diversity/ --resolution 300 --imagetype svg`;
`make_rarefaction_plots.py -i $global_TB_processing_dir/alpha_div_collated/ -m $global_QA_dir/qiime_mapping.txt -o $global_TB_results_dir/alpha_diversity/ --resolution 300 --imagetype png`;
# normalise the non normalised OTU table
print "Normalizing non normalised table at $global_norm_sample_size sequences... [$global_norm_sample_size, $global_norm_num_reps]\n";
`multiple_rarefactions_even_depth.py -i $nn_otu_table_file -o $global_TB_processing_dir/rare_tables/ -d $global_norm_sample_size -n $global_norm_num_reps --lineages_included --k`;
my $centroid_index = find_centroid_table("$global_TB_processing_dir/rare_tables/", $global_norm_sample_size, $tn_dist_file, $tn_log_file);
my $cp_str = "cp $global_TB_processing_dir/rare_tables/rarefaction_$global_norm_sample_size"."_$centroid_index".".txt $tn_otu_table_file";
`$cp_str`;
print "Summarizing by taxa.....\n";
`summarize_taxa.py -i $tn_otu_table_file -o $global_TB_results_dir/breakdown_by_taxonomy/`;
# move 100 of the 100 tables just produced into a new folder for jacknifing
my $jack_knife_folder = "$global_TB_processing_dir/rare_tables/JN";
`mkdir -p $jack_knife_folder`;
foreach my $jn_file_counter (0..$global_JN_file_count)
{
my $jn_from_file = "rarefaction_".$global_norm_sample_size."_".$jn_file_counter.".txt";
`cp $global_TB_processing_dir/rare_tables/$jn_from_file $jack_knife_folder/`;
}
print "Jacknifed beta diversity....\n";
jack_knifing("weighted_unifrac", $nn_otu_table_file, $nn_tree_file, $jack_knife_folder);
jack_knifing("unweighted_unifrac", $nn_otu_table_file, $nn_tree_file, $jack_knife_folder);
jack_knifing("euclidean", $nn_otu_table_file, $nn_tree_file, $jack_knife_folder);
jack_knifing("hellinger", $nn_otu_table_file, $nn_tree_file, $jack_knife_folder);
# beta for the normalised table
my @beta_methods = ('weighted_unifrac','unweighted_unifrac','euclidean','hellinger','binary_euclidean','chord');
foreach my $matrix_type (@beta_methods)
{
my $beta_str = "beta_diversity.py -i $tn_otu_table_file -o $global_TB_results_dir/beta_diversity/$tn_prefix/$matrix_type -m $matrix_type";
# qiime spews when you give it a tree for some methods
if(!($matrix_type =~ /euclid/))
{
$beta_str .= " -t $nn_tree_file";
}
`$beta_str`;
my $beta_file = $matrix_type."_".$tn_prefix."_otu_table.txt";
my $upgma_file = $matrix_type."_".$tn_prefix."_otu_table_upgma.tre";
my $pcoa_file = $matrix_type."_".$tn_prefix."_pcoa.txt";
# Perform UPGMA clustering on rarefied distance matrices
`upgma_cluster.py -i $global_TB_results_dir/beta_diversity/$tn_prefix/$matrix_type/$beta_file -o $global_TB_results_dir/beta_diversity/$tn_prefix/$matrix_type/$upgma_file`;
# Compute principal coordinates
`principal_coordinates.py -i $global_TB_results_dir/beta_diversity/$tn_prefix/$matrix_type/$beta_file -o $global_TB_results_dir/beta_diversity/$tn_prefix/$matrix_type/$pcoa_file`;
}
# shuffle the results around
####
#### END NN DATA SET PROCESSING
####
####
#### SEQ NORMALISED DATA SET PROCESSING
####
# these file will exist if we have used the seq centroid method
if($global_norm_style eq "SEQ")
{
print "----------------------------------------------------------------\n";
print "Start SEQ normalised data processing...\n";
print "----------------------------------------------------------------\n";
# make the seq processing dir if we need to
makeResultsDirs(1);
# find centroid sequences
find_centroid_sequences($global_SB_processing_dir."/norm_tables/", 7, "$global_TB_processing_dir/$nn_fasta_file", "$global_SB_processing_dir/$sn_fasta_file", $sn_dist_file, $sn_log_file);
# once we've found our guy, lets make an otu table with assigned taxonomies
print "Picking OTUs for SEQ normalised data set...\n";
`pick_otus.py -i "$global_SB_processing_dir/$sn_fasta_file" -s $global_similarity_setting -o $global_SB_processing_dir/uclust_picked_otus`;
print "Gettting a representitive set...\n";
`pick_rep_set.py -i $global_SB_processing_dir/uclust_picked_otus/$sn_otus_file -f "$global_SB_processing_dir/$sn_fasta_file"`;
# if we are doing OTU_AVERAGE (or if we've ben asked to) then we need to assign taxonomy here
print "Assigning taxonomy for SEQ normalised data set...\n";
my $sn_rep_set_fasta = "$global_SB_processing_dir/".$sn_fasta_file."_rep_set.fasta";
`assign_taxonomy.py -i $sn_rep_set_fasta -t $TAX_tax_file -b $TAX_blast_file -m blast -e $global_e_value -o $global_SB_processing_dir`;
print "Treeing SEQ normalised data set...\n";
`align_seqs.py -i $sn_rep_set_fasta -t $imputed_file -p 0.6 -o $global_SB_processing_dir/pynast_aligned`;
my $sn_rep_set_aligned = "$global_SB_processing_dir/pynast_aligned/".$sn_fasta_file."_rep_set_aligned.fasta";
`filter_alignment.py -i $sn_rep_set_aligned -o $global_SB_processing_dir`;
my $sn_rep_set_aligned_filtered = "$global_SB_processing_dir/".$sn_fasta_file."_rep_set_aligned_pfiltered.fasta";
`make_phylogeny.py -i $sn_rep_set_aligned_filtered -r midpoint`;
my $sn_rep_set_aligned_filtered_tree = "$global_SB_processing_dir/".$sn_fasta_file."_rep_set_aligned_pfiltered.tre";
`mv $sn_rep_set_aligned_filtered_tree $sn_tree_file`;
print "Making SEQ SEQ normalised otu table...\n";
my $sn_rep_set_tax_assign = "$global_SB_processing_dir/".$sn_fasta_file."_rep_set_tax_assignments.txt";
`make_otu_table.py -i $global_SB_processing_dir/uclust_picked_otus/$sn_otus_file -t $sn_rep_set_tax_assign -o $sn_otu_table_file`;
print "Alpha and beta diversity for SEQ normalised otu table...\n";
# beta
my @beta_methods = ('weighted_unifrac','unweighted_unifrac','euclidean','hellinger','binary_euclidean','chord');
foreach my $matrix_type (@beta_methods)
{
# basic beta diversity call
my $beta_str = "beta_diversity.py -i $sn_otu_table_file -o $global_SB_results_dir/beta_diversity/$matrix_type -m $matrix_type";
# qiime spews when you give it a tree fro some methods
if(!($matrix_type =~ /euclid/))
{
$beta_str .= " -t $sn_tree_file";
}
`$beta_str`;
my $beta_file = $matrix_type."_".$sn_prefix."_otu_table.txt";
my $upgma_file = $matrix_type."_".$sn_prefix."_otu_table_upgma.tre";
my $pcoa_file = $matrix_type."_".$sn_prefix."_pcoa.txt";
# Perform UPGMA clustering on rarefied distance matrices
`upgma_cluster.py -i $global_SB_results_dir/beta_diversity/$matrix_type/$beta_file -o $global_SB_results_dir/beta_diversity/$matrix_type/$upgma_file`;
# Compute principal coordinates
`principal_coordinates.py -i $global_SB_results_dir/beta_diversity/$matrix_type/$beta_file -o $global_SB_results_dir/beta_diversity/$matrix_type/$pcoa_file`;
}
# alpha
`alpha_diversity.py -i $sn_otu_table_file -t $sn_tree_file -o $global_SB_results_dir/alpha_diversity.txt -m chao1,chao1_confidence,PD_whole_tree,observed_species,simpson,shannon,fisher_alpha`;
print "Summarizing by taxa.....\n";
`summarize_taxa.py -i $sn_otu_table_file -o $global_SB_results_dir/breakdown_by_taxonomy/`;
}
print "Results are located in: $global_working_dir/results/\n";
####
#### END SEQ NORMALISED DATA SET PROCESSING
####
# stop the interpreter
$global_R_instance->stop();
######################################################################
# CUSTOM SUBS
######################################################################
sub jack_knifing
{
#-----
# Do jackknifing for a specific type of matrix
#
my ($matrix_type, $raw_otu_table, $raw_tree, $jack_knife_folder ) = @_;
# Produce distance matrix reflecting beta diversity in non-normalised OTU table
`beta_diversity.py -i $raw_otu_table -o $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type -m $matrix_type -t $raw_tree`;
# Perform UPGMA clustering on non-normalised distance matrix
my $beta_otu_table = $matrix_type."_".$nn_prefix."_otu_table.txt";
my $upgma_cluster_tree = $matrix_type."_".$nn_prefix."_otu_table_upgma.tre";
`upgma_cluster.py -i $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/$beta_otu_table -o $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/$upgma_cluster_tree`;
# Produce distance matrices reflecting beta diversity in the rarefied OTU tables
`beta_diversity.py -i $jack_knife_folder -o $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/rare_dm/ -m $matrix_type -t $raw_tree`;
# Perform UPGMA clustering on rarefied distance matrices
`upgma_cluster.py -i $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/rare_dm/ -o $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/rare_upgma/`;
# Compare the consensus tree to the beta-derived trees
`tree_compare.py -s $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/rare_upgma/ -m $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/$upgma_cluster_tree -o $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/upgma_cmp/`;
# Compute principal coordinates
`principal_coordinates.py -i $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/rare_dm/ -o $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/pcoa/`;
# Make PDF of Jackknife tree with labeled support: weighted unifrac command
my $output_pdf = "$global_TB_results_dir/$matrix_type"."_betadiv_jackknife_tree.pdf";
`make_bootstrapped_tree.py -m $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/upgma_cmp/master_tree.tre -s $global_TB_results_dir/beta_diversity/$nn_prefix/$matrix_type/upgma_cmp/jackknife_support.txt -o $output_pdf`;
}
sub run_R_cmd
{
#-----
# Wrapper for running R commands
#
my ($cmd) = @_;
#print "$cmd\n";
$global_R_instance->run($cmd);
}
sub find_centroid_table
{
#-----
# Find a representative set of $global_norm_sample_size sequences (do this in RRRRRR...)
#
my($path, $norm_size, $dist_file, $log_file) = @_;
print "Calculating centroid OTU table from tables in $path...\n";
$global_R_instance->start();
my $sampl_p1 = $global_num_samples + 1;
# read in the list of distance matricies
run_R_cmd(qq`library(foreign);`);
run_R_cmd(qq`a<-list.files("$path", "*.txt");`);
# work out how many there are and allocate an array
run_R_cmd(qq`len_a <- length(a);`);
run_R_cmd(qq`big_frame <- array(0,dim=c($global_num_samples,$global_num_samples,len_a));`);
print " --start loading data...\n";
# load each file individually into a big frame
my $r_str = "for (i in c(1:len_a)) { j <- i - 1; name <- paste(\"$path\",\"rarefaction_$norm_size\",\"_\",j,\".txt\",sep=\"\"); u<-read.table(name,sep=\"\\t\",row.names=1); u[,$sampl_p1]<-NULL; big_frame[,,i]<-as.matrix(dist(t(u), upper=TRUE, diag=TRUE)); i<-i+1; }";
run_R_cmd($r_str);
print " --data loaded, calculating centroid...\n";
# find the average matrix
run_R_cmd(qq`ave <- big_frame[,,1];`);
run_R_cmd(qq`for (i in c(2:len_a)) { ave <- ave + big_frame[,,i]; }`);
run_R_cmd(qq`ave <- ave/len_a;`);
print " --calculating didtances of tables to centroid...\n";
# find the euclidean distance of each matrix from the average
run_R_cmd(qq`dist<-array(0,dim=c(len_a));`);
run_R_cmd(qq`for (i in c(1:len_a)) { dist[i] <- sqrt(sum(big_frame[,,i]-ave)^2); }`);
# find the min value
run_R_cmd(qq`min_index <- which.min(dist);`);
my $centroid_otu_index = $global_R_instance->get('min_index');
# R indexes from 0
$centroid_otu_index--;
print " --table: $centroid_otu_index is the centroid table\n";
# make stats on the distances
# and log what we did
open my $log_fh, ">", $log_file or die "Could not open log file: $log_file : $!\n";
run_R_cmd(qq`max_dist <- max(dist);`);
run_R_cmd(qq`min_dist <- min(dist);`);
run_R_cmd(qq`range_dist <- max_dist - min_dist;`);
run_R_cmd(qq`mean_dist <- mean(dist);`);
run_R_cmd(qq`median_dist <- median(dist);`);
print $log_fh "---------------------------------------------------\n";
print $log_fh " Centroid OTU table based normalised statistics\n";
print $log_fh "---------------------------------------------------\n";
print $log_fh "Max dist:\t".$global_R_instance->get('max_dist')."\n";
print $log_fh "Min dist:\t".$global_R_instance->get('min_dist')."\n";
print $log_fh "Range:\t".$global_R_instance->get('range_dist')."\n";
print $log_fh "Mean:\t".$global_R_instance->get('mean_dist')."\n";
print $log_fh "Median:\t".$global_R_instance->get('median_dist')."\n";
if(2 < $global_num_samples)
{
run_R_cmd(qq`library(permute);`);
run_R_cmd(qq`library(vegan);`);
run_R_cmd(qq`mantel.otu <- mantel(ave,big_frame[,,min_index]);`);
run_R_cmd(qq`m_stat <- mantel.otu\$statistic;`);
run_R_cmd(qq`m_sig <- mantel.otu\$signif;`);
print $log_fh "Mantel P stat:\t".$global_R_instance->get('m_sig')."\n";
print $log_fh "Mantel R stat:\t".$global_R_instance->get('m_stat')."\n";
}
else
{
print "Too few samples to perform a mantel test.\n";
}
close $log_fh;
# print all the distances to a file so we can make purdy pictures from them later
open my $dist_fh, ">", $dist_file or die "Could not open distance file: $dist_file : $!\n";
my $num_tables = $global_R_instance->get('len_a');
foreach my $counter (1..$num_tables)
{
print $dist_fh $global_R_instance->get("dist[$counter]")."\n"
}
close $dist_fh;
# let the user know the result
return $centroid_otu_index;
}
sub find_centroid_sequences
{
#-----
# Find the centroid of a set of otu tables
#
my($path, $num_reps, $seqs, $final_seq_file, $dist_file, $log_file) = @_;
print "Normalising read counts...\n";
print "Input: $seqs\n";
print "Num reps: $num_reps\n";
# make a bunch of normalised read files
`mkdir -p $path`;
foreach my $file_counter (1..$num_reps)
{
print ".";
# make a normalised file
my $norm_seq_file_root = $path."norm_".$file_counter;
my $norm_seq_file = $norm_seq_file_root.".fa";
my $norm_otus_file = $norm_seq_file_root."_otus.txt";
my $norm_otu_table_file = $norm_seq_file_root."_otu_table.txt";
`app_normalise_reads.pl -in $seqs -n $global_norm_sample_size -o $norm_seq_file`;
# make an otu table for each guy
`pick_otus.py -i $norm_seq_file -s $global_similarity_setting -o $path`;
`make_otu_table.py -i $norm_otus_file -o $norm_otu_table_file`;
}
print "\n";
# find out which table is closest to the ave found for centroid OTU table found previously. Use R
# variables should still be in memory
my $sampl_p1 = $global_num_samples + 1;
# make a 3d data frame again
$global_R_instance->run(qq`new_frame <- array(0,dim=c($global_num_samples,$global_num_samples,$num_reps));`);
# load each file individually into a big frame
my $r_str = "for (i in c(1:$num_reps)) { name <- paste(\"$path"."norm_\",i,\"_otu_table.txt\",sep=\"\"); u<-read.table(name,sep=\"\\t\",row.names=1); u[,$sampl_p1]<-NULL; new_frame[,,i]<-as.matrix(dist(t(u), upper=TRUE, diag=TRUE)); i<-i+1; }";
$global_R_instance->run($r_str);
# find the euclidean distance of each matrix from the average
$global_R_instance->run(qq`dist_seq<-array(0,dim=c($num_reps));`);
$global_R_instance->run(qq`for (i in c(1:$num_reps)) { dist_seq[i] <- sqrt(sum(new_frame[,,i]-ave)^2); }`);
# find the min value
$global_R_instance->run(qq`closest_norm <- which.min(dist_seq);`);
my $closest_norm_index = $global_R_instance->get('closest_norm');
# copy the sequence file over that we wish to use
my $cp_cmd = "cp $path"."norm_$closest_norm_index".".fa ".$final_seq_file;
`$cp_cmd`;
# make stats on the distances
# and log what we did
open my $log_fh, ">", $log_file or die "Could not open log file: $log_file : $!\n";
$global_R_instance->run(qq`max_dist <- max(dist_seq);`);
$global_R_instance->run(qq`min_dist <- min(dist_seq);`);
$global_R_instance->run(qq`range_dist <- max_dist - min_dist;`);
$global_R_instance->run(qq`mean_dist <- mean(dist_seq);`);
$global_R_instance->run(qq`median_dist <- median(dist_seq);`);
print $log_fh "-----------------------------------------------\n";
print $log_fh " Read based normalised statistics\n";
print $log_fh "-----------------------------------------------\n";
print $log_fh "Max dist:\t".$global_R_instance->get('max_dist')."\n";
print $log_fh "Min dist:\t".$global_R_instance->get('min_dist')."\n";
print $log_fh "Range:\t".$global_R_instance->get('range_dist')."\n";
print $log_fh "Mean:\t".$global_R_instance->get('mean_dist')."\n";
print $log_fh "Median:\t".$global_R_instance->get('median_dist')."\n";
if(2 < $global_num_samples)
{
$global_R_instance->run(qq`mantel.otu <- mantel(ave,new_frame[,,closest_norm]);`);
$global_R_instance->run(qq`m_stat <- mantel.otu\$statistic;`);
$global_R_instance->run(qq`m_sig <- mantel.otu\$signif;`);
print $log_fh "Mantel P stat:\t".$global_R_instance->get('m_sig')."\n";
print $log_fh "Mantel R stat:\t".$global_R_instance->get('m_stat')."\n";
}
else
{
print "Too few samples to perform a mantel test.\n";
}
# print all the distances to a file so we can make purdy pictures from them later
open my $dist_fh, ">", $dist_file or die "Could not open distance file: $dist_file : $!\n";
foreach my $counter (1..$num_reps)
{
print $dist_fh $global_R_instance->get("dist_seq[$counter]")."\n"
}
close $dist_fh;
print "Done\n";
}
sub find_global_norm_sample_size
{
#-----
# pick a sane mimium number of reads
# Take nearest multiple of 50 under the minimum size... ...more or less
#
my $twenny_under = $global_min_sample_size - 20;
my $nearest_fifty = int($global_min_sample_size / 50) * 50;
$global_norm_sample_size = $nearest_fifty;
if($nearest_fifty > $twenny_under)
{
$global_norm_sample_size -= 50;
}
if($global_norm_sample_size <= 0)
{
die "Your least abundant sample with $global_min_sample_size sequences is too small to continue!\nPlease remove this sample (and any with similar numbers) and try again.\n";
}
print "Normalised sample size calculated at: $global_norm_sample_size reads\n";
return $global_norm_sample_size;
}
sub copy_read_subset
{
#-----
# copy over the denoised reads into the processing dir
# only take reads whose IDs are in the $global_samp_ID_list
#
my($source_fasta, $target_fasta) = @_;
open my $s_fh, "<", $source_fasta or die "**ERROR: could not open file: $source_fasta $!\n";
open my $t_fh, ">", $target_fasta or die "**ERROR: could not open file: $target_fasta $!\n";
my $good_seq = 0;
my %seen_seqs = ();
while(<$s_fh>)
{
if($_ =~ /^>/)
{
# header
my @components = split / /, $_;
# we need to ignore this guy if he's a duplicate
if(!exists $seen_seqs{$components[0]})
{
my $header = $components[0];
$header =~ s/>([^_]*)_.*/$1/;
if(1 == $global_samp_ID_list{$header})
{
$good_seq = 1;
print $t_fh $_;
}
$seen_seqs{$components[0]} = 1;
}
}
elsif(1 == $good_seq)
{
print $t_fh $_;
$good_seq = 0;
}
}
close $s_fh;
close $t_fh;
}
sub parse_config_results
{
#-----
# parse the app config file and produce a qiime mappings file
#
open my $conf_fh, "<", $options->{'config'} or die $!;
open my $mapping, ">", $global_mapping_file or die $!;
print $mapping "$FNB_HEADER\n";
while(<$conf_fh>)
{
next if($_ =~ /^#/);
last if($_ =~ /^@/);
chomp $_;
my @fields = split /\t/, $_;
# save the MID for later
$global_samp_ID_list{$fields[$FNA{'SampleID'}]} = $fields[$FNA{'USE'}];
# we need to find the globally maximal number of sequences for any USED sample
if("1" eq int $fields[$FNA{'USE'}])
{
$global_num_samples++;
my $sample_size = int $fields[$FNA{'ACC'}];
if($sample_size > $global_rare_X)
{
$global_rare_X = $sample_size;
}
if($sample_size < $global_min_sample_size)
{
$global_min_sample_size = $sample_size;
}
print $mapping "$fields[$FNA{'SampleID'}]\t$fields[$FNA{'BarcodeSequence'}]\t$fields[$FNA{'LinkerPrimerSequence'}]\t$fields[$FNA{'Description'}]\n";
}
}
close $mapping;
print "\t...Processing $global_num_samples samples\n";
# user options section
while(<$conf_fh>)
{
chomp $_;
my @fields = split /=/, $_;
if($#fields > 0)
{
if($fields[0] eq "DB")
{
if($fields[1] eq "SILVA")
{
$global_comp_DB_type = "SILVA";
}
}
elsif($fields[0] eq "NORMALISE")
{
# is this guy set?
if($fields[1] ne "")
{
chomp $fields[1];
my @norm_fields = split /,/, $fields[1];
$global_norm_style = $norm_fields[0];
# check the normailisation style is sane
if($global_norm_style ne "SEQ" and $global_norm_style ne "TABLE")
{
die "You must specify 'SEQ' or 'TABLE' as normalisation methods (if you specify anything)\n";
}
# see if the user decided on how many sequences to normalise to!
if($#norm_fields == 0)
{
# user did not specify an amount to normalise by
# select automatically
print "Finding normalisation size automatically\n";
find_global_norm_sample_size();
}
else
{
# user selected an amount, use this
$global_norm_sample_size = int $norm_fields[1];
print "Using user defined sample size of: $global_norm_sample_size\n";
if($global_norm_sample_size <= 0)
{
die "You need to specify a number greater than or equal to zero (or none at all).\nHint: try 'NORMALISE=$global_norm_style' or 'NORMALISE=$global_norm_style,XXX' where XXX is the number of sequences to normalise to\n";
}
}
}
}
elsif($fields[0] eq "MUL_RARE_M")
{
# is this guy set?
if($fields[1] ne "")
{
my $user_min = int($fields[1]);
if($user_min > 0)
{
$global_rare_M = $user_min;
}
}
}
elsif($fields[0] eq "MUL_RARE_X")
{
# is this guy set?
if($fields[1] ne "")
{
my $user_max = int($fields[1]);
if($user_max < $global_rare_X)
{
$global_rare_X = $user_max;
}
}
}
elsif($fields[0] eq "MUL_RARE_S")
{
# is this guy set?
if($fields[1] ne "")
{
my $user_step = int($fields[1]);
if($user_step > 0)
{
$global_rare_S = $user_step;
}
}
}
elsif($fields[0] eq "MUL_RARE_N")
{
# is this guy set?
if($fields[1] ne "")
{
my $user_num = int($fields[1]);
if($user_num > 0)
{
$global_rare_N = $user_num;
}
}
}
}
}
close $conf_fh;
# finally, check to see if we've picked a normailisation value
if(0 == $global_norm_sample_size)
{
# user did not specify an amount to normalise by
# select automatically
print "Finding normalisation size automatically\n";
find_global_norm_sample_size();
}
}
######################################################################
# TEMPLATE SUBS
######################################################################
sub checkParams {
my @standard_options = ( "help|h+", "config|c:s", "identity|i:i", "e:i");
my %options;
# Add any other command line options, and the code to handle them
#
GetOptions( \%options, @standard_options );
# if no arguments supplied print the usage and exit
#
exec("pod2usage $0") if (0 == (keys (%options) ));
# If the -help option is set, print the usage and exit
#
exec("pod2usage $0") if $options{'help'};
# Compulsosy items
if(!exists $options{'config'} ) { print "**ERROR: you MUST give a config file\n"; exec("pod2usage $0"); }
if(exists $options{'identity'})
{
if(($options{'identity'} <= 0) || ($options{'identity'} > 1))
{
die "Identity must be an integer greater than 0 and no greater than 1\n";
}
}
#if(!exists $options{''} ) { print "**ERROR: \n"; exec("pod2usage $0"); }
return \%options;
}
sub printAtStart {
print<<"EOF";
----------------------------------------------------------------
$0
Copyright (C) 2011 Michael Imelfort and Paul Dennis
This program comes with ABSOLUTELY NO WARRANTY;
This is free software, and you are welcome to redistribute it
under certain conditions: See the source for more details.
----------------------------------------------------------------
EOF
}
__DATA__
=head1 NAME
app_make_results.pl
=head1 COPYRIGHT
copyright (C) 2011 Michael Imelfort and Paul Dennis
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head1 DESCRIPTION
Insert detailed description here
=head1 SYNOPSIS
app_make_results.pl -c|config CONFIG_FILE [-help|h]
-c CONFIG_FILE App config file to be processed
[-i identity VALUE] Set blast identity [default: 97%]
[-e EVALUE] Set e-value for blast (assign taxonomy) [default 0.001]
[-help -h] Displays basic usage information
=cut