https://github.com/CuppenResearch/IAP
Tip revision: 8231e0cfdf0b5c1f092ecf5062c1aa344800267a authored by rernst on 12 June 2015, 14:11:51 UTC
Merge branch 'hotfix-v1.2.4'
Merge branch 'hotfix-v1.2.4'
Tip revision: 8231e0c
illumina_baseRecal.pm
#!/usr/bin/perl -w
##################################################################################################################################################
###This script is designed to run GATK baseRecalibration using GATK Queue
###
###
###Author: R.F.Ernst
###Latest change: Created skeleton
###
###
##################################################################################################################################################
package illumina_baseRecal;
use strict;
use POSIX qw(tmpnam);
sub runBaseRecalibration {
my $configuration = shift;
my %opt = %{$configuration};
print "Running base recalibration for the following BAM-files:\n";
foreach my $sample (@{$opt{SAMPLES}}){
### Check input .bam files
my $inBam = $opt{BAM_FILES}->{$sample};
(my $inFlagstat = $inBam) =~ s/bam/flagstat/;
(my $outBam = $inBam) =~ s/bam/recalibrated.bam/;
(my $outBai = $inBam) =~ s/bam/recalibrated.bai/;
(my $outFlagstat = $inBam) =~ s/bam/recalibrated.flagstat/;
print "\t$opt{OUTPUT_DIR}/$sample/mapping/$inBam\n";
$opt{BAM_FILES}->{$sample} = $outBam;
### Check output .bam files
if (-e "$opt{OUTPUT_DIR}/$sample/logs/BaseRecalibration_$sample.done"){
print "\t WARNING: $opt{OUTPUT_DIR}/$sample/logs/BaseRecalibration_$sample.done exists, skipping \n";
next;
}
### Build Queue command
my $javaMem = $opt{BASERECALIBRATION_MASTERTHREADS} * $opt{BASERECALIBRATION_MEM};
my $command = "java -Xmx".$javaMem."G -Xms".$opt{BASERECALIBRATION_MEM}."G -jar $opt{QUEUE_PATH}/Queue.jar ";
# cluster options
$command .= "-jobQueue $opt{BASERECALIBRATION_QUEUE} -jobNative \"-pe threaded $opt{BASERECALIBRATION_THREADS}\" -jobRunner GridEngine -jobReport $opt{OUTPUT_DIR}/$sample/logs/BaseRecalibration.jobReport.txt "; #Queue options
# baseRecalibration options
$command .= "-S $opt{BASERECALIBRATION_SCALA} -R $opt{GENOME} -I $opt{OUTPUT_DIR}/$sample/mapping/$inBam -mem $opt{BASERECALIBRATION_MEM} -nct $opt{BASERECALIBRATION_THREADS} -nsc $opt{BASERECALIBRATION_SCATTER} ";
### Parsing known files and add them to $command.
my @knownFiles;
if($opt{BASERECALIBRATION_KNOWN}) {
@knownFiles = split('\t', $opt{BASERECALIBRATION_KNOWN});
foreach my $knownFile (@knownFiles) {
if(! -e $knownFile){ die"ERROR: $knownFile does not exist\n" }
else { $command .= "-knownSites $knownFile " }
}
}
### retry option
if($opt{QUEUE_RETRY} eq 'yes'){
$command .= "-retry 1 ";
}
$command .= "-run";
### Create bash script
my $jobID = "BR_".$sample."_".get_job_id();
my $bashFile = $opt{OUTPUT_DIR}."/".$sample."/jobs/".$jobID.".sh";
my $logDir = $opt{OUTPUT_DIR}."/".$sample."/logs";
open BASERECAL_SH, ">$bashFile" or die "cannot open file $bashFile \n";
print BASERECAL_SH "#!/bin/bash\n\n";
print BASERECAL_SH "bash $opt{CLUSTER_PATH}/settings.sh\n\n";
print BASERECAL_SH "cd $opt{OUTPUT_DIR}/$sample/tmp/\n";
print BASERECAL_SH "echo \"Start base recalibration\t\" `date` \"\t$inBam\t\" `uname -n` >> ../logs/$sample.log\n\n";
print BASERECAL_SH "if [ -f $opt{OUTPUT_DIR}/$sample/mapping/$inBam ]\n";
print BASERECAL_SH "then\n";
print BASERECAL_SH "\t$command\n";
print BASERECAL_SH "else\n";
print BASERECAL_SH "\techo \"ERROR: $opt{OUTPUT_DIR}/$sample/mapping/$inBam does not exist.\" >&2\n";
print BASERECAL_SH "fi\n";
close BASERECAL_SH;
### Submit baserecal bash script
if ( @{$opt{RUNNING_JOBS}->{$sample}} ){
system "qsub -q $opt{BASERECALIBRATION_MASTERQUEUE} -m a -M $opt{MAIL} -pe threaded $opt{BASERECALIBRATION_MASTERTHREADS} -o $logDir/BaseRecalibration_$sample.out -e $logDir/BaseRecalibration_$sample.err -N $jobID -hold_jid ".join(",",@{$opt{RUNNING_JOBS}->{$sample}})." $bashFile";
} else {
system "qsub -q $opt{BASERECALIBRATION_MASTERQUEUE} -m a -M $opt{MAIL} -pe threaded $opt{BASERECALIBRATION_MASTERTHREADS} -o $logDir/BaseRecalibration_$sample.out -e $logDir/BaseRecalibration_$sample.err -N $jobID $bashFile";
}
### Create flagstat bash script
my $jobIDFS = "BRFS_".$sample."_".get_job_id();
my $bashFileFS = $opt{OUTPUT_DIR}."/".$sample."/jobs/".$jobIDFS.".sh";
open BASERECALFS_SH, ">$bashFileFS" or die "cannot open file $bashFileFS \n";
### Generate FlagStats if gatk .done file present
print BASERECALFS_SH "cd $opt{OUTPUT_DIR}/$sample/tmp/\n";
print BASERECALFS_SH "if [ -f $opt{OUTPUT_DIR}/$sample/tmp/.$outBam.done ]\n";
print BASERECALFS_SH "then\n";
print BASERECALFS_SH "\t$opt{SAMBAMBA_PATH}/sambamba flagstat -t $opt{FLAGSTAT_THREADS} $opt{OUTPUT_DIR}/$sample/tmp/$outBam > $opt{OUTPUT_DIR}/$sample/mapping/$outFlagstat\n";
print BASERECALFS_SH "fi\n\n";
### Check FlagStats and move files if correct else print error
print BASERECALFS_SH "if [ -s $opt{OUTPUT_DIR}/$sample/mapping/$inFlagstat ] && [ -s $opt{OUTPUT_DIR}/$sample/mapping/$outFlagstat ]\n";
print BASERECALFS_SH "then\n";
print BASERECALFS_SH "\tFS1=\`grep -m 1 -P \"\\d+ \" $opt{OUTPUT_DIR}/$sample/mapping/$inFlagstat | awk '{{split(\$0,columns , \"+\")} print columns[1]}'\`\n";
print BASERECALFS_SH "\tFS2=\`grep -m 1 -P \"\\d+ \" $opt{OUTPUT_DIR}/$sample/mapping/$outFlagstat | awk '{{split(\$0,columns , \"+\")} print columns[1]}'\`\n";
print BASERECALFS_SH "\tif [ \$FS1 -eq \$FS2 ]\n";
print BASERECALFS_SH "\tthen\n";
print BASERECALFS_SH "\t\tmv $opt{OUTPUT_DIR}/$sample/tmp/$outBam $opt{OUTPUT_DIR}/$sample/mapping/\n";
print BASERECALFS_SH "\t\tmv $opt{OUTPUT_DIR}/$sample/tmp/$outBai $opt{OUTPUT_DIR}/$sample/mapping/\n";
print BASERECALFS_SH "\t\ttouch $opt{OUTPUT_DIR}/$sample/logs/BaseRecalibration_$sample.done\n";
print BASERECALFS_SH "\telse\n";
print BASERECALFS_SH "\t\techo \"ERROR: $opt{OUTPUT_DIR}/$sample/mapping/$inFlagstat and $opt{OUTPUT_DIR}/$sample/mapping/$outFlagstat do not have the same read counts\" >>../logs/BaseRecalibration_$sample.err\n";
print BASERECALFS_SH "\tfi\n";
print BASERECALFS_SH "else\n";
print BASERECALFS_SH "\techo \"ERROR: Either $opt{OUTPUT_DIR}/$sample/mapping/$inFlagstat or $opt{OUTPUT_DIR}/$sample/mapping/$outFlagstat is empty.\" >> ../logs/BaseRecalibration_$sample.err\n";
print BASERECALFS_SH "fi\n\n";
print BASERECALFS_SH "echo \"End base recalibration\t\" `date` \"\t$inBam\t\" `uname -n` >> ../logs/$sample.log\n";
close BASERECALFS_SH;
### Submit flagstat bash script
system "qsub -q $opt{FLAGSTAT_QUEUE} -m a -M $opt{MAIL} -pe threaded $opt{FLAGSTAT_THREADS} -o $logDir/BaseRecalibrationFS_$sample.out -e $logDir/BaseRecalibrationFS_$sample.err -N $jobIDFS -hold_jid $jobID $bashFileFS";
push(@{$opt{RUNNING_JOBS}->{$sample}}, $jobID);
push(@{$opt{RUNNING_JOBS}->{$sample}}, $jobIDFS);
}
return \%opt;
}
############
sub get_job_id {
my $id = tmpnam();
$id=~s/\/tmp\/file//;
return $id;
}
############
1;