https://github.com/mattmattmattmatt/DeepSNVMiner
Tip revision: a98fb6f4225efc43c74545c460543612284d96cb authored by mattmattmattmatt on 24 July 2020, 02:23:15 UTC
Added correct citation
Added correct citation
Tip revision: a98fb6f
run_deepseq.pl
#! /usr/bin/perl -w
use strict;
use FindBin;
use lib "$FindBin::Bin";
use modules::Deepseq;
use modules::Exception;
use modules::SystemCall;
use Data::Dumper;
use File::Basename;
use Cwd 'abs_path';
use Pod::Usage;
use Getopt::Long;
use vars qw(%OPT);
# Options from command line:
GetOptions(
\%OPT,
"help|h",
"man|m",
"filename_stub=s",
"read1_fastq=s",
"read2_fastq=s",
"start_command=s",
"end_command=s",
"working_dir=s",
"no_adaptor",
"no_uid",
"uid_len1=i",
"uid_len2=i",
"coord_bed=s",
"ref_fasta=s",
"bwa=s",
"samtools=s",
"sm_count=i",
"sm_portion=f",
"threads=i",
"graph",
"min_seqlen=i",
"min_group=i",
"conf_file=s",
"no_pair_match",
"cut_length=i",
"no_revcom",
"combine_reads",
"uid_done"
) || modules::Exception->throw("ERROR: Problem with command line arguments");;
pod2usage(-verbose => 2) if $OPT{man};
pod2usage(1) if ($OPT{help} || !$OPT{filename_stub} || !$OPT{read1_fastq} || !$OPT{read2_fastq} || !$OPT{coord_bed}) ;
=pod
=head1 SYNOPSIS
run_deepseq.pl -filename_stub unique_samplename -read1_fastq fastq1 -read2_fastq fastq2 -start_command command_to_start_from(default=first_command) -end_command last_command_to_run(default=last_command) -working_dir working_dir(default=pwd/filename_stub_RAND) -no_adaptor no_adaptor_sequence(default=present) -uid_len1 length_of_5prime_uid_length(default=10) -uid_len2 length_of_3prime_uid_length(default=0) -bwa full_bwa_path(default=/usr/bin/bwa) -samtools full_samtools_path(default=/usr/bin/samtools) -ref_fasta full_path_to_reference_fasta(must contain bwa index files as well) -coord_bed bed_file_containing_target_coordinates -sm_count min_number_of_reads_for_supermutant(default=10) -sm_portion min_fraction_of_variant_bases_within_UID_reads(default=0.90) -threads num_threads_for_bwa(default=1) -config create_config_file -graph generate_variant_graphs_for_each_genomic_region(requires R) -min_seqlen minimum_sequence_length_to_keep_seq(default=0) -min_group min_num_of_passing_groups_to_qualify_for_supermutant(default=2) -conf_file deepseq_conf_fiel(default=deepseq.conf) -no_pair_match don't_require_barcodes_in_read_pairs_to_match -cut_length remove_this_many_bases(default=uid1+uid2) -no_revcom no_revcom_for_read2 -combine_reads combine_barcodes_from_R1_and_R2 -uid_done uid_already_in_fastq(format_is_">read1:UID=SEQ")
Required flags: -filename_stub -read1_fastq -read2_fastq -coord_bed
=head1 OPTIONS
-help brief help message
-man full documentation
=head1 NAME
run_deepseq.pl -> Wrapper script for deepseq run
=head1 DESCRIPTION
date
a script that ...
=head1 AUTHOR
Matthew Field
=head1 EXAMPLE
sample -f file
=cut
my $bwa;
my $bwa_index;
my $samtools;
my $ref_fasta;
my $PRINT_TO_LOGFILE = 1;
my $PRINT_TO_STDOUT = 1;
my (undef,$base) = fileparse(abs_path($0));
my $conf_file = defined $OPT{conf_file}?$OPT{conf_file}:$base.'/deepseq.conf';
my $sys_call = modules::SystemCall->new();
#Set the variables either using a conf file or parse them from the command line
if (-e $conf_file) {
#Read the config file to get paths needed
open(FILE,"$conf_file") || modules::Exception->throw("Can't open file $conf_file\n");
while (<FILE>) {
chomp;
if (/samtools=(.+)$/) {
$samtools = $1;
if ( !-e $samtools ) {
modules::Exception->throw("Samtools $samtools doesn't exist");
}
} elsif (/bwa=(.+)$/) {
$bwa = $1;
if ( !-e $bwa ) {
modules::Exception->throw("BWA $bwa doesn't exist");
}
} elsif (/ref_fasta=(.+)$/) {
$ref_fasta = $1;
if ( !-e $ref_fasta ) {
modules::Exception->throw("Ref fasta $ref_fasta doesn't exist");
}
} elsif (/bwa_index=(.+)$/) {
$bwa_index = $1;
if ( !-e $bwa_index ) {
if (!-e $bwa_index.'.sa') {
modules::Exception->throw("BWA index file $bwa_index doesn't exist");
}
}
}
}
} else {
#otherwise all the variables need to be defined on the command line or available in /usr/bin
if (!$OPT{ref_fasta}) {
modules::Exception->throw("ERROR: Without config file must use -ref_fasta");
} else {
$ref_fasta = $OPT{ref_fasta};
}
my $ref_fasta_abs = abs_path($ref_fasta);
#Check the bwa index is present....
my $bwa_index_file = $ref_fasta_abs . '.sa';
if (!-e $bwa_index_file) {
#Sometimes it ref.fa.sa but sometimes it's ref.sa so check both
print "Can't find the index file $bwa_index_file\n";
($bwa_index_file = $ref_fasta_abs) =~ s/\.fa$/\.sa/;
print "Checking for index file $bwa_index_file\n";
if (!-e $bwa_index_file) {
modules::Exception->throw("ERROR: Need to generate the index files for bwa in the referece directory; either 1) run bwa beforehand with 'bwa index -a bwtsw ref.fa' OR 2) run configure_deepseq.pl first");
}
}
#bwa only requires the file prefix name
($bwa_index = $bwa_index_file) =~ s/\.sa$//;
if ($OPT{bwa}) {
$bwa = $OPT{bwa};
} elsif (-e '/usr/bin/bwa') {
$bwa = '/usr/bin/bwa';
} else {
modules::Exception->throw("ERROR: Without config file must use -bwa or have bwa at /usr/bin/bwa");
}
if ($OPT{samtools}) {
$samtools = $OPT{samtools};
} elsif (-e '/usr/bin/samtools') {
$samtools = '/usr/bin/samtools';
} else {
modules::Exception->throw("ERROR: Without config file must use -samtools or have samtools at /usr/bin/samtools");
}
}
#Check the fasta is available
if (!-e $ref_fasta) {
modules::Exception->throw("ERROR: Problem with $ref_fasta");
}
my $ref_fasta_abs = abs_path($ref_fasta);
my $pwd = `pwd`;
chomp $pwd;
#now get the sequence specific variables
my $filename_stub = $OPT{filename_stub};
my $read1_fastq = $OPT{read1_fastq};
my $read2_fastq = $OPT{read2_fastq};
if ($read1_fastq =~ /bz2$/ || $read1_fastq =~ /gz$/ || $read1_fastq =~ /zip$/) {
modules::Exception->throw("ERROR: Can't work with compressed files; please uncompress first");
}
my $working_dir;
if (defined $OPT{working_dir}) {
$working_dir = $OPT{working_dir};
} else {
$working_dir = $pwd . '/' . $filename_stub . '_' . $$;
if (! -e $working_dir){
`mkdir $working_dir`;
}
}
if ($OPT{graph}) {
if (!-e "$working_dir/graphs") {
`mkdir $working_dir/graphs`;
}
}
my $adaptor = defined $OPT{no_adaptor}?0:1; #default is that there is adaptor sequence info
my $uid = defined $OPT{no_uid}?0:1; #default is that there is uid info
my $uid_len1 = defined $OPT{uid_len1}?$OPT{uid_len1}:10;
my $uid_len2 = defined $OPT{uid_len2}?$OPT{uid_len2}:0;
my $coord_bed = $OPT{coord_bed};
if ( !-e $coord_bed ) {
modules::Exception->throw("File $coord_bed doesn't exist");
}
#Change all directories files to absolute path
my $working_dir_abs = abs_path($working_dir);
my $bwa_abs = abs_path($bwa);
my $samtools_abs = abs_path($samtools);
my $coord_bed_abs = abs_path($coord_bed);
my $read1_fastq_abs = abs_path($read1_fastq);
my $read2_fastq_abs = abs_path($read2_fastq);
#Check they all exist
if ( !-e $ref_fasta_abs ) {
modules::Exception->throw("File $ref_fasta_abs doesn't exist");
}
if ( !-e $read1_fastq_abs || !-e $read2_fastq_abs ) {
modules::Exception->throw("File $read1_fastq_abs || !-e $read2_fastq_abs doesn't exist");
}
if ( !-e $working_dir_abs ) {
modules::Exception->throw("File $working_dir_abs doesn't exist");
}
if ( !-e $bwa_abs ) {
modules::Exception->throw("File $bwa_abs doesn't exist");
}
if ( !-e $samtools_abs ) {
modules::Exception->throw("File $samtools_abs doesn't exist");
}
if ( !-e $coord_bed_abs ) {
modules::Exception->throw("File $coord_bed_abs doesn't exist");
}
my $read1_fastq_short = my $read2_fastq_short;
unless (chdir $working_dir_abs) {
modules::Exception->throw("Didn't manage to change to working dir [$working_dir_abs]. Still in this directory [$pwd]");
#unless $pwd eq $working_dir;
} else {
#Create symlinks for read files (don't copy; too time consuming)
`ln -s $read1_fastq_abs .`;
`ln -s $read2_fastq_abs .`;
#pass these as we've created symlinks now
($read1_fastq_short) = basename($read1_fastq_abs);
($read2_fastq_short) = basename($read2_fastq_abs);
}
#Set the required arguments
my %args = (
-filename_stub => $filename_stub,
-read1_fastq => $read1_fastq_short,
-read2_fastq => $read2_fastq_short,
-working_dir => $working_dir_abs,
-uid_len1 => $uid_len1,
-uid_len2 => $uid_len2,
-coord_bed => $coord_bed_abs,
-ref_fasta => $ref_fasta_abs,
-bwa => $bwa_abs,
-bwa_index => $bwa_index,
-samtools => $samtools_abs,
-base=>$base
);
#Optional arguments
if (!$uid) {
$args{-no_uid} = 1;
}
if ($OPT{no_pair_match}) {
$args{-no_pair_match} = 1;
}
if ($OPT{no_revcom}) {
$args{-no_revcom} = 1;
}
if ($OPT{combine_reads}) {
$args{-combine_reads} = 1;
}
if ($OPT{cut_length}) {
$args{-cut_length} = $OPT{cut_length};
}
if (defined $OPT{sm_count}) {
$args{-sm_count} = $OPT{sm_count};
}
if (defined $OPT{sm_portion}) {
$args{-sm_portion} = $OPT{sm_portion};
}
if (defined $OPT{threads}) {
$args{-num_threads} = $OPT{threads};
}
if (defined $OPT{min_seqlen}) {
$args{-min_seqlen} = $OPT{min_seqlen};
}
if (defined $OPT{min_group}) {
$args{-min_group} = $OPT{min_group};
}
if (defined $OPT{uid_done}) {
$args{-uid_done} = 1;
}
my $start_command = defined $OPT{'start_command'}?$OPT{start_command}:'check_fastq';
my $start_number = 1;
my $end_command = defined $OPT{'end_command'}?$OPT{end_command}:'graph';
# Start a log/results file
my $logfile = $filename_stub . '.log';
my $OUT;
if (! -e $logfile){
#$logfile = $working_dir_abs . '/' . $filename_stub . '.log';
open($OUT, ">$logfile")
or modules::Exception->throw("Unable to start logfile [$logfile] : $!\n");
} else {
open($OUT, ">>$logfile")
or modules::Exception->throw("Unable to start logfile [$logfile] : $!\n");
}
### select $OUT; # write as much blurb to logfile instead of STDOUT (turn this back to STDOUT with 'select STDOUT;')
print $OUT "\n### Log edited : " . &low_rent_timestamp() . " ###\n\n" if $PRINT_TO_LOGFILE;
print STDOUT "Logfile (check here also for error messages) : $logfile\n" if $PRINT_TO_STDOUT;
print $OUT "Working directory : $working_dir\n" if $PRINT_TO_LOGFILE;
# Create deepseq object
my $deepseq = modules::Deepseq->new(
%args
);
if ($start_command ne 'check_fastq') {
if (!$deepseq->check_step($start_command)) {
modules::Exception->throw("ERROR: Can't start at step $start_command; options are check_fastq,pool_reads,bwa,call_variants,report,or graph");
}
$start_number = $deepseq->get_step_number($start_command);
}
if ($end_command ne 'graph') {
if (!$deepseq->check_step($end_command)) {
modules::Exception->throw("ERROR: Can't end at step $end_command; options are check_fastq,pool_reads,bwa,call_variants,report,or graph");
}
}
#If there is adaptor sequence(s) get it from the fastq and mark it in each sequence to avoid creating supergroup of adaptors
if ($adaptor) {
if (my $adaptor_seqs = $deepseq->get_adaptor()) {
my $adaptor_str = join(",",@{$adaptor_seqs});
print "Will remove the following adaptors:\n$adaptor_str\n";
$deepseq->set_adaptor($adaptor_seqs);
}
}
# Retrieve commands from config file
my $commands = $deepseq->get_commands();
# Try to run each command in order
for my $command_count (sort keys %{$commands}) {
my @commands = @{$commands->{$command_count}{commands}};
my @outs = @{$commands->{$command_count}{stdout}};
if (@commands != @outs) {
modules::Exception->throw("ERROR: Different number of commands and stdout flags");
}
my $block = $commands->{$command_count}{block};
if ($start_number > 1 && $command_count < $start_number) {
next;
}
if ($block eq 'graph') {
next unless $OPT{graph};
}
print "\n\nRunning block $block\n\n";
for (my $i = 0; $i < @commands; $i++) {
print $OUT "\tRun " . $commands[$i] . "\n" if $PRINT_TO_LOGFILE;
if ($outs[$i] == 1) {
my $output = `$commands[$i]`;
print $OUT "$output" if $PRINT_TO_LOGFILE;
} else {
$sys_call->run($commands[$i]);
}
}
if ($end_command eq $block) {
exit;
}
}
# Close logfile
close($OUT);
# Return to previous directory
chdir $pwd;
sub low_rent_timestamp() {
my ($sec,
$min,
$hour,
$mday,
$month,
$year,
$wday,
$yday,
$isdst) = localtime(time);
$year += 1900;
$month += 1;
my $timestamp_str = $year . '-' . $month . '-' . $mday . ' ' . $hour . ':' . $min . ':' . $sec;
return $timestamp_str;
}