https://github.com/STAR-Fusion/STAR-Fusion
Raw File
Tip revision: 59a87fdce99b5dcf4645046fbd3cbd67c524dbd9 authored by Brian Haas on 20 July 2017, 13:41:25 UTC
FI update
Tip revision: 59a87fd
STAR-Fusion
#!/usr/bin/env perl

# contributed by Brian Haas, Broad Institute, 2015

use strict;
use warnings;
use Carp;
use Cwd;
use FindBin;
use lib ("$FindBin::Bin/PerlLib");
use Pipeliner;
use Set::IntervalTree;
use File::Basename;
use Process_cmd;

use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);

my $VERSION = "0.8.0";


my $BASEDIR = "$FindBin::Bin";
my $UTILDIR = "$BASEDIR/util";

## Options
my $output_dir = "STAR-Fusion_outdir";
my $out_prefix = "star-fusion";
my $chimeric_junction_file;
my $help_flag;
my $MIN_NOVEL_JUNCTION_SUPPORT = 3;
my $MIN_ALT_PCT_JUNCTION = 10.0;
my $AGGREGATE_NOVEL_JUNCTION_DIST = 5;
my $Evalue = 1e-3;
my $tmpdir = "/tmp";
my $verbose_level = 2;
my $MIN_JUNCTION_READS = 1;
my $MIN_SUM_FRAGS = 2;
my $MAX_PROMISCUITY = 3;  # perhaps a poor choice of words, but still a best fit IMHO.
my $genome_lib_dir;
my $CPU = 4; 
my $REQUIRE_LDAS = 1;

my $STAR_MAX_MATE_DIST = 100000;

my $usage = <<__EOUSAGE__;


###################################################################################
#
#  Required:
#
#  To include running STAR:
#
#      --left_fq <string>                    left.fq file (or single.fq)
#
#      --right_fq <string>                   right.fq file  (actually optional, but highly recommended)
#
#  Or use output from earlier STAR run:
#
#      --chimeric_junction|J <string>        Chimeric.out.junction file
#
#
#    --genome_lib_dir <string>             directory containing genome lib (see http://STAR-Fusion.github.io)
#
#    --output_dir|O <string>               output directory  (default: $output_dir) 
#
#
#
#  Optional:
#
#    --annotate                            annotate fusions based on known cancer fusions and those found in normal tissues
#
#    --examine_coding_effect               explore impact of fusions on coding sequences
#
#    --extract_fusion_reads                retrieves the fusion supporting reads from the fastq files
#
#    --FusionInspector <inspect|validate>  include FusionInspector, options:
#                                               'inspect' - considers only StarFusion-identified fusion reads in context of identified fusions (fast)
#                                               'validate' - examines all reads, recovers evidence, computes fusion allele fractions (slow) 
#
#    --denovo_reconstruct                  attempt to reconstruct fusion transcripts using de novo assembly (requires --FusionInspector)
#
#    --CPU <int>                           number of threads for running STAR (default: $CPU)
#
#    --min_junction_reads <int>            minimum number of junction-spanning reads required. Default: $MIN_JUNCTION_READS
#
#    --min_sum_frags <int>                 minimum fusion support = ( # junction_reads + # spanning_frags ) Default: $MIN_SUM_FRAGS
#
#    --require_LDAS 0|1                    require long double anchor support for split reads
#                                           when no spanning frags are found (default: 1)
#
#    --max_promiscuity <int>               maximum number of partners allowed for a given fusion. Default: $MAX_PROMISCUITY
#
#
#    --min_novel_junction_support <int>    default: $MIN_NOVEL_JUNCTION_SUPPORT  (minimum of $MIN_NOVEL_JUNCTION_SUPPORT junction reads required if breakpoint
#                                                        lacks involvement of only reference junctions)
#
#    --min_alt_pct_junction <float>        default: $MIN_ALT_PCT_JUNCTION  ($MIN_ALT_PCT_JUNCTION % of the dominant isoform junction support)
#
#    --aggregate_novel_junction_dist <int>  default: $AGGREGATE_NOVEL_JUNCTION_DIST (non-ref junctions within $AGGREGATE_NOVEL_JUNCTION_DIST are merged into single calls)
#
#    -E <float>                            E-value threshold for blast searches (default: 0.001)
#
#    --tmpdir <string>                     file for temporary files (default: /tmp)
#
#    --verbose_level <int>                 verbosity (default: $verbose_level, max=2)
#
#    --no_filter                           do not filter predictions.
# 
#    --STAR_use_shared_memory              use shared memory among multiple processes for the STAR alignment step
#                                           # note, when this option is used, twopass mode is disabled, as it's incompatible w/ shared mem.
#                                          ** (when all your jobs are done, be sure to run: 'STAR --genomeLoad Remove' to unload it) **
#
#    --STAR_max_mate_dist <int>            maximum distance between mates  (also used for the maximum intron length value)  default:  $STAR_MAX_MATE_DIST
#
#    --version                             report version ($VERSION)
#
###################################################################################


__EOUSAGE__

    ;


my $no_filter = 0;


my $left_fq_filename = "";
my $right_fq_filename = "";
my $chimeric_out_junctions_file = "";

my $REPORT_VERSION = 0;
my $USE_SHARED_MEMORY = 0;

my $EXTRACT_FUSION_READS = 0;
my $FUSIONINSPECTOR_MODE = 0;
my $DENOVO_RECONSTRUCT = 0;
my $ANNOTATE = 0;
my $EXAMINE_CODING_EFFECT = 0;

&GetOptions ( 'help|h' => \$help_flag,
              
              'left_fq=s' => \$left_fq_filename,
              'right_fq=s' => \$right_fq_filename,
              
              'chimeric_junction|J=s' => \$chimeric_out_junctions_file,

              'min_junction_reads=i' => \$MIN_JUNCTION_READS,
              'min_sum_frags=i' => \$MIN_SUM_FRAGS,
              'max_promiscuity=i' => \$MAX_PROMISCUITY,
              
              'require_LDAS=i' => \$REQUIRE_LDAS,
              
              'min_novel_junction_support=i' => \$MIN_NOVEL_JUNCTION_SUPPORT,
              'min_alt_pct_junction=f' => \$MIN_ALT_PCT_JUNCTION,
              'aggregate_novel_junction_dist=i' => \$AGGREGATE_NOVEL_JUNCTION_DIST,
              'output_dir|O=s' => \$output_dir,
              
              'E=f' => \$Evalue,
              'tmpdir=s' => \$tmpdir,
              'verbose_level=i' => \$verbose_level,

              'annotate' => \$ANNOTATE,
              'examine_coding_effect' => \$EXAMINE_CODING_EFFECT,
              
              'no_filter' => \$no_filter,
    
              'genome_lib_dir=s' => \$genome_lib_dir,
              
              'version' => \$REPORT_VERSION,
              
              'CPU=i' => \$CPU,

              'STAR_use_shared_memory' => \$USE_SHARED_MEMORY,
              'STAR_MAX_MATE_DIST=i' => \$STAR_MAX_MATE_DIST,

              'extract_fusion_reads' => \$EXTRACT_FUSION_READS,
              'FusionInspector=s' => \$FUSIONINSPECTOR_MODE,
              'denovo_reconstruct' => \$DENOVO_RECONSTRUCT,
              

    );


if ($help_flag) {
    die $usage;
}

if ($REPORT_VERSION) {
    print "\n\nSTAR-Fusion version: $VERSION\n\n";
    exit(0);
}

unless ( ($left_fq_filename || $chimeric_out_junctions_file) && $genome_lib_dir) {
    die $usage;
}


if ($FUSIONINSPECTOR_MODE) {
    unless ($FUSIONINSPECTOR_MODE =~ /^(inspect|validate)$/) {
        die "Error, --FusionInspector option can be 'inspect' or 'validate', [$FUSIONINSPECTOR_MODE] not recognized ";
    }
    $EXTRACT_FUSION_READS = 1; # just in case.
}

if ($EXTRACT_FUSION_READS) {
    unless ($left_fq_filename) {
        die "Error, need --left_fq (and --right_fq for PE reads) set with --FusionInspector or --extract_fusion_reads ";
    }
}

if ($DENOVO_RECONSTRUCT && ! $FUSIONINSPECTOR_MODE) {
    die "Error, --denovo_reconstruct requires that you employ --FusionInspector ";
}


if (@ARGV) {
    die "Error, don't understand arguments: [@ARGV] ";
}


unless (-d $genome_lib_dir) {
    die "Error, cannot locate genome_lib_dir: $genome_lib_dir";
}

main: {
    
    $output_dir = &ensure_full_path($output_dir);
    $genome_lib_dir = &ensure_full_path($genome_lib_dir);
    $left_fq_filename = &ensure_full_path($left_fq_filename) if $left_fq_filename;
    $right_fq_filename = &ensure_full_path($right_fq_filename) if $right_fq_filename;
    $chimeric_out_junctions_file = &ensure_full_path($chimeric_out_junctions_file) if $chimeric_out_junctions_file;

    unless (-d $output_dir) {
        &process_cmd("mkdir -p $output_dir");
    }
    
    chdir $output_dir or die "Error, cannot cd to $output_dir";

    my $checkpoints_dir = "$output_dir/_starF_checkpoints";
    unless (-d $checkpoints_dir) {
        &process_cmd("mkdir -p $checkpoints_dir");
    }
    
    my $preliminary_outdir = "$output_dir/star-fusion.preliminary";
    unless (-d $preliminary_outdir) {
        &process_cmd("mkdir -p $preliminary_outdir");
    }
    
    if ($out_prefix =~ m|/$|) {
        # not allowing to end as a directory, must be a filename
        die "Error, --out_prefix must be a file name and not a directory name, although you can include directories in the path to the file name to be created.";
    }
    
    my $out_prefix_basename = basename($out_prefix);
    
    my $pipeliner = new Pipeliner(-verbose => $verbose_level);
    
    unless ($chimeric_out_junctions_file) {
        $chimeric_out_junctions_file = "Chimeric.out.junction";
        
        my $star_index_dir = "$genome_lib_dir/ref_genome.fa.star.idx";
        unless (-d $star_index_dir) {
            die "Error, cannot locate star index at $star_index_dir";
        }
        
        ## run STAR to align reads:
        my $cmd = "STAR --genomeDir $star_index_dir "
            . " --readFilesIn $left_fq_filename $right_fq_filename "
            . " --outReadsUnmapped None "
            . " --chimSegmentMin 12 "
            . " --chimJunctionOverhangMin 12 "
            . " --alignSJDBoverhangMin 10 "   
            . " --alignMatesGapMax $STAR_MAX_MATE_DIST "
            . " --alignIntronMax $STAR_MAX_MATE_DIST "
            . " --chimSegmentReadGapMax 3 "
            . " --alignSJstitchMismatchNmax 5 -1 5 5 "  #which allows for up to 5 mismatches for non-canonical GC/AG, and AT/AC junctions, and any number of mismatches for canonical junctions (the default values 0 -1 0 0 replicate the old behavior (from AlexD)
            . " --runThreadN $CPU"
            . " --limitBAMsortRAM 31532137230 "
            . " --outSAMstrandField intronMotif "
            . " --outSAMtype BAM SortedByCoordinate ";
        
        if ($USE_SHARED_MEMORY) {
            $cmd .= " --genomeLoad LoadAndKeep ";
        }
        else {
            # note, twopassMode is incompatible with shared memory (--genomeLoad LoadAndKeep)
            $cmd .= " --twopassMode Basic ";
        }
        
        if ($left_fq_filename =~ /\.gz$/) {
            $cmd .= " --readFilesCommand zcat ";
        }
        
        $pipeliner->add_commands(Command->new($cmd, "$checkpoints_dir/star.ok"));
    }
    
    ## predict fusions

    
    my $cmd = "$UTILDIR/STAR-Fusion.predict "
        . " -J $chimeric_out_junctions_file "
        . " --genome_lib_dir $genome_lib_dir "
        . " --min_junction_reads $MIN_JUNCTION_READS "
        . " --min_sum_frags $MIN_SUM_FRAGS "
        . " --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
        . " -O $preliminary_outdir/$out_prefix ";
    
    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.STAR-Fusion.predict.ok"));
    
    my $predicted_fusions_file = "$preliminary_outdir/$out_prefix.fusion_candidates.preliminary";
    
    ## add breakpoint / splice junction info

    my $predicted_fusions_with_splice_info = "$predicted_fusions_file.wSpliceInfo";

    $cmd = "$UTILDIR/append_breakpoint_junction_info.pl $predicted_fusions_file $genome_lib_dir > $predicted_fusions_with_splice_info";

    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/predicted_fusions_with_splice_info.ok"));

    my $prelim_fusion_file = $predicted_fusions_with_splice_info;
    
    unless ($no_filter) {
           
        ## filter fusions
        
        $cmd = "$UTILDIR/STAR-Fusion.filter "
            . " --fusion_preds $predicted_fusions_with_splice_info "
            . " -E $Evalue "
            . " --tmpdir $tmpdir "
            . " --min_junction_reads $MIN_JUNCTION_READS "
            . " --min_sum_frags $MIN_SUM_FRAGS "
            . " --require_LDAS $REQUIRE_LDAS "
            . " --max_promiscuity $MAX_PROMISCUITY "
            . " --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
            . " --min_alt_pct_junction $MIN_ALT_PCT_JUNCTION "
            . " --aggregate_novel_junction_dist $AGGREGATE_NOVEL_JUNCTION_DIST "
            . " --genome_lib_dir $genome_lib_dir "
            . " --out_prefix $preliminary_outdir/$out_prefix";
        
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/STAR-Fusion.filter.ok"));
        

        $prelim_fusion_file = "$preliminary_outdir/$out_prefix.fusion_candidates.preliminary.filtered";
        
        
    }

    if ($left_fq_filename) {
        ## convert vals to FFPM
        $cmd = "$UTILDIR/incorporate_FFPM_into_final_report.pl $left_fq_filename $prelim_fusion_file > $prelim_fusion_file.FFPM";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/FFPM.ok") );

        $prelim_fusion_file = "$prelim_fusion_file.FFPM";
    }

    # generate final fusion prediction output files:
    my $final_output_fusions = "$out_prefix.fusion_predictions.tsv";
    $pipeliner->add_commands(new Command("cp $prelim_fusion_file $final_output_fusions", "$checkpoints_dir/cp_final.ok"));
    
    ## make abridged versions:
    my $abridged_final_output = $final_output_fusions;
    $abridged_final_output =~ s/\.tsv$/.abridged.tsv/;
    $pipeliner->add_commands(new Command("$UTILDIR/column_exclusions.pl $final_output_fusions JunctionReads,SpanningFrags > $abridged_final_output", "$checkpoints_dir/abridged_final"));

    
    if ($ANNOTATE) {
        ## annotate fusions.

        my $annotated_fusions_filename = $abridged_final_output;
        $annotated_fusions_filename =~ s/\.tsv$/.annotated.tsv/;
        $cmd = "$BASEDIR/FusionAnnotator/FusionAnnotator --genome_lib_dir $genome_lib_dir --annotate $abridged_final_output > $annotated_fusions_filename";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/fusion_annotator.ok"));
        $abridged_final_output = $annotated_fusions_filename;
    }
    
    if ($EXAMINE_CODING_EFFECT) {
        
        my $coding_eff_filename = $abridged_final_output;
        $coding_eff_filename =~ s/\.tsv$/.coding_effect.tsv/; 
        
        $cmd = "$BASEDIR/FusionAnnotator/util/fusion_to_coding_region_effect.pl  --fusions $abridged_final_output --genome_lib_dir $genome_lib_dir > $coding_eff_filename";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/coding_eff.ok"));
        $abridged_final_output = $coding_eff_filename;
    }
    
    if ($EXTRACT_FUSION_READS) {
        my $cmd = "$UTILDIR/get_FUSION_EVIDENCE_fastqs.pl --fusions $out_prefix.fusion_predictions.tsv --output_prefix $out_prefix --left_fq $left_fq_filename";
        if ($right_fq_filename) {
            $cmd .= " --right_fq $right_fq_filename";
        }
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/extract_fusion_reads.ok"));
    }
    
    $pipeliner->run();
    
    if ($no_filter) {
         
        print STDERR "\n\n *** Note: --no_filtering in effect, so outputs have not been filtered for likely false positives.\n";
        print STDERR "\n\tSee output: $predicted_fusions_with_splice_info\n\n";
    }
    else {
        
        print STDERR "\n\n\t* STAR-Fusion complete.  See output: $out_prefix.fusion_candidates.final (or .abridged version)\n\n\n";
    }
    
    
    if ($FUSIONINSPECTOR_MODE) {

        my $FI_outdir = "$output_dir/FusionInspector";

        my $FI_cmd = "";
        
        if ($FUSIONINSPECTOR_MODE eq 'inspect') {
            ## use the extracted fusion evidence reads
            my $FI_left_fq = "$out_prefix.fusion_evidence_reads_1.fq";
            $FI_cmd = "$BASEDIR/FusionInspector/FusionInspector --fusions $abridged_final_output "
                . " --out_prefix finspector --min_junction_reads $MIN_JUNCTION_READS --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
                . " --prep_for_IGV --max_promiscuity $MAX_PROMISCUITY --out_dir $FI_outdir "
                . " --genome_lib_dir $genome_lib_dir --only_fusion_reads --fusion_contigs_only "
                . " --left_fq $FI_left_fq";
            if ($right_fq_filename) {
                my $FI_right_fq = "$out_prefix.fusion_evidence_reads_2.fq";
                $FI_cmd .= " --right_fq $FI_right_fq ";
            }
            
            $pipeliner->add_commands(new Command($cmd, "_fi_inspect.ok"));
        }
        else {
            # validate mode
            # use all original reads.
            $FI_cmd = "$BASEDIR/FusionInspector/FusionInspector --fusions $abridged_final_output "
                . " --out_prefix finspector --min_junction_reads $MIN_JUNCTION_READS --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
                . " --prep_for_IGV --max_promiscuity $MAX_PROMISCUITY --out_dir $FI_outdir "
                . " --genome_lib_dir $genome_lib_dir "
                . " --left_fq $left_fq_filename";
            if ($right_fq_filename) {
                $FI_cmd .= " --right_fq $right_fq_filename ";
            }
        }

        if ($DENOVO_RECONSTRUCT) {
            $FI_cmd .= " --include_Trinity ";
        }
        if ($ANNOTATE) {
            $FI_cmd .= " --annotate ";
        }
        if ($EXAMINE_CODING_EFFECT) {
            $FI_cmd .= " --examine_coding_effect ";
        }
                
        $pipeliner->add_commands(new Command($FI_cmd, "_fi_${FUSIONINSPECTOR_MODE}_" . length($FI_cmd) . ".ok"));
        
        
        $pipeliner->run();
    }
    
    
    exit(0);
    
}


####
sub missing_required_program_installed {
    my (@required_progs) = @_;

    my $missing = 0;
    
    foreach my $prog (@required_progs) {

        my $path = `which $prog`;
        chomp $path;
        unless ($path && $path =~ /\w/) {
            $missing++;
            print STDERR "Error, cannot locate required program: $prog\n";
        }
        else {
            print STDERR "-found prog $prog at $path\n";
        }
    }
    
    return($missing);
}

back to top