https://github.com/STAR-Fusion/STAR-Fusion
Raw File
Tip revision: e522c236a1b068ab1217855209b4706e2f8ef153 authored by Brian Haas on 27 October 2023, 18:40:42 UTC
fusionfilter update
Tip revision: e522c23
STAR-Fusion
#!/usr/bin/env perl

use strict;
use warnings;
use Carp;
use Cwd;
use FindBin;
use lib ("$FindBin::Bin/PerlLib");
use Pipeliner;
use File::Basename;
use Process_cmd;

use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);

my $VERSION = "1.13.0";
#my $VERSION = "__BLEEDING_EDGE__ Do not use - pull versioned release from: https://github.com/STAR-Fusion/STAR-Fusion/releases";


my @MIN_STAR_MAJOR_MINOR_VERSION = (2, 7, 8, 'a');


my $BASEDIR = "$FindBin::Bin";
my $UTILDIR = "$BASEDIR/util";
my $PLUGINSDIR = "$BASEDIR/plugins";

## Options
my $output_dir = "STAR-Fusion_outdir";
my $out_prefix = "star-fusion";
my $help_flag;
my $MIN_NOVEL_JUNCTION_SUPPORT = 3;
my $MIN_ALT_PCT_JUNCTION = 10.0;
my $AGGREGATE_NOVEL_JUNCTION_DIST = 5;
my $tmpdir = "/tmp";
my $verbose_level = 2;

my $MIN_FFPM = 0.1;

my $MIN_JUNCTION_READS = 1;
my $MIN_SUM_FRAGS = 2; # requires at least one junction read, else see min_spanning_frags_only below.
my $MIN_SPANNING_FRAGS_ONLY = 5;

my $MAX_PROMISCUITY = 10;
my $MIN_PCT_DOM_PROM = 20;

my $genome_lib_dir = $ENV{CTAT_GENOME_LIB};
my $CPU = 4;
my $outTmpDir=undef;
my $REQUIRE_LDAS = 1;


my $STAR_peOverlapNbasesMin = 12; # minimum number of bases in the overlap
my $STAR_peOverlapMMp = 0.1; # the max proportion of mismatches in the overlap


my $STAR_chimMultimapScoreRange = 3;
my $STAR_chimMultimapNmax = 20;
my $STAR_chimNonchimScoreDropMin = 10;
my $STAR_chimScoreJunctionNonGTAG = -4;


my $STAR_MAX_MATE_DIST = 100000;

my $STAR_SJDB_OVERHANG_MIN = 10;

my $MIN_PCT_MM_NONSPECIFIC = 50;

my $STAR_SortedByCoordinate = 0;
my $STAR_limitBAMsortRAM = "40G";

my $NO_REMOVE_DUPS = 0;

my $RUN_STAR_ONLY_FLAG = 0;

my $SKIP_EM_FLAG = 0;
my $SKIP_FFPM_FLAG = 0;


my $logo='
############################################################################################
#
#      _________________________ __________         ___________           .__
#     /   _____/\__    ___/  _  \\\______   \        \_   _____/_ __  _____|__| ____   ____
#     \_____  \   |    | /  /_\  \|       _/  ______ |    __)|  |  \/  ___/  |/  _ \ /    \
#     /        \  |    |/    |    \    |   \ /_____/ |     \ |  |  /\___ \|  (  <_> )   |  \
#    /_______  /  |____|\____|__  /____|_  /         \___  / |____//____  >__|\____/|___|  /
#            \/                 \/       \/              \/             \/               \/
#
#
############################################################################################';

my $usage = <<__EOUSAGE__;

$logo
#
#  Required:
#
#  To include running STAR:
#
#      --left_fq <string>                    left.fq file (or single.fq)
#
#      --right_fq <string>                   right.fq file  (actually optional, but highly recommended)
#

#  Or use output from earlier STAR run:
#
#      --chimeric_junction|J <string>        Chimeric.out.junction file
#
#
#    --genome_lib_dir <string>             directory containing genome lib (see http://STAR-Fusion.github.io)
#                                          (required to specify, unless env var CTAT_GENOME_LIB is set to it)
#                                          Easiest - get plug-n-play version from:
#                                            < https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/ >
#
#  Optional:
#
#    --CPU <int>                           number of threads for running STAR (default: $CPU)
#
#    --output_dir|O <string>               output directory  (default: $output_dir)
#
#    --show_full_usage_info                provide full usage info.
#
#
#######################
#
#   Quick guide to running:
#
#       STAR-Fusion --left_fq reads_1.fq  --right_fq reads_2.fq --genome_lib_dir /path/to/ctat_genome_lib_build_dir
#
############################################################################################

__EOUSAGE__

    ;


my $extended_usage_info = <<__EXTENDED_USAGE__
#
# STAR program configurations:
#
#    --STAR_max_mate_dist <int>            maximum distance between mates  (also used for the maximum intron length value)  default:  $STAR_MAX_MATE_DIST
#
#    --STAR_SJDBoverhangMin <int>          minimum overhang for annotated junctions default:  $STAR_SJDB_OVERHANG_MIN
#
#    --STAR_SortedByCoordinate             have STAR sort resulting bam file by coordinate
#    --STAR_limitBAMsortRAM <string>       num GB of RAM, default:  ${STAR_limitBAMsortRAM}
#
#    --STAR_PATH <string>                  /path/to/STAR  (default: uses STAR found in PATH setting).
#
#    --run_STAR_only                       stop after running STAR aligner (essentially just generating the Chimeric.out.junction file)
#
#    --STAR_twopass                        do STAR two-pass instead of two pass (one pass is default)
#
#
#   Stitching overlapping reads settings:
#          --STAR_peOverlapNbasesMin <int>    minimum number of bases in the overlap (default: $STAR_peOverlapNbasesMin)
#          --STAR_peOverlapMMp <int>          the max proportion of mismatches in the overlap (default: $STAR_peOverlapMMp)
#
#   STAR chim multi-map opts:
#          --STAR_chimMultimapScoreRange <int>     the score range for multi-mapping chimeras below the best chimeric score. (default: $STAR_chimMultimapScoreRange)
#          --STAR_chimMultimapNmax <int>           maximum number of multi-alignments for the chimera (default: $STAR_chimMultimapNmax)
#          --STAR_chimNonchimScoreDropMin <int>    to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be smaller than this value (default: $STAR_chimNonchimScoreDropMin)
#          --STAR_outSAMattrRGline <str>           pass through string value to STAR as --outSAMattrRGline  for setting read group name info in bam.  ie. --STAR_outSAMattrRGline "ID:myGRP"
#
# Chimeric read filtering parameters:
#
#    --min_pct_MM_nonspecific|M <int>     min pct of multimapping reads that should map to 2 genes
#                                         (avoids multimapping reads that lack specificity)
#                                           value must be between [1,100] (default: $MIN_PCT_MM_NONSPECIFIC)
##
# Fusion transcript filtering:
#
#    --no_filter                           do not filter predictions. (stops execution prior to the filtering stage).
#
#    --outTmpDir <string>	           passed to STAR (very useful if local disks are faster than network disks)
#
#    --min_junction_reads <int>            minimum number of junction-spanning reads required. Default: $MIN_JUNCTION_READS
#
#    --min_sum_frags <int>                 minimum fusion support = ( # junction_reads + # spanning_frags ) Default: $MIN_SUM_FRAGS
#
#    --require_LDAS 0|1                    require long double anchor support for split reads
#                                           when no spanning frags are found (default: 1)
#
#
#    --max_promiscuity <int>               maximum number of partners allowed for a given fusion. Default: $MAX_PROMISCUITY
#
#    --min_pct_dom_promiscuity <int>       for promiscuous fusions, those with less than this support of the dominant scoring pair
#                                          are filtered prior to applying the max_promiscuity filter.
#                                          (default: $MIN_PCT_DOM_PROM)
#
#    --aggregate_novel_junction_dist <int>  default: $AGGREGATE_NOVEL_JUNCTION_DIST (non-ref junctions within $AGGREGATE_NOVEL_JUNCTION_DIST
#                                           are merged into single calls)
#
#    --min_novel_junction_support <int>    default: $MIN_NOVEL_JUNCTION_SUPPORT  (minimum of $MIN_NOVEL_JUNCTION_SUPPORT
#                                          junction reads required if breakpoint
#                                                        lacks involvement of only reference junctions)
#
#    --min_spanning_frags_only <int>        minimum number of rna-seq fragments required as fusion evidence if
#                                          there are no junction reads (default: $MIN_SPANNING_FRAGS_ONLY)
#
#    --min_alt_pct_junction <float>        default: $MIN_ALT_PCT_JUNCTION  ($MIN_ALT_PCT_JUNCTION % of the dominant isoform
#                                          junction support)
#
#    --min_FFPM <float>                    minimum FFPM (fusion fragments per million rna-seq frags)  (default: $MIN_FFPM)
#
#    --no_remove_dups                      do not remove duplicate reads
#
#    --skip_EM                             skips expectation maximization based fractional assignment of spanning fragments across multiple splice breakpoint fusion calls.
#
#    --skip_FFPM                           skip FFPM computation.  FFPM is only computed correctly and meaningfully on per-sample analysis, or use of --samples_file where all entries in the samples file correspond to a single biological sample (ie. multiple lanes of seq, or replicates that are to be run as a single entity).
#
#   **  Turn off specific fusion filters:
#
#    --no_annotation_filter                exclude filtering of fusion predictions based on annotations
#                                               (ie. by default, removing fusions found in normal (non-cancer) samples))
#
#    --no_RT_artifact_filter               exclude filtering of likely RT artifacts
#                                               (by default, fusions w/o GT-AG, GC-AG, or AT-AC splice junctions at fusion breakpoint are excluded
#                                                as likely artifacts, with few exceptions of known fusions that involve intra-exon breakpoints. ie. IGH-fusions)
#
#    --no_single_fusion_per_breakpoint     exclude filtering of potentially superfluous fusions involving different gene
#                                                annotations overlapping the same fusion breakpoint.
#
# Downstream analysis of fusion candidates:
#
#    --examine_coding_effect               explore impact of fusions on coding sequences
#
#    --extract_fusion_reads                retrieves the fusion supporting reads from the fastq files
#
#    --FusionInspector <inspect|validate>  include FusionInspector, options:
#                                               'inspect' - considers only StarFusion-identified fusion reads in context of identified fusions (fast)
#                                               'validate' - examines all reads, recovers evidence, computes fusion allele fractions (slow)
#
#    --denovo_reconstruct                  attempt to reconstruct fusion transcripts using Trinity de novo assembly (requires --FusionInspector)
#
#    --misc_FI_opts <string>               additional FI-specific options to pass on to FusionInspector
#
#    --version                             report version ($VERSION)
#
# STAR shared memory parameters (see STAR-Fusion usage documentation for details):
#
#    --STAR_use_shared_memory              use shared memory among multiple processes for the STAR alignment step
#                                           # note, when this option is used, twopass mode is disabled, as it's incompatible w/ shared mem.
#                                          ** (when all your jobs are done, be sure to run: 'STAR-Fusion --STAR_Remove' to unload it) **
#
#    --STAR_LoadAndExit                    runs STAR to load the genome index into shared RAM, then exits.
#
#    --STAR_Remove                         removes genome from shared RAM and exits.
#
#
## Support for single-cell data
#
#  ## SmartSeq2
#
#      --samples_file <string>               file containing list of targeted files, separate fastqs for each cell.
#                                            (with format:
#                                                sample_name(tab)/path/to/left.fq(tab)/path/to/right.fq
#                                             and for single-end reads, exclude the 2nd read path. )
#
# Remaining misc. options:
#
#    --tmpdir <string>                     file for temporary files (default: /tmp)
#
#    --verbose_level <int>                 verbosity (default: $verbose_level, max=2)
#
#    --max_sensitivity                     includes options: --min_junction_reads 0 --min_sum_frags 1 --require_LDAS 0 --min_spanning_frags_only 1 --min_novel_junction_support 1 --skip_FFPM --no_single_fusion_per_breakpoint --skip_EM
#
#    --full_Monty                          includes max sensitivity and disables filters via: --max_promiscuity 1000000 --min_pct_dom_promiscuity 1 --min_alt_pct_junction 1e-3 --no_annotation_filter --no_RT_artifact_filter
#
#
###################################################################################


__EXTENDED_USAGE__

    ;


my $no_filter = 0;
my $MAX_SENSITIVITY_FLAG = 0;
my $FULL_MONTY_FLAG = 0;

my $left_fq_filename = "";
my $right_fq_filename = "";

my $samples_file = "";

my $chimeric_out_junctions_file = "";

my $REPORT_VERSION = 0;
my $USE_SHARED_MEMORY = 0;

my $EXTRACT_FUSION_READS = 0;
my $FUSIONINSPECTOR_MODE = 0;
my $DENOVO_RECONSTRUCT = 0;
my $EXAMINE_CODING_EFFECT = 0;

my $STAR_PROG;
my $DEVEL_STAR;

my $STAR_LOAD_AND_EXIT;
my $STAR_REMOVE;
my $STAR_twopass = 0;

my $NO_ANNOTATION_FILTER = 0;
my $NO_RT_ARTIFACT_FILTER = 0;
my $NO_SINGLE_FUSION_PER_BREAKPOINT = 0;


my $SHOW_FULL_USAGE_INFO;

my $MISC_FI_OPTS = "";

my $outSAMattrRGline = "";



&GetOptions ( 'help|h' => \$help_flag,

              'left_fq=s' => \$left_fq_filename,
              'right_fq=s' => \$right_fq_filename,
              'samples_file=s' => \$samples_file,

              'chimeric_junction|J=s' => \$chimeric_out_junctions_file,

              'min_junction_reads=i' => \$MIN_JUNCTION_READS,
              'min_sum_frags=i' => \$MIN_SUM_FRAGS,
              'max_promiscuity=i' => \$MAX_PROMISCUITY,
              'min_pct_dom_promiscuity=f' => \$MIN_PCT_DOM_PROM,
              'min_spanning_frags_only=i' => \$MIN_SPANNING_FRAGS_ONLY,
              'min_pct_MM_nonspecific|M=i' => \$MIN_PCT_MM_NONSPECIFIC,
              
              'require_LDAS=i' => \$REQUIRE_LDAS,

              'min_novel_junction_support=i' => \$MIN_NOVEL_JUNCTION_SUPPORT,
              'min_alt_pct_junction=f' => \$MIN_ALT_PCT_JUNCTION,
              'aggregate_novel_junction_dist=i' => \$AGGREGATE_NOVEL_JUNCTION_DIST,
              'output_dir|O=s' => \$output_dir,

              'tmpdir=s' => \$tmpdir,
              'verbose_level=i' => \$verbose_level,

              'examine_coding_effect' => \$EXAMINE_CODING_EFFECT,

              'min_FFPM=f' => \$MIN_FFPM,

              'no_filter' => \$no_filter,

              'no_annotation_filter' => \$NO_ANNOTATION_FILTER,
              'no_RT_artifact_filter' => \$NO_RT_ARTIFACT_FILTER,
              'no_single_fusion_per_breakpoint' => \$NO_SINGLE_FUSION_PER_BREAKPOINT,

              'genome_lib_dir=s' => \$genome_lib_dir,

              'version' => \$REPORT_VERSION,

              'CPU=i' => \$CPU,
              'outTmpDir=s' => \$outTmpDir,

              # shared memory options
              'STAR_use_shared_memory' => \$USE_SHARED_MEMORY,
              'STAR_LoadAndExit' => \$STAR_LOAD_AND_EXIT,
              'STAR_Remove' => \$STAR_REMOVE,

              'STAR_max_mate_dist=i' => \$STAR_MAX_MATE_DIST,
              'STAR_SJDBoverhangMin=i' => \$STAR_SJDB_OVERHANG_MIN,


              'STAR_SortedByCoordinate' => \$STAR_SortedByCoordinate,
              'STAR_limitBAMsortRAM=s' => \$STAR_limitBAMsortRAM,

              'STAR_PATH=s' => \$STAR_PROG,

              # other STAR settings
              'STAR_peOverlapNbasesMin=i' => \$STAR_peOverlapNbasesMin,
              'STAR_peOverlapMMp=i' => \$STAR_peOverlapMMp,

              'STAR_chimMultimapScoreRange=i' => \$STAR_chimMultimapScoreRange,
              'STAR_chimMultimapNmax=i' => \$STAR_chimMultimapNmax,
              'STAR_chimNonchimScoreDropMin=i' => \$STAR_chimNonchimScoreDropMin,

              "STAR_outSAMattrRGline=s" => \$outSAMattrRGline,
              
              "STAR_twopass" => \$STAR_twopass,

              'extract_fusion_reads' => \$EXTRACT_FUSION_READS,
              'FusionInspector=s' => \$FUSIONINSPECTOR_MODE,
              'denovo_reconstruct' => \$DENOVO_RECONSTRUCT,

              'no_remove_dups' => \$NO_REMOVE_DUPS,

              'DEVEL_STAR' => \$DEVEL_STAR,

              'show_full_usage_info' => \$SHOW_FULL_USAGE_INFO,

              'run_STAR_only' => \$RUN_STAR_ONLY_FLAG,

              'max_sensitivity' => \$MAX_SENSITIVITY_FLAG,

              'full_Monty' => \$FULL_MONTY_FLAG,

              'skip_EM' => \$SKIP_EM_FLAG,
              'skip_FFPM' => \$SKIP_FFPM_FLAG,

              'misc_FI_opts=s' => \$MISC_FI_OPTS,

              
    );



if ($SHOW_FULL_USAGE_INFO) {
    print "$usage$extended_usage_info";
    exit(0);
}


if ($help_flag) {
    print "$usage";
    exit(0);
}

if ($REPORT_VERSION) {
    print "\n\nSTAR-Fusion version: $VERSION\n\n";
    exit(0);
}


if ($FULL_MONTY_FLAG) {
    $MAX_SENSITIVITY_FLAG = 1;
    # includes max sensitivity and disables filters via: --max_promiscuity 1000000 --min_pct_dom_promiscuity 1 --min_alt_pct_junction 1e-3 --no_annotation_filter --no_RT_artifact_filter

    $MAX_PROMISCUITY = 1000000;
    $MIN_PCT_DOM_PROM = 1;
    $MIN_ALT_PCT_JUNCTION = 1e-3;
    $NO_ANNOTATION_FILTER = 1;
    $NO_RT_ARTIFACT_FILTER = 1;

}
if ($MAX_SENSITIVITY_FLAG) {
    # includes options: --min_junction_reads 0 --min_sum_frags 1 --require_LDAS 0 --min_spanning_frags_only 1 --min_novel_junction_support 1 --min_FFPM 0 --no_single_fusion_per_breakpoint
    $MIN_JUNCTION_READS = 0;
    $MIN_SUM_FRAGS = 1;
    $REQUIRE_LDAS = 0;
    $MIN_SPANNING_FRAGS_ONLY = 1;
    $MIN_NOVEL_JUNCTION_SUPPORT = 1;
    $MIN_FFPM = 0;
    $NO_SINGLE_FUSION_PER_BREAKPOINT = 1;
    $SKIP_EM_FLAG = 1;
    $SKIP_FFPM_FLAG = 1;
}


if ($STAR_PROG) {
    unless (-e $STAR_PROG) {
        die "Error, STAR program set to $STAR_PROG but can't be found";
    }
    # adding path to star to the PATH setting in case it needs to be used for FusionInspector later on.
    my $star_basedir = &ensure_full_path(dirname($STAR_PROG));
    $ENV{PATH} = "$star_basedir:$ENV{PATH}";
}
else {
    $STAR_PROG = `which STAR`;
    chomp $STAR_PROG;
    unless ($STAR_PROG =~ /\w/) {
        die "Error, cannot locate STAR program in the PATH setting";
    }
}

&check_compatible_STAR_version($STAR_PROG, \@MIN_STAR_MAJOR_MINOR_VERSION);


if ($STAR_limitBAMsortRAM =~ /^(\d+)G$/) {
    $STAR_limitBAMsortRAM = $1 * (1024**3);
}
else {
    die "Error, cannot parse G of RAM from: $STAR_limitBAMsortRAM ";
}


if ($FUSIONINSPECTOR_MODE) {
    unless ($FUSIONINSPECTOR_MODE =~ /^(inspect|validate)$/) {
        die "Error, --FusionInspector option can be 'inspect' or 'validate', [$FUSIONINSPECTOR_MODE] not recognized ";
    }
    if ($FUSIONINSPECTOR_MODE =~ /inspect/) {
        $EXTRACT_FUSION_READS = 1;
    }
}

if ($EXTRACT_FUSION_READS) {
    unless ($left_fq_filename || $samples_file) {
        die "Error, need --left_fq (and --right_fq for PE reads) or --samples_file enabled with --FusionInspector or --extract_fusion_reads ";
    }
}

if ($DENOVO_RECONSTRUCT && ! $FUSIONINSPECTOR_MODE) {
    die "Error, --denovo_reconstruct requires that you employ --FusionInspector ";
}


if (@ARGV) {
    die "Error, don't understand arguments: [@ARGV] ";
}

unless ($genome_lib_dir) {
    # neither env var or the --genome_lib_dir param is set.
    die $usage;
}

unless (-d $genome_lib_dir) {
    die "Error, cannot locate genome_lib_dir: $genome_lib_dir";
}

## reset it just in case it's different or unset to begin with:
$ENV{CTAT_GENOME_LIB} = &ensure_full_path($genome_lib_dir);

## ensure that we can locate the AnnotFilterRule in the CTAT_GENOME_LIB
my $annot_filt_module = "$genome_lib_dir/AnnotFilterRule.pm";
unless (-s $annot_filt_module) {
    die "Error, cannot locate required $annot_filt_module  ... be sure to use a more modern version of the companion CTAT_GENOME_LIB ";
}



# ensure full paths:

$genome_lib_dir = &ensure_full_path($genome_lib_dir);
my $star_index_dir = "$genome_lib_dir/ref_genome.fa.star.idx";

if ($STAR_LOAD_AND_EXIT) {
    my $cmd = "$STAR_PROG --genomeDir $star_index_dir --genomeLoad LoadAndExit";
    &process_cmd($cmd);
    print STDERR "-done loading genome into shared memory.\n";
    exit(0);
}
elsif ($STAR_REMOVE) {
    my $cmd = "$STAR_PROG --genomeDir $star_index_dir --genomeLoad Remove";
    &process_cmd($cmd);
    print STDERR "-done removing genome from shared memory.\n";
    exit(0);
}


unless ( ($left_fq_filename || $samples_file || $chimeric_out_junctions_file) && $genome_lib_dir) {
    die $usage;
}


$output_dir = &ensure_full_path($output_dir);
$left_fq_filename = &ensure_full_path($left_fq_filename) if $left_fq_filename;
$right_fq_filename = &ensure_full_path($right_fq_filename) if $right_fq_filename;
$chimeric_out_junctions_file = &ensure_full_path($chimeric_out_junctions_file) if $chimeric_out_junctions_file;
$samples_file = &ensure_full_path($samples_file) if $samples_file;

my $checkpoints_dir = "$output_dir/_starF_checkpoints";


if ($samples_file && $left_fq_filename) {
    die "Error, specify --samples_file or --left_fq but not both.  These are mutually exclusive options";
}

my $read_group_ids = "";

main: {


    &validate_ctat_genome_lib($genome_lib_dir);


    unless (-d $output_dir) {
        &process_cmd("mkdir -p $output_dir");
    }

    if ($samples_file) {
        ($read_group_ids, $left_fq_filename, $right_fq_filename) = &parse_samples_file($samples_file);

        # in case there are relative paths
        my $new_samples_file = "$output_dir/starF.target_samples.txt";
        &write_new_samples_file($new_samples_file, $read_group_ids, $left_fq_filename, $right_fq_filename);
        $samples_file = $new_samples_file;
    }



    chdir $output_dir or die "Error, cannot cd to $output_dir";

    unless (-d $checkpoints_dir) {
        &process_cmd("mkdir -p $checkpoints_dir");
    }

    my $preliminary_outdir = "$output_dir/star-fusion.preliminary";
    unless (-d $preliminary_outdir) {
        &process_cmd("mkdir -p $preliminary_outdir");
    }

    if ($out_prefix =~ m|/$|) {
        # not allowing to end as a directory, must be a filename
        die "Error, --out_prefix must be a file name and not a directory name, although you can include directories in the path to the file name to be created.";
    }

    my $out_prefix_basename = basename($out_prefix);

    my $pipeliner = new Pipeliner(-verbose => $verbose_level);

    unless ($chimeric_out_junctions_file) {
        $chimeric_out_junctions_file = "Chimeric.out.junction";


        unless (-d $star_index_dir) {
            die "Error, cannot locate star index at $star_index_dir";
        }

        &run_STAR($pipeliner, $left_fq_filename, $right_fq_filename, $read_group_ids);

    }


    if ($RUN_STAR_ONLY_FLAG) {
        print STDERR "Option --run_STAR_only flag is set. Stopping here.\n";
        exit(0);
    }


    ## Get total number of rnaseq fragments reported by STAR:
    my $num_total_rnaseq_frags = &get_num_total_rnaseq_frags($chimeric_out_junctions_file);
    print STDERR "-sample contains $num_total_rnaseq_frags rnaseq fragments\n";


    ########################################
    ## Map chimeric reads to genome features

    my $cmd = "$UTILDIR/STAR-Fusion.map_chimeric_reads_to_genes "
        . " --genome_lib_dir $genome_lib_dir "
        . " -J $chimeric_out_junctions_file "
        . " -O $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt"
        . " --CPU $CPU ";
    
    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.map_chim_to_genes.ok"));


    #############################################
    ## Filter out non-specific multimapping reads

    $cmd = "$UTILDIR/STAR-Fusion.handle_multimapping_reads "
        . " -J $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt "
        . " --genome_lib_dir $genome_lib_dir "
        . " --filt_file $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt.fail "
        . " -M $MIN_PCT_MM_NONSPECIFIC "
        . " > $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt.pass ";

    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.handle_multimappers.ok"));


    ###################################
    ## predict preliminary fusion pairs

    $cmd = "$UTILDIR/STAR-Fusion.predict "
        . " -J $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt.pass "
        . " -O $preliminary_outdir/$out_prefix ";

    if ($NO_REMOVE_DUPS) {
        $cmd .= " --no_remove_dups ";
    }

    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.STAR-Fusion.predict.ok"));

    my $predicted_fusions_file = "$preliminary_outdir/$out_prefix.fusion_candidates.preliminary";


    my $prelim_fusion_file = $predicted_fusions_file;

    unless ($no_filter) {

        ## filter fusions

        $cmd = "$UTILDIR/STAR-Fusion.filter "
            . " --fusion_preds $prelim_fusion_file "
            . " -J $chimeric_out_junctions_file "
            . " --tmpdir $tmpdir "
            . " --min_junction_reads $MIN_JUNCTION_READS "
            . " --min_sum_frags $MIN_SUM_FRAGS "
            . " --require_LDAS $REQUIRE_LDAS "
            . " --max_promiscuity $MAX_PROMISCUITY "
            . " --min_pct_dom_promiscuity $MIN_PCT_DOM_PROM "
            . " --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
            . " --min_alt_pct_junction $MIN_ALT_PCT_JUNCTION "
            . " --aggregate_novel_junction_dist $AGGREGATE_NOVEL_JUNCTION_DIST "
            . " --min_spanning_frags_only $MIN_SPANNING_FRAGS_ONLY "
            . " --genome_lib_dir $genome_lib_dir "
            . " --out_prefix $preliminary_outdir/$out_prefix";

        if ($NO_SINGLE_FUSION_PER_BREAKPOINT) {
            $cmd .= " --no_single_fusion_per_breakpoint ";
        }
        if ($SKIP_EM_FLAG) {
            $cmd .= " --skip_EM ";
        }

        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/STAR-Fusion.filter.ok"));


        $prelim_fusion_file = "$preliminary_outdir/$out_prefix.fusion_candidates.preliminary.filtered";


    }

    
    if ( $num_total_rnaseq_frags > 0 && (! $SKIP_FFPM_FLAG)) {
        ## convert vals to FFPM
        $cmd = "$UTILDIR/incorporate_FFPM_into_final_report.pl $prelim_fusion_file $num_total_rnaseq_frags > $prelim_fusion_file.FFPM";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/FFPM.ok") );

        $prelim_fusion_file = "$prelim_fusion_file.FFPM";
    }

    ## add breakpoint / splice junction info

    my $predicted_fusions_with_splice_info = "$predicted_fusions_file.wSpliceInfo";

    $cmd = "$UTILDIR/append_breakpoint_junction_info.pl $prelim_fusion_file $genome_lib_dir > $predicted_fusions_with_splice_info";

    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/predicted_fusions_with_splice_info.ok"));




    ## annotate fusions.

    my $annotated_fusions_filename = "$predicted_fusions_with_splice_info.wAnnot";
    $cmd = "$BASEDIR/FusionAnnotator/FusionAnnotator --genome_lib_dir $genome_lib_dir --annotate $predicted_fusions_with_splice_info > $annotated_fusions_filename";
    $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/fusion_annotator.ok"));



    ## various downstream fusion filters
    $prelim_fusion_file = $annotated_fusions_filename;

    # filter based on annotations
    unless ($NO_ANNOTATION_FILTER) {

        ## filter by annotations
        $cmd = "$BASEDIR/FusionFilter/util/filter_by_annotation_rules.pl --fusions $annotated_fusions_filename --genome_lib_dir $genome_lib_dir";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/annot_filter.ok"));
        $prelim_fusion_file = "$annotated_fusions_filename.annot_filter.pass";
    }

    unless ($NO_RT_ARTIFACT_FILTER) {

        ## filter out likely RT artifacts
        $cmd = "$UTILDIR/filter_likely_RT_artifacts.pl --fusions $prelim_fusion_file --genome_lib_dir $genome_lib_dir";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/RTartifact_filter.ok"));
        $prelim_fusion_file = "$prelim_fusion_file.RTartifact.pass";

    }

    
    if ( ($num_total_rnaseq_frags > 0) && (! $SKIP_FFPM_FLAG) && $MIN_FFPM > 0) {

        # filter based on min FFPM value

        $cmd = "$UTILDIR/filter_by_min_FFPM.pl $prelim_fusion_file $MIN_FFPM";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/FFPM_filter.ok") );

        $prelim_fusion_file = "$prelim_fusion_file.minFFPM.$MIN_FFPM.pass";
    }


    # generate final fusion prediction output files:
    my $final_output_fusions = "$out_prefix.fusion_predictions.tsv";
    $pipeliner->add_commands(new Command("cp $prelim_fusion_file $final_output_fusions", "$checkpoints_dir/cp_final.ok"));


    ## if samples file for sc, separate by cell:
    if ($samples_file) {
        $pipeliner->add_commands(new Command("$UTILDIR/sc/starF_partition_final_by_sc.pl $final_output_fusions > $final_output_fusions.samples_deconvolved.tsv",
                                             "$checkpoints_dir/samples_deconvolved.ok") );

        ## make abridged version.
        $pipeliner->add_commands(new Command("$UTILDIR/column_exclusions.pl $final_output_fusions.samples_deconvolved.tsv JunctionReads,SpanningFrags > $final_output_fusions.samples_deconvolved.abridged.tsv",
                                             "$checkpoints_dir/samples_deconvolved.abridged.ok") );

    }


    ## make abridged versions:
    my $abridged_final_output = $final_output_fusions;
    $abridged_final_output =~ s/\.tsv$/.abridged.tsv/;
    $pipeliner->add_commands(new Command("$UTILDIR/column_exclusions.pl $final_output_fusions JunctionReads,SpanningFrags > $abridged_final_output", "$checkpoints_dir/abridged_final"));

    if ($EXAMINE_CODING_EFFECT) {

        my $coding_eff_filename = $abridged_final_output;
        $coding_eff_filename =~ s/\.tsv$/.coding_effect.tsv/;

        $cmd = "$BASEDIR/FusionAnnotator/util/fusion_to_coding_region_effect.pl  --fusions $abridged_final_output --genome_lib_dir $genome_lib_dir > $coding_eff_filename";
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/coding_eff.ok"));
        $abridged_final_output = $coding_eff_filename;
    }

    if ($EXTRACT_FUSION_READS) {
        my $cmd = "$UTILDIR/get_FUSION_EVIDENCE_fastqs.pl --fusions $out_prefix.fusion_predictions.tsv --output_prefix $out_prefix ";
        if ($samples_file) {
            $cmd .= " --samples_file $samples_file ";
        }
        else {
            $cmd .= " --left_fq $left_fq_filename";
            if ($right_fq_filename) {
                $cmd .= " --right_fq $right_fq_filename";
            }
        }
        $pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/extract_fusion_reads.ok"));
    }

    $pipeliner->run();

    if ($no_filter) {

        print STDERR "\n\n *** Note: --no_filtering in effect, so outputs have not been filtered for likely false positives.\n";
        print STDERR "\n\tSee output: $prelim_fusion_file\n\n";
    }
    else {

        print STDERR "\n\n\t* STAR-Fusion complete.  See output: $output_dir/$out_prefix.fusion_predictions.tsv (or .abridged.tsv version)\n\n\n";
    }


    if ($FUSIONINSPECTOR_MODE) {

        my $FI_outdir = "$output_dir/FusionInspector-${FUSIONINSPECTOR_MODE}";

        my $FI_cmd = "";
        $FI_cmd = "$BASEDIR/FusionInspector/FusionInspector --fusions $abridged_final_output "
                . " --out_prefix finspector "
                . " --min_junction_reads $MIN_JUNCTION_READS "
                . " --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
                . " --min_spanning_frags_only $MIN_SPANNING_FRAGS_ONLY "
                . " --max_mate_dist $STAR_MAX_MATE_DIST "
                . " --vis "
                . " --max_promiscuity $MAX_PROMISCUITY "
                . " --output_dir $FI_outdir "
                . " --genome_lib_dir $genome_lib_dir "
                .  " --CPU $CPU ";

        if ($FUSIONINSPECTOR_MODE eq 'inspect') {
            ## use the extracted fusion evidence reads
            my $FI_left_fq = "$out_prefix.fusion_evidence_reads_1.fq";

            $FI_cmd .= " --only_fusion_reads "
                    .  "--fusion_contigs_only "
                    .  " --left_fq $FI_left_fq";

            if ($right_fq_filename) {
                my $FI_right_fq = "$out_prefix.fusion_evidence_reads_2.fq";
                $FI_cmd .= " --right_fq $FI_right_fq ";
            }
            $FI_cmd .= " --no_FFPM "; # FFPM would be meaningless in this context of fusion-only reads.
        }
        else {
            # validate mode
            # use all original reads.

            if ($samples_file) {
                $FI_cmd .= " --samples_file $samples_file ";
            }
            else {

                $FI_cmd .=  " --left_fq $left_fq_filename";
                if ($right_fq_filename) {
                    $FI_cmd .= " --right_fq $right_fq_filename ";
                }
            }
        }

        if ($DENOVO_RECONSTRUCT) {
            $FI_cmd .= " --include_Trinity ";
        }

        $FI_cmd .= " --annotate "; # just do it by default

        if ($EXAMINE_CODING_EFFECT) {
            $FI_cmd .= " --examine_coding_effect ";
        }

        if ($NO_REMOVE_DUPS) {
            $FI_cmd .= " --no_remove_dups ";
        }

        if ($MISC_FI_OPTS) {
            $FI_cmd .= " $MISC_FI_OPTS ";
        }
        
        $pipeliner->add_commands(new Command($FI_cmd, "_fi_${FUSIONINSPECTOR_MODE}_" . length($FI_cmd) . ".ok"));


        $pipeliner->run();
    }


    exit(0);

}


####
sub missing_required_program_installed {
    my (@required_progs) = @_;

    my $missing = 0;

    foreach my $prog (@required_progs) {

        my $path = `which $prog`;
        chomp $path;
        unless ($path && $path =~ /\w/) {
            $missing++;
            print STDERR "Error, cannot locate required program: $prog\n";
        }
        else {
            print STDERR "-found prog $prog at $path\n";
        }
    }

    return($missing);
}



####
sub run_STAR {
    my ($pipeliner, $left_fq_filename, $right_fq_filename, $read_group_ids) = @_;

    my $maybe_tmpdir= defined($outTmpDir)? " --outTmpDir $outTmpDir " : "";

    ## run STAR to align reads:
    my $cmd = "$STAR_PROG --genomeDir $star_index_dir "
        . " --outReadsUnmapped None "
        . " --chimSegmentMin 12 "
        . " --chimJunctionOverhangMin 8 "
        . " --chimOutJunctionFormat 1 "  # new option for STAR 2.6.1a
        . " --alignSJDBoverhangMin $STAR_SJDB_OVERHANG_MIN "
        . " --alignMatesGapMax $STAR_MAX_MATE_DIST "
        . " --alignIntronMax $STAR_MAX_MATE_DIST "
        . " --alignSJstitchMismatchNmax 5 -1 5 5 "  #which allows for up to 5 mismatches for non-canonical GC/AG, and AT/AC junctions, and any number of mismatches for canonical junctions (the default values 0 -1 0 0 replicate the old behavior (from AlexD)

        . " --runThreadN $CPU"
        . $maybe_tmpdir
        . " --outSAMstrandField intronMotif "
        . " --outSAMunmapped Within "

        . " --alignInsertionFlush Right "
        . " --alignSplicedMateMapLminOverLmate 0 "
        . " --alignSplicedMateMapLmin 30 "


        ;


    # from: https://github.com/alexdobin/STAR/issues/744
    # --chimSegmentMin controls the minimum total length of the chimeric segment, which is calculated as a sum of alignment blocks of the segment.
    #  --chimJunctionOverhangMin, on the other hand, controls the minimum length of the block adjacent to the chimeric junction. This prevents chimeric junction from happening very close to the normal junction.



    
    if ($STAR_SortedByCoordinate) {

        $cmd .= " --limitBAMsortRAM $STAR_limitBAMsortRAM "
            . " --outSAMtype BAM SortedByCoordinate ";

    }
    else {
        $cmd .= " --outSAMtype BAM Unsorted "; # default, don't spend time and RAM sorting the data as not required for fusion-finding
    }

    my $reads = "$left_fq_filename $right_fq_filename";
    if (length($reads) > 1000) {
        my $star_params_file = "star_params." . time() . ".$$.txt";
        open(my $ofh, ">$star_params_file") or die "Error, cannot write to file: $star_params_file";
        print $ofh "readFilesIn $reads\n";
        if ($read_group_ids) {
            print $ofh "outSAMattrRGline $read_group_ids\n";
        }
        elsif ($outSAMattrRGline) {
            print $ofh "outSAMattrRGline $outSAMattrRGline\n";
        }
        else {
            print $ofh "outSAMattrRGline ID:GRPundef\n";
        }
        close $ofh;

        $cmd .= " --parametersFiles $star_params_file ";
    }
    else {
        $cmd .= " --readFilesIn $reads ";
        if ($read_group_ids) {
            $cmd .= " --outSAMattrRGline $read_group_ids ";
        }
        elsif ($outSAMattrRGline) {
            $cmd .= "--outSAMattrRGline $outSAMattrRGline";
        }
        else {
            $cmd .= " --outSAMattrRGline ID:GRPundef ";
        }
    }

    # chim read multimapping settings
    $cmd .= " --chimMultimapScoreRange $STAR_chimMultimapScoreRange "
          . " --chimScoreJunctionNonGTAG $STAR_chimScoreJunctionNonGTAG "
          . " --chimMultimapNmax $STAR_chimMultimapNmax "
          . " --chimOutType Junctions WithinBAM "
          . " --chimNonchimScoreDropMin $STAR_chimNonchimScoreDropMin ";

    # merge overlapping reads
    $cmd .= " --peOverlapNbasesMin $STAR_peOverlapNbasesMin --peOverlapMMp $STAR_peOverlapMMp ";


    if ($USE_SHARED_MEMORY) {
        $cmd .= " --genomeLoad LoadAndKeep ";
    }
    else {
        # note, twopassMode is incompatible with shared memory (--genomeLoad LoadAndKeep)
        $cmd .= " --genomeLoad NoSharedMemory ";
        
        if ($STAR_twopass) {
            $cmd .= " --twopassMode Basic ";
        } 
        else {
            $cmd .= " --twopassMode None ";
        }
    }
    
    if ($left_fq_filename =~ /\.gz$/) {
        $cmd .= " --readFilesCommand 'gunzip -c' ";
    }

    $cmd .= " --quantMode GeneCounts";

    #$cmd .=  " --limitOutSJcollapsed 10000000 ";

    $pipeliner->add_commands(Command->new($cmd, "$checkpoints_dir/run_star_aligner.ok"));

    $pipeliner->run();


    return;
}



####
sub run_BBMerge {
    my ($pipeliner, $left_fq_filename, $right_fq_filename, $read_group_ids) = @_;

    my @left_fq_files = split(/,/, $left_fq_filename);
    my @right_fq_files = split(/,/, $right_fq_filename);
    my @read_groups = split(/ , /, $read_group_ids);

    my @bbmerged_files;

    while (@left_fq_files) {

        my $left_fq_filename = shift @left_fq_files;
        my $right_fq_filename = shift @right_fq_files;


        unless ($right_fq_filename) {
            confess "Error, need right fq filename corresponding to left fq file: $left_fq_filename";
        }
        my $read_group = shift @read_groups;
        if ($read_group) {
            $read_group =~ s/^ID://;
        }
        else {
            $read_group = "singlesample";
        }

        my $cmd = "$PLUGINSDIR/bbmap/bbmerge.sh in1=$left_fq_filename in2=$right_fq_filename out=$read_group.bbmerge.fq";

        $pipeliner->add_commands(Command->new($cmd, "$checkpoints_dir/$read_group.bbmerge.ok"));

        # adjust the fq record names so we know these are the merged records later on.
        #$pipeliner->add_commands(Command->new("$UTILDIR/make_merged_prefix_fq.pl bbmerged.fq > bbmerged.adj.fq", "$checkpoints_dir/bbmerge.add_merged_token.ok"));

        push (@bbmerged_files, "$read_group.bbmerge.fq");
    }

    my $bbmerged_fq_files = join(",", @bbmerged_files);

    return($bbmerged_fq_files);
}


####
sub parse_samples_file {
    my ($samples_file) = @_;

    my @read_groups;
    my @left_read_files;
    my @right_read_files;

    open(my $fh, "$samples_file") or die "Error, cannot open file: $samples_file";
    while (<$fh>) {
        chomp;
        my ($sample_name, $left_path, $right_path) = split(/\t/);
        unless (-s $left_path) {
            die "Error, cannot find non-emtpy file: $left_path";
        }
        push (@read_groups, "ID:$sample_name");
        push (@left_read_files, &ensure_full_path($left_path));

        if ($right_path) {
            unless (-s $right_path) {
                die "Error, cannot find non-empty file: $right_path";
            }
            push (@right_read_files, &ensure_full_path($right_path));
        }

    }

    my $read_groups_str = join(" , ", @read_groups);
    my $left_reads_str = join(",", @left_read_files);
    my $right_reads_str = join(",", @right_read_files);

    return($read_groups_str, $left_reads_str, $right_reads_str);
}

####
sub write_new_samples_file {
    my ($new_samples_file, $read_group_ids, $left_fq_filename, $right_fq_filename) = @_;

    open(my $ofh, ">$new_samples_file") or die "Error, cannot write to $new_samples_file";

    my @read_group_ids = split(/ , /, $read_group_ids);
    my @left_fq_files = split(/,/, $left_fq_filename);
    my @right_fq_files = split(/,/, $right_fq_filename);

    while (@read_group_ids) {
        my $read_group = shift @read_group_ids;
        my $left_fq = shift @left_fq_files;
        my $right_fq = shift @right_fq_files;

        $read_group =~ s/^ID://;

        my $outline = join("\t", $read_group, $left_fq);
        if ($right_fq) {
            $outline .= "\t$right_fq";
        }
        $outline .= "\n";
        print $ofh $outline;
    }
    close $ofh;

    return;
}

####
sub check_compatible_STAR_version {
    my ($star_prog, $min_star_major_minor_aref) = @_;

    my $star_version_info = `$star_prog --version`;


    chomp $star_version_info;

    my @MIN_VERSION = @$min_star_major_minor_aref;
    my $min_letter = pop @MIN_VERSION;

    my $errmsg = "\n\n\tSorry, version [$star_version_info] is not supported. Please use at least STAR version " . join(".", @MIN_VERSION) . "$min_letter\n";
    $errmsg .= "\tTo upgrade, obtain the latest STAR here: https://github.com/alexdobin/STAR/releases\n\n";

    if ($star_version_info =~ /^(\d+)\.(\d+)\.(\d+)([a-z]?)/) {

        my $major_ver = $1;
        my $minor_ver = $2;
        my $patch_ver = $3;
        my $letter_ver = $4;

        if ($major_ver < $min_star_major_minor_aref->[0]
            ||
            ($major_ver == $min_star_major_minor_aref->[0] && $minor_ver < $min_star_major_minor_aref->[1])
            ||
            ($major_ver == $min_star_major_minor_aref->[0] && $minor_ver == $min_star_major_minor_aref->[1] && $patch_ver < $min_star_major_minor_aref->[2])
            ||
            ($major_ver == $min_star_major_minor_aref->[0] && $minor_ver == $min_star_major_minor_aref->[1] && $patch_ver == $min_star_major_minor_aref->[2] && ord($letter_ver) < ord($min_star_major_minor_aref->[3]))
            ) {

            print STDERR $errmsg;
            exit(1);
        }
        else {
            # all good. :-)
        }
    }
    else {
        print STDERR $errmsg;
        exit(1);
    }

    return;
}


####
sub get_num_total_rnaseq_frags {
    my ($chimeric_out_junctions_file) = @_;

    my $read_count_line = `tail -n1 $chimeric_out_junctions_file`;

    if ($read_count_line =~ /^\# Nreads (\d+)\tNreadsUnique (\d+)\tNreadsMulti (\d+)/) {
        my $total_reads = $1;
        my $num_reads_unique = $2;
        my $num_reads_multi = $3;


        return ($total_reads);

    }
    else {
        my $v_letter = pop @MIN_STAR_MAJOR_MINOR_VERSION;
        confess "Error, file: $chimeric_out_junctions_file doesn't have expected formatting.  Please be sure to use the STAR aligner with min version: " . join(".", @MIN_STAR_MAJOR_MINOR_VERSION) . $v_letter;
    }
}


####
sub validate_ctat_genome_lib {
    my ($genome_lib_dir) = @_;

    my $cmd = "$UTILDIR/validate_ctat_genome_lib.pl $genome_lib_dir";

    my $ret = system($cmd);
    if ($ret) {
        die "\n\n\tERROR, ctat genome lib: $genome_lib_dir does not validate and may require indices to be rebuilt.\n\n"
            . "\tVisit: https://github.com/STAR-Fusion/STAR-Fusion/wiki/rebuild_ctat_genome_lib_indices\n\tfor details.\n\n";
    }

    return;
}
back to top