https://github.com/STAR-Fusion/STAR-Fusion
Tip revision: e522c236a1b068ab1217855209b4706e2f8ef153 authored by Brian Haas on 27 October 2023, 18:40:42 UTC
fusionfilter update
fusionfilter update
Tip revision: e522c23
STAR-Fusion
#!/usr/bin/env perl
use strict;
use warnings;
use Carp;
use Cwd;
use FindBin;
use lib ("$FindBin::Bin/PerlLib");
use Pipeliner;
use File::Basename;
use Process_cmd;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
my $VERSION = "1.13.0";
#my $VERSION = "__BLEEDING_EDGE__ Do not use - pull versioned release from: https://github.com/STAR-Fusion/STAR-Fusion/releases";
my @MIN_STAR_MAJOR_MINOR_VERSION = (2, 7, 8, 'a');
my $BASEDIR = "$FindBin::Bin";
my $UTILDIR = "$BASEDIR/util";
my $PLUGINSDIR = "$BASEDIR/plugins";
## Options
my $output_dir = "STAR-Fusion_outdir";
my $out_prefix = "star-fusion";
my $help_flag;
my $MIN_NOVEL_JUNCTION_SUPPORT = 3;
my $MIN_ALT_PCT_JUNCTION = 10.0;
my $AGGREGATE_NOVEL_JUNCTION_DIST = 5;
my $tmpdir = "/tmp";
my $verbose_level = 2;
my $MIN_FFPM = 0.1;
my $MIN_JUNCTION_READS = 1;
my $MIN_SUM_FRAGS = 2; # requires at least one junction read, else see min_spanning_frags_only below.
my $MIN_SPANNING_FRAGS_ONLY = 5;
my $MAX_PROMISCUITY = 10;
my $MIN_PCT_DOM_PROM = 20;
my $genome_lib_dir = $ENV{CTAT_GENOME_LIB};
my $CPU = 4;
my $outTmpDir=undef;
my $REQUIRE_LDAS = 1;
my $STAR_peOverlapNbasesMin = 12; # minimum number of bases in the overlap
my $STAR_peOverlapMMp = 0.1; # the max proportion of mismatches in the overlap
my $STAR_chimMultimapScoreRange = 3;
my $STAR_chimMultimapNmax = 20;
my $STAR_chimNonchimScoreDropMin = 10;
my $STAR_chimScoreJunctionNonGTAG = -4;
my $STAR_MAX_MATE_DIST = 100000;
my $STAR_SJDB_OVERHANG_MIN = 10;
my $MIN_PCT_MM_NONSPECIFIC = 50;
my $STAR_SortedByCoordinate = 0;
my $STAR_limitBAMsortRAM = "40G";
my $NO_REMOVE_DUPS = 0;
my $RUN_STAR_ONLY_FLAG = 0;
my $SKIP_EM_FLAG = 0;
my $SKIP_FFPM_FLAG = 0;
my $logo='
############################################################################################
#
# _________________________ __________ ___________ .__
# / _____/\__ ___/ _ \\\______ \ \_ _____/_ __ _____|__| ____ ____
# \_____ \ | | / /_\ \| _/ ______ | __)| | \/ ___/ |/ _ \ / \
# / \ | |/ | \ | \ /_____/ | \ | | /\___ \| ( <_> ) | \
# /_______ / |____|\____|__ /____|_ / \___ / |____//____ >__|\____/|___| /
# \/ \/ \/ \/ \/ \/
#
#
############################################################################################';
my $usage = <<__EOUSAGE__;
$logo
#
# Required:
#
# To include running STAR:
#
# --left_fq <string> left.fq file (or single.fq)
#
# --right_fq <string> right.fq file (actually optional, but highly recommended)
#
# Or use output from earlier STAR run:
#
# --chimeric_junction|J <string> Chimeric.out.junction file
#
#
# --genome_lib_dir <string> directory containing genome lib (see http://STAR-Fusion.github.io)
# (required to specify, unless env var CTAT_GENOME_LIB is set to it)
# Easiest - get plug-n-play version from:
# < https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/ >
#
# Optional:
#
# --CPU <int> number of threads for running STAR (default: $CPU)
#
# --output_dir|O <string> output directory (default: $output_dir)
#
# --show_full_usage_info provide full usage info.
#
#
#######################
#
# Quick guide to running:
#
# STAR-Fusion --left_fq reads_1.fq --right_fq reads_2.fq --genome_lib_dir /path/to/ctat_genome_lib_build_dir
#
############################################################################################
__EOUSAGE__
;
my $extended_usage_info = <<__EXTENDED_USAGE__
#
# STAR program configurations:
#
# --STAR_max_mate_dist <int> maximum distance between mates (also used for the maximum intron length value) default: $STAR_MAX_MATE_DIST
#
# --STAR_SJDBoverhangMin <int> minimum overhang for annotated junctions default: $STAR_SJDB_OVERHANG_MIN
#
# --STAR_SortedByCoordinate have STAR sort resulting bam file by coordinate
# --STAR_limitBAMsortRAM <string> num GB of RAM, default: ${STAR_limitBAMsortRAM}
#
# --STAR_PATH <string> /path/to/STAR (default: uses STAR found in PATH setting).
#
# --run_STAR_only stop after running STAR aligner (essentially just generating the Chimeric.out.junction file)
#
# --STAR_twopass do STAR two-pass instead of two pass (one pass is default)
#
#
# Stitching overlapping reads settings:
# --STAR_peOverlapNbasesMin <int> minimum number of bases in the overlap (default: $STAR_peOverlapNbasesMin)
# --STAR_peOverlapMMp <int> the max proportion of mismatches in the overlap (default: $STAR_peOverlapMMp)
#
# STAR chim multi-map opts:
# --STAR_chimMultimapScoreRange <int> the score range for multi-mapping chimeras below the best chimeric score. (default: $STAR_chimMultimapScoreRange)
# --STAR_chimMultimapNmax <int> maximum number of multi-alignments for the chimera (default: $STAR_chimMultimapNmax)
# --STAR_chimNonchimScoreDropMin <int> to trigger chimeric detection, the drop in the best non-chimeric alignment score with respect to the read length has to be smaller than this value (default: $STAR_chimNonchimScoreDropMin)
# --STAR_outSAMattrRGline <str> pass through string value to STAR as --outSAMattrRGline for setting read group name info in bam. ie. --STAR_outSAMattrRGline "ID:myGRP"
#
# Chimeric read filtering parameters:
#
# --min_pct_MM_nonspecific|M <int> min pct of multimapping reads that should map to 2 genes
# (avoids multimapping reads that lack specificity)
# value must be between [1,100] (default: $MIN_PCT_MM_NONSPECIFIC)
##
# Fusion transcript filtering:
#
# --no_filter do not filter predictions. (stops execution prior to the filtering stage).
#
# --outTmpDir <string> passed to STAR (very useful if local disks are faster than network disks)
#
# --min_junction_reads <int> minimum number of junction-spanning reads required. Default: $MIN_JUNCTION_READS
#
# --min_sum_frags <int> minimum fusion support = ( # junction_reads + # spanning_frags ) Default: $MIN_SUM_FRAGS
#
# --require_LDAS 0|1 require long double anchor support for split reads
# when no spanning frags are found (default: 1)
#
#
# --max_promiscuity <int> maximum number of partners allowed for a given fusion. Default: $MAX_PROMISCUITY
#
# --min_pct_dom_promiscuity <int> for promiscuous fusions, those with less than this support of the dominant scoring pair
# are filtered prior to applying the max_promiscuity filter.
# (default: $MIN_PCT_DOM_PROM)
#
# --aggregate_novel_junction_dist <int> default: $AGGREGATE_NOVEL_JUNCTION_DIST (non-ref junctions within $AGGREGATE_NOVEL_JUNCTION_DIST
# are merged into single calls)
#
# --min_novel_junction_support <int> default: $MIN_NOVEL_JUNCTION_SUPPORT (minimum of $MIN_NOVEL_JUNCTION_SUPPORT
# junction reads required if breakpoint
# lacks involvement of only reference junctions)
#
# --min_spanning_frags_only <int> minimum number of rna-seq fragments required as fusion evidence if
# there are no junction reads (default: $MIN_SPANNING_FRAGS_ONLY)
#
# --min_alt_pct_junction <float> default: $MIN_ALT_PCT_JUNCTION ($MIN_ALT_PCT_JUNCTION % of the dominant isoform
# junction support)
#
# --min_FFPM <float> minimum FFPM (fusion fragments per million rna-seq frags) (default: $MIN_FFPM)
#
# --no_remove_dups do not remove duplicate reads
#
# --skip_EM skips expectation maximization based fractional assignment of spanning fragments across multiple splice breakpoint fusion calls.
#
# --skip_FFPM skip FFPM computation. FFPM is only computed correctly and meaningfully on per-sample analysis, or use of --samples_file where all entries in the samples file correspond to a single biological sample (ie. multiple lanes of seq, or replicates that are to be run as a single entity).
#
# ** Turn off specific fusion filters:
#
# --no_annotation_filter exclude filtering of fusion predictions based on annotations
# (ie. by default, removing fusions found in normal (non-cancer) samples))
#
# --no_RT_artifact_filter exclude filtering of likely RT artifacts
# (by default, fusions w/o GT-AG, GC-AG, or AT-AC splice junctions at fusion breakpoint are excluded
# as likely artifacts, with few exceptions of known fusions that involve intra-exon breakpoints. ie. IGH-fusions)
#
# --no_single_fusion_per_breakpoint exclude filtering of potentially superfluous fusions involving different gene
# annotations overlapping the same fusion breakpoint.
#
# Downstream analysis of fusion candidates:
#
# --examine_coding_effect explore impact of fusions on coding sequences
#
# --extract_fusion_reads retrieves the fusion supporting reads from the fastq files
#
# --FusionInspector <inspect|validate> include FusionInspector, options:
# 'inspect' - considers only StarFusion-identified fusion reads in context of identified fusions (fast)
# 'validate' - examines all reads, recovers evidence, computes fusion allele fractions (slow)
#
# --denovo_reconstruct attempt to reconstruct fusion transcripts using Trinity de novo assembly (requires --FusionInspector)
#
# --misc_FI_opts <string> additional FI-specific options to pass on to FusionInspector
#
# --version report version ($VERSION)
#
# STAR shared memory parameters (see STAR-Fusion usage documentation for details):
#
# --STAR_use_shared_memory use shared memory among multiple processes for the STAR alignment step
# # note, when this option is used, twopass mode is disabled, as it's incompatible w/ shared mem.
# ** (when all your jobs are done, be sure to run: 'STAR-Fusion --STAR_Remove' to unload it) **
#
# --STAR_LoadAndExit runs STAR to load the genome index into shared RAM, then exits.
#
# --STAR_Remove removes genome from shared RAM and exits.
#
#
## Support for single-cell data
#
# ## SmartSeq2
#
# --samples_file <string> file containing list of targeted files, separate fastqs for each cell.
# (with format:
# sample_name(tab)/path/to/left.fq(tab)/path/to/right.fq
# and for single-end reads, exclude the 2nd read path. )
#
# Remaining misc. options:
#
# --tmpdir <string> file for temporary files (default: /tmp)
#
# --verbose_level <int> verbosity (default: $verbose_level, max=2)
#
# --max_sensitivity includes options: --min_junction_reads 0 --min_sum_frags 1 --require_LDAS 0 --min_spanning_frags_only 1 --min_novel_junction_support 1 --skip_FFPM --no_single_fusion_per_breakpoint --skip_EM
#
# --full_Monty includes max sensitivity and disables filters via: --max_promiscuity 1000000 --min_pct_dom_promiscuity 1 --min_alt_pct_junction 1e-3 --no_annotation_filter --no_RT_artifact_filter
#
#
###################################################################################
__EXTENDED_USAGE__
;
my $no_filter = 0;
my $MAX_SENSITIVITY_FLAG = 0;
my $FULL_MONTY_FLAG = 0;
my $left_fq_filename = "";
my $right_fq_filename = "";
my $samples_file = "";
my $chimeric_out_junctions_file = "";
my $REPORT_VERSION = 0;
my $USE_SHARED_MEMORY = 0;
my $EXTRACT_FUSION_READS = 0;
my $FUSIONINSPECTOR_MODE = 0;
my $DENOVO_RECONSTRUCT = 0;
my $EXAMINE_CODING_EFFECT = 0;
my $STAR_PROG;
my $DEVEL_STAR;
my $STAR_LOAD_AND_EXIT;
my $STAR_REMOVE;
my $STAR_twopass = 0;
my $NO_ANNOTATION_FILTER = 0;
my $NO_RT_ARTIFACT_FILTER = 0;
my $NO_SINGLE_FUSION_PER_BREAKPOINT = 0;
my $SHOW_FULL_USAGE_INFO;
my $MISC_FI_OPTS = "";
my $outSAMattrRGline = "";
&GetOptions ( 'help|h' => \$help_flag,
'left_fq=s' => \$left_fq_filename,
'right_fq=s' => \$right_fq_filename,
'samples_file=s' => \$samples_file,
'chimeric_junction|J=s' => \$chimeric_out_junctions_file,
'min_junction_reads=i' => \$MIN_JUNCTION_READS,
'min_sum_frags=i' => \$MIN_SUM_FRAGS,
'max_promiscuity=i' => \$MAX_PROMISCUITY,
'min_pct_dom_promiscuity=f' => \$MIN_PCT_DOM_PROM,
'min_spanning_frags_only=i' => \$MIN_SPANNING_FRAGS_ONLY,
'min_pct_MM_nonspecific|M=i' => \$MIN_PCT_MM_NONSPECIFIC,
'require_LDAS=i' => \$REQUIRE_LDAS,
'min_novel_junction_support=i' => \$MIN_NOVEL_JUNCTION_SUPPORT,
'min_alt_pct_junction=f' => \$MIN_ALT_PCT_JUNCTION,
'aggregate_novel_junction_dist=i' => \$AGGREGATE_NOVEL_JUNCTION_DIST,
'output_dir|O=s' => \$output_dir,
'tmpdir=s' => \$tmpdir,
'verbose_level=i' => \$verbose_level,
'examine_coding_effect' => \$EXAMINE_CODING_EFFECT,
'min_FFPM=f' => \$MIN_FFPM,
'no_filter' => \$no_filter,
'no_annotation_filter' => \$NO_ANNOTATION_FILTER,
'no_RT_artifact_filter' => \$NO_RT_ARTIFACT_FILTER,
'no_single_fusion_per_breakpoint' => \$NO_SINGLE_FUSION_PER_BREAKPOINT,
'genome_lib_dir=s' => \$genome_lib_dir,
'version' => \$REPORT_VERSION,
'CPU=i' => \$CPU,
'outTmpDir=s' => \$outTmpDir,
# shared memory options
'STAR_use_shared_memory' => \$USE_SHARED_MEMORY,
'STAR_LoadAndExit' => \$STAR_LOAD_AND_EXIT,
'STAR_Remove' => \$STAR_REMOVE,
'STAR_max_mate_dist=i' => \$STAR_MAX_MATE_DIST,
'STAR_SJDBoverhangMin=i' => \$STAR_SJDB_OVERHANG_MIN,
'STAR_SortedByCoordinate' => \$STAR_SortedByCoordinate,
'STAR_limitBAMsortRAM=s' => \$STAR_limitBAMsortRAM,
'STAR_PATH=s' => \$STAR_PROG,
# other STAR settings
'STAR_peOverlapNbasesMin=i' => \$STAR_peOverlapNbasesMin,
'STAR_peOverlapMMp=i' => \$STAR_peOverlapMMp,
'STAR_chimMultimapScoreRange=i' => \$STAR_chimMultimapScoreRange,
'STAR_chimMultimapNmax=i' => \$STAR_chimMultimapNmax,
'STAR_chimNonchimScoreDropMin=i' => \$STAR_chimNonchimScoreDropMin,
"STAR_outSAMattrRGline=s" => \$outSAMattrRGline,
"STAR_twopass" => \$STAR_twopass,
'extract_fusion_reads' => \$EXTRACT_FUSION_READS,
'FusionInspector=s' => \$FUSIONINSPECTOR_MODE,
'denovo_reconstruct' => \$DENOVO_RECONSTRUCT,
'no_remove_dups' => \$NO_REMOVE_DUPS,
'DEVEL_STAR' => \$DEVEL_STAR,
'show_full_usage_info' => \$SHOW_FULL_USAGE_INFO,
'run_STAR_only' => \$RUN_STAR_ONLY_FLAG,
'max_sensitivity' => \$MAX_SENSITIVITY_FLAG,
'full_Monty' => \$FULL_MONTY_FLAG,
'skip_EM' => \$SKIP_EM_FLAG,
'skip_FFPM' => \$SKIP_FFPM_FLAG,
'misc_FI_opts=s' => \$MISC_FI_OPTS,
);
if ($SHOW_FULL_USAGE_INFO) {
print "$usage$extended_usage_info";
exit(0);
}
if ($help_flag) {
print "$usage";
exit(0);
}
if ($REPORT_VERSION) {
print "\n\nSTAR-Fusion version: $VERSION\n\n";
exit(0);
}
if ($FULL_MONTY_FLAG) {
$MAX_SENSITIVITY_FLAG = 1;
# includes max sensitivity and disables filters via: --max_promiscuity 1000000 --min_pct_dom_promiscuity 1 --min_alt_pct_junction 1e-3 --no_annotation_filter --no_RT_artifact_filter
$MAX_PROMISCUITY = 1000000;
$MIN_PCT_DOM_PROM = 1;
$MIN_ALT_PCT_JUNCTION = 1e-3;
$NO_ANNOTATION_FILTER = 1;
$NO_RT_ARTIFACT_FILTER = 1;
}
if ($MAX_SENSITIVITY_FLAG) {
# includes options: --min_junction_reads 0 --min_sum_frags 1 --require_LDAS 0 --min_spanning_frags_only 1 --min_novel_junction_support 1 --min_FFPM 0 --no_single_fusion_per_breakpoint
$MIN_JUNCTION_READS = 0;
$MIN_SUM_FRAGS = 1;
$REQUIRE_LDAS = 0;
$MIN_SPANNING_FRAGS_ONLY = 1;
$MIN_NOVEL_JUNCTION_SUPPORT = 1;
$MIN_FFPM = 0;
$NO_SINGLE_FUSION_PER_BREAKPOINT = 1;
$SKIP_EM_FLAG = 1;
$SKIP_FFPM_FLAG = 1;
}
if ($STAR_PROG) {
unless (-e $STAR_PROG) {
die "Error, STAR program set to $STAR_PROG but can't be found";
}
# adding path to star to the PATH setting in case it needs to be used for FusionInspector later on.
my $star_basedir = &ensure_full_path(dirname($STAR_PROG));
$ENV{PATH} = "$star_basedir:$ENV{PATH}";
}
else {
$STAR_PROG = `which STAR`;
chomp $STAR_PROG;
unless ($STAR_PROG =~ /\w/) {
die "Error, cannot locate STAR program in the PATH setting";
}
}
&check_compatible_STAR_version($STAR_PROG, \@MIN_STAR_MAJOR_MINOR_VERSION);
if ($STAR_limitBAMsortRAM =~ /^(\d+)G$/) {
$STAR_limitBAMsortRAM = $1 * (1024**3);
}
else {
die "Error, cannot parse G of RAM from: $STAR_limitBAMsortRAM ";
}
if ($FUSIONINSPECTOR_MODE) {
unless ($FUSIONINSPECTOR_MODE =~ /^(inspect|validate)$/) {
die "Error, --FusionInspector option can be 'inspect' or 'validate', [$FUSIONINSPECTOR_MODE] not recognized ";
}
if ($FUSIONINSPECTOR_MODE =~ /inspect/) {
$EXTRACT_FUSION_READS = 1;
}
}
if ($EXTRACT_FUSION_READS) {
unless ($left_fq_filename || $samples_file) {
die "Error, need --left_fq (and --right_fq for PE reads) or --samples_file enabled with --FusionInspector or --extract_fusion_reads ";
}
}
if ($DENOVO_RECONSTRUCT && ! $FUSIONINSPECTOR_MODE) {
die "Error, --denovo_reconstruct requires that you employ --FusionInspector ";
}
if (@ARGV) {
die "Error, don't understand arguments: [@ARGV] ";
}
unless ($genome_lib_dir) {
# neither env var or the --genome_lib_dir param is set.
die $usage;
}
unless (-d $genome_lib_dir) {
die "Error, cannot locate genome_lib_dir: $genome_lib_dir";
}
## reset it just in case it's different or unset to begin with:
$ENV{CTAT_GENOME_LIB} = &ensure_full_path($genome_lib_dir);
## ensure that we can locate the AnnotFilterRule in the CTAT_GENOME_LIB
my $annot_filt_module = "$genome_lib_dir/AnnotFilterRule.pm";
unless (-s $annot_filt_module) {
die "Error, cannot locate required $annot_filt_module ... be sure to use a more modern version of the companion CTAT_GENOME_LIB ";
}
# ensure full paths:
$genome_lib_dir = &ensure_full_path($genome_lib_dir);
my $star_index_dir = "$genome_lib_dir/ref_genome.fa.star.idx";
if ($STAR_LOAD_AND_EXIT) {
my $cmd = "$STAR_PROG --genomeDir $star_index_dir --genomeLoad LoadAndExit";
&process_cmd($cmd);
print STDERR "-done loading genome into shared memory.\n";
exit(0);
}
elsif ($STAR_REMOVE) {
my $cmd = "$STAR_PROG --genomeDir $star_index_dir --genomeLoad Remove";
&process_cmd($cmd);
print STDERR "-done removing genome from shared memory.\n";
exit(0);
}
unless ( ($left_fq_filename || $samples_file || $chimeric_out_junctions_file) && $genome_lib_dir) {
die $usage;
}
$output_dir = &ensure_full_path($output_dir);
$left_fq_filename = &ensure_full_path($left_fq_filename) if $left_fq_filename;
$right_fq_filename = &ensure_full_path($right_fq_filename) if $right_fq_filename;
$chimeric_out_junctions_file = &ensure_full_path($chimeric_out_junctions_file) if $chimeric_out_junctions_file;
$samples_file = &ensure_full_path($samples_file) if $samples_file;
my $checkpoints_dir = "$output_dir/_starF_checkpoints";
if ($samples_file && $left_fq_filename) {
die "Error, specify --samples_file or --left_fq but not both. These are mutually exclusive options";
}
my $read_group_ids = "";
main: {
&validate_ctat_genome_lib($genome_lib_dir);
unless (-d $output_dir) {
&process_cmd("mkdir -p $output_dir");
}
if ($samples_file) {
($read_group_ids, $left_fq_filename, $right_fq_filename) = &parse_samples_file($samples_file);
# in case there are relative paths
my $new_samples_file = "$output_dir/starF.target_samples.txt";
&write_new_samples_file($new_samples_file, $read_group_ids, $left_fq_filename, $right_fq_filename);
$samples_file = $new_samples_file;
}
chdir $output_dir or die "Error, cannot cd to $output_dir";
unless (-d $checkpoints_dir) {
&process_cmd("mkdir -p $checkpoints_dir");
}
my $preliminary_outdir = "$output_dir/star-fusion.preliminary";
unless (-d $preliminary_outdir) {
&process_cmd("mkdir -p $preliminary_outdir");
}
if ($out_prefix =~ m|/$|) {
# not allowing to end as a directory, must be a filename
die "Error, --out_prefix must be a file name and not a directory name, although you can include directories in the path to the file name to be created.";
}
my $out_prefix_basename = basename($out_prefix);
my $pipeliner = new Pipeliner(-verbose => $verbose_level);
unless ($chimeric_out_junctions_file) {
$chimeric_out_junctions_file = "Chimeric.out.junction";
unless (-d $star_index_dir) {
die "Error, cannot locate star index at $star_index_dir";
}
&run_STAR($pipeliner, $left_fq_filename, $right_fq_filename, $read_group_ids);
}
if ($RUN_STAR_ONLY_FLAG) {
print STDERR "Option --run_STAR_only flag is set. Stopping here.\n";
exit(0);
}
## Get total number of rnaseq fragments reported by STAR:
my $num_total_rnaseq_frags = &get_num_total_rnaseq_frags($chimeric_out_junctions_file);
print STDERR "-sample contains $num_total_rnaseq_frags rnaseq fragments\n";
########################################
## Map chimeric reads to genome features
my $cmd = "$UTILDIR/STAR-Fusion.map_chimeric_reads_to_genes "
. " --genome_lib_dir $genome_lib_dir "
. " -J $chimeric_out_junctions_file "
. " -O $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt"
. " --CPU $CPU ";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.map_chim_to_genes.ok"));
#############################################
## Filter out non-specific multimapping reads
$cmd = "$UTILDIR/STAR-Fusion.handle_multimapping_reads "
. " -J $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt "
. " --genome_lib_dir $genome_lib_dir "
. " --filt_file $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt.fail "
. " -M $MIN_PCT_MM_NONSPECIFIC "
. " > $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt.pass ";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.handle_multimappers.ok"));
###################################
## predict preliminary fusion pairs
$cmd = "$UTILDIR/STAR-Fusion.predict "
. " -J $preliminary_outdir/star-fusion.junction_breakpts_to_genes.txt.pass "
. " -O $preliminary_outdir/$out_prefix ";
if ($NO_REMOVE_DUPS) {
$cmd .= " --no_remove_dups ";
}
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/$out_prefix.STAR-Fusion.predict.ok"));
my $predicted_fusions_file = "$preliminary_outdir/$out_prefix.fusion_candidates.preliminary";
my $prelim_fusion_file = $predicted_fusions_file;
unless ($no_filter) {
## filter fusions
$cmd = "$UTILDIR/STAR-Fusion.filter "
. " --fusion_preds $prelim_fusion_file "
. " -J $chimeric_out_junctions_file "
. " --tmpdir $tmpdir "
. " --min_junction_reads $MIN_JUNCTION_READS "
. " --min_sum_frags $MIN_SUM_FRAGS "
. " --require_LDAS $REQUIRE_LDAS "
. " --max_promiscuity $MAX_PROMISCUITY "
. " --min_pct_dom_promiscuity $MIN_PCT_DOM_PROM "
. " --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
. " --min_alt_pct_junction $MIN_ALT_PCT_JUNCTION "
. " --aggregate_novel_junction_dist $AGGREGATE_NOVEL_JUNCTION_DIST "
. " --min_spanning_frags_only $MIN_SPANNING_FRAGS_ONLY "
. " --genome_lib_dir $genome_lib_dir "
. " --out_prefix $preliminary_outdir/$out_prefix";
if ($NO_SINGLE_FUSION_PER_BREAKPOINT) {
$cmd .= " --no_single_fusion_per_breakpoint ";
}
if ($SKIP_EM_FLAG) {
$cmd .= " --skip_EM ";
}
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/STAR-Fusion.filter.ok"));
$prelim_fusion_file = "$preliminary_outdir/$out_prefix.fusion_candidates.preliminary.filtered";
}
if ( $num_total_rnaseq_frags > 0 && (! $SKIP_FFPM_FLAG)) {
## convert vals to FFPM
$cmd = "$UTILDIR/incorporate_FFPM_into_final_report.pl $prelim_fusion_file $num_total_rnaseq_frags > $prelim_fusion_file.FFPM";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/FFPM.ok") );
$prelim_fusion_file = "$prelim_fusion_file.FFPM";
}
## add breakpoint / splice junction info
my $predicted_fusions_with_splice_info = "$predicted_fusions_file.wSpliceInfo";
$cmd = "$UTILDIR/append_breakpoint_junction_info.pl $prelim_fusion_file $genome_lib_dir > $predicted_fusions_with_splice_info";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/predicted_fusions_with_splice_info.ok"));
## annotate fusions.
my $annotated_fusions_filename = "$predicted_fusions_with_splice_info.wAnnot";
$cmd = "$BASEDIR/FusionAnnotator/FusionAnnotator --genome_lib_dir $genome_lib_dir --annotate $predicted_fusions_with_splice_info > $annotated_fusions_filename";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/fusion_annotator.ok"));
## various downstream fusion filters
$prelim_fusion_file = $annotated_fusions_filename;
# filter based on annotations
unless ($NO_ANNOTATION_FILTER) {
## filter by annotations
$cmd = "$BASEDIR/FusionFilter/util/filter_by_annotation_rules.pl --fusions $annotated_fusions_filename --genome_lib_dir $genome_lib_dir";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/annot_filter.ok"));
$prelim_fusion_file = "$annotated_fusions_filename.annot_filter.pass";
}
unless ($NO_RT_ARTIFACT_FILTER) {
## filter out likely RT artifacts
$cmd = "$UTILDIR/filter_likely_RT_artifacts.pl --fusions $prelim_fusion_file --genome_lib_dir $genome_lib_dir";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/RTartifact_filter.ok"));
$prelim_fusion_file = "$prelim_fusion_file.RTartifact.pass";
}
if ( ($num_total_rnaseq_frags > 0) && (! $SKIP_FFPM_FLAG) && $MIN_FFPM > 0) {
# filter based on min FFPM value
$cmd = "$UTILDIR/filter_by_min_FFPM.pl $prelim_fusion_file $MIN_FFPM";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/FFPM_filter.ok") );
$prelim_fusion_file = "$prelim_fusion_file.minFFPM.$MIN_FFPM.pass";
}
# generate final fusion prediction output files:
my $final_output_fusions = "$out_prefix.fusion_predictions.tsv";
$pipeliner->add_commands(new Command("cp $prelim_fusion_file $final_output_fusions", "$checkpoints_dir/cp_final.ok"));
## if samples file for sc, separate by cell:
if ($samples_file) {
$pipeliner->add_commands(new Command("$UTILDIR/sc/starF_partition_final_by_sc.pl $final_output_fusions > $final_output_fusions.samples_deconvolved.tsv",
"$checkpoints_dir/samples_deconvolved.ok") );
## make abridged version.
$pipeliner->add_commands(new Command("$UTILDIR/column_exclusions.pl $final_output_fusions.samples_deconvolved.tsv JunctionReads,SpanningFrags > $final_output_fusions.samples_deconvolved.abridged.tsv",
"$checkpoints_dir/samples_deconvolved.abridged.ok") );
}
## make abridged versions:
my $abridged_final_output = $final_output_fusions;
$abridged_final_output =~ s/\.tsv$/.abridged.tsv/;
$pipeliner->add_commands(new Command("$UTILDIR/column_exclusions.pl $final_output_fusions JunctionReads,SpanningFrags > $abridged_final_output", "$checkpoints_dir/abridged_final"));
if ($EXAMINE_CODING_EFFECT) {
my $coding_eff_filename = $abridged_final_output;
$coding_eff_filename =~ s/\.tsv$/.coding_effect.tsv/;
$cmd = "$BASEDIR/FusionAnnotator/util/fusion_to_coding_region_effect.pl --fusions $abridged_final_output --genome_lib_dir $genome_lib_dir > $coding_eff_filename";
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/coding_eff.ok"));
$abridged_final_output = $coding_eff_filename;
}
if ($EXTRACT_FUSION_READS) {
my $cmd = "$UTILDIR/get_FUSION_EVIDENCE_fastqs.pl --fusions $out_prefix.fusion_predictions.tsv --output_prefix $out_prefix ";
if ($samples_file) {
$cmd .= " --samples_file $samples_file ";
}
else {
$cmd .= " --left_fq $left_fq_filename";
if ($right_fq_filename) {
$cmd .= " --right_fq $right_fq_filename";
}
}
$pipeliner->add_commands(new Command($cmd, "$checkpoints_dir/extract_fusion_reads.ok"));
}
$pipeliner->run();
if ($no_filter) {
print STDERR "\n\n *** Note: --no_filtering in effect, so outputs have not been filtered for likely false positives.\n";
print STDERR "\n\tSee output: $prelim_fusion_file\n\n";
}
else {
print STDERR "\n\n\t* STAR-Fusion complete. See output: $output_dir/$out_prefix.fusion_predictions.tsv (or .abridged.tsv version)\n\n\n";
}
if ($FUSIONINSPECTOR_MODE) {
my $FI_outdir = "$output_dir/FusionInspector-${FUSIONINSPECTOR_MODE}";
my $FI_cmd = "";
$FI_cmd = "$BASEDIR/FusionInspector/FusionInspector --fusions $abridged_final_output "
. " --out_prefix finspector "
. " --min_junction_reads $MIN_JUNCTION_READS "
. " --min_novel_junction_support $MIN_NOVEL_JUNCTION_SUPPORT "
. " --min_spanning_frags_only $MIN_SPANNING_FRAGS_ONLY "
. " --max_mate_dist $STAR_MAX_MATE_DIST "
. " --vis "
. " --max_promiscuity $MAX_PROMISCUITY "
. " --output_dir $FI_outdir "
. " --genome_lib_dir $genome_lib_dir "
. " --CPU $CPU ";
if ($FUSIONINSPECTOR_MODE eq 'inspect') {
## use the extracted fusion evidence reads
my $FI_left_fq = "$out_prefix.fusion_evidence_reads_1.fq";
$FI_cmd .= " --only_fusion_reads "
. "--fusion_contigs_only "
. " --left_fq $FI_left_fq";
if ($right_fq_filename) {
my $FI_right_fq = "$out_prefix.fusion_evidence_reads_2.fq";
$FI_cmd .= " --right_fq $FI_right_fq ";
}
$FI_cmd .= " --no_FFPM "; # FFPM would be meaningless in this context of fusion-only reads.
}
else {
# validate mode
# use all original reads.
if ($samples_file) {
$FI_cmd .= " --samples_file $samples_file ";
}
else {
$FI_cmd .= " --left_fq $left_fq_filename";
if ($right_fq_filename) {
$FI_cmd .= " --right_fq $right_fq_filename ";
}
}
}
if ($DENOVO_RECONSTRUCT) {
$FI_cmd .= " --include_Trinity ";
}
$FI_cmd .= " --annotate "; # just do it by default
if ($EXAMINE_CODING_EFFECT) {
$FI_cmd .= " --examine_coding_effect ";
}
if ($NO_REMOVE_DUPS) {
$FI_cmd .= " --no_remove_dups ";
}
if ($MISC_FI_OPTS) {
$FI_cmd .= " $MISC_FI_OPTS ";
}
$pipeliner->add_commands(new Command($FI_cmd, "_fi_${FUSIONINSPECTOR_MODE}_" . length($FI_cmd) . ".ok"));
$pipeliner->run();
}
exit(0);
}
####
sub missing_required_program_installed {
my (@required_progs) = @_;
my $missing = 0;
foreach my $prog (@required_progs) {
my $path = `which $prog`;
chomp $path;
unless ($path && $path =~ /\w/) {
$missing++;
print STDERR "Error, cannot locate required program: $prog\n";
}
else {
print STDERR "-found prog $prog at $path\n";
}
}
return($missing);
}
####
sub run_STAR {
my ($pipeliner, $left_fq_filename, $right_fq_filename, $read_group_ids) = @_;
my $maybe_tmpdir= defined($outTmpDir)? " --outTmpDir $outTmpDir " : "";
## run STAR to align reads:
my $cmd = "$STAR_PROG --genomeDir $star_index_dir "
. " --outReadsUnmapped None "
. " --chimSegmentMin 12 "
. " --chimJunctionOverhangMin 8 "
. " --chimOutJunctionFormat 1 " # new option for STAR 2.6.1a
. " --alignSJDBoverhangMin $STAR_SJDB_OVERHANG_MIN "
. " --alignMatesGapMax $STAR_MAX_MATE_DIST "
. " --alignIntronMax $STAR_MAX_MATE_DIST "
. " --alignSJstitchMismatchNmax 5 -1 5 5 " #which allows for up to 5 mismatches for non-canonical GC/AG, and AT/AC junctions, and any number of mismatches for canonical junctions (the default values 0 -1 0 0 replicate the old behavior (from AlexD)
. " --runThreadN $CPU"
. $maybe_tmpdir
. " --outSAMstrandField intronMotif "
. " --outSAMunmapped Within "
. " --alignInsertionFlush Right "
. " --alignSplicedMateMapLminOverLmate 0 "
. " --alignSplicedMateMapLmin 30 "
;
# from: https://github.com/alexdobin/STAR/issues/744
# --chimSegmentMin controls the minimum total length of the chimeric segment, which is calculated as a sum of alignment blocks of the segment.
# --chimJunctionOverhangMin, on the other hand, controls the minimum length of the block adjacent to the chimeric junction. This prevents chimeric junction from happening very close to the normal junction.
if ($STAR_SortedByCoordinate) {
$cmd .= " --limitBAMsortRAM $STAR_limitBAMsortRAM "
. " --outSAMtype BAM SortedByCoordinate ";
}
else {
$cmd .= " --outSAMtype BAM Unsorted "; # default, don't spend time and RAM sorting the data as not required for fusion-finding
}
my $reads = "$left_fq_filename $right_fq_filename";
if (length($reads) > 1000) {
my $star_params_file = "star_params." . time() . ".$$.txt";
open(my $ofh, ">$star_params_file") or die "Error, cannot write to file: $star_params_file";
print $ofh "readFilesIn $reads\n";
if ($read_group_ids) {
print $ofh "outSAMattrRGline $read_group_ids\n";
}
elsif ($outSAMattrRGline) {
print $ofh "outSAMattrRGline $outSAMattrRGline\n";
}
else {
print $ofh "outSAMattrRGline ID:GRPundef\n";
}
close $ofh;
$cmd .= " --parametersFiles $star_params_file ";
}
else {
$cmd .= " --readFilesIn $reads ";
if ($read_group_ids) {
$cmd .= " --outSAMattrRGline $read_group_ids ";
}
elsif ($outSAMattrRGline) {
$cmd .= "--outSAMattrRGline $outSAMattrRGline";
}
else {
$cmd .= " --outSAMattrRGline ID:GRPundef ";
}
}
# chim read multimapping settings
$cmd .= " --chimMultimapScoreRange $STAR_chimMultimapScoreRange "
. " --chimScoreJunctionNonGTAG $STAR_chimScoreJunctionNonGTAG "
. " --chimMultimapNmax $STAR_chimMultimapNmax "
. " --chimOutType Junctions WithinBAM "
. " --chimNonchimScoreDropMin $STAR_chimNonchimScoreDropMin ";
# merge overlapping reads
$cmd .= " --peOverlapNbasesMin $STAR_peOverlapNbasesMin --peOverlapMMp $STAR_peOverlapMMp ";
if ($USE_SHARED_MEMORY) {
$cmd .= " --genomeLoad LoadAndKeep ";
}
else {
# note, twopassMode is incompatible with shared memory (--genomeLoad LoadAndKeep)
$cmd .= " --genomeLoad NoSharedMemory ";
if ($STAR_twopass) {
$cmd .= " --twopassMode Basic ";
}
else {
$cmd .= " --twopassMode None ";
}
}
if ($left_fq_filename =~ /\.gz$/) {
$cmd .= " --readFilesCommand 'gunzip -c' ";
}
$cmd .= " --quantMode GeneCounts";
#$cmd .= " --limitOutSJcollapsed 10000000 ";
$pipeliner->add_commands(Command->new($cmd, "$checkpoints_dir/run_star_aligner.ok"));
$pipeliner->run();
return;
}
####
sub run_BBMerge {
my ($pipeliner, $left_fq_filename, $right_fq_filename, $read_group_ids) = @_;
my @left_fq_files = split(/,/, $left_fq_filename);
my @right_fq_files = split(/,/, $right_fq_filename);
my @read_groups = split(/ , /, $read_group_ids);
my @bbmerged_files;
while (@left_fq_files) {
my $left_fq_filename = shift @left_fq_files;
my $right_fq_filename = shift @right_fq_files;
unless ($right_fq_filename) {
confess "Error, need right fq filename corresponding to left fq file: $left_fq_filename";
}
my $read_group = shift @read_groups;
if ($read_group) {
$read_group =~ s/^ID://;
}
else {
$read_group = "singlesample";
}
my $cmd = "$PLUGINSDIR/bbmap/bbmerge.sh in1=$left_fq_filename in2=$right_fq_filename out=$read_group.bbmerge.fq";
$pipeliner->add_commands(Command->new($cmd, "$checkpoints_dir/$read_group.bbmerge.ok"));
# adjust the fq record names so we know these are the merged records later on.
#$pipeliner->add_commands(Command->new("$UTILDIR/make_merged_prefix_fq.pl bbmerged.fq > bbmerged.adj.fq", "$checkpoints_dir/bbmerge.add_merged_token.ok"));
push (@bbmerged_files, "$read_group.bbmerge.fq");
}
my $bbmerged_fq_files = join(",", @bbmerged_files);
return($bbmerged_fq_files);
}
####
sub parse_samples_file {
my ($samples_file) = @_;
my @read_groups;
my @left_read_files;
my @right_read_files;
open(my $fh, "$samples_file") or die "Error, cannot open file: $samples_file";
while (<$fh>) {
chomp;
my ($sample_name, $left_path, $right_path) = split(/\t/);
unless (-s $left_path) {
die "Error, cannot find non-emtpy file: $left_path";
}
push (@read_groups, "ID:$sample_name");
push (@left_read_files, &ensure_full_path($left_path));
if ($right_path) {
unless (-s $right_path) {
die "Error, cannot find non-empty file: $right_path";
}
push (@right_read_files, &ensure_full_path($right_path));
}
}
my $read_groups_str = join(" , ", @read_groups);
my $left_reads_str = join(",", @left_read_files);
my $right_reads_str = join(",", @right_read_files);
return($read_groups_str, $left_reads_str, $right_reads_str);
}
####
sub write_new_samples_file {
my ($new_samples_file, $read_group_ids, $left_fq_filename, $right_fq_filename) = @_;
open(my $ofh, ">$new_samples_file") or die "Error, cannot write to $new_samples_file";
my @read_group_ids = split(/ , /, $read_group_ids);
my @left_fq_files = split(/,/, $left_fq_filename);
my @right_fq_files = split(/,/, $right_fq_filename);
while (@read_group_ids) {
my $read_group = shift @read_group_ids;
my $left_fq = shift @left_fq_files;
my $right_fq = shift @right_fq_files;
$read_group =~ s/^ID://;
my $outline = join("\t", $read_group, $left_fq);
if ($right_fq) {
$outline .= "\t$right_fq";
}
$outline .= "\n";
print $ofh $outline;
}
close $ofh;
return;
}
####
sub check_compatible_STAR_version {
my ($star_prog, $min_star_major_minor_aref) = @_;
my $star_version_info = `$star_prog --version`;
chomp $star_version_info;
my @MIN_VERSION = @$min_star_major_minor_aref;
my $min_letter = pop @MIN_VERSION;
my $errmsg = "\n\n\tSorry, version [$star_version_info] is not supported. Please use at least STAR version " . join(".", @MIN_VERSION) . "$min_letter\n";
$errmsg .= "\tTo upgrade, obtain the latest STAR here: https://github.com/alexdobin/STAR/releases\n\n";
if ($star_version_info =~ /^(\d+)\.(\d+)\.(\d+)([a-z]?)/) {
my $major_ver = $1;
my $minor_ver = $2;
my $patch_ver = $3;
my $letter_ver = $4;
if ($major_ver < $min_star_major_minor_aref->[0]
||
($major_ver == $min_star_major_minor_aref->[0] && $minor_ver < $min_star_major_minor_aref->[1])
||
($major_ver == $min_star_major_minor_aref->[0] && $minor_ver == $min_star_major_minor_aref->[1] && $patch_ver < $min_star_major_minor_aref->[2])
||
($major_ver == $min_star_major_minor_aref->[0] && $minor_ver == $min_star_major_minor_aref->[1] && $patch_ver == $min_star_major_minor_aref->[2] && ord($letter_ver) < ord($min_star_major_minor_aref->[3]))
) {
print STDERR $errmsg;
exit(1);
}
else {
# all good. :-)
}
}
else {
print STDERR $errmsg;
exit(1);
}
return;
}
####
sub get_num_total_rnaseq_frags {
my ($chimeric_out_junctions_file) = @_;
my $read_count_line = `tail -n1 $chimeric_out_junctions_file`;
if ($read_count_line =~ /^\# Nreads (\d+)\tNreadsUnique (\d+)\tNreadsMulti (\d+)/) {
my $total_reads = $1;
my $num_reads_unique = $2;
my $num_reads_multi = $3;
return ($total_reads);
}
else {
my $v_letter = pop @MIN_STAR_MAJOR_MINOR_VERSION;
confess "Error, file: $chimeric_out_junctions_file doesn't have expected formatting. Please be sure to use the STAR aligner with min version: " . join(".", @MIN_STAR_MAJOR_MINOR_VERSION) . $v_letter;
}
}
####
sub validate_ctat_genome_lib {
my ($genome_lib_dir) = @_;
my $cmd = "$UTILDIR/validate_ctat_genome_lib.pl $genome_lib_dir";
my $ret = system($cmd);
if ($ret) {
die "\n\n\tERROR, ctat genome lib: $genome_lib_dir does not validate and may require indices to be rebuilt.\n\n"
. "\tVisit: https://github.com/STAR-Fusion/STAR-Fusion/wiki/rebuild_ctat_genome_lib_indices\n\tfor details.\n\n";
}
return;
}