https://github.com/smangul1/ROP
Revision a440e8016a3e20986809d19108be8b612e9bfa4f authored by Kevin Hsieh on 27 January 2020, 06:45:33 UTC, committed by Kevin Hsieh on 27 January 2020, 06:45:33 UTC
1 parent 2e08815
Raw File
Tip revision: a440e8016a3e20986809d19108be8b612e9bfa4f authored by Kevin Hsieh on 27 January 2020, 06:45:33 UTC
Remove broken conda clean option
Tip revision: a440e80
rop.sh
#!/usr/bin/env bash

# ------------------------------------------------------------------------------
# INTRO
# ------------------------------------------------------------------------------

set -e
echo '--------------------------------------------------------------------------------'
echo 'Read Origin Protocol: Main Program'
echo '--------------------------------------------------------------------------------'
DIR=`dirname $(readlink -f "$0")`
sed '/##/ q' "$DIR/README.md" | head -n -2 | tail -n +3
echo '--------------------------------------------------------------------------------'

# Add MiniConda to PATH if it's available.
if [ -d "$DIR/tools/MiniConda/bin" ]; then
    export PATH="$DIR/tools/MiniConda/bin:$PATH"
fi

# ------------------------------------------------------------------------------
# PARSE OPTIONS
# ------------------------------------------------------------------------------

# Test for getopt availability.
set +e
getopt --test
if [ $? -ne 4 ]; then
    echo "Error: Environment doesn't support getopt." >&2
    exit 1
fi
set -e

# Call getopt.
SHORT_OPTIONS='o:s:abzdfimpqxh'
LONG_OPTIONS='organism:,steps:,fasta,bam,gzip,dev,force,ignore-extensions,max,\
pe,quiet,commands,help'
set +e
PARSED=`getopt --options="$SHORT_OPTIONS" --longoptions="$LONG_OPTIONS" \
--name "$0" -- "$@"`
if [ $? -ne 0 ]; then
    exit 1  # getopt will have printed the error message
fi
set -e
eval set -- "$PARSED"

# Set default options.
ORGANISM='human'
STEPS='rdna reference repeats immune metaphlan viral fungi protozoa'
    # Non-default: lowq
    # Disabled: circrna bacteria
FASTA=false
BAM=false
GZIP=false
DEV=false
FORCE=false
IGNORE_EXTENSIONS=false
MAX=''
PE=''
QUIET=false
COMMANDS=false
UNMAPPED_READS=''
OUTPUT_DIR=''

# Review parsed options.
while true; do
    case "$1" in
        -o|--organism)
            # Organism to run ROP for.
            ORGANISM="$2"
            shift 2
            ;;
        -s|--steps)
            # Select the analysis modes to use.
            STEPS=`tr ',' ' ' <<<"$2"`
            shift 2
            ;;
        -a|--fasta)
            # Input unmapped reads in .fasta format instead of .fastq format.
            # Forcibly disables low-quality read filtering.
            FASTA=true
            shift
            ;;
        -b|--bam)
            # Input unmapped reads in .bam format instead of .fastq format.
            BAM=true
            shift
            ;;
        -z|--gzip)
            # gunzip the input file.
            GZIP=true
            shift
            ;;
        -d|--dev)
            # Keep intermediate FASTA files.
            DEV=true
            shift
            ;;
        -f|--force)
            # Overwrite the analysis destination directory.
            FORCE=true
            shift
            ;;
        -i|--ignore-extensions)
            # Ignore incorrect .fastq/.fq/.fasta/.fa file extensions.
            # Does not ignore incorrect .gz/.bam file extensions.
            IGNORE_EXTENSIONS=true
            shift
            ;;
        -m|--max)
            # Use a liberal threshold when remapping to reference.
            MAX='--max'
            shift
            ;;
        -p|--pe)
            # Not implemented (usage unclear).
            # Report the number of discordant read pairs, with reads from the
            # same pair classified into different classes.
            PE='--pe'
            shift
            ;;
        -q|--quiet)
            # Not implemented (usage unclear).
            # Suppress progress report and warnings.
            QUIET=true
            shift
            ;;
        -x|--commands)
            # Print all commands.
            COMMANDS=true
            shift
            ;;
        -h|--help)
            echo "Usage: $0 [-o ORGANISM] [-s STEPS] [-abz] [-dfimxh]" \
                "unmapped_reads output_dir" >&2
            exit 0
            ;;
        --)
            # Mandatory arguments.
            UNMAPPED_READS="$2"
            OUTPUT_DIR="$3"
            if [ "$UNMAPPED_READS" = '' ] || [ "$OUTPUT_DIR" = '' ]; then
                echo 'Error: Insufficient arguments.'
                exit 1
            fi
            shift 3
            break
            ;;
        *)
            echo "Error parsing options." >&2
            exit 1
            ;;
    esac
done

# Add all steps if selected.
if [ "$STEPS" = 'all' ]; then
    STEPS='lowq rdna reference repeats circrna immune microbiome'
fi

# Convert to absolute paths.
UNMAPPED_READS=`readlink -m "$UNMAPPED_READS"`
OUTPUT_DIR=`readlink -m "$OUTPUT_DIR"`

# Check if UNMAPPED_READS exists.
if [ ! -e "$UNMAPPED_READS" ]; then
    echo "Error: $UNMAPPED_READS doesn't exist." >&2
    exit 1
fi

# Check if OUTPUT_DIR exists, then make it.
if [ -d "$OUTPUT_DIR" ]; then
    if [ $FORCE = true ]; then
        rm -fr "$OUTPUT_DIR"
    else
        echo "Error: The directory $OUTPUT_DIR exists. Please choose a" \
            'different directory in which to save results of the analysis, or' \
            'use the -f option to overwrite the directory.' >&2
        exit 1
    fi
fi
mkdir -p "$OUTPUT_DIR"

# ------------------------------------------------------------------------------
# CONSTANTS
# ------------------------------------------------------------------------------

# Sample name and database location.
SAMPLE=`basename "$UNMAPPED_READS" | sed 's \([^\.]*\)\..* \1 '`
DB="$DIR/db_$ORGANISM"

# Perform lazy native installation if needed.
if [ ! -h "$DB" ] && [ ! -d "$DB" ]; then
    echo 'Performing a lazy native installation. This might take some time.' >&2
    cd "$DIR"
    ./install.sh -no $ORGANISM
fi

# Duplicate stdout and stderr to the log file. Print commands if selected.
touch "$OUTPUT_DIR/$SAMPLE--general.log"
exec &> >(tee -i "$OUTPUT_DIR/$SAMPLE--general.log")
if [ $COMMANDS = true ]; then
    set -x
fi
echo "Input file: $UNMAPPED_READS"

# Declare output directories.
declare -A DIRS=(
    ['01_lowq']="$OUTPUT_DIR/01_lowq"
    ['02_rdna']="$OUTPUT_DIR/02_rdna"
    ['03_reference']="$OUTPUT_DIR/03_reference"
    ['04_repeats']="$OUTPUT_DIR/04_repeats"
    ['05_circrna']="$OUTPUT_DIR/05_circrna"
    ['06_immune']="$OUTPUT_DIR/06_immune"
    ['07a_metaphlan']="$OUTPUT_DIR/07a_metaphlan"
    ['07b_bacteria']="$OUTPUT_DIR/07b_bacteria"
    ['07c_viral']="$OUTPUT_DIR/07c_viral"
    ['07d_fungi']="$OUTPUT_DIR/07d_fungi"
    ['07e_protozoa']="$OUTPUT_DIR/07e_protozoa"
)

# Make output directories if necessary.
for val in "${DIRS[@]}"; do
    mkdir -p "$val"
done

# Declare intermediate files.
declare -A INTFNS=(
    # Step 1 (lowq).
    ['01_lowq_post']="${DIRS['01_lowq']}/$SAMPLE--01_lowq_post.fasta"

    # Step 2 (rdna).
    ['02_rdna_output']="${DIRS['02_rdna']}/$SAMPLE--02_rdna_output.bam"
    ['02_rdna_reads']="${DIRS['02_rdna']}/$SAMPLE--02_rdna_reads.txt"
    ['02_rdna_post']="${DIRS['02_rdna']}/$SAMPLE--02_rdna_post.fasta"

    # Step 3 (reference).
    ['03_reference_genomeoutput']="${DIRS['03_reference']}/$SAMPLE--03_reference_genomeoutput.bam"
    ['03_reference_transcriptomeoutput']="${DIRS['03_reference']}\
/$SAMPLE--03_reference_transcriptomeoutput.bam"
    ['03_reference_reads']="${DIRS['03_reference']}/$SAMPLE--03_reference_reads.txt"
    ['03_reference_post']="${DIRS['03_reference']}/$SAMPLE--03_reference_post.fasta"

    # Step 4 (repeats).
    ['04_repeats_output']="${DIRS['04_repeats']}/$SAMPLE--04_repeats_output.tsv"
    ['04_repeats_reads']="${DIRS['04_repeats']}/$SAMPLE--04_repeats_reads.txt"
    ['04_repeats_post']="${DIRS['04_repeats']}/$SAMPLE--04_repeats_post.fasta"

    # Step 5 (circrna).
    ['05_circrna_reads']="${DIRS['05_circrna']}/$SAMPLE--05_circrna_reads.txt"
    ['05_circrna_post']="${DIRS['05_circrna']}/$SAMPLE--05_circrna_post.fasta"

    # Step 6 (immune).
    ['06_immune_output']="${DIRS['06_immune']}/$SAMPLE--06_immune_output.cdr3"
    ['06_immune_clonality']="${DIRS['06_immune']}/$SAMPLE--06_immune_clonality"
    ['06_immune_reads']="${DIRS['06_immune']}/$SAMPLE--06_immune_reads.txt"
    ['06_immune_post']="${DIRS['06_immune']}/$SAMPLE--06_immune_post.fasta"

    # Step 7a (metaphlan).
    # No post file (don't reduce unmapped reads using MetaPhlAn results).
    ['07a_metaphlan_map']="${DIRS['07a_metaphlan']}/$SAMPLE--07a_metaphlan_map.map"
    ['07a_metaphlan_bowtie2out']="${DIRS['07a_metaphlan']}/$SAMPLE--07a_metaphlan_bowtie2out.txt"
    ['07a_metaphlan_output']="${DIRS['07a_metaphlan']}/$SAMPLE--07a_metaphlan_output.tsv"

    # Step 7b (bacteria).
    ['07b_bacteria_output']="${DIRS['07b_bacteria']}/$SAMPLE--07b_bacteria_output.bam"
    ['07b_bacteria_reads']="${DIRS['07b_bacteria']}/$SAMPLE--07b_bacteria_reads.txt"
    ['07b_bacteria_post']="${DIRS['07b_bacteria']}/$SAMPLE--07b_bacteria_post.fasta"

    # Step 7c (viral).
    ['07c_viral_output']="${DIRS['07c_viral']}/$SAMPLE--07c_viral_output.bam"
    ['07c_viral_viproutput']="${DIRS['07c_viral']}/$SAMPLE--07c_viral_viproutput.bam"
    ['07c_viral_reads']="${DIRS['07c_viral']}/$SAMPLE--07c_viral_reads.txt"
    ['07c_viral_post']="${DIRS['07c_viral']}/$SAMPLE--07c_viral_post.fasta"

    # Step 7d (fungi).
    ['07d_fungi_output']="${DIRS['07d_fungi']}/$SAMPLE--07d_fungi_output.bam"
    ['07d_fungi_reads']="${DIRS['07d_fungi']}/$SAMPLE--07d_fungi_reads.txt"
    ['07d_fungi_post']="${DIRS['07d_fungi']}/$SAMPLE--07d_fungi_post.fasta"

    # Step 6e (protozoa).
    ['07e_protozoa_output']="${DIRS['07e_protozoa']}/$SAMPLE--07e_protozoa_output.bam"
    ['07e_protozoa_reads']="${DIRS['07e_protozoa']}/$SAMPLE--07e_protozoa_reads.txt"
    ['07e_protozoa_post']="${DIRS['07e_protozoa']}/$SAMPLE--07e_protozoa_post.fasta"

    ['unaccounted']="$OUTPUT_DIR/$SAMPLE--unaccounted.fasta"
)

# Declare log files.
declare -A LOGFNS=(
    #['01_lowq']="${DIRS['01_lowq']}/$SAMPLE--01_lowq.log"
    ['02_rdna']="${DIRS['02_rdna']}/$SAMPLE--02_rdna.log"
    ['03_reference']="${DIRS['03_reference']}/$SAMPLE--03_reference.log"
    ['04_repeats']="${DIRS['04_repeats']}/$SAMPLE--04_repeats.log"
    ['05_circrna']="${DIRS['05_circrna']}/$SAMPLE--05_circrna.log"
    ['06_immune']="${DIRS['06_immune']}/$SAMPLE--06_immune.log"
    ['07a_metaphlan']="${DIRS['07a_metaphlan']}/$SAMPLE--07a_metaphlan.log"
    ['07b_bacteria']="${DIRS['07b_bacteria']}/$SAMPLE--07b_bacteria.log"
    ['07c_viral']="${DIRS['07c_viral']}/$SAMPLE--07c_viral.log"
    ['07d_fungi']="${DIRS['07d_fungi']}/$SAMPLE--07d_fungi.log"
    ['07e_protozoa']="${DIRS['07e_protozoa']}/$SAMPLE--07e_protozoa.log"
    ['counts']="$OUTPUT_DIR/$SAMPLE--counts.csv"
)

# ------------------------------------------------------------------------------
# UTILITY
# ------------------------------------------------------------------------------

reads_present () {
    if [ `wc -l <"$1"` -le 1 ]; then
        echo 'No more reads!'
        return 1  # false
    else
        return 0  # true
    fi
}

clean () {
    if [ $DEV = false ]; then
        rm "$1"
    fi
}

# ------------------------------------------------------------------------------
# PREPROCESS INPUT FILE
# ------------------------------------------------------------------------------

# Do preprocessing in OUTPUT_DIR.
cd "$OUTPUT_DIR"
cp "$UNMAPPED_READS" .
current=`basename "$UNMAPPED_READS"`

# Unpack the unmapped reads if -z and/or -b are selected.
if [ $GZIP == true ]; then
    echo 'Unpacking gzip...'
    post=`basename "$current" .gz`
    if [ "$post" == "$current" ]; then
        echo 'Error: input file missing .gz extension' >&2
        exit 1
    fi
    gunzip "$current"
    current="$post"
fi
if [ $BAM == true ]; then
    echo 'Unpacking bam...'
    post="$(basename "$current" .bam).fastq"
    if [ "$post" == "$current.fastq" ]; then
        echo 'Error: input file missing .bam extension' >&2
        exit 1
    fi
    samtools bam2fq "$current" >"$post"
    clean "$current"
    current="$post"
fi

# Inspect the input file, then restore current to a full path.
if [ $FASTA == true ]; then
    if [ $IGNORE_EXTENSIONS = false ] && \
        [ `basename $current .fasta` == "$current" ] && \
        [ `basename $current .fa` == "$current" ]; then
        echo 'Error: input file missing .fasta/.fa extension' >&2
        exit 1
    fi
    N=`grep -c '^>' "$current"`
    READ_LENGTH=$(($(sed -n '2 p' <"$current" | wc -m) - 1))
else
    if [ $IGNORE_EXTENSIONS = false ] && \
        [ `basename $current .fastq` == "$current" ] && \
        [ `basename $current .fq` == "$current" ]; then
        echo 'Error: input file missing .fastq/.fq extension' >&2
        exit 1
    fi
    line_count=`wc -l <"$current"`
    N=`bc <<<"$line_count/4"`
    READ_LENGTH=$(($(sed -n '2 p' <"$current" | wc -m) - 1))
fi
echo "Processing $N unmapped reads. The first unmapped read has length $READ_LENGTH."
current=`readlink -e "$current"`

# Record the number of reads accounted for in each step.
declare -A n_reads=(
    ['01_lowq']=0
    ['02_rdna']=0
    ['03_reference']=0
    ['04_repeats']=0
    ['05_circrna']=0
    ['05_immune']=0
    ['07b_bacteria']=0
    ['07c_viral']=0
    ['07d_fungi']=0
    ['07e_protozoa']=0
)

# ------------------------------------------------------------------------------
# 1. LOW QUALITY READ MARKING
# ------------------------------------------------------------------------------

echo "1. Low quality read marking (-s lowq)..."
cd "${DIRS['01_lowq']}"
post="${INTFNS['01_lowq_post']}"

if ! grep -q 'lowq' <<<"$STEPS" || [ $FASTA = true ] || \
    ! reads_present "$current"; then
    echo '--> Skipped low quality read marking.'
    
    # Must convert to fasta to continue.
    if [ $FASTA = false ]; then
        fastq_to_fasta -n <"$current" >"$post"
        clean "$current"
        current="$post"
    fi
else
    n_reads['01_lowq']=`python "$DIR/helper.py" lowq $MAX $PE \
        --pre "$current" --post "$post"`
    echo "--> Marked lowq in the names of ${n_reads['01_lowq']} low quality" \
        'reads.'
    echo '    These reads are not filtered.'
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# 2. rDNA PROFILING
# ------------------------------------------------------------------------------

echo "2. rDNA profiling (-s rdna)..."
cd "${DIRS['02_rdna']}"
post="${INTFNS['02_rdna_post']}"

if ! grep -q 'rdna' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped rDNA profiling.'
else
    bowtie2 -f -x "$DB/ribosomal.DNA/ribosomal.DNA" --end-to-end -D 15 -R 2 \
        -L 22 -i S,1,1.15 "$current" 2>"${LOGFNS['02_rdna']}" \
        | samtools sort - >${INTFNS['02_rdna_output']}
    samtools index "${INTFNS['02_rdna_output']}"
    n_reads['02_rdna']=`python "$DIR/helper.py" rdna $MAX $PE \
        -i "${INTFNS['02_rdna_output']}" \
        -o ${INTFNS['02_rdna_reads']} \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['02_rdna']} reads from ribosomal DNA."
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# 3. REMAPPING TO REFERENCE
# ------------------------------------------------------------------------------

echo '3. Remapping to reference (-s reference)...'
cd "${DIRS['03_reference']}"
post="${INTFNS['03_reference_post']}"

if ! grep -q 'reference' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped remapping to reference.'
else
    bwa mem "$DB/BWA.index/genome.fa" "$current" \
        2>>"${LOGFNS['03_reference']}" \
        | samtools sort - >"${INTFNS['03_reference_genomeoutput']}"
    bwa mem "$DB/BWA.index/isoforms_GRCh38_Ensembl.fasta" "$current" \
        2>>"${LOGFNS['03_reference']}" \
        | samtools sort - >"${INTFNS['03_reference_transcriptomeoutput']}"
    samtools index "${INTFNS['03_reference_genomeoutput']}"
    samtools index "${INTFNS['03_reference_transcriptomeoutput']}"
    n_reads['03_reference']=`python "$DIR/helper.py" reference $MAX $PE \
        -i "${INTFNS['03_reference_genomeoutput']},${INTFNS['03_reference_transcriptomeoutput']}" \
        -o "${INTFNS['03_reference_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['03_reference']} reads from reference genome" \
        'or transcriptome.'
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# 4. REPEAT PROFILING
# ------------------------------------------------------------------------------

echo '4. Repeat profiling (-s repeats)...'
cd "${DIRS['04_repeats']}"
post="${INTFNS['04_repeats_post']}"

if ! grep -q 'repeats' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped repeat profiling.'
else
    blastn -task megablast -index_name "$DB/repeats/repbase.fa" \
        -use_index true -query "$current" -db "$DB/repeats/repbase.fa" \
        -outfmt 6 -evalue 1e-05 >"${INTFNS['04_repeats_output']}" \
        2>"${LOGFNS['04_repeats']}"
    n_reads['04_repeats']=`python "$DIR/helper.py" repeats $MAX $PE \
        -i "${INTFNS['04_repeats_output']}" \
        -o "${INTFNS['04_repeats_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['04_repeats']} reads from repeat sequences."
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# 5. CIRCULAR RNA PROFILING
# ------------------------------------------------------------------------------

echo '5. Circular RNA profiling (-s circrna)...'
cd "${DIRS['05_circrna']}"
post="${INTFNS['05_circrna_post']}"

if ! grep -q 'circrna' <<<"$STEPS" || ! reads_present "$current" \
    || ! grep -q 'override' <<<"$STEPS"; then  # Disabled (no database).
    echo '--> Skipped circular RNA profiling.'
else  # WARNING: This branch is untested!
    tophat2 -o . --fusion-search --keep-fasta-order --no-coverage-search \
        "$DB/Bowtie2Index/genome" "$current" 2>"${LOGFNS['05_circrna']}"
    samtools bam2fq 'accepted_hits.bam' >'accepted_hits.fastq'
    n_reads['05_circrna']=`python "$DIR/helper.py" circrna $MAX $PE \
        -i 'accepted_hits.fastq' \
        -o "${INTFNS['05_circrna_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['05_circrna']} reads from circular RNA."
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# 6. IMMUNE PROFILING
# ------------------------------------------------------------------------------

echo '6. Immune profiling (-s immune)...'
cd "${DIRS['06_immune']}"
post="${INTFNS['06_immune_post']}"

if ! grep -q 'immune' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped immune profiling.'
else
    python "$DIR/tools/imrep/imrep.py" -f -1 --extendedOutput "$current" \
        "${INTFNS['06_immune_output']}" &>"${LOGFNS['06_immune']}"
    python "$DIR/tools/imrep/clonality.py" \
        "${INTFNS['06_immune_output']}" \
        "${INTFNS['06_immune_clonality']}" &>>"${LOGFNS['06_immune']}"
    n_reads['06_immune']=`python "$DIR/helper.py" immune $MAX $PE \
        -i $(ls full_cdr3_*) \
        -o "${INTFNS['06_immune_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['06_immune']} reads from T and B cell"\
        'repetoires.'
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# 7. MICROBIOME PROFILING
# ------------------------------------------------------------------------------

echo '7a. MetaPhlAn profiling (-s metaphlan)...'
cd "${DIRS['07a_metaphlan']}"
# No post file (don't reduce unmapped reads using MetaPhlAn results).

if ! grep -qE 'metaphlan|microbiome' <<<"$STEPS" || ! reads_present "$current";
then
    echo '--> Skipped MetaPhlAn profiling.'
else
    python "$DIR/tools/metaphlan2/metaphlan2.py" "$current" \
        --input_type multifasta --nproc 8 \
        --bowtie2out "${INTFNS['07a_metaphlan_bowtie2out']}" \
        >"${INTFNS['07a_metaphlan_output']}" 2>"${LOGFNS['07a_metaphlan']}"
    n_reads_07a_metaphlan=`wc -l <"${INTFNS['07a_metaphlan_output']}"`
    echo "--> Identified $n_reads_07a_metaphlan reads using MetaPhlAn."
    echo '    These reads are neither filtered nor included in the total.'
    # Don't clean or change $current.
fi

echo '7b. Bacterial profiling (-s bacteria)...'
cd "${DIRS['07b_bacteria']}"
post="${INTFNS['07b_bacteria_post']}"

if ! grep -qE 'bacteria|microbiome' <<<"$STEPS" || ! reads_present "$current" \
    || ! grep -q 'override' <<<"$STEPS"; then  # Disabled (no database).
    echo '--> Skipped bacterial profiling.'
else
    bwa mem "$DB/bacteria/bacteria.ncbi.february.3.2018.fasta" "$current" \
        | samtools sort - >"${INTFNS['07b_bacteria_output']}"
    n_reads['07b_bacteria']=`python "$DIR"/helper.py microbiome $MAX $PE \
        -i "${INTFNS['07b_bacteria_output']}" \
        -o "${INTFNS['07b_bacteria_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['07b_bacteria']} reads from bacterial genomes."
    clean "$current"
    current="$post"
fi

echo '7c. Viral profiling (-s viral)...'
cd "${DIRS['07c_viral']}"
post="${INTFNS['07c_viral_post']}"

if ! grep -qE 'viral|microbiome' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped viral profiling.'
else
    bwa mem -a "$DB/viral/viral.ncbi.february.3.2018.fasta" "$current" \
        2>"${LOGFNS['07c_viral']}" \
        | samtools sort - >"${INTFNS['07c_viral_output']}"
    bwa mem -a "$DB/viral.vipr/NONFLU_All.fastq" "$current" \
        2>>"${LOGFNS['07c_viral']}" \
        | samtools sort - >"${INTFNS['07c_viral_viproutput']}"
    samtools index "${INTFNS['07c_viral_output']}"
    samtools index "${INTFNS['07c_viral_viproutput']}"
    n_reads['07c_viral']=`python "$DIR/helper.py" microbiome $MAX $PE \
        -i "${INTFNS['07c_viral_output']},${INTFNS['07c_viral_viproutput']}" \
        -o "${INTFNS['07c_viral_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['07c_viral']} reads from viral genomes."
    clean "$current"
    current="$post"
fi

echo '7d. Fungal profiling (-s fungi)...'
cd "${DIRS['07d_fungi']}"
post="${INTFNS['07d_fungi_post']}"

if ! grep -qE 'fungi|microbiome' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped fungal profiling.'
else
    bwa mem -a "$DB/fungi/fungi.ncbi.february.3.2018.fasta" "$current" \
        2>"${LOGFNS['07d_fungi']}" \
        | samtools sort - >"${INTFNS['07d_fungi_output']}"
    samtools index "${INTFNS['07d_fungi_output']}"
    n_reads['07d_fungi']=`python "$DIR/helper.py" microbiome $MAX $PE \
        -i "${INTFNS['07d_fungi_output']}" \
        -o "${INTFNS['07d_fungi_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['07d_fungi']} reads from fungal genomes."
    clean "$current"
    current="$post"
fi

echo '7e. Protozoan profiling (-s protozoa)...'
cd "${DIRS['07e_protozoa']}"
post="${INTFNS['07e_protozoa_post']}"

if ! grep -qE 'protozoa|microbiome' <<<"$STEPS" || ! reads_present "$current"; then
    echo '--> Skipped protozoan profiling.'
else
    bwa mem -a "$DB/protozoa/protozoa.ncbi.february.3.2018.fasta" "$current" \
        2>"${LOGFNS['07e_protozoa']}" \
        | samtools sort - >"${INTFNS['07e_protozoa_output']}"
    samtools index "${INTFNS['07e_protozoa_output']}"
    n_reads['07e_protozoa']=`python "$DIR/helper.py" microbiome $MAX $PE \
        -i "${INTFNS['07e_protozoa_output']}" \
        -o "${INTFNS['07e_protozoa_reads']}" \
        --pre "$current" --post "$post"`
    echo "--> Filtered ${n_reads['07e_protozoa']} reads from protozoan genomes."
    clean "$current"
    current="$post"
fi

# ------------------------------------------------------------------------------
# CLEANUP
# ------------------------------------------------------------------------------

# Revise low quality read count.
if grep -q 'lowq' <<<"$STEPS"; then
    n_reads['01_lowq']=`grep -c '^>lowq_' "$current"`
fi

# Sum accounted reads and write to file.
steps=''
counts=''
sum=0
for key in "${!n_reads[@]}"; do
    steps+="$key,"
    counts+="${n_reads[$key]},"
    ((sum += ${n_reads[$key]})) || true
done
steps=`sed 's .$  ' <<<"$steps"`
counts=`sed 's .$  ' <<<"$counts"`
pct=`bc -l <<<"scale=2; 100*$sum/$N"`
echo "$steps" >"${LOGFNS['counts']}"
echo "$counts" >>"${LOGFNS['counts']}"

# Unaccounted reads.
cp "$current" "${INTFNS['unaccounted']}"
clean "$current"

# Final message.
echo "Summary: ROP has accounted for $sum reads ($pct% of unmapped reads)."
if grep -q 'lowq' <<<"$STEPS"; then
    echo "Of those, ${n_reads['01_lowq']} were considered accounted due to low quality."
fi
back to top