https://github.com/Gregor-Mendel-Institute/RKP2021-CMT3
Raw File
Tip revision: 89d7e2ea78af1969bb161640baed09296ed2485f authored by Papareddy on 23 April 2021, 11:18:44 UTC
Update README.md
Tip revision: 89d7e2e
main.nf
#!/usr/bin/env nextflow
/*
========================================================================================
                         nf-core/chipseq
========================================================================================
 nf-core/chipseq Analysis Pipeline.
 #### Homepage / Documentation
 https://github.com/nf-core/chipseq
----------------------------------------------------------------------------------------
*/

def helpMessage() {
    log.info nfcoreHeader()
    log.info"""
    Usage:

    The typical command for running the pipeline is as follows:

      nextflow run nf-core/chipseq --input design.csv --genome GRCh37 -profile docker

    Mandatory arguments:
      --input [file]                  Comma-separated file containing information about the samples in the experiment (see docs/usage.md) (Default: './design.csv')
      --fasta [file]                  Path to Fasta reference. Not mandatory when using reference in iGenomes config via --genome (Default: false)
      --gtf [file]                    Path to GTF file. Not mandatory when using reference in iGenomes config via --genome (Default: false)
      -profile [str]                  Configuration profile to use. Can use multiple (comma separated)
                                      Available: conda, docker, singularity, awsbatch, test

    Generic
      --single_end [bool]             Specifies that the input is single-end reads (Default: false)
      --seq_center [str]              Sequencing center information to be added to read group of BAM files (Default: false)
      --fragment_size [int]           Estimated fragment size used to extend single-end reads (Default: 200)


    References                        If not specified in the configuration file or you wish to overwrite any of the references
      --genome [str]                  Name of iGenomes reference (Default: false)
      --bwa_index [file]              Full path to directory containing BWA index including base name i.e. /path/to/index/genome.fa (Default: false)
      --gene_bed [file]               Path to BED file containing gene intervals (Default: false)
      --macs_gsize [str]              Effective genome size parameter required by MACS2. If using iGenomes config, values have only been provided when --genome is set as GRCh37, GRCm38, hg19, mm10, BDGP6 and WBcel235 (Default: false)
      --blacklist [file]              Path to blacklist regions (.BED format), used for filtering alignments (Default: false)
      --save_reference [bool]         If generated by the pipeline save the BWA index in the results directory (Default: false)

    Trimming
      --clip_r1 [int]                 Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads) (Default: 0)
      --clip_r2 [int]                 Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only) (Default: 0)
      --three_prime_clip_r1 [int]     Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed (Default: 0)
      --three_prime_clip_r2 [int]     Instructs Trim Galore to re move bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed (Default: 0)
      --trim_nextseq [int]            Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails (Default: 0)
      --skip_trimming [bool]          Skip the adapter trimming step (Default: false)
      --save_trimmed [bool]           Save the trimmed FastQ files in the results directory (Default: false)

    Alignments
      --bwa_min_score [int]           Don’t output BWA MEM alignments with score lower than this parameter (Default: false)
      --keep_dups [bool]              Duplicate reads are not filtered from alignments (Default: false)
      --keep_multi_map [bool]         Reads mapping to multiple locations are not filtered from alignments (Default: false)
      --save_align_intermeds [bool]   Save the intermediate BAM files from the alignment step - not done by default (Default: false)

    Peaks
      --narrow_peak [bool]            Run MACS2 in narrowPeak mode (Default: false)
      --broad_cutoff [float]          Specifies broad cutoff value for MACS2. Only used when --narrow_peak isnt specified (Default: 0.1)
      --macs_fdr [float]              Minimum FDR (q-value) cutoff for peak detection, --macs_fdr and --macs_pvalue are mutually exclusive (Default: false)
      --macs_pvalue [float]           p-value cutoff for peak detection, --macs_fdr and --macs_pvalue are mutually exclusive (Default: false)
      --min_reps_consensus [int]      Number of biological replicates required from a given condition for a peak to contribute to a consensus peak (Default: 1)
      --save_macs_pileup [bool]       Instruct MACS2 to create bedGraph files normalised to signal per million reads (Default: false)
      --skip_peak_qc [bool]           Skip MACS2 peak QC plot generation (Default: false)
      --skip_peak_annotation [bool]   Skip annotation of MACS2 and consensus peaks with HOMER (Default: false)
      --skip_consensus_peaks [bool]   Skip consensus peak generation (Default: false)

    Differential analysis
      --deseq2_vst [bool]             Use vst transformation instead of rlog with DESeq2 (Default: false)
      --skip_diff_analysis [bool]     Skip differential accessibility analysis (Default: false)

    QC
      --skip_fastqc [bool]            Skip FastQC (Default: false)
      --skip_picard_metrics [bool]    Skip Picard CollectMultipleMetrics (Default: false)
      --skip_preseq [bool]            Skip Preseq (Default: false)
      --skip_plot_profile [bool]      Skip deepTools plotProfile (Default: false)

      --skip_spp [bool]               Skip Phantompeakqualtools (Default: false)
      --skip_igv [bool]               Skip IGV (Default: false)
      --skip_multiqc [bool]           Skip MultiQC (Default: false)

    Other
      --outdir [file]                 The output directory where the results will be saved (Default: './results')
      --publish_dir_mode [str]        Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy)
      --email [email]                 Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits (Default: false)
      --email_on_fail [email]         Same as --email, except only send mail if the workflow is not successful (Default: false)
      --max_multiqc_email_size [str]  Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
      -name [str]                     Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic (Default: false)

    AWSBatch options:
      --awsqueue [str]                The AWSBatch JobQueue that needs to be set when running on AWSBatch
      --awsregion [str]               The AWS Region for your AWS Batch job to run on
      --awscli [str]                  Path to the AWS CLI tool
    """.stripIndent()
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                SET UP CONFIGURATION VARIABLES                       -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

// Show help message
if (params.help) {
    helpMessage()
    exit 0
}

// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
    custom_runName = workflow.runName
}

////////////////////////////////////////////////////
/* --         DEFAULT PARAMETER VALUES         -- */
////////////////////////////////////////////////////

// Check if genome exists in the config file
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
    exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
}

// Configurable variables
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.bwa_index = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false
params.gene_bed = params.genome ? params.genomes[ params.genome ].bed12 ?: false : false

params.blacklist = params.genome ? params.genomes[ params.genome ].blacklist ?: false : false
params.anno_readme = params.genome ? params.genomes[ params.genome ].readme ?: false : false

// Global variables


////////////////////////////////////////////////////
/* --          CONFIG FILES                    -- */
////////////////////////////////////////////////////

// Pipeline config
ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true)
ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)
ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true)

// JSON files required by BAMTools for alignment filtering
if (params.single_end) {
    ch_bamtools_filter_config = file(params.bamtools_filter_se_config, checkIfExists: true)
} else {
    ch_bamtools_filter_config = file(params.bamtools_filter_pe_config, checkIfExists: true)
}

// Header files for MultiQC
ch_peak_count_header = file("$baseDir/assets/multiqc/peak_count_header.txt", checkIfExists: true)
ch_frip_score_header = file("$baseDir/assets/multiqc/frip_score_header.txt", checkIfExists: true)
ch_peak_annotation_header = file("$baseDir/assets/multiqc/peak_annotation_header.txt", checkIfExists: true)
ch_deseq2_pca_header = file("$baseDir/assets/multiqc/deseq2_pca_header.txt", checkIfExists: true)
ch_deseq2_clustering_header = file("$baseDir/assets/multiqc/deseq2_clustering_header.txt", checkIfExists: true)
ch_spp_correlation_header = file("$baseDir/assets/multiqc/spp_correlation_header.txt", checkIfExists: true)
ch_spp_nsc_header = file("$baseDir/assets/multiqc/spp_nsc_header.txt", checkIfExists: true)
ch_spp_rsc_header = file("$baseDir/assets/multiqc/spp_rsc_header.txt", checkIfExists: true)

////////////////////////////////////////////////////
/* --          VALIDATE INPUTS                 -- */
////////////////////////////////////////////////////

if (params.input)     { ch_input = file(params.input, checkIfExists: true) } else { exit 1, 'Samples design file not specified!' }
if (params.gtf)       { ch_gtf = file(params.gtf, checkIfExists: true) } else { exit 1, 'GTF annotation file not specified!' }
if (params.gene_bed)  { ch_gene_bed = file(params.gene_bed, checkIfExists: true) }


if (params.fasta) {
    lastPath = params.fasta.lastIndexOf(File.separator)
    bwa_base = params.fasta.substring(lastPath+1)
    ch_fasta = file(params.fasta, checkIfExists: true)
} else {
    exit 1, 'Fasta file not specified!'
}

if (params.bwa_index) {
    lastPath = params.bwa_index.lastIndexOf(File.separator)
    bwa_dir  = params.bwa_index.substring(0,lastPath+1)
    bwa_base = params.bwa_index.substring(lastPath+1)
    Channel
        .fromPath(bwa_dir, checkIfExists: true)
        .set { ch_bwa_index }
}

// Save AWS IGenomes file containing annotation version
if (params.anno_readme && file(params.anno_readme).exists()) {
    file("${params.outdir}/genome/").mkdirs()
    file(params.anno_readme).copyTo("${params.outdir}/genome/")
}

////////////////////////////////////////////////////
/* --                   AWS                    -- */
////////////////////////////////////////////////////

if (workflow.profile.contains('awsbatch')) {
    // AWSBatch sanity checking
    if (!params.awsqueue || !params.awsregion) exit 1, 'Specify correct --awsqueue and --awsregion parameters on AWSBatch!'
    // Check outdir paths to be S3 buckets if running on AWSBatch
    // related: https://github.com/nextflow-io/nextflow/issues/813
    if (!params.outdir.startsWith('s3:')) exit 1, 'Outdir not on S3 - specify S3 Bucket to run on AWSBatch!'
    // Prevent trace files to be stored on S3 since S3 does not support rolling files.
    if (params.tracedir.startsWith('s3:')) exit 1, 'Specify a local tracedir or run without trace! S3 cannot be used for tracefiles.'
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                       HEADER LOG INFO                               -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

// Header log info

def summary = [:]
summary['Run Name']               = custom_runName ?: workflow.runName
summary['Data Type']              = params.single_end ? 'Single-End' : 'Paired-End'
summary['Design File']            = params.input
summary['Genome']                 = params.genome ?: 'Not supplied'
summary['Fasta File']             = params.fasta
summary['GTF File']               = params.gtf
if (params.gene_bed)              summary['Gene BED File'] = params.gene_bed
if (params.bwa_index)             summary['BWA Index'] = params.bwa_index

if (params.bwa_min_score)         summary['BWA Min Score'] = params.bwa_min_score

if (params.skip_trimming) {
    summary['Trimming Step']      = 'Skipped'
} else {
    summary['Trim R1']            = "$params.clip_r1 bp"
    summary['Trim R2']            = "$params.clip_r2 bp"
    summary["Trim 3' R1"]         = "$params.three_prime_clip_r1 bp"
    summary["Trim 3' R2"]         = "$params.three_prime_clip_r2 bp"
    summary['NextSeq Trim']       = "$params.trim_nextseq bp"
}
if (params.seq_center)            summary['Sequencing Center'] = params.seq_center
if (params.single_end)            summary['Fragment Size'] = "$params.fragment_size bp"

if (params.keep_dups)             summary['Keep Duplicates'] = 'Yes'
if (params.keep_multi_map)        summary['Keep Multi-mapped'] = 'Yes'
summary['Save Genome Index']      = params.save_reference ? 'Yes' : 'No'
if (params.save_trimmed)          summary['Save Trimmed'] = 'Yes'
if (params.save_align_intermeds)  summary['Save Intermeds'] =  'Yes'

if (params.skip_fastqc)           summary['Skip FastQC'] = 'Yes'
if (params.skip_picard_metrics)   summary['Skip Picard Metrics'] = 'Yes'
if (params.skip_preseq)           summary['Skip Preseq'] = 'Yes'
if (params.skip_plot_profile)     summary['Skip plotProfile'] = 'Yes'


if (params.skip_multiqc)          summary['Skip MultiQC'] = 'Yes'
summary['Max Resources']          = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine)     summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output Dir']             = params.outdir
summary['Launch Dir']             = workflow.launchDir
summary['Working Dir']            = workflow.workDir
summary['Script Dir']             = workflow.projectDir
summary['User']                   = workflow.userName
if (workflow.profile.contains('awsbatch')) {
    summary['AWS Region']         = params.awsregion
    summary['AWS Queue']          = params.awsqueue
    summary['AWS CLI']            = params.awscli
}
summary['Config Profile']         = workflow.profile
if (params.config_profile_description) summary['Config Description'] = params.config_profile_description
if (params.config_profile_contact)     summary['Config Contact']     = params.config_profile_contact
if (params.config_profile_url)         summary['Config URL']         = params.config_profile_url
if (params.email || params.email_on_fail) {
    summary['E-mail Address']     = params.email
    summary['E-mail on failure']  = params.email_on_fail
    summary['MultiQC Max Size']   = params.max_multiqc_email_size
}
log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join('\n')
log.info "-\033[2m--------------------------------------------------\033[0m-"

// Check the hostnames against configured profiles




///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                     PARSE DESIGN FILE                               -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * PREPROCESSING: Reformat design file, check validitiy and create IP vs control mappings
 */
process CHECK_DESIGN {
    tag "$design"
    publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode

    input:
    path design from ch_input

    output:
    path 'design_reads.csv' into ch_design_reads_csv
    path 'design_controls.csv' into ch_design_controls_csv

    script:  // This script is bundled with the pipeline, in nf-core/chipseq/bin/
    """
    check_design.py $design design_reads.csv design_controls.csv
    """
}

/*
 * Create channels for input fastq files
 */
if (params.single_end) {
    ch_design_reads_csv
        .splitCsv(header:true, sep:',')
        .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true) ] ] }
        .into { ch_raw_reads_fastqc;
                ch_raw_reads_trimgalore }
} else {
    ch_design_reads_csv
        .splitCsv(header:true, sep:',')
        .map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true) ] ] }
        .into { ch_raw_reads_fastqc;
                ch_raw_reads_trimgalore }
}

/*
 * Create a channel with [sample_id, control id, antibody, replicatesExist, multipleGroups]
 */
ch_design_controls_csv
    .splitCsv(header:true, sep:',')
    .map { row -> [ row.sample_id, row.control_id, row.antibody, row.replicatesExist.toBoolean(), row.multipleGroups.toBoolean() ] }
    .set { ch_design_controls_csv }

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                     PREPARE ANNOTATION FILES                        -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * PREPROCESSING: Build BWA index
 */
if (!params.bwa_index) {
    process BWA_INDEX {
        tag "$fasta"
        label 'process_high'
        publishDir path: { params.save_reference ? "${params.outdir}/genome" : params.outdir },
            saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode

        input:
        path fasta from ch_fasta

        output:
        path 'BWAIndex' into ch_bwa_index

        script:
        """
        bwa index -a bwtsw $fasta
        mkdir BWAIndex && mv ${fasta}* BWAIndex
        """
    }
}

/*
 * PREPROCESSING: Generate gene BED file
 */
// If --gtf is supplied along with --genome
// Make gene bed from supplied --gtf instead of using iGenomes one automatically
def MAKE_BED = false
if (!params.gene_bed) {
    MAKE_BED = true
} else if (params.genome && params.gtf) {
    if (params.genomes[ params.genome ].gtf != params.gtf) {
        MAKE_BED = true
    }
}
if (MAKE_BED) {
    process MAKE_GENE_BED {
        tag "$gtf"
        label 'process_low'
        publishDir "${params.outdir}/genome", mode: params.publish_dir_mode

        input:
        path gtf from ch_gtf

        output:
        path '*.bed' into ch_gene_bed

        script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
        """
        gtf2bed $gtf > ${gtf.baseName}.bed
        """
    }
}

/*
 * PREPROCESSING: Prepare genome intervals for filtering
 */
process MAKE_GENOME_FILTER {
    tag "$fasta"
    publishDir "${params.outdir}/genome", mode: params.publish_dir_mode

    input:
    path fasta from ch_fasta


    output:
    path "$fasta"                                      // FASTA FILE FOR IGV
    path '*.fai'                                       // FAI INDEX FOR REFERENCE GENOME
    path '*.bed' into ch_genome_filter_regions         // BED FILE WITHOUT BLACKLIST REGIONS
    path '*.sizes' into ch_genome_sizes_bigwig         // CHROMOSOME SIZES FILE FOR BEDTOOLS

    script:
    blacklist_filter = params.blacklist ? "sortBed -i $blacklist -g ${fasta}.sizes | complementBed -i stdin -g ${fasta}.sizes" : "awk '{print \$1, '0' , \$2}' OFS='\t' ${fasta}.sizes"
    """
    samtools faidx $fasta
    cut -f 1,2 ${fasta}.fai > ${fasta}.sizes
    $blacklist_filter > ${fasta}.include_regions.bed
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        FASTQ QC                                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 1: FastQC
 */
process FASTQC {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode,
        saveAs: { filename ->
                      filename.endsWith('.zip') ? "zips/$filename" : filename
                }

    when:
    !params.skip_fastqc

    input:
    tuple val(name), path(reads) from ch_raw_reads_fastqc

    output:
    path '*.{zip,html}' into ch_fastqc_reports_mqc

    script:
    // Added soft-links to original fastqs for consistent naming in MultiQC
    if (params.single_end) {
        """
        [ ! -f  ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz
        fastqc -q -t $task.cpus ${name}.fastq.gz
        """
    } else {
        """
        [ ! -f  ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz
        [ ! -f  ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz
        fastqc -q -t $task.cpus ${name}_1.fastq.gz
        fastqc -q -t $task.cpus ${name}_2.fastq.gz
        """
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        ADAPTER TRIMMING                             -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 2: Trim Galore!
 */
if (params.skip_trimming) {
    ch_trimmed_reads = ch_raw_reads_trimgalore
    ch_trimgalore_results_mqc = Channel.empty()
    ch_trimgalore_fastqc_reports_mqc = Channel.empty()
} else {
    process TRIMGALORE {
        tag "$name"
        label 'process_high'
        publishDir "${params.outdir}/trim_galore", mode: params.publish_dir_mode,
            saveAs: { filename ->
                          if (filename.endsWith('.html')) "fastqc/$filename"
                          else if (filename.endsWith('.zip')) "fastqc/zips/$filename"
                          else if (filename.endsWith('trimming_report.txt')) "logs/$filename"
                          else params.save_trimmed ? filename : null
                    }

        input:
        tuple val(name), path(reads) from ch_raw_reads_trimgalore

        output:
        tuple val(name), path('*.fq.gz') into ch_trimmed_reads
        path '*.txt' into ch_trimgalore_results_mqc
        path '*.{zip,html}' into ch_trimgalore_fastqc_reports_mqc

        script:
        // Calculate number of --cores for TrimGalore based on value of task.cpus
        // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019
        // See: https://github.com/nf-core/atacseq/pull/65
        def cores = 1
        if (task.cpus) {
            cores = (task.cpus as int) - 4
            if (params.single_end) cores = (task.cpus as int) - 3
            if (cores < 1) cores = 1
            if (cores > 4) cores = 4
        }

        c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : ''
        c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : ''
        tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : ''
        tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : ''
        nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : ''

        // Added soft-links to original fastqs for consistent naming in MultiQC
        if (params.single_end) {
            """
            [ ! -f  ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz
            trim_galore --cores $cores --fastqc --gzip $c_r1 $tpc_r1 $nextseq ${name}.fastq.gz
            """
        } else {
            """
            [ ! -f  ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz
            [ ! -f  ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz
            trim_galore --cores $cores --paired --fastqc --gzip $c_r1 $c_r2 $tpc_r1 $tpc_r2 $nextseq ${name}_1.fastq.gz ${name}_2.fastq.gz
            """
        }
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                        ALIGN                                        -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 3.1: Map read(s) with bwa mem
 */
process BWA_MEM {
    tag "$name"
    label 'process_high'

    input:
    tuple val(name), path(reads) from ch_trimmed_reads
    path index from ch_bwa_index.collect()

    output:
    tuple val(name), path('*.bam') into ch_bwa_bam

    script:
    prefix = "${name}.Lb"
    rg = "\'@RG\\tID:${name}\\tSM:${name.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${name}\\tPU:1\'"
    if (params.seq_center) {
        rg = "\'@RG\\tID:${name}\\tSM:${name.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${name}\\tPU:1\\tCN:${params.seq_center}\'"
    }
    score = params.bwa_min_score ? "-T ${params.bwa_min_score}" : ''
    """
    bwa mem \\
        -t $task.cpus \\
        -M \\
        -R $rg \\
        $score \\
        ${index}/${bwa_base} \\
        $reads \\
        | samtools view -@ $task.cpus -b -h -F 0x0100 -O BAM -o ${prefix}.bam -
    """
}

/*
 * STEP 3.2: Convert BAM to coordinate sorted BAM
 */
process SORT_BAM {
    tag "$name"
    label 'process_medium'
    if (params.save_align_intermeds) {
        publishDir path: "${params.outdir}/bwa/library", mode: params.publish_dir_mode,
            saveAs: { filename ->
                          if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
                          else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
                          else if (filename.endsWith('.stats')) "samtools_stats/$filename"
                          else filename
                    }
    }

    input:
    tuple val(name), path(bam) from ch_bwa_bam

    output:
    tuple val(name), path('*.sorted.{bam,bam.bai}') into ch_sort_bam_merge
    path '*.{flagstat,idxstats,stats}' into ch_sort_bam_flagstat_mqc

    script:
    prefix = "${name}.Lb"
    """
    samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $name $bam
    samtools index ${prefix}.sorted.bam
    samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
    samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
    samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                    MERGE LIBRARY BAM                                -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 4.1: Merge BAM files for all libraries from same sample replicate
 */
ch_sort_bam_merge
    .map { it -> [ it[0].split('_')[0..-2].join('_'), it[1] ] }
    .groupTuple(by: [0])
    .map { it ->  [ it[0], it[1].flatten() ] }
    .set { ch_sort_bam_merge }

process MERGED_BAM {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
        saveAs: { filename ->
                      if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
                      else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
                      else if (filename.endsWith('.stats')) "samtools_stats/$filename"
                      else if (filename.endsWith('.metrics.txt')) "picard_metrics/$filename"
                      else params.save_align_intermeds ? filename : null
                }

    input:
    tuple val(name), path(bams) from ch_sort_bam_merge

    output:
    tuple val(name), path("*${prefix}.sorted.{bam,bam.bai}") into ch_merge_bam_filter,
                                                                  ch_merge_bam_preseq
    path '*.{flagstat,idxstats,stats}' into ch_merge_bam_stats_mqc
    path '*.txt' into ch_merge_bam_metrics_mqc

    script:
    prefix = "${name}.mLb.mkD"
    bam_files = bams.findAll { it.toString().endsWith('.bam') }.sort()
    def avail_mem = 3
    if (!task.memory) {
        log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
    } else {
        avail_mem = task.memory.toGiga()
    }
    if (bam_files.size() > 1) {
        """
        picard -Xmx${avail_mem}g MergeSamFiles \\
            ${'INPUT='+bam_files.join(' INPUT=')} \\
            OUTPUT=${name}.sorted.bam \\
            SORT_ORDER=coordinate \\
            VALIDATION_STRINGENCY=LENIENT \\
            TMP_DIR=tmp
        samtools index ${name}.sorted.bam

        picard -Xmx${avail_mem}g MarkDuplicates \\
            INPUT=${name}.sorted.bam \\
            OUTPUT=${prefix}.sorted.bam \\
            ASSUME_SORTED=true \\
            REMOVE_DUPLICATES=false \\
            METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\
            VALIDATION_STRINGENCY=LENIENT \\
            TMP_DIR=tmp

        samtools index ${prefix}.sorted.bam
        samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
        samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
        samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
        """
    } else {
      """
      picard -Xmx${avail_mem}g MarkDuplicates \\
          INPUT=${bam_files[0]} \\
          OUTPUT=${prefix}.sorted.bam \\
          ASSUME_SORTED=true \\
          REMOVE_DUPLICATES=false \\
          METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\
          VALIDATION_STRINGENCY=LENIENT \\
          TMP_DIR=tmp

      samtools index ${prefix}.sorted.bam
      samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
      samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
      samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
      """
    }
}

/*
 * STEP 4.2: Filter BAM file at merged library-level
 */
process MERGED_BAM_FILTER {
    tag "$name"
    label 'process_medium'
    publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
        saveAs: { filename ->
                      if (params.single_end || params.save_align_intermeds) {
                          if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
                          else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
                          else if (filename.endsWith('.stats')) "samtools_stats/$filename"
                          else if (filename.endsWith('.sorted.bam')) filename
                          else if (filename.endsWith('.sorted.bam.bai')) filename
                          else null
                      }
                }

    input:
    tuple val(name), path(bam) from ch_merge_bam_filter
    path bed from ch_genome_filter_regions.collect()
    path bamtools_filter_config from ch_bamtools_filter_config

    output:
    tuple val(name), path('*.{bam,bam.bai}') into ch_filter_bam
    tuple val(name), path('*.flagstat') into ch_filter_bam_flagstat
    path '*.{idxstats,stats}' into ch_filter_bam_stats_mqc

    script:
    prefix = params.single_end ? "${name}.mLb.clN" : "${name}.mLb.flT"
    filter_params = params.single_end ? '-F 0x004' : '-F 0x004 -F 0x0008 -f 0x001'
    dup_params = params.keep_dups ? '' : '-F 0x0400'
    multimap_params = params.keep_multi_map ? '' : '-q 1'
    blacklist_params = params.blacklist ? "-L $bed" : ''
    name_sort_bam = params.single_end ? '' : "samtools sort -n -@ $task.cpus -o ${prefix}.bam -T $prefix ${prefix}.sorted.bam"
    """
    samtools view \\
        $filter_params \\
        $dup_params \\
        $multimap_params \\
        $blacklist_params \\
        -b ${bam[0]} \\
        | bamtools filter \\
            -out ${prefix}.sorted.bam \\
            -script $bamtools_filter_config

    samtools index ${prefix}.sorted.bam
    samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
    samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
    samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats

    $name_sort_bam
    """
}

/*
 * STEP 4.3: Remove orphan reads from paired-end BAM file
 */
if (params.single_end) {
    ch_filter_bam
        .into { ch_rm_orphan_bam_metrics;
                ch_rm_orphan_bam_bigwig;
                ch_rm_orphan_bam_macs_1;
                ch_rm_orphan_bam_macs_2;
                ch_rm_orphan_bam_phantompeakqualtools;
                ch_rm_orphan_name_bam_counts }

    ch_filter_bam_flagstat
        .into { ch_rm_orphan_flagstat_bigwig;
                ch_rm_orphan_flagstat_macs;
                ch_rm_orphan_flagstat_mqc }

    ch_filter_bam_stats_mqc
        .set { ch_rm_orphan_stats_mqc }
} else {
    process MERGED_BAM_REMOVE_ORPHAN {
        tag "$name"
        label 'process_medium'
        publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
            saveAs: { filename ->
                          if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
                          else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
                          else if (filename.endsWith('.stats')) "samtools_stats/$filename"
                          else if (filename.endsWith('.sorted.bam')) filename
                          else if (filename.endsWith('.sorted.bam.bai')) filename
                          else null
                    }

        input:
        tuple val(name), path(bam) from ch_filter_bam

        output:
        tuple val(name), path('*.sorted.{bam,bam.bai}') into ch_rm_orphan_bam_metrics,
                                                             ch_rm_orphan_bam_bigwig,
                                                             ch_rm_orphan_bam_macs_1,
                                                             ch_rm_orphan_bam_macs_2,
                                                             ch_rm_orphan_bam_phantompeakqualtools
        tuple val(name), path("${prefix}.bam") into ch_rm_orphan_name_bam_counts
        tuple val(name), path('*.flagstat') into ch_rm_orphan_flagstat_bigwig,
                                                 ch_rm_orphan_flagstat_macs,
                                                 ch_rm_orphan_flagstat_mqc
        path '*.{idxstats,stats}' into ch_rm_orphan_stats_mqc

        script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
        prefix = "${name}.mLb.clN"
        """
        bampe_rm_orphan.py ${bam[0]} ${prefix}.bam --only_fr_pairs

        samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $prefix ${prefix}.bam
        samtools index ${prefix}.sorted.bam
        samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
        samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
        samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
        """
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                 MERGE LIBRARY BAM POST-ANALYSIS                     -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 5.1: Preseq analysis after merging libraries and before filtering
 */
process PRESEQ {
    tag "$name"
    label 'process_medium'
    label 'error_ignore'
    publishDir "${params.outdir}/bwa/mergedLibrary/preseq", mode: params.publish_dir_mode

    when:
    !params.skip_preseq

    input:
    tuple val(name), path(bam) from ch_merge_bam_preseq

    output:
    path '*.ccurve.txt' into ch_preseq_mqc
    path '*.log'

    script:
    pe = params.single_end ? '' : '-pe'
    """
    preseq lc_extrap \\
        -output ${name}.ccurve.txt \\
        -verbose \\
        -bam \\
        $pe \\
        -seed 1 \\
        ${bam[0]}
    cp .command.err ${name}.command.log
    """
}

/*
 * STEP 5.2: Picard CollectMultipleMetrics after merging libraries and filtering
 */
process PICARD_METRICS {
    tag "$name"
    label 'process_medium'
    publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
        saveAs: { filename ->
                      if (filename.endsWith('_metrics')) "picard_metrics/$filename"
                      else if (filename.endsWith('.pdf')) "picard_metrics/pdf/$filename"
                      else null
                }

    when:
    !params.skip_picard_metrics

    input:
    tuple val(name), path(bam) from ch_rm_orphan_bam_metrics
    path fasta from ch_fasta

    output:
    path '*_metrics' into ch_collectmetrics_mqc
    path '*.pdf'

    script:
    prefix = "${name}.mLb.clN"
    def avail_mem = 3
    if (!task.memory) {
        log.info '[Picard CollectMultipleMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
    } else {
        avail_mem = task.memory.toGiga()
    }
    """
    picard -Xmx${avail_mem}g CollectMultipleMetrics \\
        INPUT=${bam[0]} \\
        OUTPUT=${prefix}.CollectMultipleMetrics \\
        REFERENCE_SEQUENCE=$fasta \\
        VALIDATION_STRINGENCY=LENIENT \\
        TMP_DIR=tmp
    """
}

/*
 * STEP 5.3: Read depth normalised bigWig
 */
process BIGWIG {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/bigwig", mode: params.publish_dir_mode,
        saveAs: { filename ->
                      if (filename.endsWith('scale_factor.txt')) "scale/$filename"
                      else if (filename.endsWith('.bigWig')) filename
                      else null
                }

    input:
    tuple val(name), path(bam), path(flagstat) from ch_rm_orphan_bam_bigwig.join(ch_rm_orphan_flagstat_bigwig, by: [0])
    path sizes from ch_genome_sizes_bigwig.collect()

    output:
    tuple val(name), path('*.bigWig') into ch_bigwig_plotprofile
    path '*igv.txt' into ch_bigwig_igv
    path '*scale_factor.txt'

    script:
    pe_fragment = params.single_end ? '' : '-pc'
    extend = (params.single_end && params.fragment_size > 0) ? "-fs ${params.fragment_size}" : ''
    """
    SCALE_FACTOR=\$(grep 'mapped (' $flagstat | awk '{print 1000000/\$1}')
    echo \$SCALE_FACTOR > ${name}.scale_factor.txt
    genomeCoverageBed -ibam ${bam[0]} -bg -scale \$SCALE_FACTOR $pe_fragment $extend | sort -T '.' -k1,1 -k2,2n >  ${name}.bedGraph

    bedGraphToBigWig ${name}.bedGraph $sizes ${name}.bigWig

    find * -type f -name "*.bigWig" -exec echo -e "bwa/mergedLibrary/bigwig/"{}"\\t0,0,178" \\; > ${name}.bigWig.igv.txt
    """
}

/*
 * STEP 5.4: Generate gene body coverage plot with deepTools plotProfile and plotHeatmap
 */
process PLOTPROFILE {
    tag "$name"
    label 'process_high'
    publishDir "${params.outdir}/bwa/mergedLibrary/deepTools/plotProfile", mode: params.publish_dir_mode

    when:
    !params.skip_plot_profile

    input:
    tuple val(name), path(bigwig) from ch_bigwig_plotprofile
    path bed from ch_gene_bed

    output:
    path '*.plotProfile.tab' into ch_plotprofile_mqc
    path '*.{gz,pdf,mat.tab}'

    script:
    """
    computeMatrix scale-regions \\
        --regionsFileName $bed \\
        --scoreFileName $bigwig \\
        --outFileName ${name}.computeMatrix.mat.gz \\
        --outFileNameMatrix ${name}.computeMatrix.vals.mat.tab \\
        --regionBodyLength 1000 \\
        --beforeRegionStartLength 3000 \\
        --afterRegionStartLength 3000 \\
        --skipZeros \\
        --smartLabels \\
        --numberOfProcessors $task.cpus

    plotProfile --matrixFile ${name}.computeMatrix.mat.gz \\
        --outFileName ${name}.plotProfile.pdf \\
        --outFileNameData ${name}.plotProfile.tab

    plotHeatmap --matrixFile ${name}.computeMatrix.mat.gz \\
        --outFileName ${name}.plotHeatmap.pdf \\
        --outFileNameMatrix ${name}.plotHeatmap.mat.tab
    """
}

/*
 * STEP 5.5: Phantompeakqualtools
 */
process PHANTOMPEAKQUALTOOLS {
    tag "$name"
    label 'process_medium'
    publishDir "${params.outdir}/bwa/mergedLibrary/phantompeakqualtools", mode: params.publish_dir_mode

    when:
    !params.skip_spp

    input:
    tuple val(name), path(bam) from ch_rm_orphan_bam_phantompeakqualtools
    path spp_correlation_header from ch_spp_correlation_header
    path spp_nsc_header from ch_spp_nsc_header
    path spp_rsc_header from ch_spp_rsc_header

    output:
    path '*.spp.out' into ch_spp_out_mqc
    path '*_mqc.tsv' into ch_spp_csv_mqc
    path '*.pdf'

    script:
    """
    RUN_SPP=`which run_spp.R`
    Rscript -e "library(caTools); source(\\"\$RUN_SPP\\")" -c="${bam[0]}" -savp="${name}.spp.pdf" -savd="${name}.spp.Rdata" -out="${name}.spp.out" -p=$task.cpus
    cp $spp_correlation_header ${name}_spp_correlation_mqc.tsv
    Rscript -e "load('${name}.spp.Rdata'); write.table(crosscorr\\\$cross.correlation, file=\\"${name}_spp_correlation_mqc.tsv\\", sep=",", quote=FALSE, row.names=FALSE, col.names=FALSE,append=TRUE)"

    awk -v OFS='\t' '{print "${name}", \$9}' ${name}.spp.out | cat $spp_nsc_header - > ${name}_spp_nsc_mqc.tsv
    awk -v OFS='\t' '{print "${name}", \$10}' ${name}.spp.out | cat $spp_rsc_header - > ${name}_spp_rsc_mqc.tsv
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                 END OF PIPELINE (peak calling etc not performed)                      -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////


back to top