https://github.com/Gregor-Mendel-Institute/RKP2021-CMT3
Tip revision: 89d7e2ea78af1969bb161640baed09296ed2485f authored by Papareddy on 23 April 2021, 11:18:44 UTC
Update README.md
Update README.md
Tip revision: 89d7e2e
main.nf
#!/usr/bin/env nextflow
/*
========================================================================================
nf-core/chipseq
========================================================================================
nf-core/chipseq Analysis Pipeline.
#### Homepage / Documentation
https://github.com/nf-core/chipseq
----------------------------------------------------------------------------------------
*/
def helpMessage() {
log.info nfcoreHeader()
log.info"""
Usage:
The typical command for running the pipeline is as follows:
nextflow run nf-core/chipseq --input design.csv --genome GRCh37 -profile docker
Mandatory arguments:
--input [file] Comma-separated file containing information about the samples in the experiment (see docs/usage.md) (Default: './design.csv')
--fasta [file] Path to Fasta reference. Not mandatory when using reference in iGenomes config via --genome (Default: false)
--gtf [file] Path to GTF file. Not mandatory when using reference in iGenomes config via --genome (Default: false)
-profile [str] Configuration profile to use. Can use multiple (comma separated)
Available: conda, docker, singularity, awsbatch, test
Generic
--single_end [bool] Specifies that the input is single-end reads (Default: false)
--seq_center [str] Sequencing center information to be added to read group of BAM files (Default: false)
--fragment_size [int] Estimated fragment size used to extend single-end reads (Default: 200)
References If not specified in the configuration file or you wish to overwrite any of the references
--genome [str] Name of iGenomes reference (Default: false)
--bwa_index [file] Full path to directory containing BWA index including base name i.e. /path/to/index/genome.fa (Default: false)
--gene_bed [file] Path to BED file containing gene intervals (Default: false)
--macs_gsize [str] Effective genome size parameter required by MACS2. If using iGenomes config, values have only been provided when --genome is set as GRCh37, GRCm38, hg19, mm10, BDGP6 and WBcel235 (Default: false)
--blacklist [file] Path to blacklist regions (.BED format), used for filtering alignments (Default: false)
--save_reference [bool] If generated by the pipeline save the BWA index in the results directory (Default: false)
Trimming
--clip_r1 [int] Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads) (Default: 0)
--clip_r2 [int] Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only) (Default: 0)
--three_prime_clip_r1 [int] Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed (Default: 0)
--three_prime_clip_r2 [int] Instructs Trim Galore to re move bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed (Default: 0)
--trim_nextseq [int] Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails (Default: 0)
--skip_trimming [bool] Skip the adapter trimming step (Default: false)
--save_trimmed [bool] Save the trimmed FastQ files in the results directory (Default: false)
Alignments
--bwa_min_score [int] Don’t output BWA MEM alignments with score lower than this parameter (Default: false)
--keep_dups [bool] Duplicate reads are not filtered from alignments (Default: false)
--keep_multi_map [bool] Reads mapping to multiple locations are not filtered from alignments (Default: false)
--save_align_intermeds [bool] Save the intermediate BAM files from the alignment step - not done by default (Default: false)
Peaks
--narrow_peak [bool] Run MACS2 in narrowPeak mode (Default: false)
--broad_cutoff [float] Specifies broad cutoff value for MACS2. Only used when --narrow_peak isnt specified (Default: 0.1)
--macs_fdr [float] Minimum FDR (q-value) cutoff for peak detection, --macs_fdr and --macs_pvalue are mutually exclusive (Default: false)
--macs_pvalue [float] p-value cutoff for peak detection, --macs_fdr and --macs_pvalue are mutually exclusive (Default: false)
--min_reps_consensus [int] Number of biological replicates required from a given condition for a peak to contribute to a consensus peak (Default: 1)
--save_macs_pileup [bool] Instruct MACS2 to create bedGraph files normalised to signal per million reads (Default: false)
--skip_peak_qc [bool] Skip MACS2 peak QC plot generation (Default: false)
--skip_peak_annotation [bool] Skip annotation of MACS2 and consensus peaks with HOMER (Default: false)
--skip_consensus_peaks [bool] Skip consensus peak generation (Default: false)
Differential analysis
--deseq2_vst [bool] Use vst transformation instead of rlog with DESeq2 (Default: false)
--skip_diff_analysis [bool] Skip differential accessibility analysis (Default: false)
QC
--skip_fastqc [bool] Skip FastQC (Default: false)
--skip_picard_metrics [bool] Skip Picard CollectMultipleMetrics (Default: false)
--skip_preseq [bool] Skip Preseq (Default: false)
--skip_plot_profile [bool] Skip deepTools plotProfile (Default: false)
--skip_spp [bool] Skip Phantompeakqualtools (Default: false)
--skip_igv [bool] Skip IGV (Default: false)
--skip_multiqc [bool] Skip MultiQC (Default: false)
Other
--outdir [file] The output directory where the results will be saved (Default: './results')
--publish_dir_mode [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy)
--email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits (Default: false)
--email_on_fail [email] Same as --email, except only send mail if the workflow is not successful (Default: false)
--max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
-name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic (Default: false)
AWSBatch options:
--awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch
--awsregion [str] The AWS Region for your AWS Batch job to run on
--awscli [str] Path to the AWS CLI tool
""".stripIndent()
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- SET UP CONFIGURATION VARIABLES -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Show help message
if (params.help) {
helpMessage()
exit 0
}
// Has the run name been specified by the user?
// this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
custom_runName = workflow.runName
}
////////////////////////////////////////////////////
/* -- DEFAULT PARAMETER VALUES -- */
////////////////////////////////////////////////////
// Check if genome exists in the config file
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
}
// Configurable variables
params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
params.bwa_index = params.genome ? params.genomes[ params.genome ].bwa ?: false : false
params.gtf = params.genome ? params.genomes[ params.genome ].gtf ?: false : false
params.gene_bed = params.genome ? params.genomes[ params.genome ].bed12 ?: false : false
params.blacklist = params.genome ? params.genomes[ params.genome ].blacklist ?: false : false
params.anno_readme = params.genome ? params.genomes[ params.genome ].readme ?: false : false
// Global variables
////////////////////////////////////////////////////
/* -- CONFIG FILES -- */
////////////////////////////////////////////////////
// Pipeline config
ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true)
ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)
ch_output_docs_images = file("$baseDir/docs/images/", checkIfExists: true)
// JSON files required by BAMTools for alignment filtering
if (params.single_end) {
ch_bamtools_filter_config = file(params.bamtools_filter_se_config, checkIfExists: true)
} else {
ch_bamtools_filter_config = file(params.bamtools_filter_pe_config, checkIfExists: true)
}
// Header files for MultiQC
ch_peak_count_header = file("$baseDir/assets/multiqc/peak_count_header.txt", checkIfExists: true)
ch_frip_score_header = file("$baseDir/assets/multiqc/frip_score_header.txt", checkIfExists: true)
ch_peak_annotation_header = file("$baseDir/assets/multiqc/peak_annotation_header.txt", checkIfExists: true)
ch_deseq2_pca_header = file("$baseDir/assets/multiqc/deseq2_pca_header.txt", checkIfExists: true)
ch_deseq2_clustering_header = file("$baseDir/assets/multiqc/deseq2_clustering_header.txt", checkIfExists: true)
ch_spp_correlation_header = file("$baseDir/assets/multiqc/spp_correlation_header.txt", checkIfExists: true)
ch_spp_nsc_header = file("$baseDir/assets/multiqc/spp_nsc_header.txt", checkIfExists: true)
ch_spp_rsc_header = file("$baseDir/assets/multiqc/spp_rsc_header.txt", checkIfExists: true)
////////////////////////////////////////////////////
/* -- VALIDATE INPUTS -- */
////////////////////////////////////////////////////
if (params.input) { ch_input = file(params.input, checkIfExists: true) } else { exit 1, 'Samples design file not specified!' }
if (params.gtf) { ch_gtf = file(params.gtf, checkIfExists: true) } else { exit 1, 'GTF annotation file not specified!' }
if (params.gene_bed) { ch_gene_bed = file(params.gene_bed, checkIfExists: true) }
if (params.fasta) {
lastPath = params.fasta.lastIndexOf(File.separator)
bwa_base = params.fasta.substring(lastPath+1)
ch_fasta = file(params.fasta, checkIfExists: true)
} else {
exit 1, 'Fasta file not specified!'
}
if (params.bwa_index) {
lastPath = params.bwa_index.lastIndexOf(File.separator)
bwa_dir = params.bwa_index.substring(0,lastPath+1)
bwa_base = params.bwa_index.substring(lastPath+1)
Channel
.fromPath(bwa_dir, checkIfExists: true)
.set { ch_bwa_index }
}
// Save AWS IGenomes file containing annotation version
if (params.anno_readme && file(params.anno_readme).exists()) {
file("${params.outdir}/genome/").mkdirs()
file(params.anno_readme).copyTo("${params.outdir}/genome/")
}
////////////////////////////////////////////////////
/* -- AWS -- */
////////////////////////////////////////////////////
if (workflow.profile.contains('awsbatch')) {
// AWSBatch sanity checking
if (!params.awsqueue || !params.awsregion) exit 1, 'Specify correct --awsqueue and --awsregion parameters on AWSBatch!'
// Check outdir paths to be S3 buckets if running on AWSBatch
// related: https://github.com/nextflow-io/nextflow/issues/813
if (!params.outdir.startsWith('s3:')) exit 1, 'Outdir not on S3 - specify S3 Bucket to run on AWSBatch!'
// Prevent trace files to be stored on S3 since S3 does not support rolling files.
if (params.tracedir.startsWith('s3:')) exit 1, 'Specify a local tracedir or run without trace! S3 cannot be used for tracefiles.'
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- HEADER LOG INFO -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Header log info
def summary = [:]
summary['Run Name'] = custom_runName ?: workflow.runName
summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End'
summary['Design File'] = params.input
summary['Genome'] = params.genome ?: 'Not supplied'
summary['Fasta File'] = params.fasta
summary['GTF File'] = params.gtf
if (params.gene_bed) summary['Gene BED File'] = params.gene_bed
if (params.bwa_index) summary['BWA Index'] = params.bwa_index
if (params.bwa_min_score) summary['BWA Min Score'] = params.bwa_min_score
if (params.skip_trimming) {
summary['Trimming Step'] = 'Skipped'
} else {
summary['Trim R1'] = "$params.clip_r1 bp"
summary['Trim R2'] = "$params.clip_r2 bp"
summary["Trim 3' R1"] = "$params.three_prime_clip_r1 bp"
summary["Trim 3' R2"] = "$params.three_prime_clip_r2 bp"
summary['NextSeq Trim'] = "$params.trim_nextseq bp"
}
if (params.seq_center) summary['Sequencing Center'] = params.seq_center
if (params.single_end) summary['Fragment Size'] = "$params.fragment_size bp"
if (params.keep_dups) summary['Keep Duplicates'] = 'Yes'
if (params.keep_multi_map) summary['Keep Multi-mapped'] = 'Yes'
summary['Save Genome Index'] = params.save_reference ? 'Yes' : 'No'
if (params.save_trimmed) summary['Save Trimmed'] = 'Yes'
if (params.save_align_intermeds) summary['Save Intermeds'] = 'Yes'
if (params.skip_fastqc) summary['Skip FastQC'] = 'Yes'
if (params.skip_picard_metrics) summary['Skip Picard Metrics'] = 'Yes'
if (params.skip_preseq) summary['Skip Preseq'] = 'Yes'
if (params.skip_plot_profile) summary['Skip plotProfile'] = 'Yes'
if (params.skip_multiqc) summary['Skip MultiQC'] = 'Yes'
summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output Dir'] = params.outdir
summary['Launch Dir'] = workflow.launchDir
summary['Working Dir'] = workflow.workDir
summary['Script Dir'] = workflow.projectDir
summary['User'] = workflow.userName
if (workflow.profile.contains('awsbatch')) {
summary['AWS Region'] = params.awsregion
summary['AWS Queue'] = params.awsqueue
summary['AWS CLI'] = params.awscli
}
summary['Config Profile'] = workflow.profile
if (params.config_profile_description) summary['Config Description'] = params.config_profile_description
if (params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact
if (params.config_profile_url) summary['Config URL'] = params.config_profile_url
if (params.email || params.email_on_fail) {
summary['E-mail Address'] = params.email
summary['E-mail on failure'] = params.email_on_fail
summary['MultiQC Max Size'] = params.max_multiqc_email_size
}
log.info summary.collect { k,v -> "${k.padRight(20)}: $v" }.join('\n')
log.info "-\033[2m--------------------------------------------------\033[0m-"
// Check the hostnames against configured profiles
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- PARSE DESIGN FILE -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* PREPROCESSING: Reformat design file, check validitiy and create IP vs control mappings
*/
process CHECK_DESIGN {
tag "$design"
publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode
input:
path design from ch_input
output:
path 'design_reads.csv' into ch_design_reads_csv
path 'design_controls.csv' into ch_design_controls_csv
script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
"""
check_design.py $design design_reads.csv design_controls.csv
"""
}
/*
* Create channels for input fastq files
*/
if (params.single_end) {
ch_design_reads_csv
.splitCsv(header:true, sep:',')
.map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true) ] ] }
.into { ch_raw_reads_fastqc;
ch_raw_reads_trimgalore }
} else {
ch_design_reads_csv
.splitCsv(header:true, sep:',')
.map { row -> [ row.sample_id, [ file(row.fastq_1, checkIfExists: true), file(row.fastq_2, checkIfExists: true) ] ] }
.into { ch_raw_reads_fastqc;
ch_raw_reads_trimgalore }
}
/*
* Create a channel with [sample_id, control id, antibody, replicatesExist, multipleGroups]
*/
ch_design_controls_csv
.splitCsv(header:true, sep:',')
.map { row -> [ row.sample_id, row.control_id, row.antibody, row.replicatesExist.toBoolean(), row.multipleGroups.toBoolean() ] }
.set { ch_design_controls_csv }
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- PREPARE ANNOTATION FILES -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* PREPROCESSING: Build BWA index
*/
if (!params.bwa_index) {
process BWA_INDEX {
tag "$fasta"
label 'process_high'
publishDir path: { params.save_reference ? "${params.outdir}/genome" : params.outdir },
saveAs: { params.save_reference ? it : null }, mode: params.publish_dir_mode
input:
path fasta from ch_fasta
output:
path 'BWAIndex' into ch_bwa_index
script:
"""
bwa index -a bwtsw $fasta
mkdir BWAIndex && mv ${fasta}* BWAIndex
"""
}
}
/*
* PREPROCESSING: Generate gene BED file
*/
// If --gtf is supplied along with --genome
// Make gene bed from supplied --gtf instead of using iGenomes one automatically
def MAKE_BED = false
if (!params.gene_bed) {
MAKE_BED = true
} else if (params.genome && params.gtf) {
if (params.genomes[ params.genome ].gtf != params.gtf) {
MAKE_BED = true
}
}
if (MAKE_BED) {
process MAKE_GENE_BED {
tag "$gtf"
label 'process_low'
publishDir "${params.outdir}/genome", mode: params.publish_dir_mode
input:
path gtf from ch_gtf
output:
path '*.bed' into ch_gene_bed
script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
"""
gtf2bed $gtf > ${gtf.baseName}.bed
"""
}
}
/*
* PREPROCESSING: Prepare genome intervals for filtering
*/
process MAKE_GENOME_FILTER {
tag "$fasta"
publishDir "${params.outdir}/genome", mode: params.publish_dir_mode
input:
path fasta from ch_fasta
output:
path "$fasta" // FASTA FILE FOR IGV
path '*.fai' // FAI INDEX FOR REFERENCE GENOME
path '*.bed' into ch_genome_filter_regions // BED FILE WITHOUT BLACKLIST REGIONS
path '*.sizes' into ch_genome_sizes_bigwig // CHROMOSOME SIZES FILE FOR BEDTOOLS
script:
blacklist_filter = params.blacklist ? "sortBed -i $blacklist -g ${fasta}.sizes | complementBed -i stdin -g ${fasta}.sizes" : "awk '{print \$1, '0' , \$2}' OFS='\t' ${fasta}.sizes"
"""
samtools faidx $fasta
cut -f 1,2 ${fasta}.fai > ${fasta}.sizes
$blacklist_filter > ${fasta}.include_regions.bed
"""
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- FASTQ QC -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* STEP 1: FastQC
*/
process FASTQC {
tag "$name"
label 'process_medium'
publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode,
saveAs: { filename ->
filename.endsWith('.zip') ? "zips/$filename" : filename
}
when:
!params.skip_fastqc
input:
tuple val(name), path(reads) from ch_raw_reads_fastqc
output:
path '*.{zip,html}' into ch_fastqc_reports_mqc
script:
// Added soft-links to original fastqs for consistent naming in MultiQC
if (params.single_end) {
"""
[ ! -f ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz
fastqc -q -t $task.cpus ${name}.fastq.gz
"""
} else {
"""
[ ! -f ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz
[ ! -f ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz
fastqc -q -t $task.cpus ${name}_1.fastq.gz
fastqc -q -t $task.cpus ${name}_2.fastq.gz
"""
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- ADAPTER TRIMMING -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* STEP 2: Trim Galore!
*/
if (params.skip_trimming) {
ch_trimmed_reads = ch_raw_reads_trimgalore
ch_trimgalore_results_mqc = Channel.empty()
ch_trimgalore_fastqc_reports_mqc = Channel.empty()
} else {
process TRIMGALORE {
tag "$name"
label 'process_high'
publishDir "${params.outdir}/trim_galore", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('.html')) "fastqc/$filename"
else if (filename.endsWith('.zip')) "fastqc/zips/$filename"
else if (filename.endsWith('trimming_report.txt')) "logs/$filename"
else params.save_trimmed ? filename : null
}
input:
tuple val(name), path(reads) from ch_raw_reads_trimgalore
output:
tuple val(name), path('*.fq.gz') into ch_trimmed_reads
path '*.txt' into ch_trimgalore_results_mqc
path '*.{zip,html}' into ch_trimgalore_fastqc_reports_mqc
script:
// Calculate number of --cores for TrimGalore based on value of task.cpus
// See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019
// See: https://github.com/nf-core/atacseq/pull/65
def cores = 1
if (task.cpus) {
cores = (task.cpus as int) - 4
if (params.single_end) cores = (task.cpus as int) - 3
if (cores < 1) cores = 1
if (cores > 4) cores = 4
}
c_r1 = params.clip_r1 > 0 ? "--clip_r1 ${params.clip_r1}" : ''
c_r2 = params.clip_r2 > 0 ? "--clip_r2 ${params.clip_r2}" : ''
tpc_r1 = params.three_prime_clip_r1 > 0 ? "--three_prime_clip_r1 ${params.three_prime_clip_r1}" : ''
tpc_r2 = params.three_prime_clip_r2 > 0 ? "--three_prime_clip_r2 ${params.three_prime_clip_r2}" : ''
nextseq = params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : ''
// Added soft-links to original fastqs for consistent naming in MultiQC
if (params.single_end) {
"""
[ ! -f ${name}.fastq.gz ] && ln -s $reads ${name}.fastq.gz
trim_galore --cores $cores --fastqc --gzip $c_r1 $tpc_r1 $nextseq ${name}.fastq.gz
"""
} else {
"""
[ ! -f ${name}_1.fastq.gz ] && ln -s ${reads[0]} ${name}_1.fastq.gz
[ ! -f ${name}_2.fastq.gz ] && ln -s ${reads[1]} ${name}_2.fastq.gz
trim_galore --cores $cores --paired --fastqc --gzip $c_r1 $c_r2 $tpc_r1 $tpc_r2 $nextseq ${name}_1.fastq.gz ${name}_2.fastq.gz
"""
}
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- ALIGN -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* STEP 3.1: Map read(s) with bwa mem
*/
process BWA_MEM {
tag "$name"
label 'process_high'
input:
tuple val(name), path(reads) from ch_trimmed_reads
path index from ch_bwa_index.collect()
output:
tuple val(name), path('*.bam') into ch_bwa_bam
script:
prefix = "${name}.Lb"
rg = "\'@RG\\tID:${name}\\tSM:${name.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${name}\\tPU:1\'"
if (params.seq_center) {
rg = "\'@RG\\tID:${name}\\tSM:${name.split('_')[0..-2].join('_')}\\tPL:ILLUMINA\\tLB:${name}\\tPU:1\\tCN:${params.seq_center}\'"
}
score = params.bwa_min_score ? "-T ${params.bwa_min_score}" : ''
"""
bwa mem \\
-t $task.cpus \\
-M \\
-R $rg \\
$score \\
${index}/${bwa_base} \\
$reads \\
| samtools view -@ $task.cpus -b -h -F 0x0100 -O BAM -o ${prefix}.bam -
"""
}
/*
* STEP 3.2: Convert BAM to coordinate sorted BAM
*/
process SORT_BAM {
tag "$name"
label 'process_medium'
if (params.save_align_intermeds) {
publishDir path: "${params.outdir}/bwa/library", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
else if (filename.endsWith('.stats')) "samtools_stats/$filename"
else filename
}
}
input:
tuple val(name), path(bam) from ch_bwa_bam
output:
tuple val(name), path('*.sorted.{bam,bam.bai}') into ch_sort_bam_merge
path '*.{flagstat,idxstats,stats}' into ch_sort_bam_flagstat_mqc
script:
prefix = "${name}.Lb"
"""
samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $name $bam
samtools index ${prefix}.sorted.bam
samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
"""
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- MERGE LIBRARY BAM -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* STEP 4.1: Merge BAM files for all libraries from same sample replicate
*/
ch_sort_bam_merge
.map { it -> [ it[0].split('_')[0..-2].join('_'), it[1] ] }
.groupTuple(by: [0])
.map { it -> [ it[0], it[1].flatten() ] }
.set { ch_sort_bam_merge }
process MERGED_BAM {
tag "$name"
label 'process_medium'
publishDir "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
else if (filename.endsWith('.stats')) "samtools_stats/$filename"
else if (filename.endsWith('.metrics.txt')) "picard_metrics/$filename"
else params.save_align_intermeds ? filename : null
}
input:
tuple val(name), path(bams) from ch_sort_bam_merge
output:
tuple val(name), path("*${prefix}.sorted.{bam,bam.bai}") into ch_merge_bam_filter,
ch_merge_bam_preseq
path '*.{flagstat,idxstats,stats}' into ch_merge_bam_stats_mqc
path '*.txt' into ch_merge_bam_metrics_mqc
script:
prefix = "${name}.mLb.mkD"
bam_files = bams.findAll { it.toString().endsWith('.bam') }.sort()
def avail_mem = 3
if (!task.memory) {
log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.toGiga()
}
if (bam_files.size() > 1) {
"""
picard -Xmx${avail_mem}g MergeSamFiles \\
${'INPUT='+bam_files.join(' INPUT=')} \\
OUTPUT=${name}.sorted.bam \\
SORT_ORDER=coordinate \\
VALIDATION_STRINGENCY=LENIENT \\
TMP_DIR=tmp
samtools index ${name}.sorted.bam
picard -Xmx${avail_mem}g MarkDuplicates \\
INPUT=${name}.sorted.bam \\
OUTPUT=${prefix}.sorted.bam \\
ASSUME_SORTED=true \\
REMOVE_DUPLICATES=false \\
METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\
VALIDATION_STRINGENCY=LENIENT \\
TMP_DIR=tmp
samtools index ${prefix}.sorted.bam
samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
"""
} else {
"""
picard -Xmx${avail_mem}g MarkDuplicates \\
INPUT=${bam_files[0]} \\
OUTPUT=${prefix}.sorted.bam \\
ASSUME_SORTED=true \\
REMOVE_DUPLICATES=false \\
METRICS_FILE=${prefix}.MarkDuplicates.metrics.txt \\
VALIDATION_STRINGENCY=LENIENT \\
TMP_DIR=tmp
samtools index ${prefix}.sorted.bam
samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
"""
}
}
/*
* STEP 4.2: Filter BAM file at merged library-level
*/
process MERGED_BAM_FILTER {
tag "$name"
label 'process_medium'
publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
saveAs: { filename ->
if (params.single_end || params.save_align_intermeds) {
if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
else if (filename.endsWith('.stats')) "samtools_stats/$filename"
else if (filename.endsWith('.sorted.bam')) filename
else if (filename.endsWith('.sorted.bam.bai')) filename
else null
}
}
input:
tuple val(name), path(bam) from ch_merge_bam_filter
path bed from ch_genome_filter_regions.collect()
path bamtools_filter_config from ch_bamtools_filter_config
output:
tuple val(name), path('*.{bam,bam.bai}') into ch_filter_bam
tuple val(name), path('*.flagstat') into ch_filter_bam_flagstat
path '*.{idxstats,stats}' into ch_filter_bam_stats_mqc
script:
prefix = params.single_end ? "${name}.mLb.clN" : "${name}.mLb.flT"
filter_params = params.single_end ? '-F 0x004' : '-F 0x004 -F 0x0008 -f 0x001'
dup_params = params.keep_dups ? '' : '-F 0x0400'
multimap_params = params.keep_multi_map ? '' : '-q 1'
blacklist_params = params.blacklist ? "-L $bed" : ''
name_sort_bam = params.single_end ? '' : "samtools sort -n -@ $task.cpus -o ${prefix}.bam -T $prefix ${prefix}.sorted.bam"
"""
samtools view \\
$filter_params \\
$dup_params \\
$multimap_params \\
$blacklist_params \\
-b ${bam[0]} \\
| bamtools filter \\
-out ${prefix}.sorted.bam \\
-script $bamtools_filter_config
samtools index ${prefix}.sorted.bam
samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
$name_sort_bam
"""
}
/*
* STEP 4.3: Remove orphan reads from paired-end BAM file
*/
if (params.single_end) {
ch_filter_bam
.into { ch_rm_orphan_bam_metrics;
ch_rm_orphan_bam_bigwig;
ch_rm_orphan_bam_macs_1;
ch_rm_orphan_bam_macs_2;
ch_rm_orphan_bam_phantompeakqualtools;
ch_rm_orphan_name_bam_counts }
ch_filter_bam_flagstat
.into { ch_rm_orphan_flagstat_bigwig;
ch_rm_orphan_flagstat_macs;
ch_rm_orphan_flagstat_mqc }
ch_filter_bam_stats_mqc
.set { ch_rm_orphan_stats_mqc }
} else {
process MERGED_BAM_REMOVE_ORPHAN {
tag "$name"
label 'process_medium'
publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('.flagstat')) "samtools_stats/$filename"
else if (filename.endsWith('.idxstats')) "samtools_stats/$filename"
else if (filename.endsWith('.stats')) "samtools_stats/$filename"
else if (filename.endsWith('.sorted.bam')) filename
else if (filename.endsWith('.sorted.bam.bai')) filename
else null
}
input:
tuple val(name), path(bam) from ch_filter_bam
output:
tuple val(name), path('*.sorted.{bam,bam.bai}') into ch_rm_orphan_bam_metrics,
ch_rm_orphan_bam_bigwig,
ch_rm_orphan_bam_macs_1,
ch_rm_orphan_bam_macs_2,
ch_rm_orphan_bam_phantompeakqualtools
tuple val(name), path("${prefix}.bam") into ch_rm_orphan_name_bam_counts
tuple val(name), path('*.flagstat') into ch_rm_orphan_flagstat_bigwig,
ch_rm_orphan_flagstat_macs,
ch_rm_orphan_flagstat_mqc
path '*.{idxstats,stats}' into ch_rm_orphan_stats_mqc
script: // This script is bundled with the pipeline, in nf-core/chipseq/bin/
prefix = "${name}.mLb.clN"
"""
bampe_rm_orphan.py ${bam[0]} ${prefix}.bam --only_fr_pairs
samtools sort -@ $task.cpus -o ${prefix}.sorted.bam -T $prefix ${prefix}.bam
samtools index ${prefix}.sorted.bam
samtools flagstat ${prefix}.sorted.bam > ${prefix}.sorted.bam.flagstat
samtools idxstats ${prefix}.sorted.bam > ${prefix}.sorted.bam.idxstats
samtools stats ${prefix}.sorted.bam > ${prefix}.sorted.bam.stats
"""
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- MERGE LIBRARY BAM POST-ANALYSIS -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* STEP 5.1: Preseq analysis after merging libraries and before filtering
*/
process PRESEQ {
tag "$name"
label 'process_medium'
label 'error_ignore'
publishDir "${params.outdir}/bwa/mergedLibrary/preseq", mode: params.publish_dir_mode
when:
!params.skip_preseq
input:
tuple val(name), path(bam) from ch_merge_bam_preseq
output:
path '*.ccurve.txt' into ch_preseq_mqc
path '*.log'
script:
pe = params.single_end ? '' : '-pe'
"""
preseq lc_extrap \\
-output ${name}.ccurve.txt \\
-verbose \\
-bam \\
$pe \\
-seed 1 \\
${bam[0]}
cp .command.err ${name}.command.log
"""
}
/*
* STEP 5.2: Picard CollectMultipleMetrics after merging libraries and filtering
*/
process PICARD_METRICS {
tag "$name"
label 'process_medium'
publishDir path: "${params.outdir}/bwa/mergedLibrary", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('_metrics')) "picard_metrics/$filename"
else if (filename.endsWith('.pdf')) "picard_metrics/pdf/$filename"
else null
}
when:
!params.skip_picard_metrics
input:
tuple val(name), path(bam) from ch_rm_orphan_bam_metrics
path fasta from ch_fasta
output:
path '*_metrics' into ch_collectmetrics_mqc
path '*.pdf'
script:
prefix = "${name}.mLb.clN"
def avail_mem = 3
if (!task.memory) {
log.info '[Picard CollectMultipleMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
} else {
avail_mem = task.memory.toGiga()
}
"""
picard -Xmx${avail_mem}g CollectMultipleMetrics \\
INPUT=${bam[0]} \\
OUTPUT=${prefix}.CollectMultipleMetrics \\
REFERENCE_SEQUENCE=$fasta \\
VALIDATION_STRINGENCY=LENIENT \\
TMP_DIR=tmp
"""
}
/*
* STEP 5.3: Read depth normalised bigWig
*/
process BIGWIG {
tag "$name"
label 'process_medium'
publishDir "${params.outdir}/bwa/mergedLibrary/bigwig", mode: params.publish_dir_mode,
saveAs: { filename ->
if (filename.endsWith('scale_factor.txt')) "scale/$filename"
else if (filename.endsWith('.bigWig')) filename
else null
}
input:
tuple val(name), path(bam), path(flagstat) from ch_rm_orphan_bam_bigwig.join(ch_rm_orphan_flagstat_bigwig, by: [0])
path sizes from ch_genome_sizes_bigwig.collect()
output:
tuple val(name), path('*.bigWig') into ch_bigwig_plotprofile
path '*igv.txt' into ch_bigwig_igv
path '*scale_factor.txt'
script:
pe_fragment = params.single_end ? '' : '-pc'
extend = (params.single_end && params.fragment_size > 0) ? "-fs ${params.fragment_size}" : ''
"""
SCALE_FACTOR=\$(grep 'mapped (' $flagstat | awk '{print 1000000/\$1}')
echo \$SCALE_FACTOR > ${name}.scale_factor.txt
genomeCoverageBed -ibam ${bam[0]} -bg -scale \$SCALE_FACTOR $pe_fragment $extend | sort -T '.' -k1,1 -k2,2n > ${name}.bedGraph
bedGraphToBigWig ${name}.bedGraph $sizes ${name}.bigWig
find * -type f -name "*.bigWig" -exec echo -e "bwa/mergedLibrary/bigwig/"{}"\\t0,0,178" \\; > ${name}.bigWig.igv.txt
"""
}
/*
* STEP 5.4: Generate gene body coverage plot with deepTools plotProfile and plotHeatmap
*/
process PLOTPROFILE {
tag "$name"
label 'process_high'
publishDir "${params.outdir}/bwa/mergedLibrary/deepTools/plotProfile", mode: params.publish_dir_mode
when:
!params.skip_plot_profile
input:
tuple val(name), path(bigwig) from ch_bigwig_plotprofile
path bed from ch_gene_bed
output:
path '*.plotProfile.tab' into ch_plotprofile_mqc
path '*.{gz,pdf,mat.tab}'
script:
"""
computeMatrix scale-regions \\
--regionsFileName $bed \\
--scoreFileName $bigwig \\
--outFileName ${name}.computeMatrix.mat.gz \\
--outFileNameMatrix ${name}.computeMatrix.vals.mat.tab \\
--regionBodyLength 1000 \\
--beforeRegionStartLength 3000 \\
--afterRegionStartLength 3000 \\
--skipZeros \\
--smartLabels \\
--numberOfProcessors $task.cpus
plotProfile --matrixFile ${name}.computeMatrix.mat.gz \\
--outFileName ${name}.plotProfile.pdf \\
--outFileNameData ${name}.plotProfile.tab
plotHeatmap --matrixFile ${name}.computeMatrix.mat.gz \\
--outFileName ${name}.plotHeatmap.pdf \\
--outFileNameMatrix ${name}.plotHeatmap.mat.tab
"""
}
/*
* STEP 5.5: Phantompeakqualtools
*/
process PHANTOMPEAKQUALTOOLS {
tag "$name"
label 'process_medium'
publishDir "${params.outdir}/bwa/mergedLibrary/phantompeakqualtools", mode: params.publish_dir_mode
when:
!params.skip_spp
input:
tuple val(name), path(bam) from ch_rm_orphan_bam_phantompeakqualtools
path spp_correlation_header from ch_spp_correlation_header
path spp_nsc_header from ch_spp_nsc_header
path spp_rsc_header from ch_spp_rsc_header
output:
path '*.spp.out' into ch_spp_out_mqc
path '*_mqc.tsv' into ch_spp_csv_mqc
path '*.pdf'
script:
"""
RUN_SPP=`which run_spp.R`
Rscript -e "library(caTools); source(\\"\$RUN_SPP\\")" -c="${bam[0]}" -savp="${name}.spp.pdf" -savd="${name}.spp.Rdata" -out="${name}.spp.out" -p=$task.cpus
cp $spp_correlation_header ${name}_spp_correlation_mqc.tsv
Rscript -e "load('${name}.spp.Rdata'); write.table(crosscorr\\\$cross.correlation, file=\\"${name}_spp_correlation_mqc.tsv\\", sep=",", quote=FALSE, row.names=FALSE, col.names=FALSE,append=TRUE)"
awk -v OFS='\t' '{print "${name}", \$9}' ${name}.spp.out | cat $spp_nsc_header - > ${name}_spp_nsc_mqc.tsv
awk -v OFS='\t' '{print "${name}", \$10}' ${name}.spp.out | cat $spp_rsc_header - > ${name}_spp_rsc_mqc.tsv
"""
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* -- -- */
/* -- END OF PIPELINE (peak calling etc not performed) -- */
/* -- -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////