Content - f58fc6a4649e729e71f9b1e541dac96239ae1b6e - 31ee1e7/scripts/ChIP-TRA1-snakefile

visit type:
https://github.com/starostikm/SNPC-1.3

11 February 2021, 00:44:05 UTC
Tip revision: b23f652341d999150edf5ae9c8de72e9192b2843 authored by Margaret R. Starostik on 09 February 2021, 03:23:15 UTC
edited abstract
Tip revision: b23f652
ChIP-TRA1-snakefile
# vim: set ft=python:

shell.prefix("""
set -o pipefail
set -e
""")

"""
Author: Margaret R. Starostik
Aim: Snakemake workflow for TRA-1 ChIP-seq analysis
Date: February 18, 2020
"""

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
## MODULES

import glob
from os.path import join, basename, dirname

BEDTOOLS_VERSION = "2.27.0"
BOWTIE2_VERSION = "2.3.4.2"
FASTQC_VERSION = "0.11.7"
JAVA_VERSION = "1.8.0_181"
MACS2_VERSION = "2.1.2"
R_VERSION = "3.5.1"
SAMTOOLS_VERSION = "1.9"
TRIMGALORE_VERSION = "0.5.0"

PYTHON = "/path/to/etc/profile.d/conda.sh"
ENV = "python27"

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
## DIRECTORIES

"""
-BASE (/path/to/base)
    -PROJECT (TRA-1)
        --RawData
            ---FASTQ
        --ProcessedData
            ---00_RENAMED
            ---01_TRIMMED
            ---02_ALIGNED
            ---03_DEDUPED
                 ----NoModel250
                ----NoModel550
        --CleanData
        --QualityControl
            ---CrossCorrelation
            ---FastQC
                ----Unprocessed
                ----Processed
            ---FileCheck
            ---Fingerprint
            ---GenomeSize
            ---PairwiseCorrelations
            ---Pileup
        --logs   
"""

# base directory
BASE_DIR = "/path/to/base"
PROJECT = "TRA-1"
PROJECT_DIR = "{0}/{1}".format(BASE_DIR, PROJECT)

# project directories
FASTQ_DIR = PROJECT_DIR + "/ProcessedData/00_RENAMED"

UFASTQC_DIR = PROJECT_DIR + "/QualityControl/FastQC/Unprocessed"
PFASTQC_DIR = PROJECT_DIR + "/QualityControl/FastQC/Processed"

TRIM_DIR = PROJECT_DIR + "/ProcessedData/01_TRIMMED"
ALIGN_DIR = PROJECT_DIR + "/ProcessedData/02_ALIGNED"
FILTER_DIR = PROJECT_DIR + "/ProcessedData/03_FILTERED"

PEAKS_DIR = PROJECT_DIR + "/ProcessedData/04_PEAKS"
METHOD01_DIR = PEAKS_DIR + "/Method01"

CORR_DIR = PROJECT_DIR + "/QualityControl/Correlation"
FINGERPRINT_DIR = PROJECT_DIR + "/QualityControl/Fingerprint"
PILEUP_DIR = PROJECT_DIR + "/QualityControl/Pileup"

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
## GLOBALS: DECLARE VARIABLES USED IN SUBSEQUENT STEPS

# base path to references
REF_BASEPATH = "/home-3/mstaros1@jhu.edu/references/ensembl97.269_WBcel235"

# full path to Bowtie2 indices
REF_SOURCE = "/WBcel235_ensembl97.269/BOWTIE2/WBcel235"

# full path to black list
BLACKLIST_NAME = "ce11-blacklist.v2"
BLACKLIST = "{0}/{1}.bed".format(REF_BASEPATH, BLACKLIST_NAME)

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
## SAMPLES TO BE PROCESSED

# All samples
CONTROLS= ["ChIP2020-001-N2-72hGravid-WW", "ChIP2020-002-TRA13xFLAG-72hGravid-WW", "ChIP2020-003-TRA13xFLAG_1xTBS-72hGravid-WW", "ChIP2020-004-TRA13xFLAG_2xTBS-72hGravid-WW", "ChIP2020-005-TRA13xFLAG_3xTBS-72hGravid-WW", \
"ChIP2020-011-N2-72hGravid-WW", "ChIP2020-012-TRA13xFLAG-72hGravid-WW", "ChIP2020-013-TRA13xFLAG_1xTBS-72hGravid-WW", "ChIP2020-014-TRA13xFLAG_2xTBS-72hGravid-WW", "ChIP2020-015-TRA13xFLAG_3xTBS-72hGravid-WW"] 
CASES = ["ChIP2020-006-N2-72hGravid-WW", "ChIP2020-007-TRA13xFLAG-72hGravid-WW", "ChIP2020-008-TRA13xFLAG_1xTBS-72hGravid-WW", "ChIP2020-009-TRA13xFLAG_2xTBS-72hGravid-WW", "ChIP2020-010-TRA13xFLAG_3xTBS-72hGravid-WW", \
"ChIP2020-016-N2-72hGravid-WW", "ChIP2020-017-TRA13xFLAG-72hGravid-WW", "ChIP2020-018-TRA13xFLAG_1xTBS-72hGravid-WW", "ChIP2020-019-TRA13xFLAG_2xTBS-72hGravid-WW", "ChIP2020-020-TRA13xFLAG_3xTBS-72hGravid-WW"] 

SAMPLES = CONTROLS + CASES

IP_BAM = expand(FILTER_DIR + "/dedup_{cases}.bam", zip, cases = CASES, controls = CONTROLS)
INPUT_BAM = expand(FILTER_DIR + "/dedup_{controls}.bam", zip, cases = CASES, controls = CONTROLS)

# CROSS CORRELATION
CROSSCORR_SUMMARY = expand(CROSSCORR_DIR + "/{sample}_summary.txt", sample = SAMPLES)
CROSSCORR_PDF = expand(CROSSCORR_DIR + "/{sample}_CrossCorr.pdf", sample = SAMPLES)

# FINGERPRINT
FF_PLOT = expand(FINGERPRINT_DIR + "/{cases}_{contols}_FingerPrintPlot.pdf", zip, cases = CASES, contols = CONTROLS)

# PILEUP
PILEUP = expand(PILEUP_DIR + "/{cases}_vs_{contols}_CPMratio_Pileup.bw", zip, cases = CASES, contols = CONTROLS)
LOG2_PILEUP = expand(PILEUP_DIR + "/{cases}_vs_{contols}_CPMlog2ratio_Pileup.bw", zip, cases = CASES, contols = CONTROLS)
BIGWIG_SUMMARY = PILEUP_DIR + "/CPMratio_bigWigSummary.npz"


PEARSON_TXT = CORR_DIR + "/SampleCorrelation_CPMratio.txt"
PEARSON_PDF = CORR_DIR + "/SampleCorrelation_CPMratio.pdf" 

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
## LOCAL RULES: RULES THAT CAN BE RUN LOCALLY

localrules: all

"""
RULE ALL: SNAKEMAKE ACCEPTS RULE NAMES AS TARGETS IF THE REFERRED RULE DOES NOT HAVE WILDCARDS; IF NO TARGET IS GIVEN AT COMMAND LINE AT THE TOP OF THE \
WORKFLOW, SNAKEMAKE WILL DEFINE THE FIRST FILE OF THE SNAKEFILE AS THE TARGET. THEREFORE, IT IS BEST PRACTICE TO HAVE A "RULE ALL" AT THE TOP OF THE \
WORKFLOW WHICH HAS ALL THE DESIRED TARGET FILES OF THE PIPELINE AS INPUT FILES.OBTAIN THE DESIRED TARGET FILES FROM "SAMPLES TO BE PROCESSED".
"""

rule all:
    input:
        FF_PLOT,
        PILEUP,
        LOG2_PILEUP,
        BIGWIG_SUMMARY,
        PEARSON_TXT,
        PEARSON_PDF

#-------------------------------------------------------------------------------------------------------------------------------------------------------------
rule crosscorr:
    """
    Cross-correlation analysis is done on filtered, but not de-duped, and sub-sampled BAM files. The read trimming requirements are 50bp.
    """
    input:
        dbam = DEDUP_DIR + "/deduped_{sample}.bam",
        idbam = DEDUP_DIR + "/deduped_{sample}.bam"
    output:
        crosscorr_summary = CROSSCORR_DIR + "/{sample}_summary.txt",
        crosscorr_pdf = CROSSCORR_DIR + "/{sample}_CrossCorr.pdf"
    threads: 8
    params:
        partition = "shared",
        mem = "5G",
        time = "10:00:00"
    shell:
        """
        module load R/{R_VERSION} || exit 1
        module load samtools/{SAMTOOLS_VERSION} || exit 1
        Rscript /home-3/mstaros1@jhu.edu/tools/run_spp.R \
        -c={input.dbam} \
        -out={output.crosscorr_summary} \
        -savp={output.crosscorr_pdf}
        """
##-------------------------------------------------------------------------------------------------------------------------------------------------------------
rule fingerprint:
    """
    Fingerprints.Takes long time.
    """
    input:
        ip_bam = FILTER_DIR + "/dedup_{cases}.bam",,
        input_bam = FILTER_DIR + "/dedup_{controls}.bam",
        blacklist = BLACKLIST
    output:
        ff_plot = FINGERPRINT_DIR + "/{cases}_{contols}_FingerPrintPlot.pdf"
    threads: 8
    params:
        partition = "shared",
        mem = "30G",
        time = "48:00:00"
    shell:
        """
        source {PYTHON}
        conda activate {ENV}
        plotFingerprint \
        --bamfiles {input.ip_bam} {input.input_bam} \
        --plotFile {output.ff_plot} \
        --labels IP Input \
        --blackListFileName {input.blacklist}
        """

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
rule pileup:
    """
    A bigWig file is generated based on two BAM files that are compared to each other while normalized for sequencing depth. \
    Compares two BAM files based on the number of mapped reads by partitioning the genome into equal size bins. The number of reads falling into each bin is \
    counted for each file, and a summary value is reported as the ratio of the number of reads per bin. The number of reads in each BAM file is normalized \
    using CPM. The effective genome size was estimated from the table provided by deeptools (https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html) \
    for read length 50 and reference genome WBcel235. 728500, the number of bases in the blacklist file, was subtracted from the listed value 95159452 to obtain \
    an effective genome size of 94430952.
    """ 
    input:
        ip_bam = FILTER_DIR + "/dedup_{cases}.bam",
        ip_bai = FILTER_DIR + "/dedup_{cases}.bam.bai",
        input_bam = FILTER_DIR + "/dedup_{contols}.bam",
        input_bai = FILTER_DIR + "/dedup_{contols}.bam.bai",
        blacklist = BLACKLIST
    output:
        pileup = PILEUP_DIR + "/{cases}_vs_{contols}_CPMratio_Pileup.bw"
    threads: 8
    params:
        partition = "shared",
        mem = "5G",
        time = "1:00:00"
    shell:
        """
        source {PYTHON}
        conda activate {ENV}
        bamCompare \
        --bamfile1 {input.ip_bam} \
        --bamfile2 {input.input_bam} \
        --outFileName {output.pileup} \
        --scaleFactorsMethod None \
        --effectiveGenomeSize 94430952 \
        --normalizeUsing CPM \
        --operation ratio \
        --pseudocount 1 \
        --binSize 50 \
        --blackListFileName {input.blacklist} \
        --ignoreForNormalization MtDNA \
        --extendReads 200
        """

##-------------------------------------------------------------------------------------------------------------------------------------------------------------
rule log2pileup:
    """
    A bigWig file is generated based on two BAM files that are compared to each other while normalized for sequencing depth. \
    Compares two BAM files based on the number of mapped reads by partitioning the genome into equal size bins. The number of reads falling into each bin is \
    counted for each file, and a summary value is reported as the ratio of the number of reads per bin. The number of reads in each BAM file is normalized \
    using CPM. The effective genome size was estimated from the table provided by deeptools (https://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html) \
    for read length 50 and reference genome WBcel235. 728500, the number of bases in the blacklist file, was subtracted from the listed value 95159452 to obtain \
    an effective genome size of 94430952.
    """ 
    input:
        ip_bam = FILTER_DIR + "/dedup_{cases}.bam",
        ip_bai = FILTER_DIR + "/dedup_{cases}.bam.bai",
        input_bam = FILTER_DIR + "/dedup_{contols}.bam",
        input_bai = FILTER_DIR + "/dedup_{contols}.bam.bai",
        blacklist = BLACKLIST
    output:
        log2_pileup = PILEUP_DIR + "/{cases}_vs_{contols}_CPMlog2ratio_Pileup.bw"
    threads: 8
    params:
        partition = "shared",
        mem = "5G",
        time = "1:00:00"
    shell:
        """
        source {PYTHON}
        conda activate {ENV}
        bamCompare \
        --bamfile1 {input.ip_bam} \
        --bamfile2 {input.input_bam} \
        --outFileName {output.log2_pileup} \
        --scaleFactorsMethod None \
        --effectiveGenomeSize 94430952 \
        --normalizeUsing CPM \
        --operation log2 \
        --pseudocount 1 \
        --binSize 50 \
        --blackListFileName {input.blacklist} \
        --ignoreForNormalization MtDNA \
        --extendReads 200
        """

##-------------------------------------------------------------------------------------------------------------------------------------------------------------Y
rule bigwig:
    """
    Compute average scrores for each of the files in every genomic region. The output is a compressed numpy array and can be used by other tools such as \
    plotCorrelation or plotPCA.
    """
    input:
        all_pileups = expand(PILEUP_DIR + "/{cases}_vs_{contols}_CPMratio_Pileup.bw", zip, cases = CASES, contols = CONTROLS),
        blacklist = BLACKLIST
    output:
        bigwig_summary = PILEUP_DIR + "/CPMratio_bigWigSummary.npz"
    threads: 8
    params:
        partition = "shared",
        mem = "10G",
        time = "2:00:00"
    shell:
        """
        source {PYTHON}
        conda activate {ENV}
        multiBigwigSummary \
        bins \
        --bwfiles {input.all_pileups} \
        --outFileName {output.bigwig_summary} \
        --chromosomesToSkip MtDNA \
        --blackListFileName {input.blacklist}
        """
        
##-------------------------------------------------------------------------------------------------------------------------------------------------------------
rule pearson_corr:
    """
    
    """
    input:
        bigwig_summary = PILEUP_DIR + "/CPMratio_bigWigSummary.npz"
    output:
        pearson_txt = CORR_DIR + "/SampleCorrelation_CPMratio.txt",
        pearson_pdf = CORR_DIR + "/SampleCorrelation_CPMratio.pdf"
    threads: 8
    params:
        partition = "shared",
        mem = "1G",
        time = "1:00:00"
    shell:
        """
        source {PYTHON}
        conda activate {ENV}
        plotCorrelation \
        --corData {input.bigwig_summary} \
        --corMethod pearson \
        --whatToPlot scatterplot \
        --plotFile {output.pearson_pdf} \
        --outFileCorMatrix {output.pearson_txt} \
        --labels N2
        """