https://github.com/arjunrajlaboratory/atac-seq_pipeline_paired-end
Tip revision: b165422254188ce4f89074374967f3390fd5d875 authored by emsanford on 25 February 2021, 23:34:09 UTC
Update concat_zipped_fastq_files_from_illumina.py
Update concat_zipped_fastq_files_from_illumina.py
Tip revision: b165422
atac_pipeline.py
import sys
import logging
import argparse
from subprocess import Popen, PIPE
import os
import gzip
import random
#note: the call to this script should be preprended with a 'set_atac_pmacs_env' command that sets the paths for the appropriate versions of each tool called in this pipeline.
#software references
refs_dir = '/project/arjunrajlab/refs'
bowtie2_index = refs_dir + '/bowtie2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bowtie_index'
gencode_tss_areas = refs_dir + '/atac_pipeline_refs/GENCODEv24_TSS_hg38_radius500.bed'
refseq_tss_areas = refs_dir + '/atac_pipeline_refs/UCSC_RefSeq_TSS_hg38_radius500.bed'
chrom_sizes_file = refs_dir + '/atac_pipeline_refs/hg38.chrom.sizes'
hg38_blacklist = refs_dir + '/atac_pipeline_refs/hg38.blacklist.sorted.bed'
class ATAC_Sample:
"""
Lightweight class to store filepaths for atac-seq sample processing
"""
def __init__(self, sample_name, input_data_dir, output_dir, macs2_window_size, macs2_summit_window_radius):
self.name = sample_name
self.input_data_dir = input_data_dir
self.output_data_dir = output_dir + '/' + sample_name
# input fastq files (paired end)
self.read1_path = '{0}/{1}'.format(input_data_dir, sample_name + '_R1.fastq.gz')
self.read2_path = '{0}/{1}'.format(input_data_dir, sample_name + '_R2.fastq.gz')
# bowtie2 alignment files
self.bowtie2_stderr_logfile = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'log_stderr_bowtie2.txt')
self.bowtie2_aligned_reads = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'bowtie2_aligned_reads.bam')
# alignment filtering files
self.tmp_early_filt_output = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'tmp.lightFilt.bam')
self.early_filt_fixmate_output = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'lightFilt.fixmate.bam')
self.filtered_bam_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'filt.bam')
self.filtered_bam_file_namesorted = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'filt.nameSorted.bam')
self.picard_metrics_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'picardMetrics.txt')
self.picard_output = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'picardMarkedDups.bam')
self.final_bam_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.bam')
self.final_bam_stats = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.stats.txt')
self.final_bam_file_namesorted = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.nameSorted.bam')
# text file representations of filtered aligned reads
self.early_tagalign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'early.tagAlign.gz')
self.final_tagalign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.tagAlign.gz')
self.final_Tn5shifted_tagAlign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.tagAlign_tn5_shifted.gz')
self.final_bedpe_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.bedpe.gz')
self.macs2_unsorted_tagAlign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'macs2_formatted.unsorted.tagAlign')
self.macs2_tagAlign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'Tn5_insertion_points.tagAlign.gz')
# subsampled files
self.subsampled_tagalign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'subsampled.tagAlign.gz')
# QC and summary report files
self.insert_size_histogram_plot = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'insert_size_histogram.pdf')
self.insert_size_histogram_data = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'insert_size_data.txt')
self.mitochondrial_read_report = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'mitochondrial_read_report.txt')
self.refseq_tss_report = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'refseq_tss_report.txt')
self.gencode_tss_report = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'gencode_tss_report.txt')
self.pbc_qc_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.pbc.qc.txt')
# macs2 peak calling files
macs2_folder_name = 'macs2_output_windowSize{0}'.format(macs2_window_size)
self.macs2_output_dir = '{0}/{1}'.format(self.output_data_dir, macs2_folder_name)
self.macs2_cvg_bedgraph_output = '{0}/{2}/{1}_treat_pileup.bdg'.format(self.output_data_dir, sample_name, macs2_folder_name)
self.macs2_tmp_sorted_bedgraph_file = '{0}/{2}/{1}.macs2.tmp.sorted.bedGraph'.format(self.output_data_dir, sample_name, macs2_folder_name)
self.macs2_tmp_clipped_bedgraphFile = '{0}/{2}/{1}.macs2.tmp.sorted.clipped.bedGraph'.format(self.output_data_dir, sample_name, macs2_folder_name)
self.macs2_smooth_bigWig_file = '{0}/{2}/{1}.smoothed_Tn5insertionPoints.bigWig'.format(self.output_data_dir, sample_name, macs2_folder_name)
#TODO
self.macs2_narrowPeak_file = '{0}/{1}/{2}_peaks.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_narrowPeak_fullChrsOnly = '{0}/{1}/{2}_fullChrsOnly.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_narrowPeak_blacklistFiltered = '{0}/{1}/{2}_blacklistFiltered.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_summits_file = '{0}/{1}/{2}_summits.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
#TODO
self.macs2_summits_file_fullChrsOnly = '{0}/{1}/{2}_summits.fullChrsOnly.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_summits_blacklistFilteredfile = '{0}/{1}/{2}_summits.blacklistFiltered.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
#TODO
self.macs2_summits_blFilt_sorted = '{0}/{1}/{2}_summits.blacklistFiltered.sorted.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_summits_top300k = '{0}/{1}/{2}_summits.top300k.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_summits_top300k_posnSorted = '{0}/{1}/{2}_summits.top300k.posnSorted.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
self.macs2_summits_top300k_windows = '{0}/{1}/{2}_summits.top300k.windowedRadius{3}.bed'.format(self.output_data_dir, macs2_folder_name, sample_name, macs2_summit_window_radius)
#files to be deleted after pipeline runs
self.intermediate_file_list = [self.tmp_early_filt_output, self.early_filt_fixmate_output, self.early_tagalign_file, self.filtered_bam_file_namesorted,
self.filtered_bam_file, self.picard_output, self.final_bam_file_namesorted, self.macs2_unsorted_tagAlign_file,
self.macs2_cvg_bedgraph_output, self.macs2_tmp_sorted_bedgraph_file, self.macs2_tmp_clipped_bedgraphFile, self.macs2_summits_file_fullChrsOnly,
self.macs2_narrowPeak_fullChrsOnly, self.macs2_summits_blFilt_sorted, self.macs2_summits_top300k]
def run_command(cmd):
logger = logging.getLogger()
logger.debug(cmd)
os.system(cmd)
def setup_output_filepaths(s_obj):
if not os.path.exists(s_obj.output_data_dir):
os.makedirs(s_obj.output_data_dir)
def run_fastQC(s_obj):
run_command(' '.join(['fastqc', s_obj.read1_path, '--outdir', s_obj.output_data_dir]))
run_command(' '.join(['fastqc', s_obj.read2_path, '--outdir', s_obj.output_data_dir]))
def bowtie2_align_reads(s_obj, bt2_max_frag_len, num_bowtie2_threads):
cmd = ' '.join(['bowtie2', '--local',
'--maxins', str(bt2_max_frag_len), '-x', bowtie2_index,
'--threads', str(num_bowtie2_threads),
'-1', s_obj.read1_path, '-2', s_obj.read2_path,
'2>', s_obj.bowtie2_stderr_logfile,
'|', 'samtools', 'view', '-u', '/dev/stdin',
'|', 'samtools', 'sort', '/dev/stdin', s_obj.bowtie2_aligned_reads[:-4]]) # [:-4] is necessary because samtools adds a bam suffix
run_command(cmd)
def filter_alignments(s_obj):
logger = logging.getLogger()
logger.info('Removing unmapped reads, reads with unmapped mates, or reads not passing vendor QC, then sorting by QNAME: {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'view', '-F', '524', '-f', '2', '-u', s_obj.bowtie2_aligned_reads,
'|', 'samtools', 'sort', '-n', '/dev/stdin', s_obj.tmp_early_filt_output[:-4]]))
# samtools view:
# -F 524 (-F is "blacklist" for certain flag bits)
# bit 512: "not passing filters, such as platform/vendor quality controls"
# bit 8: "next segment in the template unmapped"
# bit 4: "segment unmapped"
# -f 2 (-f is "whitelist" for certain flag bits)
# bit 2: "each segment properly aligned according to the aligner"
logger.info('Adding header, filtering out secondary alignments, and fixing mates: {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'view', '-h', s_obj.tmp_early_filt_output,
'|', 'samtools', 'fixmate', '-r', '/dev/stdin', s_obj.early_filt_fixmate_output]))
logger.info('Removing low map-quality alignments, optical duplicates, and sorting by position: {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'view', '-q', '20', '-F', '1804', '-f', '2', '-u', s_obj.early_filt_fixmate_output,
'|', 'samtools', 'sort', '/dev/stdin', s_obj.filtered_bam_file[:-4]]))
# samtools view:
# -F 1804 (-F is "blacklist" for certain flag bits)
# bit 1024: pcr or optical duplicate
# bit 512: "not passing filters, such as platform/vendor quality controls"
# bit 256: secondary alignment. "It is typically used to flag alternative mappings when multiple mappings are presented in a SAM"
# bit 8: "next segment in the template unmapped"
# bit 4: "segment unmapped"
# -f 2 (-f is "whitelist" for certain flag bits)
# bit 2: "each segment properly aligned according to the aligner"
logger.info('Marking duplicates with Picard for sample: {0}'.format(s_obj.name))
run_command(' '.join(['MarkDuplicates',
'INPUT={0}'.format(s_obj.filtered_bam_file), 'OUTPUT={0}'.format(s_obj.picard_output),
'METRICS_FILE={0}'.format(s_obj.picard_metrics_file), 'VALIDATION_STRINGENCY=LENIENT',
'ASSUME_SORTED=true', 'REMOVE_DUPLICATES=false', 'VERBOSITY=WARNING']))
logger.info('Writing final BAM file for sample (with PCR duplicates removed): {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'view', '-F', '1804', '-f', '2', '-b', s_obj.picard_output, '>', s_obj.final_bam_file]))
logger.info('Indexing final BAM file for sample: {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'index', s_obj.final_bam_file]))
def generate_summary_reports(s_obj):
logger = logging.getLogger()
logger.info('Computing samtools stats for sample: {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'flagstat', s_obj.final_bam_file, '>', s_obj.final_bam_stats]))
logger.info('Creating PBC QC file for BAM with PCR duplicates for sample: {0}'.format(s_obj.name))
# PBC File output format:
# TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
logger.info('Creating temporary name-sorted filtered BAM file: {0}'.format(s_obj.name))
run_command(' '.join(['samtools', 'sort', '-n', s_obj.filtered_bam_file, s_obj.filtered_bam_file_namesorted[:-4]]))
logger.info('Running PBC computation and writing report: {0}'.format(s_obj.name))
cmd = "bamToBed -bedpe -i {0}".format(s_obj.filtered_bam_file_namesorted) + \
" | awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$3,$6,$9,$10}'" + \
" | grep -v 'chrM' | sort | uniq -c" + \
" | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf \"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'" + \
" > {0}".format(s_obj.pbc_qc_file)
run_command(cmd)
logger.info('Plotting insert size histogram for sample: {0}'.format(s_obj.name))
run_command(' '.join(['CollectInsertSizeMetrics', 'INPUT={0}'.format(s_obj.final_bam_file),
'OUTPUT={0}'.format(s_obj.insert_size_histogram_data), 'H={0}'.format(s_obj.insert_size_histogram_plot)]))
logger.info('Calculating percent mitochondrial reads for sample: {0}'.format(s_obj.name))
logger.info('Creating tagAlign formatted file for lightly-filtered aligned reads: {0}'.format(s_obj.name))
cmd = "bamToBed -i {0}".format(s_obj.early_filt_fixmate_output) + \
r"""| awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' """ + \
"| gzip -nc > {0}".format(s_obj.early_tagalign_file)
run_command(cmd)
# calculate total read pairs (before MQ filtering)
p1 = Popen(['zcat', s_obj.early_tagalign_file], stdout=PIPE)
p2 = Popen(['wc', '-l'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
p1.stdout.close()
output, err = p2.communicate()
return_code = p2.wait()
n_total_read_pairs = int(output) / 2
# calculate number of mitochondrial read pairs (before MQ filtering)
p1 = Popen(['zcat', s_obj.early_tagalign_file], stdout=PIPE)
p2 = Popen(['grep', 'chrM'], stdin=p1.stdout, stdout=PIPE)
p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
p1.stdout.close()
p2.stdout.close()
output, err = p3.communicate()
return_code = p3.wait()
n_mitochondrial_read_pairs = int(output) / 2
with open(s_obj.mitochondrial_read_report, 'w') as f:
f.write('\t'.join(['TotalReadPairs', 'MitochondrialReadPairs', 'FracMitochondrialReads'])+'\n')
f.write('\t'.join(map(str, [n_total_read_pairs, n_mitochondrial_read_pairs,
float(n_mitochondrial_read_pairs)/float(n_total_read_pairs)]))+'\n')
def generate_text_alignment_formats(s_obj):
logger = logging.getLogger()
# Create "virtual single-ended file"
logger.info('Creating tagAlign formatted file: {0}'.format(s_obj.name))
cmd = "bamToBed -i {0}".format(s_obj.final_bam_file) + \
r"""| awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' """ + \
"| gzip -nc > {0}".format(s_obj.final_tagalign_file)
run_command(cmd)
# Shift tagAlign file due to Tn5 insertion (original ATAC seq paper authors did this)
# "For peak-calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event.
# Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adaptors separated by 9 bp (ref. 11).
# Therefore, all reads aligning to the + strand were offset by +4 bp, and all reads aligning to the - strand were offset -5 bp."
logger.info('Creating Tn5-shifted final tagAlign file: {0}'.format(s_obj.name))
cmd = "zcat {0} ".format(s_obj.final_tagalign_file) + \
r"""| awk -F $'\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4; $3 = $3 + 4} else if ($6 == "-") {$2 = $2 - 5; $3 = $3 - 5} print $0}' """ + \
"| gzip -nc > {0}".format(s_obj.final_Tn5shifted_tagAlign_file)
run_command(cmd)
logger.info('Creating temporary name-sorted final BAM: {0}'.format(s_obj.name))
run_command('samtools sort -n {0} {1}'.format(s_obj.final_bam_file, s_obj.final_bam_file_namesorted[:-4]))
logger.info('Creating BEDPE file from name-sorted final BAM, excluding mitochondrial reads: {0}'.format(s_obj.name))
run_command("bamToBed -bedpe -mate1 -i {0} | grep -v 'chrM' | gzip -nc > {1}".format(s_obj.final_bam_file_namesorted, s_obj.final_bedpe_file))
logger.info('Generating tagAlign file for MACS2 peak calling: {0}'.format(s_obj.name))
fout = open(s_obj.macs2_unsorted_tagAlign_file, 'w')
random.seed(0)
with gzip.open(s_obj.final_bedpe_file, 'rt') as f:
for line in f:
fields = line.strip().split('\t')
r1_chr = fields[0]
r1_pos1 = int(fields[1])
r1_pos2 = int(fields[2])
r1_strand = fields[8]
r2_chr = fields[3]
r2_pos1 = int(fields[4])
r2_pos2 = int(fields[5])
r2_strand = fields[9]
assert(r1_chr == r2_chr)
if(r1_strand == r2_strand):
logger.warn('Final BEDPE contains read with unexpected R1-R2 orientation!: ' + line)
#Tn5 shifting done by original ATAC seq paper authors
if r1_strand == '+':
r1_pos1 += 4
r1_pos2 += 4
r2_pos1 -= 5
r2_pos2 -= 5
elif r1_strand == '-':
r1_pos1 -= 5
r1_pos2 -= 5
r2_pos1 += 4
r2_pos2 += 4
else:
logger.warn('Final BEDPE final contains improperly-formatted line!: ' + line)
posn_list = [r1_pos1, r1_pos2, r2_pos1, r2_pos2]
min_posn = min(posn_list)
max_posn = max(posn_list)
if min_posn in [r1_pos1, r1_pos2]:
min_strand = r1_strand
max_strand = r2_strand
elif min_posn in [r2_pos1, r2_pos2]:
min_strand = r2_strand
max_strand = r1_strand
else:
raise()
# Pick only one of the Tn5 insertion points (ENCODE pipeline only keeps one in paired-end datasets, perhaps due to possibly breaking MACS2 statistical assumptions when including both?)
if random.randint(0,1):
fout.write('\t'.join(map(str, [r1_chr, min_posn, min_posn, 'N', 1000, min_strand])) + '\n')
else:
fout.write('\t'.join(map(str, [r1_chr, max_posn, max_posn, 'N', 1000, max_strand])) + '\n')
fout.flush()
fout.close()
run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_unsorted_tagAlign_file, '>', s_obj.macs2_tagAlign_file[:-3]]))
run_command('gzip -f {0}'.format(s_obj.macs2_tagAlign_file[:-3]))
def calc_and_write_frac_reads_close_to_TSS(final_tagalign_file, reference_tss_areas, output_file):
p1 = Popen(['bedtools', 'intersect', '-u', '-a', final_tagalign_file, '-b', reference_tss_areas], stdout=PIPE)
p2 = Popen(['grep', '-v', 'chrM'], stdin=p1.stdout, stdout=PIPE)
p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
p1.stdout.close()
p2.stdout.close()
output, err = p3.communicate()
return_code = p3.wait()
n_tss_reads = int(output)
p1 = Popen(['zcat', final_tagalign_file], stdout=PIPE)
p2 = Popen(['wc', '-l'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
p1.stdout.close()
output, err = p2.communicate()
return_code = p2.wait()
n_total_reads = int(output)
with open(output_file, 'w') as f:
f.write('\t'.join(['NumReads(R1+R2)', 'NumReadsCloseToTSS(R1+R2)', 'FracTSS_Reads']) + '\n')
f.write('\t'.join(map(str, [n_total_reads, n_tss_reads, float(n_tss_reads)/n_total_reads])) + '\n')
def macs2_call_peaks(s_obj, shiftsize, ext_size, macs2_cap_num_summits, macs2_summit_window_radius):
logger = logging.getLogger()
run_command(' '.join(['macs2', 'callpeak', '--nomodel', '--nolambda', '--keep-dup', 'all',
'--call-summits', '-B', '--SPMR', '--format', 'BED',
'-q', '0.05', '--shift', str(shiftsize), '--extsize', str(ext_size),
'-t', s_obj.macs2_tagAlign_file, '--outdir', s_obj.macs2_output_dir,
'--name', s_obj.name]))
logger.info('Sorting temporary bedgraph file from MACS2: {0}'.format(s_obj.name))
run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_cvg_bedgraph_output, '>', s_obj.macs2_tmp_sorted_bedgraph_file]))
logger.info('Clipping peak segments larger than chromosome sizes: {0}'.format(s_obj.name))
def make_chr_size_dict(chrom_sizes_file):
chr_size_dict = {}
with open(chrom_sizes_file) as f:
for line in f:
fields = line.strip().split('\t')
chr_name = fields[0]
chr_size = int(fields[1])
chr_size_dict[chr_name] = chr_size
return chr_size_dict
chr_size_dict = make_chr_size_dict(chrom_sizes_file)
fout = open(s_obj.macs2_tmp_clipped_bedgraphFile, 'w')
with open(s_obj.macs2_tmp_sorted_bedgraph_file) as f:
for line in f:
fields = line.strip().split('\t')
chr_field = fields[0]
coord1 = int(fields[1])
coord2 = int(fields[2])
chr_size = chr_size_dict[chr_field]
if coord1 >= chr_size and coord2 >= chr_size:
continue
elif coord2 >= chr_size:
coord2 = chr_size - 1
fout.write('\t'.join(map(str, [chr_field, coord1, coord2, fields[3]])) + '\n')
fout.flush()
fout.close()
fullChrOnly_grep_command = r'grep -P "^(chr1|chr2|chr3|chr4|chr5|chr6|chr7|chr8|chr9|chr10|chr11|chr12|chr13|chr14|chr15|chr16|chr17|chr18|chr19|chr20|chr21|chr22|chrX|chrY)\t"'
logger.info('Filtering non-full-chr contigs out of MACS2 narrowPeak file: {0}'.format(s_obj.name))
run_command(' '.join([fullChrOnly_grep_command, s_obj.macs2_narrowPeak_file, ">", s_obj.macs2_narrowPeak_fullChrsOnly]))
logger.info('Filtering blacklisted regions out of MACS2 narrowPeak file: {0}'.format(s_obj.name))
run_command(' '.join(['bedtools', 'intersect', '-v', '-sorted',
'-a', s_obj.macs2_narrowPeak_fullChrsOnly, '-b', hg38_blacklist,
'>', s_obj.macs2_narrowPeak_blacklistFiltered]))
logger.info('Filtering non-full-chr contigs out of MACS2 summits: {0}'.format(s_obj.name))
run_command(' '.join([fullChrOnly_grep_command, s_obj.macs2_summits_file, ">", s_obj.macs2_summits_file_fullChrsOnly]))
logger.info('Filtering blacklisted regions out of MACS2 summits: {0}'.format(s_obj.name))
run_command(' '.join(['bedtools', 'intersect', '-v', '-sorted',
'-a', s_obj.macs2_summits_file_fullChrsOnly, '-b', hg38_blacklist,
'>', s_obj.macs2_summits_blacklistFilteredfile]))
#optional todo: rename peaks by rank?
logger.info('Selecting top {1} summits: {0}'.format(s_obj.name, macs2_cap_num_summits))
run_command(' '.join(['sort', '-k', '5gr,5gr', s_obj.macs2_summits_blacklistFilteredfile,
'>', s_obj.macs2_summits_blFilt_sorted]))
run_command(' '.join(['head', '-n', str(macs2_cap_num_summits), s_obj.macs2_summits_blFilt_sorted,
'>', s_obj.macs2_summits_top300k]))
logger.info('Sorting top MACS2 summits by genomic position: {0}'.format(s_obj.name))
run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_summits_top300k,
'>', s_obj.macs2_summits_top300k_posnSorted]))
logger.info('Converting summits to small genomic windows: {0}'.format(s_obj.name))
run_command(' '.join(['cat', s_obj.macs2_summits_top300k_posnSorted, '|', 'awk',
r"""'BEGIN{OFS="\t"}{""" + '$2 = $2 - {0}; $3 = $3 + {0}; print $0'.format(macs2_summit_window_radius) + r"}'",
'>', s_obj.macs2_summits_top300k_windows]))
logger.info('Converting bedgraph to bigWig file (for viewing in IGV): {0}'.format(s_obj.name))
run_command(' '.join(['bedGraphToBigWig', s_obj.macs2_tmp_clipped_bedgraphFile, chrom_sizes_file, s_obj.macs2_smooth_bigWig_file]))
def _arg_parser():
script_usage = script_usage = '''%(prog)s [options] sample_name input_data_directory output_data_directory'''
parser = argparse.ArgumentParser(usage=script_usage)
g = parser.add_argument_group("Required Arguments")
g.add_argument('sample_name', type=str, help='The sample name')
g.add_argument('input_data_directory', type=str,
help='Folder containing input fastq files that correspond to the name of the sample')
g.add_argument('output_data_directory', type=str,
help='Folder in which to store pipeline outputs')
g = parser.add_argument_group("Parameters")
g.add_argument('--num_bowtie2_threads', type=int, default=1,
help='Number of threads for running the bowtie2 aligner in parallel')
g.add_argument('--bowtie2_max_fragment_length', type=int, default=2000,
help="Maximum fragment size to consider for aligning paired end reads")
# note: in encode atac specification, for "ENCODE3" the window size is 73 and the shift size is -37. However 150 is the default in Kundaje lab pipeline.
g.add_argument('--macs2_smoothing_window_size', type=int, default=150,
help="The width of the window for smoothing inferred Tn5 insertion points, used by MACS2 peak caller for calling peaks and creating bigWig files")
g.add_argument('--macs2_summit_window_radius', type=int, default=50,
help="Hard-coded peak size for summit-based analysis")
g.add_argument('--macs2_cap_num_summits', type=int, default=300000,
help="Upper bound for the number of MACS2 summits used in downstream analysis")
return parser
def main(args):
logging.basicConfig(format='%(levelname)s: [%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S %Z')
logger=logging.getLogger()
logger.setLevel(logging.DEBUG)
sample_name = args.pop('sample_name')
input_data_dir = args.pop('input_data_directory')
output_dir = args.pop('output_data_directory')
num_bowtie2_threads = args.pop('num_bowtie2_threads')
bt2_max_frag_len = args.pop('bowtie2_max_fragment_length')
macs2_smoothing_window_size = args.pop('macs2_smoothing_window_size')
macs2_shift_size = int(round(macs2_smoothing_window_size/2) * -1) #shift size should be int, -1/2 * corresponding smoothing window sizes
macs2_cap_num_summits = args.pop('macs2_cap_num_summits')
macs2_summit_window_radius = args.pop('macs2_summit_window_radius')
s_obj = ATAC_Sample(sample_name, input_data_dir, output_dir, macs2_smoothing_window_size, macs2_summit_window_radius)
setup_output_filepaths(s_obj)
logger.info('Running FastQC for sample: {0}'.format(s_obj.name))
run_fastQC(s_obj)
logger.info('Aligning reads for sample: {0}'.format(s_obj.name))
bowtie2_align_reads(s_obj, bt2_max_frag_len, num_bowtie2_threads)
logger.info('Filtering alignments for sample: {0}'.format(s_obj.name))
filter_alignments(s_obj)
logger.info('Generating summary and QC reports for sample: {0}'.format(s_obj.name))
generate_summary_reports(s_obj)
logger.info('Generating text-based alignment file formats: {0}'.format(s_obj.name))
generate_text_alignment_formats(s_obj)
logger.info('Calculating fraction of reads overlapping RefSeq TSS regions: {0}'.format(s_obj.name))
calc_and_write_frac_reads_close_to_TSS(s_obj.final_Tn5shifted_tagAlign_file, refseq_tss_areas, s_obj.refseq_tss_report)
logger.info('Calculating fraction of reads overlapping Gencode TSS regions: {0}'.format(s_obj.name))
calc_and_write_frac_reads_close_to_TSS(s_obj.final_Tn5shifted_tagAlign_file, gencode_tss_areas, s_obj.gencode_tss_report)
logger.info('Calling peaks with MACS2: {0}'.format(s_obj.name))
## note: peak calling parameter set from Omni-ATAC paper is: "macs2 callpeak --nomodel --nolambda --keep-dup all --call-summits"
macs2_call_peaks(s_obj, macs2_shift_size, macs2_smoothing_window_size, macs2_cap_num_summits, macs2_summit_window_radius)
logger.info('Deleting intermediate files: {0}'.format(s_obj.name))
for f in s_obj.intermediate_file_list:
run_command(' '.join(['rm', f]))
if __name__ == '__main__':
args = vars(_arg_parser().parse_args(sys.argv[1:]))
main(args)
#OLD CODE SNIPPETS
# #TODO: find purpose for subsampled files... do pseudoreplicate peak calling and IDR! (use half of reads)
# n_subsampled_reads = 10000000
# logger.debug('Subsampling tagAlign file to {1} reads, excluding chrM: {0}'.format(sample_name, n_subsampled_reads))
# cmd = 'zcat {0} '.format(final_bedpe_file) + \
# '| shuf -n {0} --random-source={1} '.format(n_subsampled_reads, final_bedpe_file) + \
# r"""| awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"N","1000",$9}' """ + \
# '| gzip -nc > {0}'.format(subsampled_tagalign_file)
# os.system(cmd)
# logger.info('Creating coverage bigWig tracks from final BAM file: {0}'.format(sample_name))
# os.system(' '.join(['bedtools', 'genomecov', '-bg', '-ibam', final_bam_file, '>', final_tmp_bedgraph_file]))
# logger.debug('Sorting temporary bedgraph file: {0}'.format(sample_name))
# os.system(' '.join(['sort', '-k1,1', '-k2,2n', final_tmp_bedgraph_file, '>', final_tmp_sorted_bedgraph_file]))
# logger.debug('Converting bedgraph to bigWig file: {0}'.format(sample_name))
# os.system(' '.join(['bedGraphToBigWig', final_tmp_sorted_bedgraph_file, chrom_sizes_file, final_bigWig_file]))
# logger.debug('Deleting temporary files used to make final coverage bigWig file: {0}'.format(sample_name))
# os.system(' '.join(['rm', final_tmp_bedgraph_file]))
# os.system(' '.join(['rm', final_tmp_sorted_bedgraph_file]))