https://github.com/arjunrajlaboratory/atac-seq_pipeline_paired-end
Raw File
Tip revision: b165422254188ce4f89074374967f3390fd5d875 authored by emsanford on 25 February 2021, 23:34:09 UTC
Update concat_zipped_fastq_files_from_illumina.py
Tip revision: b165422
atac_pipeline.py
import sys
import logging
import argparse
from subprocess import Popen, PIPE
import os
import gzip
import random

#note: the call to this script should be preprended with a 'set_atac_pmacs_env' command that sets the paths for the appropriate versions of each tool called in this pipeline. 

#software references
refs_dir          = '/project/arjunrajlab/refs'
bowtie2_index     = refs_dir + '/bowtie2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bowtie_index'
gencode_tss_areas = refs_dir + '/atac_pipeline_refs/GENCODEv24_TSS_hg38_radius500.bed'
refseq_tss_areas  = refs_dir + '/atac_pipeline_refs/UCSC_RefSeq_TSS_hg38_radius500.bed'
chrom_sizes_file  = refs_dir + '/atac_pipeline_refs/hg38.chrom.sizes'
hg38_blacklist    = refs_dir + '/atac_pipeline_refs/hg38.blacklist.sorted.bed'


class ATAC_Sample:
	"""
	Lightweight class to store filepaths for atac-seq sample processing
	"""
	def __init__(self, sample_name, input_data_dir, output_dir, macs2_window_size, macs2_summit_window_radius):
		self.name            = sample_name
		self.input_data_dir  = input_data_dir
		self.output_data_dir = output_dir + '/' + sample_name
		# input fastq files (paired end)
		self.read1_path     = '{0}/{1}'.format(input_data_dir, sample_name + '_R1.fastq.gz')
		self.read2_path     = '{0}/{1}'.format(input_data_dir, sample_name + '_R2.fastq.gz')
		# bowtie2 alignment files
		self.bowtie2_stderr_logfile = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'log_stderr_bowtie2.txt')
		self.bowtie2_aligned_reads  = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'bowtie2_aligned_reads.bam')
		# alignment filtering files
		self.tmp_early_filt_output        = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'tmp.lightFilt.bam')
		self.early_filt_fixmate_output    = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'lightFilt.fixmate.bam')
		self.filtered_bam_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'filt.bam')
		self.filtered_bam_file_namesorted = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'filt.nameSorted.bam')
		self.picard_metrics_file          = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'picardMetrics.txt')
		self.picard_output                = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'picardMarkedDups.bam')
		self.final_bam_file               = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.bam')
		self.final_bam_stats              = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.stats.txt')
		self.final_bam_file_namesorted    = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.nameSorted.bam')
		# text file representations of filtered aligned reads
		self.early_tagalign_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'early.tagAlign.gz')
		self.final_tagalign_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.tagAlign.gz')
		self.final_Tn5shifted_tagAlign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.tagAlign_tn5_shifted.gz')
		self.final_bedpe_file               = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.bedpe.gz')
		self.macs2_unsorted_tagAlign_file   = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'macs2_formatted.unsorted.tagAlign')
		self.macs2_tagAlign_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'Tn5_insertion_points.tagAlign.gz')
		# subsampled files
		self.subsampled_tagalign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'subsampled.tagAlign.gz')
		# QC and summary report files
		self.insert_size_histogram_plot = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'insert_size_histogram.pdf')
		self.insert_size_histogram_data = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'insert_size_data.txt')
		self.mitochondrial_read_report  = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'mitochondrial_read_report.txt')
		self.refseq_tss_report          = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'refseq_tss_report.txt')
		self.gencode_tss_report         = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'gencode_tss_report.txt')
		self.pbc_qc_file                = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.pbc.qc.txt')
		# macs2 peak calling files
		macs2_folder_name = 'macs2_output_windowSize{0}'.format(macs2_window_size)
		self.macs2_output_dir = '{0}/{1}'.format(self.output_data_dir, macs2_folder_name)
		self.macs2_cvg_bedgraph_output      = '{0}/{2}/{1}_treat_pileup.bdg'.format(self.output_data_dir, sample_name, macs2_folder_name)
		self.macs2_tmp_sorted_bedgraph_file = '{0}/{2}/{1}.macs2.tmp.sorted.bedGraph'.format(self.output_data_dir, sample_name, macs2_folder_name)
		self.macs2_tmp_clipped_bedgraphFile = '{0}/{2}/{1}.macs2.tmp.sorted.clipped.bedGraph'.format(self.output_data_dir, sample_name, macs2_folder_name)
		self.macs2_smooth_bigWig_file       = '{0}/{2}/{1}.smoothed_Tn5insertionPoints.bigWig'.format(self.output_data_dir, sample_name, macs2_folder_name)
		#TODO
		self.macs2_narrowPeak_file               = '{0}/{1}/{2}_peaks.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_narrowPeak_fullChrsOnly       = '{0}/{1}/{2}_fullChrsOnly.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_narrowPeak_blacklistFiltered  = '{0}/{1}/{2}_blacklistFiltered.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_file                  = '{0}/{1}/{2}_summits.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		#TODO
		self.macs2_summits_file_fullChrsOnly     = '{0}/{1}/{2}_summits.fullChrsOnly.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_blacklistFilteredfile = '{0}/{1}/{2}_summits.blacklistFiltered.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		#TODO
		self.macs2_summits_blFilt_sorted         = '{0}/{1}/{2}_summits.blacklistFiltered.sorted.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_top300k               = '{0}/{1}/{2}_summits.top300k.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_top300k_posnSorted    = '{0}/{1}/{2}_summits.top300k.posnSorted.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_top300k_windows       = '{0}/{1}/{2}_summits.top300k.windowedRadius{3}.bed'.format(self.output_data_dir, macs2_folder_name, sample_name, macs2_summit_window_radius)

		#files to be deleted after pipeline runs
		self.intermediate_file_list = [self.tmp_early_filt_output, self.early_filt_fixmate_output, self.early_tagalign_file, self.filtered_bam_file_namesorted, 
								       self.filtered_bam_file, self.picard_output, self.final_bam_file_namesorted, self.macs2_unsorted_tagAlign_file,
								       self.macs2_cvg_bedgraph_output, self.macs2_tmp_sorted_bedgraph_file, self.macs2_tmp_clipped_bedgraphFile, self.macs2_summits_file_fullChrsOnly,
								       self.macs2_narrowPeak_fullChrsOnly, self.macs2_summits_blFilt_sorted, self.macs2_summits_top300k]


def run_command(cmd):
	logger = logging.getLogger()
	logger.debug(cmd)
	os.system(cmd)


def setup_output_filepaths(s_obj):
	if not os.path.exists(s_obj.output_data_dir):
		os.makedirs(s_obj.output_data_dir)


def run_fastQC(s_obj):
	run_command(' '.join(['fastqc', s_obj.read1_path, '--outdir', s_obj.output_data_dir]))
	run_command(' '.join(['fastqc', s_obj.read2_path, '--outdir', s_obj.output_data_dir]))


def bowtie2_align_reads(s_obj, bt2_max_frag_len, num_bowtie2_threads):
	cmd = ' '.join(['bowtie2', '--local', 
		           '--maxins', str(bt2_max_frag_len), '-x', bowtie2_index,
		           '--threads', str(num_bowtie2_threads), 
		           '-1', s_obj.read1_path, '-2', s_obj.read2_path, 
		           '2>', s_obj.bowtie2_stderr_logfile, 
		      '|', 'samtools', 'view', '-u', '/dev/stdin',	
		      '|', 'samtools', 'sort', '/dev/stdin', s_obj.bowtie2_aligned_reads[:-4]]) # [:-4] is necessary because samtools adds a bam suffix
	run_command(cmd)	


def filter_alignments(s_obj):
	logger = logging.getLogger()
	logger.info('Removing unmapped reads, reads with unmapped mates, or reads not passing vendor QC, then sorting by QNAME: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-F', '524', '-f', '2', '-u', s_obj.bowtie2_aligned_reads,
						'|', 'samtools', 'sort', '-n', '/dev/stdin', s_obj.tmp_early_filt_output[:-4]]))
	# samtools view: 
	#               -F 524   (-F is "blacklist" for certain flag bits)
	#					bit 512: "not passing filters, such as platform/vendor quality controls"
	#					bit 8: "next segment in the template unmapped"
	#					bit 4: "segment unmapped"
	#               -f 2     (-f is "whitelist" for certain flag bits) 
	#					bit 2: "each segment properly aligned according to the aligner"

	logger.info('Adding header, filtering out secondary alignments, and fixing mates: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-h', s_obj.tmp_early_filt_output, 
				   '|', 'samtools', 'fixmate', '-r', '/dev/stdin', s_obj.early_filt_fixmate_output]))
	logger.info('Removing low map-quality alignments, optical duplicates, and sorting by position: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-q', '20', '-F', '1804', '-f', '2', '-u', s_obj.early_filt_fixmate_output,
						'|', 'samtools', 'sort', '/dev/stdin', s_obj.filtered_bam_file[:-4]]))
	# samtools view: 
	#               -F 1804  (-F is "blacklist" for certain flag bits)
	#					bit 1024: pcr or optical duplicate 
	#					bit 512: "not passing filters, such as platform/vendor quality controls"
	#					bit 256: secondary alignment. "It is typically used to flag alternative mappings when multiple mappings are presented in a SAM"
	#					bit 8: "next segment in the template unmapped"
	#					bit 4: "segment unmapped"
	#               -f 2     (-f is "whitelist" for certain flag bits) 
	#					bit 2: "each segment properly aligned according to the aligner"
	logger.info('Marking duplicates with Picard for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['MarkDuplicates', 
						'INPUT={0}'.format(s_obj.filtered_bam_file), 'OUTPUT={0}'.format(s_obj.picard_output),
						 'METRICS_FILE={0}'.format(s_obj.picard_metrics_file), 'VALIDATION_STRINGENCY=LENIENT',
						 'ASSUME_SORTED=true', 'REMOVE_DUPLICATES=false', 'VERBOSITY=WARNING']))
	logger.info('Writing final BAM file for sample (with PCR duplicates removed): {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-F', '1804', '-f', '2', '-b', s_obj.picard_output, '>', s_obj.final_bam_file]))
	logger.info('Indexing final BAM file for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'index', s_obj.final_bam_file]))


def generate_summary_reports(s_obj):
	logger = logging.getLogger()

	logger.info('Computing samtools stats for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'flagstat', s_obj.final_bam_file, '>', s_obj.final_bam_stats]))

	logger.info('Creating PBC QC file for BAM with PCR duplicates for sample: {0}'.format(s_obj.name))
	# PBC File output format:
		# TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
	logger.info('Creating temporary name-sorted filtered BAM file: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'sort', '-n', s_obj.filtered_bam_file, s_obj.filtered_bam_file_namesorted[:-4]]))

	logger.info('Running PBC computation and writing report: {0}'.format(s_obj.name))
	cmd = "bamToBed -bedpe -i {0}".format(s_obj.filtered_bam_file_namesorted) + \
		   " | awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$3,$6,$9,$10}'" + \
		   " | grep -v 'chrM' | sort | uniq -c" + \
		   " | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf \"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'" + \
		   " > {0}".format(s_obj.pbc_qc_file)
	run_command(cmd)

	logger.info('Plotting insert size histogram for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['CollectInsertSizeMetrics', 'INPUT={0}'.format(s_obj.final_bam_file),
						'OUTPUT={0}'.format(s_obj.insert_size_histogram_data), 'H={0}'.format(s_obj.insert_size_histogram_plot)]))

	logger.info('Calculating percent mitochondrial reads for sample: {0}'.format(s_obj.name))

	logger.info('Creating tagAlign formatted file for lightly-filtered aligned reads: {0}'.format(s_obj.name))
	cmd = "bamToBed -i {0}".format(s_obj.early_filt_fixmate_output) + \
	 r"""| awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' """ + \
		"| gzip -nc > {0}".format(s_obj.early_tagalign_file)
	run_command(cmd)

	# calculate total read pairs (before MQ filtering)
	p1 = Popen(['zcat', s_obj.early_tagalign_file], stdout=PIPE)
	p2 = Popen(['wc', '-l'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
	p1.stdout.close()
	output, err = p2.communicate()
	return_code = p2.wait()
	n_total_read_pairs = int(output) / 2
	# calculate number of mitochondrial read pairs (before MQ filtering)
	p1 = Popen(['zcat', s_obj.early_tagalign_file], stdout=PIPE)
	p2 = Popen(['grep', 'chrM'], stdin=p1.stdout, stdout=PIPE)
	p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
	p1.stdout.close()
	p2.stdout.close()
	output, err = p3.communicate()
	return_code = p3.wait()
	n_mitochondrial_read_pairs = int(output) / 2
	with open(s_obj.mitochondrial_read_report, 'w') as f:
		f.write('\t'.join(['TotalReadPairs', 'MitochondrialReadPairs', 'FracMitochondrialReads'])+'\n')
		f.write('\t'.join(map(str, [n_total_read_pairs, n_mitochondrial_read_pairs, 
						   float(n_mitochondrial_read_pairs)/float(n_total_read_pairs)]))+'\n')


def generate_text_alignment_formats(s_obj):
	logger = logging.getLogger()
	# Create "virtual single-ended file" 
	logger.info('Creating tagAlign formatted file: {0}'.format(s_obj.name))
	cmd = "bamToBed -i {0}".format(s_obj.final_bam_file) + \
	 r"""| awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' """ + \
		"| gzip -nc > {0}".format(s_obj.final_tagalign_file)
	run_command(cmd)
	# Shift tagAlign file due to Tn5 insertion (original ATAC seq paper authors did this)
	#     "For peak-calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event. 
	#      Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adaptors separated by 9 bp (ref. 11). 
	#      Therefore, all reads aligning to the + strand were offset by +4 bp, and all reads aligning to the - strand were offset -5 bp." 
	logger.info('Creating Tn5-shifted final tagAlign file: {0}'.format(s_obj.name))
	cmd = "zcat {0} ".format(s_obj.final_tagalign_file) + \
	 r"""| awk -F $'\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4; $3 = $3 + 4} else if ($6 == "-") {$2 = $2 - 5; $3 = $3 - 5} print $0}' """ + \
		"| gzip -nc > {0}".format(s_obj.final_Tn5shifted_tagAlign_file)
	run_command(cmd)
	logger.info('Creating temporary name-sorted final BAM: {0}'.format(s_obj.name))
	run_command('samtools sort -n {0} {1}'.format(s_obj.final_bam_file, s_obj.final_bam_file_namesorted[:-4]))
	logger.info('Creating BEDPE file from name-sorted final BAM, excluding mitochondrial reads: {0}'.format(s_obj.name))
	run_command("bamToBed -bedpe -mate1 -i {0} | grep -v 'chrM' | gzip -nc > {1}".format(s_obj.final_bam_file_namesorted, s_obj.final_bedpe_file))

	logger.info('Generating tagAlign file for MACS2 peak calling: {0}'.format(s_obj.name))
	fout = open(s_obj.macs2_unsorted_tagAlign_file, 'w')
	random.seed(0)
	with gzip.open(s_obj.final_bedpe_file, 'rt') as f:
		for line in f:
			fields    = line.strip().split('\t')
			r1_chr    = fields[0]
			r1_pos1   = int(fields[1])
			r1_pos2   = int(fields[2])
			r1_strand = fields[8]
			r2_chr    = fields[3]
			r2_pos1   = int(fields[4])
			r2_pos2   = int(fields[5])
			r2_strand = fields[9]
			assert(r1_chr == r2_chr)
			if(r1_strand == r2_strand):
				logger.warn('Final BEDPE contains read with unexpected R1-R2 orientation!: ' + line)
			#Tn5 shifting done by original ATAC seq paper authors
			if r1_strand == '+':
				r1_pos1 += 4
				r1_pos2 += 4
				r2_pos1 -= 5
				r2_pos2 -= 5
			elif r1_strand == '-':
				r1_pos1 -= 5
				r1_pos2 -= 5
				r2_pos1 += 4
				r2_pos2 += 4
			else:
				logger.warn('Final BEDPE final contains improperly-formatted line!: ' + line)
			posn_list = [r1_pos1, r1_pos2, r2_pos1, r2_pos2]
			min_posn = min(posn_list)
			max_posn = max(posn_list)
			if min_posn in [r1_pos1, r1_pos2]:
				min_strand = r1_strand
				max_strand = r2_strand
			elif min_posn in [r2_pos1, r2_pos2]:
				min_strand = r2_strand
				max_strand = r1_strand
			else:
				raise()
			# Pick only one of the Tn5 insertion points (ENCODE pipeline only keeps one in paired-end datasets, perhaps due to possibly breaking MACS2 statistical assumptions when including both?)
			if random.randint(0,1):
				fout.write('\t'.join(map(str, [r1_chr, min_posn, min_posn, 'N', 1000, min_strand])) + '\n')
			else:
				fout.write('\t'.join(map(str, [r1_chr, max_posn, max_posn, 'N', 1000, max_strand])) + '\n')
	fout.flush()
	fout.close()
	run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_unsorted_tagAlign_file, '>', s_obj.macs2_tagAlign_file[:-3]]))
	run_command('gzip -f {0}'.format(s_obj.macs2_tagAlign_file[:-3]))


def calc_and_write_frac_reads_close_to_TSS(final_tagalign_file, reference_tss_areas, output_file):
		p1 = Popen(['bedtools', 'intersect', '-u', '-a', final_tagalign_file, '-b', reference_tss_areas], stdout=PIPE)
		p2 = Popen(['grep', '-v', 'chrM'], stdin=p1.stdout, stdout=PIPE)
		p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
		p1.stdout.close()
		p2.stdout.close()
		output, err = p3.communicate()
		return_code = p3.wait()
		n_tss_reads = int(output)

		p1 = Popen(['zcat', final_tagalign_file], stdout=PIPE)
		p2 = Popen(['wc', '-l'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
		p1.stdout.close()
		output, err = p2.communicate()
		return_code = p2.wait()
		n_total_reads = int(output)
		with open(output_file, 'w') as f:
			f.write('\t'.join(['NumReads(R1+R2)', 'NumReadsCloseToTSS(R1+R2)', 'FracTSS_Reads']) + '\n')
			f.write('\t'.join(map(str, [n_total_reads, n_tss_reads, float(n_tss_reads)/n_total_reads])) + '\n')


def macs2_call_peaks(s_obj, shiftsize, ext_size, macs2_cap_num_summits, macs2_summit_window_radius):
	logger = logging.getLogger()

	run_command(' '.join(['macs2', 'callpeak', '--nomodel', '--nolambda', '--keep-dup', 'all',
						'--call-summits', '-B', '--SPMR', '--format', 'BED',
						'-q', '0.05', '--shift', str(shiftsize), '--extsize', str(ext_size),
						'-t', s_obj.macs2_tagAlign_file, '--outdir', s_obj.macs2_output_dir, 
						'--name', s_obj.name]))

	logger.info('Sorting temporary bedgraph file from MACS2: {0}'.format(s_obj.name))
	run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_cvg_bedgraph_output, '>', s_obj.macs2_tmp_sorted_bedgraph_file]))

	logger.info('Clipping peak segments larger than chromosome sizes: {0}'.format(s_obj.name))
	def make_chr_size_dict(chrom_sizes_file):
		chr_size_dict = {}
		with open(chrom_sizes_file) as f:
			for line in f:
				fields = line.strip().split('\t')
				chr_name = fields[0]
				chr_size = int(fields[1])
				chr_size_dict[chr_name] = chr_size
		return chr_size_dict
	chr_size_dict = make_chr_size_dict(chrom_sizes_file)

	fout = open(s_obj.macs2_tmp_clipped_bedgraphFile, 'w')
	with open(s_obj.macs2_tmp_sorted_bedgraph_file) as f:
		for line in f:
			fields    = line.strip().split('\t')
			chr_field = fields[0]
			coord1    = int(fields[1])
			coord2    = int(fields[2])
			chr_size  = chr_size_dict[chr_field]
			if coord1 >= chr_size and coord2 >= chr_size:
				continue
			elif coord2 >= chr_size:
				coord2 = chr_size - 1
			fout.write('\t'.join(map(str, [chr_field, coord1, coord2, fields[3]])) + '\n')
	fout.flush()
	fout.close()

	fullChrOnly_grep_command = r'grep -P "^(chr1|chr2|chr3|chr4|chr5|chr6|chr7|chr8|chr9|chr10|chr11|chr12|chr13|chr14|chr15|chr16|chr17|chr18|chr19|chr20|chr21|chr22|chrX|chrY)\t"'
	logger.info('Filtering non-full-chr contigs out of MACS2 narrowPeak file: {0}'.format(s_obj.name))
	run_command(' '.join([fullChrOnly_grep_command, s_obj.macs2_narrowPeak_file, ">", s_obj.macs2_narrowPeak_fullChrsOnly]))
	logger.info('Filtering blacklisted regions out of MACS2 narrowPeak file: {0}'.format(s_obj.name))
	run_command(' '.join(['bedtools', 'intersect', '-v', '-sorted',
		                '-a', s_obj.macs2_narrowPeak_fullChrsOnly, '-b', hg38_blacklist,
		                '>', s_obj.macs2_narrowPeak_blacklistFiltered]))
	logger.info('Filtering non-full-chr contigs out of MACS2 summits: {0}'.format(s_obj.name))
	run_command(' '.join([fullChrOnly_grep_command, s_obj.macs2_summits_file, ">", s_obj.macs2_summits_file_fullChrsOnly]))
	logger.info('Filtering blacklisted regions out of MACS2 summits: {0}'.format(s_obj.name))
	run_command(' '.join(['bedtools', 'intersect', '-v', '-sorted',
		                '-a', s_obj.macs2_summits_file_fullChrsOnly, '-b', hg38_blacklist,
		                '>', s_obj.macs2_summits_blacklistFilteredfile]))
	#optional todo: rename peaks by rank?
	logger.info('Selecting top {1} summits: {0}'.format(s_obj.name, macs2_cap_num_summits))
	run_command(' '.join(['sort', '-k', '5gr,5gr', s_obj.macs2_summits_blacklistFilteredfile,
						'>', s_obj.macs2_summits_blFilt_sorted]))
	run_command(' '.join(['head', '-n', str(macs2_cap_num_summits), s_obj.macs2_summits_blFilt_sorted,
						'>', s_obj.macs2_summits_top300k]))
	logger.info('Sorting top MACS2 summits by genomic position: {0}'.format(s_obj.name))
	run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_summits_top300k,
						'>', s_obj.macs2_summits_top300k_posnSorted]))
	logger.info('Converting summits to small genomic windows: {0}'.format(s_obj.name))
	run_command(' '.join(['cat', s_obj.macs2_summits_top300k_posnSorted, '|', 'awk', 
					r"""'BEGIN{OFS="\t"}{""" + '$2 = $2 - {0}; $3 = $3 + {0}; print $0'.format(macs2_summit_window_radius) + r"}'",
					    '>', s_obj.macs2_summits_top300k_windows]))
	logger.info('Converting bedgraph to bigWig file (for viewing in IGV): {0}'.format(s_obj.name))
	run_command(' '.join(['bedGraphToBigWig', s_obj.macs2_tmp_clipped_bedgraphFile, chrom_sizes_file, s_obj.macs2_smooth_bigWig_file]))


def _arg_parser():
	script_usage = script_usage = '''%(prog)s [options] sample_name input_data_directory output_data_directory'''
	parser = argparse.ArgumentParser(usage=script_usage)

	g = parser.add_argument_group("Required Arguments")
	g.add_argument('sample_name', type=str, help='The sample name')
	g.add_argument('input_data_directory', type=str,
	                help='Folder containing input fastq files that correspond to the name of the sample')
	g.add_argument('output_data_directory', type=str,
                    help='Folder in which to store pipeline outputs')

	g = parser.add_argument_group("Parameters")
	g.add_argument('--num_bowtie2_threads', type=int, default=1,
    	           help='Number of threads for running the bowtie2 aligner in parallel')
	g.add_argument('--bowtie2_max_fragment_length', type=int, default=2000,
				   help="Maximum fragment size to consider for aligning paired end reads")
	# note: in encode atac specification, for "ENCODE3" the window size is 73 and the shift size is -37. However 150 is the default in Kundaje lab pipeline.
	g.add_argument('--macs2_smoothing_window_size', type=int, default=150,
			       help="The width of the window for smoothing inferred Tn5 insertion points, used by MACS2 peak caller for calling peaks and creating bigWig files")
	g.add_argument('--macs2_summit_window_radius', type=int, default=50,
				   help="Hard-coded peak size for summit-based analysis")
	g.add_argument('--macs2_cap_num_summits', type=int, default=300000,
    			   help="Upper bound for the number of MACS2 summits used in downstream analysis")
	return parser


def main(args):
	logging.basicConfig(format='%(levelname)s: [%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S %Z')
	logger=logging.getLogger()
	logger.setLevel(logging.DEBUG)

	sample_name         = args.pop('sample_name')
	input_data_dir      = args.pop('input_data_directory')
	output_dir          = args.pop('output_data_directory')
	num_bowtie2_threads = args.pop('num_bowtie2_threads')
	bt2_max_frag_len    = args.pop('bowtie2_max_fragment_length')
	macs2_smoothing_window_size = args.pop('macs2_smoothing_window_size')
	macs2_shift_size            = int(round(macs2_smoothing_window_size/2) * -1) #shift size should be int, -1/2 * corresponding smoothing window sizes
	macs2_cap_num_summits       = args.pop('macs2_cap_num_summits')
	macs2_summit_window_radius  = args.pop('macs2_summit_window_radius')

	s_obj = ATAC_Sample(sample_name, input_data_dir, output_dir, macs2_smoothing_window_size, macs2_summit_window_radius)
	setup_output_filepaths(s_obj)

	logger.info('Running FastQC for sample: {0}'.format(s_obj.name))
	run_fastQC(s_obj)
	logger.info('Aligning reads for sample: {0}'.format(s_obj.name))
	bowtie2_align_reads(s_obj, bt2_max_frag_len, num_bowtie2_threads)
	logger.info('Filtering alignments for sample: {0}'.format(s_obj.name))
	filter_alignments(s_obj)
	logger.info('Generating summary and QC reports for sample: {0}'.format(s_obj.name))
	generate_summary_reports(s_obj)
	logger.info('Generating text-based alignment file formats: {0}'.format(s_obj.name))
	generate_text_alignment_formats(s_obj)
	logger.info('Calculating fraction of reads overlapping RefSeq TSS regions: {0}'.format(s_obj.name))
	calc_and_write_frac_reads_close_to_TSS(s_obj.final_Tn5shifted_tagAlign_file, refseq_tss_areas, s_obj.refseq_tss_report)
	logger.info('Calculating fraction of reads overlapping Gencode TSS regions: {0}'.format(s_obj.name))
	calc_and_write_frac_reads_close_to_TSS(s_obj.final_Tn5shifted_tagAlign_file, gencode_tss_areas, s_obj.gencode_tss_report)
	logger.info('Calling peaks with MACS2: {0}'.format(s_obj.name))
	## note: peak calling parameter set from Omni-ATAC paper is: "macs2 callpeak --nomodel --nolambda --keep-dup all --call-summits"
	macs2_call_peaks(s_obj, macs2_shift_size, macs2_smoothing_window_size, macs2_cap_num_summits, macs2_summit_window_radius)
	logger.info('Deleting intermediate files: {0}'.format(s_obj.name))
	for f in s_obj.intermediate_file_list:
		run_command(' '.join(['rm', f]))


if __name__ == '__main__':
    args = vars(_arg_parser().parse_args(sys.argv[1:]))
    main(args)


#OLD CODE SNIPPETS

# #TODO: find purpose for subsampled files... do pseudoreplicate peak calling and IDR! (use half of reads)
# n_subsampled_reads  = 10000000
# logger.debug('Subsampling tagAlign file to {1} reads, excluding chrM: {0}'.format(sample_name, n_subsampled_reads))
# cmd = 'zcat {0} '.format(final_bedpe_file) + \
# '| shuf -n {0} --random-source={1} '.format(n_subsampled_reads, final_bedpe_file) + \
# r"""| awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"N","1000",$9}' """ + \
# '| gzip -nc > {0}'.format(subsampled_tagalign_file)
# os.system(cmd)

	# logger.info('Creating coverage bigWig tracks from final BAM file: {0}'.format(sample_name))
	# os.system(' '.join(['bedtools', 'genomecov', '-bg', '-ibam', final_bam_file, '>', final_tmp_bedgraph_file]))
	# logger.debug('Sorting temporary bedgraph file: {0}'.format(sample_name))
	# os.system(' '.join(['sort', '-k1,1', '-k2,2n', final_tmp_bedgraph_file, '>', final_tmp_sorted_bedgraph_file]))
	# logger.debug('Converting bedgraph to bigWig file: {0}'.format(sample_name))
	# os.system(' '.join(['bedGraphToBigWig', final_tmp_sorted_bedgraph_file, chrom_sizes_file, final_bigWig_file]))
	# logger.debug('Deleting temporary files used to make final coverage bigWig file: {0}'.format(sample_name))
	# os.system(' '.join(['rm', final_tmp_bedgraph_file]))
	# os.system(' '.join(['rm', final_tmp_sorted_bedgraph_file]))
back to top