Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

  • c3be876
  • /
  • atac_pipeline.py
Raw File Download

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • content
  • directory
content badge
swh:1:cnt:2196b9e2f90d295af91872675fa85374a3023614
directory badge
swh:1:dir:c3be8763ed0c43d17cb09b29b894cfb899d1fc14

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • content
  • directory
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
atac_pipeline.py
import sys
import logging
import argparse
from subprocess import Popen, PIPE
import os
import gzip
import random

#note: the call to this script should be preprended with a 'set_atac_pmacs_env' command that sets the paths for the appropriate versions of each tool called in this pipeline. 

#software references
refs_dir          = '/project/arjunrajlab/refs'
bowtie2_index     = refs_dir + '/bowtie2/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bowtie_index'
gencode_tss_areas = refs_dir + '/atac_pipeline_refs/GENCODEv24_TSS_hg38_radius500.bed'
refseq_tss_areas  = refs_dir + '/atac_pipeline_refs/UCSC_RefSeq_TSS_hg38_radius500.bed'
chrom_sizes_file  = refs_dir + '/atac_pipeline_refs/hg38.chrom.sizes'
hg38_blacklist    = refs_dir + '/atac_pipeline_refs/hg38.blacklist.sorted.bed'


class ATAC_Sample:
	"""
	Lightweight class to store filepaths for atac-seq sample processing
	"""
	def __init__(self, sample_name, input_data_dir, output_dir, macs2_window_size, macs2_summit_window_radius):
		self.name            = sample_name
		self.input_data_dir  = input_data_dir
		self.output_data_dir = output_dir + '/' + sample_name
		# input fastq files (paired end)
		self.read1_path     = '{0}/{1}'.format(input_data_dir, sample_name + '_R1.fastq.gz')
		self.read2_path     = '{0}/{1}'.format(input_data_dir, sample_name + '_R2.fastq.gz')
		# bowtie2 alignment files
		self.bowtie2_stderr_logfile = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'log_stderr_bowtie2.txt')
		self.bowtie2_aligned_reads  = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'bowtie2_aligned_reads.bam')
		# alignment filtering files
		self.tmp_early_filt_output        = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'tmp.lightFilt.bam')
		self.early_filt_fixmate_output    = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'lightFilt.fixmate.bam')
		self.filtered_bam_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'filt.bam')
		self.filtered_bam_file_namesorted = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'filt.nameSorted.bam')
		self.picard_metrics_file          = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'picardMetrics.txt')
		self.picard_output                = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'picardMarkedDups.bam')
		self.final_bam_file               = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.bam')
		self.final_bam_stats              = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.stats.txt')
		self.final_bam_file_namesorted    = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.nameSorted.bam')
		# text file representations of filtered aligned reads
		self.early_tagalign_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'early.tagAlign.gz')
		self.final_tagalign_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.tagAlign.gz')
		self.final_Tn5shifted_tagAlign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.tagAlign_tn5_shifted.gz')
		self.final_bedpe_file               = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.bedpe.gz')
		self.macs2_unsorted_tagAlign_file   = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'macs2_formatted.unsorted.tagAlign')
		self.macs2_tagAlign_file            = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'Tn5_insertion_points.tagAlign.gz')
		# subsampled files
		self.subsampled_tagalign_file = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'subsampled.tagAlign.gz')
		# QC and summary report files
		self.insert_size_histogram_plot = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'insert_size_histogram.pdf')
		self.insert_size_histogram_data = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'insert_size_data.txt')
		self.mitochondrial_read_report  = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'mitochondrial_read_report.txt')
		self.refseq_tss_report          = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'refseq_tss_report.txt')
		self.gencode_tss_report         = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'gencode_tss_report.txt')
		self.pbc_qc_file                = '{0}/{1}.{2}'.format(self.output_data_dir, sample_name, 'final.pbc.qc.txt')
		# macs2 peak calling files
		macs2_folder_name = 'macs2_output_windowSize{0}'.format(macs2_window_size)
		self.macs2_output_dir = '{0}/{1}'.format(self.output_data_dir, macs2_folder_name)
		self.macs2_cvg_bedgraph_output      = '{0}/{2}/{1}_treat_pileup.bdg'.format(self.output_data_dir, sample_name, macs2_folder_name)
		self.macs2_tmp_sorted_bedgraph_file = '{0}/{2}/{1}.macs2.tmp.sorted.bedGraph'.format(self.output_data_dir, sample_name, macs2_folder_name)
		self.macs2_tmp_clipped_bedgraphFile = '{0}/{2}/{1}.macs2.tmp.sorted.clipped.bedGraph'.format(self.output_data_dir, sample_name, macs2_folder_name)
		self.macs2_smooth_bigWig_file       = '{0}/{2}/{1}.smoothed_Tn5insertionPoints.bigWig'.format(self.output_data_dir, sample_name, macs2_folder_name)
		#TODO
		self.macs2_narrowPeak_file               = '{0}/{1}/{2}_peaks.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_narrowPeak_fullChrsOnly       = '{0}/{1}/{2}_fullChrsOnly.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_narrowPeak_blacklistFiltered  = '{0}/{1}/{2}_blacklistFiltered.narrowPeak'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_file                  = '{0}/{1}/{2}_summits.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		#TODO
		self.macs2_summits_file_fullChrsOnly     = '{0}/{1}/{2}_summits.fullChrsOnly.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_blacklistFilteredfile = '{0}/{1}/{2}_summits.blacklistFiltered.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		#TODO
		self.macs2_summits_blFilt_sorted         = '{0}/{1}/{2}_summits.blacklistFiltered.sorted.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_top300k               = '{0}/{1}/{2}_summits.top300k.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_top300k_posnSorted    = '{0}/{1}/{2}_summits.top300k.posnSorted.bed'.format(self.output_data_dir, macs2_folder_name, sample_name)
		self.macs2_summits_top300k_windows       = '{0}/{1}/{2}_summits.top300k.windowedRadius{3}.bed'.format(self.output_data_dir, macs2_folder_name, sample_name, macs2_summit_window_radius)

		#files to be deleted after pipeline runs
		self.intermediate_file_list = [self.tmp_early_filt_output, self.early_filt_fixmate_output, self.early_tagalign_file, self.filtered_bam_file_namesorted, 
								       self.filtered_bam_file, self.picard_output, self.final_bam_file_namesorted, self.macs2_unsorted_tagAlign_file,
								       self.macs2_cvg_bedgraph_output, self.macs2_tmp_sorted_bedgraph_file, self.macs2_tmp_clipped_bedgraphFile, self.macs2_summits_file_fullChrsOnly,
								       self.macs2_narrowPeak_fullChrsOnly, self.macs2_summits_blFilt_sorted, self.macs2_summits_top300k]


def run_command(cmd):
	logger = logging.getLogger()
	logger.debug(cmd)
	os.system(cmd)


def setup_output_filepaths(s_obj):
	if not os.path.exists(s_obj.output_data_dir):
		os.makedirs(s_obj.output_data_dir)


def run_fastQC(s_obj):
	run_command(' '.join(['fastqc', s_obj.read1_path, '--outdir', s_obj.output_data_dir]))
	run_command(' '.join(['fastqc', s_obj.read2_path, '--outdir', s_obj.output_data_dir]))


def bowtie2_align_reads(s_obj, bt2_max_frag_len, num_bowtie2_threads):
	cmd = ' '.join(['bowtie2', '--local', 
		           '--maxins', str(bt2_max_frag_len), '-x', bowtie2_index,
		           '--threads', str(num_bowtie2_threads), 
		           '-1', s_obj.read1_path, '-2', s_obj.read2_path, 
		           '2>', s_obj.bowtie2_stderr_logfile, 
		      '|', 'samtools', 'view', '-u', '/dev/stdin',	
		      '|', 'samtools', 'sort', '/dev/stdin', s_obj.bowtie2_aligned_reads[:-4]]) # [:-4] is necessary because samtools adds a bam suffix
	run_command(cmd)	


def filter_alignments(s_obj):
	logger = logging.getLogger()
	logger.info('Removing unmapped reads, reads with unmapped mates, or reads not passing vendor QC, then sorting by QNAME: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-F', '524', '-f', '2', '-u', s_obj.bowtie2_aligned_reads,
						'|', 'samtools', 'sort', '-n', '/dev/stdin', s_obj.tmp_early_filt_output[:-4]]))
	# samtools view: 
	#               -F 524   (-F is "blacklist" for certain flag bits)
	#					bit 512: "not passing filters, such as platform/vendor quality controls"
	#					bit 8: "next segment in the template unmapped"
	#					bit 4: "segment unmapped"
	#               -f 2     (-f is "whitelist" for certain flag bits) 
	#					bit 2: "each segment properly aligned according to the aligner"

	logger.info('Adding header, filtering out secondary alignments, and fixing mates: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-h', s_obj.tmp_early_filt_output, 
				   '|', 'samtools', 'fixmate', '-r', '/dev/stdin', s_obj.early_filt_fixmate_output]))
	logger.info('Removing low map-quality alignments, optical duplicates, and sorting by position: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-q', '20', '-F', '1804', '-f', '2', '-u', s_obj.early_filt_fixmate_output,
						'|', 'samtools', 'sort', '/dev/stdin', s_obj.filtered_bam_file[:-4]]))
	# samtools view: 
	#               -F 1804  (-F is "blacklist" for certain flag bits)
	#					bit 1024: pcr or optical duplicate 
	#					bit 512: "not passing filters, such as platform/vendor quality controls"
	#					bit 256: secondary alignment. "It is typically used to flag alternative mappings when multiple mappings are presented in a SAM"
	#					bit 8: "next segment in the template unmapped"
	#					bit 4: "segment unmapped"
	#               -f 2     (-f is "whitelist" for certain flag bits) 
	#					bit 2: "each segment properly aligned according to the aligner"
	logger.info('Marking duplicates with Picard for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['MarkDuplicates', 
						'INPUT={0}'.format(s_obj.filtered_bam_file), 'OUTPUT={0}'.format(s_obj.picard_output),
						 'METRICS_FILE={0}'.format(s_obj.picard_metrics_file), 'VALIDATION_STRINGENCY=LENIENT',
						 'ASSUME_SORTED=true', 'REMOVE_DUPLICATES=false', 'VERBOSITY=WARNING']))
	logger.info('Writing final BAM file for sample (with PCR duplicates removed): {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'view', '-F', '1804', '-f', '2', '-b', s_obj.picard_output, '>', s_obj.final_bam_file]))
	logger.info('Indexing final BAM file for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'index', s_obj.final_bam_file]))


def generate_summary_reports(s_obj):
	logger = logging.getLogger()

	logger.info('Computing samtools stats for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'flagstat', s_obj.final_bam_file, '>', s_obj.final_bam_stats]))

	logger.info('Creating PBC QC file for BAM with PCR duplicates for sample: {0}'.format(s_obj.name))
	# PBC File output format:
		# TotalReadPairs [tab] DistinctReadPairs [tab] OneReadPair [tab] TwoReadPairs [tab] NRF=Distinct/Total [tab] PBC1=OnePair/Distinct [tab] PBC2=OnePair/TwoPair
	logger.info('Creating temporary name-sorted filtered BAM file: {0}'.format(s_obj.name))
	run_command(' '.join(['samtools', 'sort', '-n', s_obj.filtered_bam_file, s_obj.filtered_bam_file_namesorted[:-4]]))

	logger.info('Running PBC computation and writing report: {0}'.format(s_obj.name))
	cmd = "bamToBed -bedpe -i {0}".format(s_obj.filtered_bam_file_namesorted) + \
		   " | awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$3,$6,$9,$10}'" + \
		   " | grep -v 'chrM' | sort | uniq -c" + \
		   " | awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf \"%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n\",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'" + \
		   " > {0}".format(s_obj.pbc_qc_file)
	run_command(cmd)

	logger.info('Plotting insert size histogram for sample: {0}'.format(s_obj.name))
	run_command(' '.join(['CollectInsertSizeMetrics', 'INPUT={0}'.format(s_obj.final_bam_file),
						'OUTPUT={0}'.format(s_obj.insert_size_histogram_data), 'H={0}'.format(s_obj.insert_size_histogram_plot)]))

	logger.info('Calculating percent mitochondrial reads for sample: {0}'.format(s_obj.name))

	logger.info('Creating tagAlign formatted file for lightly-filtered aligned reads: {0}'.format(s_obj.name))
	cmd = "bamToBed -i {0}".format(s_obj.early_filt_fixmate_output) + \
	 r"""| awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' """ + \
		"| gzip -nc > {0}".format(s_obj.early_tagalign_file)
	run_command(cmd)

	# calculate total read pairs (before MQ filtering)
	p1 = Popen(['zcat', s_obj.early_tagalign_file], stdout=PIPE)
	p2 = Popen(['wc', '-l'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
	p1.stdout.close()
	output, err = p2.communicate()
	return_code = p2.wait()
	n_total_read_pairs = int(output) / 2
	# calculate number of mitochondrial read pairs (before MQ filtering)
	p1 = Popen(['zcat', s_obj.early_tagalign_file], stdout=PIPE)
	p2 = Popen(['grep', 'chrM'], stdin=p1.stdout, stdout=PIPE)
	p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
	p1.stdout.close()
	p2.stdout.close()
	output, err = p3.communicate()
	return_code = p3.wait()
	n_mitochondrial_read_pairs = int(output) / 2
	with open(s_obj.mitochondrial_read_report, 'w') as f:
		f.write('\t'.join(['TotalReadPairs', 'MitochondrialReadPairs', 'FracMitochondrialReads'])+'\n')
		f.write('\t'.join(map(str, [n_total_read_pairs, n_mitochondrial_read_pairs, 
						   float(n_mitochondrial_read_pairs)/float(n_total_read_pairs)]))+'\n')


def generate_text_alignment_formats(s_obj):
	logger = logging.getLogger()
	# Create "virtual single-ended file" 
	logger.info('Creating tagAlign formatted file: {0}'.format(s_obj.name))
	cmd = "bamToBed -i {0}".format(s_obj.final_bam_file) + \
	 r"""| awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}' """ + \
		"| gzip -nc > {0}".format(s_obj.final_tagalign_file)
	run_command(cmd)
	# Shift tagAlign file due to Tn5 insertion (original ATAC seq paper authors did this)
	#     "For peak-calling and footprinting, we adjusted the read start sites to represent the center of the transposon binding event. 
	#      Previous descriptions of the Tn5 transposase show that the transposon binds as a dimer and inserts two adaptors separated by 9 bp (ref. 11). 
	#      Therefore, all reads aligning to the + strand were offset by +4 bp, and all reads aligning to the - strand were offset -5 bp." 
	logger.info('Creating Tn5-shifted final tagAlign file: {0}'.format(s_obj.name))
	cmd = "zcat {0} ".format(s_obj.final_tagalign_file) + \
	 r"""| awk -F $'\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4; $3 = $3 + 4} else if ($6 == "-") {$2 = $2 - 5; $3 = $3 - 5} print $0}' """ + \
		"| gzip -nc > {0}".format(s_obj.final_Tn5shifted_tagAlign_file)
	run_command(cmd)
	logger.info('Creating temporary name-sorted final BAM: {0}'.format(s_obj.name))
	run_command('samtools sort -n {0} {1}'.format(s_obj.final_bam_file, s_obj.final_bam_file_namesorted[:-4]))
	logger.info('Creating BEDPE file from name-sorted final BAM, excluding mitochondrial reads: {0}'.format(s_obj.name))
	run_command("bamToBed -bedpe -mate1 -i {0} | grep -v 'chrM' | gzip -nc > {1}".format(s_obj.final_bam_file_namesorted, s_obj.final_bedpe_file))

	logger.info('Generating tagAlign file for MACS2 peak calling: {0}'.format(s_obj.name))
	fout = open(s_obj.macs2_unsorted_tagAlign_file, 'w')
	random.seed(0)
	with gzip.open(s_obj.final_bedpe_file, 'rt') as f:
		for line in f:
			fields    = line.strip().split('\t')
			r1_chr    = fields[0]
			r1_pos1   = int(fields[1])
			r1_pos2   = int(fields[2])
			r1_strand = fields[8]
			r2_chr    = fields[3]
			r2_pos1   = int(fields[4])
			r2_pos2   = int(fields[5])
			r2_strand = fields[9]
			assert(r1_chr == r2_chr)
			if(r1_strand == r2_strand):
				logger.warn('Final BEDPE contains read with unexpected R1-R2 orientation!: ' + line)
			#Tn5 shifting done by original ATAC seq paper authors
			if r1_strand == '+':
				r1_pos1 += 4
				r1_pos2 += 4
				r2_pos1 -= 5
				r2_pos2 -= 5
			elif r1_strand == '-':
				r1_pos1 -= 5
				r1_pos2 -= 5
				r2_pos1 += 4
				r2_pos2 += 4
			else:
				logger.warn('Final BEDPE final contains improperly-formatted line!: ' + line)
			posn_list = [r1_pos1, r1_pos2, r2_pos1, r2_pos2]
			min_posn = min(posn_list)
			max_posn = max(posn_list)
			if min_posn in [r1_pos1, r1_pos2]:
				min_strand = r1_strand
				max_strand = r2_strand
			elif min_posn in [r2_pos1, r2_pos2]:
				min_strand = r2_strand
				max_strand = r1_strand
			else:
				raise()
			# Pick only one of the Tn5 insertion points (ENCODE pipeline only keeps one in paired-end datasets, perhaps due to possibly breaking MACS2 statistical assumptions when including both?)
			if random.randint(0,1):
				fout.write('\t'.join(map(str, [r1_chr, min_posn, min_posn, 'N', 1000, min_strand])) + '\n')
			else:
				fout.write('\t'.join(map(str, [r1_chr, max_posn, max_posn, 'N', 1000, max_strand])) + '\n')
	fout.flush()
	fout.close()
	run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_unsorted_tagAlign_file, '>', s_obj.macs2_tagAlign_file[:-3]]))
	run_command('gzip -f {0}'.format(s_obj.macs2_tagAlign_file[:-3]))


def calc_and_write_frac_reads_close_to_TSS(final_tagalign_file, reference_tss_areas, output_file):
		p1 = Popen(['bedtools', 'intersect', '-u', '-a', final_tagalign_file, '-b', reference_tss_areas], stdout=PIPE)
		p2 = Popen(['grep', '-v', 'chrM'], stdin=p1.stdout, stdout=PIPE)
		p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE)
		p1.stdout.close()
		p2.stdout.close()
		output, err = p3.communicate()
		return_code = p3.wait()
		n_tss_reads = int(output)

		p1 = Popen(['zcat', final_tagalign_file], stdout=PIPE)
		p2 = Popen(['wc', '-l'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
		p1.stdout.close()
		output, err = p2.communicate()
		return_code = p2.wait()
		n_total_reads = int(output)
		with open(output_file, 'w') as f:
			f.write('\t'.join(['NumReads(R1+R2)', 'NumReadsCloseToTSS(R1+R2)', 'FracTSS_Reads']) + '\n')
			f.write('\t'.join(map(str, [n_total_reads, n_tss_reads, float(n_tss_reads)/n_total_reads])) + '\n')


def macs2_call_peaks(s_obj, shiftsize, ext_size, macs2_cap_num_summits, macs2_summit_window_radius):
	logger = logging.getLogger()

	run_command(' '.join(['macs2', 'callpeak', '--nomodel', '--nolambda', '--keep-dup', 'all',
						'--call-summits', '-B', '--SPMR', '--format', 'BED',
						'-q', '0.05', '--shift', str(shiftsize), '--extsize', str(ext_size),
						'-t', s_obj.macs2_tagAlign_file, '--outdir', s_obj.macs2_output_dir, 
						'--name', s_obj.name]))

	logger.info('Sorting temporary bedgraph file from MACS2: {0}'.format(s_obj.name))
	run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_cvg_bedgraph_output, '>', s_obj.macs2_tmp_sorted_bedgraph_file]))

	logger.info('Clipping peak segments larger than chromosome sizes: {0}'.format(s_obj.name))
	def make_chr_size_dict(chrom_sizes_file):
		chr_size_dict = {}
		with open(chrom_sizes_file) as f:
			for line in f:
				fields = line.strip().split('\t')
				chr_name = fields[0]
				chr_size = int(fields[1])
				chr_size_dict[chr_name] = chr_size
		return chr_size_dict
	chr_size_dict = make_chr_size_dict(chrom_sizes_file)

	fout = open(s_obj.macs2_tmp_clipped_bedgraphFile, 'w')
	with open(s_obj.macs2_tmp_sorted_bedgraph_file) as f:
		for line in f:
			fields    = line.strip().split('\t')
			chr_field = fields[0]
			coord1    = int(fields[1])
			coord2    = int(fields[2])
			chr_size  = chr_size_dict[chr_field]
			if coord1 >= chr_size and coord2 >= chr_size:
				continue
			elif coord2 >= chr_size:
				coord2 = chr_size - 1
			fout.write('\t'.join(map(str, [chr_field, coord1, coord2, fields[3]])) + '\n')
	fout.flush()
	fout.close()

	fullChrOnly_grep_command = r'grep -P "^(chr1|chr2|chr3|chr4|chr5|chr6|chr7|chr8|chr9|chr10|chr11|chr12|chr13|chr14|chr15|chr16|chr17|chr18|chr19|chr20|chr21|chr22|chrX|chrY)\t"'
	logger.info('Filtering non-full-chr contigs out of MACS2 narrowPeak file: {0}'.format(s_obj.name))
	run_command(' '.join([fullChrOnly_grep_command, s_obj.macs2_narrowPeak_file, ">", s_obj.macs2_narrowPeak_fullChrsOnly]))
	logger.info('Filtering blacklisted regions out of MACS2 narrowPeak file: {0}'.format(s_obj.name))
	run_command(' '.join(['bedtools', 'intersect', '-v', '-sorted',
		                '-a', s_obj.macs2_narrowPeak_fullChrsOnly, '-b', hg38_blacklist,
		                '>', s_obj.macs2_narrowPeak_blacklistFiltered]))
	logger.info('Filtering non-full-chr contigs out of MACS2 summits: {0}'.format(s_obj.name))
	run_command(' '.join([fullChrOnly_grep_command, s_obj.macs2_summits_file, ">", s_obj.macs2_summits_file_fullChrsOnly]))
	logger.info('Filtering blacklisted regions out of MACS2 summits: {0}'.format(s_obj.name))
	run_command(' '.join(['bedtools', 'intersect', '-v', '-sorted',
		                '-a', s_obj.macs2_summits_file_fullChrsOnly, '-b', hg38_blacklist,
		                '>', s_obj.macs2_summits_blacklistFilteredfile]))
	#optional todo: rename peaks by rank?
	logger.info('Selecting top {1} summits: {0}'.format(s_obj.name, macs2_cap_num_summits))
	run_command(' '.join(['sort', '-k', '5gr,5gr', s_obj.macs2_summits_blacklistFilteredfile,
						'>', s_obj.macs2_summits_blFilt_sorted]))
	run_command(' '.join(['head', '-n', str(macs2_cap_num_summits), s_obj.macs2_summits_blFilt_sorted,
						'>', s_obj.macs2_summits_top300k]))
	logger.info('Sorting top MACS2 summits by genomic position: {0}'.format(s_obj.name))
	run_command(' '.join(['sort', '-k1,1', '-k2,2n', s_obj.macs2_summits_top300k,
						'>', s_obj.macs2_summits_top300k_posnSorted]))
	logger.info('Converting summits to small genomic windows: {0}'.format(s_obj.name))
	run_command(' '.join(['cat', s_obj.macs2_summits_top300k_posnSorted, '|', 'awk', 
					r"""'BEGIN{OFS="\t"}{""" + '$2 = $2 - {0}; $3 = $3 + {0}; print $0'.format(macs2_summit_window_radius) + r"}'",
					    '>', s_obj.macs2_summits_top300k_windows]))
	logger.info('Converting bedgraph to bigWig file (for viewing in IGV): {0}'.format(s_obj.name))
	run_command(' '.join(['bedGraphToBigWig', s_obj.macs2_tmp_clipped_bedgraphFile, chrom_sizes_file, s_obj.macs2_smooth_bigWig_file]))


def _arg_parser():
	script_usage = script_usage = '''%(prog)s [options] sample_name input_data_directory output_data_directory'''
	parser = argparse.ArgumentParser(usage=script_usage)

	g = parser.add_argument_group("Required Arguments")
	g.add_argument('sample_name', type=str, help='The sample name')
	g.add_argument('input_data_directory', type=str,
	                help='Folder containing input fastq files that correspond to the name of the sample')
	g.add_argument('output_data_directory', type=str,
                    help='Folder in which to store pipeline outputs')

	g = parser.add_argument_group("Parameters")
	g.add_argument('--num_bowtie2_threads', type=int, default=1,
    	           help='Number of threads for running the bowtie2 aligner in parallel')
	g.add_argument('--bowtie2_max_fragment_length', type=int, default=2000,
				   help="Maximum fragment size to consider for aligning paired end reads")
	# note: in encode atac specification, for "ENCODE3" the window size is 73 and the shift size is -37. However 150 is the default in Kundaje lab pipeline.
	g.add_argument('--macs2_smoothing_window_size', type=int, default=150,
			       help="The width of the window for smoothing inferred Tn5 insertion points, used by MACS2 peak caller for calling peaks and creating bigWig files")
	g.add_argument('--macs2_summit_window_radius', type=int, default=50,
				   help="Hard-coded peak size for summit-based analysis")
	g.add_argument('--macs2_cap_num_summits', type=int, default=300000,
    			   help="Upper bound for the number of MACS2 summits used in downstream analysis")
	return parser


def main(args):
	logging.basicConfig(format='%(levelname)s: [%(asctime)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S %Z')
	logger=logging.getLogger()
	logger.setLevel(logging.DEBUG)

	sample_name         = args.pop('sample_name')
	input_data_dir      = args.pop('input_data_directory')
	output_dir          = args.pop('output_data_directory')
	num_bowtie2_threads = args.pop('num_bowtie2_threads')
	bt2_max_frag_len    = args.pop('bowtie2_max_fragment_length')
	macs2_smoothing_window_size = args.pop('macs2_smoothing_window_size')
	macs2_shift_size            = int(round(macs2_smoothing_window_size/2) * -1) #shift size should be int, -1/2 * corresponding smoothing window sizes
	macs2_cap_num_summits       = args.pop('macs2_cap_num_summits')
	macs2_summit_window_radius  = args.pop('macs2_summit_window_radius')

	s_obj = ATAC_Sample(sample_name, input_data_dir, output_dir, macs2_smoothing_window_size, macs2_summit_window_radius)
	setup_output_filepaths(s_obj)

	logger.info('Running FastQC for sample: {0}'.format(s_obj.name))
	run_fastQC(s_obj)
	logger.info('Aligning reads for sample: {0}'.format(s_obj.name))
	bowtie2_align_reads(s_obj, bt2_max_frag_len, num_bowtie2_threads)
	logger.info('Filtering alignments for sample: {0}'.format(s_obj.name))
	filter_alignments(s_obj)
	logger.info('Generating summary and QC reports for sample: {0}'.format(s_obj.name))
	generate_summary_reports(s_obj)
	logger.info('Generating text-based alignment file formats: {0}'.format(s_obj.name))
	generate_text_alignment_formats(s_obj)
	logger.info('Calculating fraction of reads overlapping RefSeq TSS regions: {0}'.format(s_obj.name))
	calc_and_write_frac_reads_close_to_TSS(s_obj.final_Tn5shifted_tagAlign_file, refseq_tss_areas, s_obj.refseq_tss_report)
	logger.info('Calculating fraction of reads overlapping Gencode TSS regions: {0}'.format(s_obj.name))
	calc_and_write_frac_reads_close_to_TSS(s_obj.final_Tn5shifted_tagAlign_file, gencode_tss_areas, s_obj.gencode_tss_report)
	logger.info('Calling peaks with MACS2: {0}'.format(s_obj.name))
	## note: peak calling parameter set from Omni-ATAC paper is: "macs2 callpeak --nomodel --nolambda --keep-dup all --call-summits"
	macs2_call_peaks(s_obj, macs2_shift_size, macs2_smoothing_window_size, macs2_cap_num_summits, macs2_summit_window_radius)
	logger.info('Deleting intermediate files: {0}'.format(s_obj.name))
	for f in s_obj.intermediate_file_list:
		run_command(' '.join(['rm', f]))


if __name__ == '__main__':
    args = vars(_arg_parser().parse_args(sys.argv[1:]))
    main(args)


#OLD CODE SNIPPETS

# #TODO: find purpose for subsampled files... do pseudoreplicate peak calling and IDR! (use half of reads)
# n_subsampled_reads  = 10000000
# logger.debug('Subsampling tagAlign file to {1} reads, excluding chrM: {0}'.format(sample_name, n_subsampled_reads))
# cmd = 'zcat {0} '.format(final_bedpe_file) + \
# '| shuf -n {0} --random-source={1} '.format(n_subsampled_reads, final_bedpe_file) + \
# r"""| awk 'BEGIN{OFS="\t"}{print $1,$2,$3,"N","1000",$9}' """ + \
# '| gzip -nc > {0}'.format(subsampled_tagalign_file)
# os.system(cmd)

	# logger.info('Creating coverage bigWig tracks from final BAM file: {0}'.format(sample_name))
	# os.system(' '.join(['bedtools', 'genomecov', '-bg', '-ibam', final_bam_file, '>', final_tmp_bedgraph_file]))
	# logger.debug('Sorting temporary bedgraph file: {0}'.format(sample_name))
	# os.system(' '.join(['sort', '-k1,1', '-k2,2n', final_tmp_bedgraph_file, '>', final_tmp_sorted_bedgraph_file]))
	# logger.debug('Converting bedgraph to bigWig file: {0}'.format(sample_name))
	# os.system(' '.join(['bedGraphToBigWig', final_tmp_sorted_bedgraph_file, chrom_sizes_file, final_bigWig_file]))
	# logger.debug('Deleting temporary files used to make final coverage bigWig file: {0}'.format(sample_name))
	# os.system(' '.join(['rm', final_tmp_bedgraph_file]))
	# os.system(' '.join(['rm', final_tmp_sorted_bedgraph_file]))

back to top

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Content policy— Contact— JavaScript license information— Web API