import sys import os import glob import re # do not include terminal slash in directories path_to_set_atac_pmacs_env = sys.argv[1] # e.g., '/home/esanford/dev_atac_seq_pipeline/set_atac_pmacs_env' path_to_atac_pipeline = sys.argv[2] # e.g., '/home/esanford/dev_atac_seq_pipeline/atac_pipeline.py' input_data_dir = sys.argv[3] output_dir = sys.argv[4] num_bowtie2_threads = 4 memory_per_job_in_mb = 12 * 1024 #default on PMACS is 6GB. 6GB will cause most jobs to terminate if there are ~50-75M reads per sample. sample_names = set() #start with set to disallow duplicates for sample_filepath in glob.glob(input_data_dir + '/*'): try: sample_name_regex = '(.*/)(.*)(_R[1-2]).fastq.gz' re_match_obj = re.match(sample_name_regex, sample_filepath) sample_name = re_match_obj.group(2) sample_names.add(sample_name) except AttributeError: print('warning: {0} does not match expected filename for input fastq files! Expected filenames conform to this regular expression: {1}'.format(sample_filepath, sample_name_regex)) sample_name_list = list(sample_names) print('about to submit jobs for these samples: {0}'.format(sample_name_list)) for s in sample_name_list: job_cmd = 'bsub -J ' + 'pipeline_' + s + ' -n ' + str(num_bowtie2_threads) + ' ' + \ '-M ' + str(memory_per_job_in_mb) + ' ' + \ '-o ' + '{0}/{1}/{1}.pipeline_stdout.txt'.format(output_dir, s) + ' ' + \ '-e ' + '{0}/{1}/{1}.pipeline_stderr.txt'.format(output_dir, s) + ' ' + \ path_to_set_atac_pmacs_env + ' ' + \ 'python' + ' ' + \ path_to_atac_pipeline + ' ' + \ '{0} {1} {2} --num_bowtie2_threads {3}'.format(s, input_data_dir, output_dir, num_bowtie2_threads) print(job_cmd) os.system(job_cmd)