https://github.com/arjunrajlaboratory/atac-seq_pipeline_paired-end
Revision b165422254188ce4f89074374967f3390fd5d875 authored by emsanford on 25 February 2021, 23:34:09 UTC, committed by GitHub on 25 February 2021, 23:34:09 UTC
Do not include replicate number in the sample key by default
1 parent c4c819e
Tip revision: b165422254188ce4f89074374967f3390fd5d875 authored by emsanford on 25 February 2021, 23:34:09 UTC
Update concat_zipped_fastq_files_from_illumina.py
Update concat_zipped_fastq_files_from_illumina.py
Tip revision: b165422
concat_zipped_fastq_files_from_illumina.py
import glob
import sys
import os
import re
import time
base_directory = sys.argv[1] # e.g. '/home/esanford/data/HD3_ATAC-seq/data_from_illumina/FASTQ_Generation_2018-10-31_08_01_22Z-134203162'
output_directory = sys.argv[2] # e.g. '/home/esanford/data/HD3_ATAC-seq/concatenated_data'
# replicate = 'rep1'
fastq_files = glob.glob(base_directory + '/*/*.fastq*')
sample_dict = {}
number_of_lanes = 4
print fastq_files
for f in fastq_files:
sample_dir_string = f.split('/')[-2]
fastq_file_string = f.split('/')[-1]
sample_name_regex = '(.*)(_S)([0-9]+)(_L00.*)'
re_match_obj = re.match(sample_name_regex, fastq_file_string)
sample_name = re_match_obj.group(1)
sample_number = int(re_match_obj.group(3))
print "{0} : {1}".format(sample_number, sample_name)
#sample_key = "{0:02d}-{1}".format(sample_number, sample_name)
# sample_key = sample_name + '-' + replicate
sample_key = sample_name
if sample_key not in sample_dict:
sample_dict[sample_key] = []
sample_dict[sample_key].append(f)
for sample, filepaths in sample_dict.items():
read1_files = filter(lambda x: '_R1_' in x, filepaths)
read2_files = filter(lambda x: '_R2_' in x, filepaths)
print read1_files
print read2_files
assert(len(read1_files) == number_of_lanes)
assert(len(read2_files) == number_of_lanes)
r1_output_file = output_directory + '/' + sample + '_R1.fastq'
cmd1 = 'zcat {0} > {1}'.format(' '.join(read1_files), r1_output_file)
print cmd1
os.system(cmd1)
r2_output_file = output_directory + '/' + sample + '_R2.fastq'
cmd2 = 'zcat {0} > {1}'.format(' '.join(read2_files), r2_output_file)
print cmd2
os.system(cmd2)
# delay five seconds in case previous step isn't completely finished before proceeding to next step
time.sleep(5)
#compress files
os.system('bsub gzip {0}'.format(r1_output_file))
os.system('bsub gzip {0}'.format(r2_output_file))
Computing file changes ...