https://github.com/jvivian/one_off_scripts
Raw File
Tip revision: 3ad04be99cd01e6a047c1b530cc8a1de82bd862e authored by John Vivian on 02 February 2017, 01:31:18 UTC
Refactor SRA pipeline to use faster method than fastq-dump
Tip revision: 3ad04be
transfer_beatAML_to_s3.py
import os
import tarfile

from toil_lib.urls import s3am_upload
from tqdm import tqdm

home_dir = '/pod/home/jvivian/beatAML-transfer/'
sample_dir = '/pod/pstore/projects/BeatAML/fastq-12_07_2016/'
s3_dir = 's3://cgl-beataml-data/'
s3_key = '/pod/home/jvivian/master.key'

samples = os.listdir(sample_dir)
sample_ids = {x.split('_L00')[0] for x in samples}

with open(os.path.join(home_dir, 'beatAML-samples'), 'w') as f:
    f.write('\n'.join(sample_ids))

for sample_id in tqdm(sample_ids):
    subset = [os.path.join(sample_dir, x) for x in samples if x.startswith(sample_id)]

    out_tar = os.path.join(home_dir, sample_id + '.tar.gz')
    with tarfile.open(out_tar, 'w:gz') as tar:
        for sample in subset:
            tar.add(sample, arcname=os.path.basename(sample))

    s3am_upload(out_tar, s3_dir, num_cores=6, s3_key_path=s3_key)

    os.remove(out_tar)
back to top