https://github.com/jvivian/one_off_scripts
Raw File
Tip revision: 3ad04be99cd01e6a047c1b530cc8a1de82bd862e authored by John Vivian on 02 February 2017, 01:31:18 UTC
Refactor SRA pipeline to use faster method than fastq-dump
Tip revision: 3ad04be
upload_to_ceph.py
#!/usr/bin/env python2.7
import boto
import boto.s3.connection
from boto.s3 import connect_to_region
import math
import os
import sys


def upload_file(s3, bucketname, keyname, file_path):

        b = s3.get_bucket(bucketname)

        k = b.new_key(keyname)
        if not k.exists():

            mp = b.initiate_multipart_upload(keyname)

            source_size = os.stat(file_path).st_size
            bytes_per_chunk = 5000*1024*1024
            chunks_count = int(math.ceil(source_size / float(bytes_per_chunk)))

            for i in range(chunks_count):
                offset = i * bytes_per_chunk
                remaining_bytes = source_size - offset
                bytes = min([bytes_per_chunk, remaining_bytes])
                part_num = i + 1

                print "uploading part " + str(part_num) + " of " + str(chunks_count)

                with open(file_path, 'r') as fp:
                        fp.seek(offset)
                        mp.upload_part_from_file(fp=fp, part_num=part_num, size=bytes)

            if len(mp.get_all_parts()) == chunks_count:
                    mp.complete_upload()
                    print "upload_file done"
            else:
                    mp.cancel_upload()
                    print "upload_file failed"



host = 'ceph-gw-01.pod'
s3 = boto.connect_s3(host=host,
                     is_secure=False,
                     calling_format=boto.s3.connection.OrdinaryCallingFormat(),)


directory = sys.argv[1]
# bucket = conn.get_bucket('cgl-driver-projects')
files = [os.path.join(directory, f) for f in os.listdir(directory)]
failures = []
for f in files:
    retry = 3
    # k = bucket.new_key('ispy/' + os.path.basename(f))
    # if not k.exists():
    for i in xrange(retry):
        try:
            print 'uploading: {}\tTry: {}'.format(f, i+1)
            upload_file(s3, 'cgl-pipeline-inputs', 'rnaseq-cgl/' + os.path.basename(f), f)
            break
        except Exception as e:
            if i == 2:
                print '{}: Permamently failed to upload'.format(f)
                print e.message, e.args
                failures.append(f)
            pass

if failures:
    print '\n=====Failures====='
    print '\n'.join(failures)
back to top