# -*- coding: utf-8 -*- """Converting .npy image files and metadata into HDF5 This script converts the baobab data into the HDF5 format. Example ------- To run this script, pass in the baobab out_dir path as the first argument and the framework format as the second, e.g.:: $ to_hdf5 out_data/tdlmc_train_EmpiricalBNNPrior_seed1113 --format 'tf' The output file will be named `tdlmc_train_EmpiricalBNNPrior_seed1113.h5` and can be found inside the directory provided as the first argument. See the demo notebook `demo/Read_hdf5_file.ipynb` for instructions on how to access the datasets in this file. """ import os, sys import numpy as np import pandas as pd import argparse import h5py from addict import Dict from tqdm import tqdm def parse_args(): """Parses command-line arguments """ parser = argparse.ArgumentParser() parser.add_argument('npy_dir', help='directory containing .npy files and metadata (path of out_dir in the baobab config)') parser.add_argument('--format', default='tf', dest='format', type=str, choices=['tf', 'theano'], help='format of image. Default: tf.') args = parser.parse_args() # sys.argv rerouting for setuptools entry point if args is None: args = Dict() args.npy_dir = sys.argv[0] args.format = sys.argv[1] #base, ext = os.path.splitext(save_path) #if ext.lower() not in ['.h5', '.hdf5']: # raise argparse.ArgumentTypeError('out_filepath must have a valid HDF5 extension.') return args def main(): args = parse_args() baobab_out_dir = os.path.basename(os.path.normpath(args.npy_dir)) save_path = os.path.join(args.npy_dir, '{:s}.h5'.format(baobab_out_dir)) print("Destination path: {:s}".format(save_path)) metadata_path = os.path.join(args.npy_dir, 'metadata.csv') metadata_df = pd.read_csv(metadata_path, index_col=None) img_path_list = metadata_df['img_filename'].values first_img_filepath = os.path.join(args.npy_dir, img_path_list[0]) img_shape = np.load(first_img_filepath).shape # image dimensions n_x, n_y = img_shape[-2], img_shape[-1] n_data, n_cols = metadata_df.shape # Initialize hdf5 file hdf_file = h5py.File(save_path, mode='w', driver=None) # Create dataset for images if args.format == 'tf': img_shape = (n_x, n_y, 1) # tf data shape elif args.format == 'theano': img_shape = (1, n_x, n_y) # theano data shape else: raise NotImplementedError # Initialize mean and std of images, and quantities required to compute them online hdf_file.create_dataset('pixels_mean', img_shape, np.float32) hdf_file.create_dataset('pixels_std', img_shape, np.float32) mean = np.zeros(img_shape, np.float32) std = np.zeros(img_shape, np.float32) sum_sq = np.zeros(img_shape, np.float32) ddof = 0 # degree of freedom print("Saving images...") current_idx = 0 # running idx of dataset pbar = tqdm(total=n_data) while current_idx < n_data: # Read in image img_path = os.path.join(args.npy_dir, img_path_list[current_idx]) img = np.load(img_path).reshape(img_shape) # Change axis order for theano if format=='theano': img = np.rollaxis(img, 2) # Populate images dataset dataset_name = 'image_{:d}'.format(current_idx) hdf_file.create_dataset(dataset_name, img_shape, np.float32) hdf_file[dataset_name][...] = img[None] # Update running mean and std (Welford's algorithm) current_idx += 1 delta = img - mean mean += delta / current_idx sum_sq += delta * (img - mean) # Update progress pbar.update(1) pbar.close() # Populate mean, std datasets std = np.sqrt(sum_sq / (n_data - ddof)) hdf_file['pixels_mean'][...] = mean hdf_file['pixels_std'][...] = std hdf_file.close() # Create dataset for metadata df metadata_df.to_hdf(save_path, key='metadata', mode='a', format='table') # TODO: serialize or subgroup each row so the whole dataframe isn't read into memory if __name__ == '__main__': main()