{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, sys\n",
    "import h5py\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading in HDF5 files\n",
    "__Author:__ Ji Won Park\n",
    "    \n",
    "__Created:__ 9/30/19\n",
    "    \n",
    "__Last run:__ 9/30/19\n",
    "\n",
    "__Goals:__\n",
    "Read in the contents of the HDF5 file written with the Baobab script, `to_hdf5.py`\n",
    "\n",
    "__Before running this notebook:__\n",
    "1. Generate some data. At the root of the `baobab` repo, run:\n",
    "```\n",
    "generate baobab/configs/tdlmc_diagonal_config.py --n_data 10\n",
    "```\n",
    "This generates 10 images with the `.npy` extension at the location this notebook expects.\n",
    "\n",
    "2. Save the `.npy` image files and the `metadata.csv` into an HDF5 file. At the root of the `baobab` repo, run:\n",
    "```\n",
    "to_hdf5 tdlmc_train_DiagonalBNNPrior_seed1113 --format 'tf'\n",
    "```\n",
    "This creates an HDF file at the path `tdlmc_train_DiagonalBNNPrior_seed1113/tdlmc_train_DiagonalBNNPrior_seed1113.h5` in the Tensorflow (`tf`) format, which places channels at the last dimension."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdf5_filepath = os.path.join('..',\n",
    "                             'tdlmc_train_DiagonalBNNPrior_seed1113', \n",
    "                             'tdlmc_train_DiagonalBNNPrior_seed1113.h5')\n",
    "hdf5_file = h5py.File(hdf5_filepath, 'r')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are the datasets stored in this file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(hdf5_file.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The images can be accessed as follows. Note that the shape of the image is such that the channels go at the end, in the `tf` (TensorFlow) format as specified by the `--format` command-line argument."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdf5_file['image_0'].shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The pixel-wise mean and std across all the images can be accessed as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hdf5_file['pixels_mean'].shape, hdf5_file['pixels_std'].shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The metadata dataframe can be read in using the Pandas command `read_hdf5`, with the `start` and `stop` arguments specifying the rows and `columns` argument specifying the columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_hdf(hdf5_filepath, key='metadata', mode='r', \n",
    "            start=3, \n",
    "            stop=5, \n",
    "            columns=['lens_mass_theta_E', 'lens_mass_gamma'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (baobab)",
   "language": "python",
   "name": "baobab"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}