Content - e86e649c28b3aa14938663550dc5a069b00677c3 - add11de/bin/pycbc_hdf5_splitbank

pycbc_hdf5_splitbank
#!/usr/bin/env python

# Copyright (C) 2016  Soumi De
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

"""
The code reads in a compressed template bank and splits it up into
smaller banks where the number of smaller banks is a user input
"""

import argparse
import numpy
import h5py
import logging
import pycbc, pycbc.version
from pycbc.waveform import bank
from numpy import random

__author__  = "Soumi De <soumi.de@ligo.org>"

parser = argparse.ArgumentParser(description=__doc__[1:])
parser.add_argument("--version", action="version",
                  version=pycbc.version.git_verbose_msg)
parser.add_argument("--bank-file", type=str,
                    help="Bank hdf file to load.")
outbanks = parser.add_mutually_exclusive_group(required=True)
outbanks.add_argument("--templates-per-bank", type=int,
                      help="Number of templates in each output sub-banks. "
                      "Either specify this or --number-of-banks, not both.")
outbanks.add_argument("--number-of-banks", type=int,
                      help="Number of output sub-banks. Either specify this "
                      "or --templates-per-bank, not both.")
outbanks.add_argument("--output-filenames", nargs='*', default=None,
                      action="store",
                      help="Directly specify the names of the output files. "
                      "The bank will be split equally between files.")
parser.add_argument("--output-prefix", default=None,
                    help="Prefix to add to the output template bank names, "
                    "for example 'H1L1-BANK'. Output file names would then be "
                    "'H1L1-BANK{x}.hdf' where {x} is 1,2,...")
sortopt = parser.add_mutually_exclusive_group()
sortopt.add_argument("--mchirp-sort", action="store_true", default=False,
                    help="Sort templates by chirp mass before splitting")
sortopt.add_argument("--random-sort", action="store_true", default=False,
                    help="Sort templates randomly before splitting")
parser.add_argument("--random-seed", type=int,
                    help="Random seed for --random-sort")
parser.add_argument("--force", action="store_true", default=False,
                    help="Overwrite the given hdf file if it exists. "
                         "Otherwise, an error is raised.")
parser.add_argument("--verbose", action="store_true", default=False)


args = parser.parse_args()

# input checks
if args.mchirp_sort and (args.random_seed is not None):
    raise RuntimeError("Can't supply a random seed if not sorting randomly!")

if args.output_filenames is None and args.output_prefix is None:
    raise RuntimeError("Must specify either output filenames or a prefix!")

if args.output_filenames and args.output_prefix:
    raise RuntimeError("Can't specify both output filenames and a prefix")

pycbc.init_logging(args.verbose)
logging.info("Loading bank")

tmplt_bank = bank.TemplateBank(args.bank_file)

templates = tmplt_bank.table

if args.random_sort:
    if args.random_seed is not None:
        random.seed(args.random_seed)
    idx = numpy.arange(templates.size)
    numpy.random.shuffle(idx)
    templates = templates[idx]
    tmplt_bank.table = templates

if args.mchirp_sort:
    from pycbc import pnutils
    mcsort = numpy.argsort(templates.mchirp)
    templates = templates[mcsort]
    tmplt_bank.table = templates

# Split the templates in the bank taken as input into the smaller banks

# If an array of filenames
if args.output_filenames:
    args.number_of_banks = len(args.output_filenames)

# If the number of output banks is taken as input calculate the number
# of templates to be stored per bank
if args.number_of_banks:
    num_files = args.number_of_banks
    num_per_file = int(templates[:].size/num_files)

# If the number of templates per bank is taken as input calculate the
# number of output banks
elif args.templates_per_bank:
    num_per_file = args.templates_per_bank
    num_files = int(templates[:].size / num_per_file)

# Generate sub-banks
logging.info("Generating the output sub-banks")
for ii in range(num_files):
    start_idx = ii * num_per_file
    # The output banks are assigned a fixed length equal to the number
    # of templates per bank requested by the user or calculated earlier
    # in the code except for the last bank in which the remaining
    # templates, if any, are put.
    if ( ii == (num_files-1)):
        end_idx = templates[:].size
    else:
        end_idx = (ii + 1) * num_per_file

    # Assign a name to the h5py output file to store the ii'th smaller bank
    if args.output_filenames:
        outname = args.output_filenames[ii]
    elif args.output_prefix:
        outname = args.output_prefix + str(ii) + '.hdf'
    else:
        raise RuntimeError("I shouldn't be able to reach this point. One out "
                           "of --output-filenames and --output-prefix must "
                           "have been supplied!")

    # Generate the hdf5 output file for the ii'th sub-bank, which would
    # be a slice of the input template bank having a start index and
    # end index as calculated above
    output = tmplt_bank.write_to_hdf(outname, start_idx, end_idx,
                                     force=args.force)
    output.close()

logging.info("finished")