Content - 2f634ce1745662fb159312ddb39b77fb875ba0f9 - 122a099/dataholders.py

dataholders.py
# Copyright 2016 James Hensman, Mark van der Wilk,
#                Valentine Svensson, alexggmatthews,
#                PabloLeon, fujiisoup
# Copyright 2017 Artem Artemev @awav
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import tensorflow as tf

from .. import misc
from ..core.errors import GPflowError
from ..core.compilable import Build
from .parameter import Parameter


class DataHolder(Parameter):
    """
    DataHolder is similar to the Parameter with only difference that
    default values for `fix_shape` and `trainable` options are opposite
    to the Parameter's and it does not have prior and transform options.

    By default shape of data holders is in floating mode and data holder
    does not provide a trainable option at all.

    :param value: Data input value. It can be a float, an integer,
        a float or integer like list, numpy array or TensorFlow variable.
    :param dtype: Type of new data holder.
    :param fix_shape: Default value is `False` and indicates that shape
        of internal tensor does not have specific shape, in other words,
        it is None.
    :param name: Name of the parameter.

    :raises: ValueError exception if value is not valid.
    """

    def __init__(self, value, dtype=None, fix_shape=False, name=None):
        self._dataholder_tensor = None
        super().__init__(value=value, name=name, dtype=dtype, fix_shape=fix_shape)

    @property
    def trainable(self):
        return False

    @property
    def parameter_tensor(self):
        return self._dataholder_tensor

    def set_trainable(self, value, graph=None):
        raise NotImplementedError('Data holder cannot be fixed.')

    def is_built(self, graph):
        if graph is None:
            raise ValueError('Graph is not specified.')
        if self.graph is not None:
            if self.graph is not graph:
                return Build.NOT_COMPATIBLE_GRAPH
            return Build.YES
        return Build.NO

    def as_pandas_table(self):
        column_names = ['class', 'shape', 'fixed_shape', 'value']
        column_values = [self.__class__.__name__, self.shape, self.fixed_shape, self.value]
        column_values = [[value] for value in column_values]
        df = misc.pretty_pandas_table([self.pathname], column_names, column_values)
        return df

    def _parameter_name(self):
        return misc.tensor_name(self.tf_pathname, 'dataholder')

    def _clear(self):
        self.reset_name()
        self._initial_value_tensor = None
        self._dataholder_tensor = None
        self._is_initialized_tensor = None

    def _build(self):
        tensor = self._build_parameter()
        self._dataholder_tensor = tensor
        self._is_initialized_tensor = tf.is_variable_initialized(tensor)

    def _init_parameter_defaults(self):
        self._initial_value_tensor = None
        self._dataholder_tensor = None
        self._is_initialized_tensor = None

    def _init_parameter_attributes(self, _prior, _transform, _trainable):
        pass

    def _set_parameter_attribute(self, attr, value):
        raise NotImplementedError('Data holder does not have parameter attributes.')

    def _read_parameter_tensor(self, session):
        return session.run(self._dataholder_tensor)

    def _apply_transform(self, value):
        return value

    def _set_parameter_tensor(self, tensor):
        self._dataholder_tensor = tensor

    def __setattr__(self, name, value):
        object.__setattr__(self, name, value)


class Minibatch(DataHolder):
    """
    Minibatch is a special case of data holders. As the name implies the minibatch
    object provides shuffling and endless batching mechanism for input data.
    Minibatch formes batches along zero axe of the input array.

    Minibatch is shape agnostic at zero axe. Once you created a minibatch you can
    vary size of the dataset, but feature shapes must be fixed.

    CAVEAT: Minibatch is not auto-initializable. It means that whenever you switch
    to another session, autoflow methods and optimizers will not be able to
    intialize TensorFlow dataset iterator. You have to call `intialize` method
    for Minibatch explicitly. Simple cases are not affected though.

    ```
    with tf.Session() as session1:
        mini = gpflow.Minibatch(data)

    with tf.Session() as session2:
        # mini.read_value(session=session2) # <<< fails.
        mini.initialize(session=session2)
        mini.read_value(session=session2) # <<< works fine.
    ```

    :param value: Numpy array.
    :param batch_size: Size of the batches.
    :param shuffle: If `True` then input data will be shuffled before batching.
    :param seed: Seed value for TensorFlow random generator.
    :param dtype: Type of new minibatch.
    :param name: Minibatch name.

    :raises: ValueError exception if input value is not a numpy array or a list.
    """

    def __init__(self, value, batch_size=1, shuffle=True,
                 seed=None, dtype=None, name=None):
        if not misc.is_valid_param_value(value) or misc.is_tensor(value):
            raise ValueError('The value must be either an array or a scalar.')

        super().__init__(value, name=name, dtype=dtype)

        self._batch_size = batch_size
        self._shuffle = shuffle
        self._seed = seed

    @property
    def batch_size(self):
        return self._batch_size

    @batch_size.setter
    def batch_size(self, value):
        return self.set_batch_size(value)

    @property
    def initializables(self):
        return [self._iterator_tensor]

    @property
    def initializable_feeds(self):
        if self._dataholder_tensor is None:
            return None
        return {self._cache_tensor: self._value,
                self._batch_size_tensor: self._batch_size}

    @property
    def seed(self):
        return self._seed

    @seed.setter
    def seed(self, seed):
        if self.graph is not None and self.is_built_coherence():
            raise GPflowError('Minibatch seed cannot be changed when it is built.')
        self._seed = seed

    def set_batch_size(self, size, session=None):
        self._batch_size = size
        session = self.enquire_session(session)
        if session is not None:
            self.initialize(session=session, force=True)

    def _clear(self):
        self.reset_name()
        self._cache_tensor = None
        self._batch_size_tensor = None
        self._dataholder_tensor = None
        self._iterator_tensor = None

    def _build(self):
        initial_tensor = self._build_placeholder_cache()
        self._cache_tensor = initial_tensor
        self._dataholder_tensor = self._build_dataholder(initial_tensor)

    def _build_placeholder_cache(self):
        value = self._value
        return tf.placeholder(dtype=value.dtype, shape=None, name='minibatch_init')

    def _build_dataholder(self, initial_tensor):
        if initial_tensor is None:
            raise GPflowError("Minibatch state corrupted.")
        data = tf.data.Dataset.from_tensor_slices(initial_tensor)
        data = data.repeat()
        if self._shuffle:
            shape = self._value.shape
            data = data.shuffle(buffer_size=shape[0], seed=self._seed)
        self._batch_size_tensor = tf.placeholder(tf.int64, shape=())
        data = data.batch(batch_size=self._batch_size_tensor)
        self._iterator_tensor = data.make_initializable_iterator()
        name = self._parameter_name()
        return self._iterator_tensor.get_next(name=name)

    def _init_parameter_defaults(self):
        self._cache_tensor = None
        self._batch_size_tensor = None
        self._dataholder_tensor = None
        self._iterator_tensor = None
        self._shuffle = True
        self._batch_size = 1
        self._seed = None

    def _parameter_name(self):
        name = 'minibatch'
        if self.parent is self:
            return misc.tensor_name(self.tf_pathname, name)
        return name