# Copyright 2016 James Hensman, Mark van der Wilk, Valentine Svensson, alexggmatthews, fujiisoup
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import abc
from typing import Optional, Tuple, Union

import tensorflow as tf
import numpy as np
from ..base import Module
from ..kernels import Kernel
from ..likelihoods import Likelihood
from ..mean_functions import MeanFunction, Zero
from ..config import default_float, default_jitter

MeanAndVariance = Tuple[tf.Tensor, tf.Tensor]


class BayesianModel(Module):
    """ Bayesian model. """

    def neg_log_marginal_likelihood(self, *args, **kwargs) -> tf.Tensor:
        return -(self.log_likelihood(*args, **kwargs) + self.log_prior())

    def log_prior(self) -> tf.Tensor:
        if len(self.variables) == 0:
            return tf.convert_to_tensor(0., dtype=default_float())
        return tf.add_n([p.log_prior() for p in self.trainable_parameters])

    @abc.abstractmethod
    def log_likelihood(self, *args, **kwargs) -> tf.Tensor:
        pass


class GPModel(BayesianModel):
    """
    A stateless base class for Gaussian process models, that is, those of the form

    .. math::
       :nowrap:

       \\begin{align}
       \\theta & \sim p(\\theta) \\\\
       f       & \sim \\mathcal{GP}(m(x), k(x, x'; \\theta)) \\\\
       f_i       & = f(x_i) \\\\
       y_i\,|\,f_i     & \sim p(y_i|f_i)
       \\end{align}

    This class mostly adds functionality to compile predictions. To use it,
    inheriting classes must define a predict_f function, which computes
    the means and variances of the latent function. Its usage is similar to log_likelihood in the
    Model class.

    These predictions are then pushed through the likelihood to obtain means
    and variances of held out data, self.predict_y.

    The predictions can also be used to compute the (log) density of held-out
    data via self.predict_log_density.

    """

    def __init__(self,
                 kernel: Kernel,
                 likelihood: Likelihood,
                 mean_function: Optional[MeanFunction] = None,
                 num_latent: int = 1):
        super().__init__()
        self.num_latent = num_latent
        #TODO(@awav): Why is this here when MeanFunction does not have a __len__ method
        if mean_function is None:
            mean_function = Zero()
        self.mean_function = mean_function
        self.kernel = kernel
        self.likelihood = likelihood

    @abc.abstractmethod
    def predict_f(self, X: tf.Tensor, full_cov=False, full_output_cov=False) -> MeanAndVariance:
        pass

    def predict_f_samples(self, X, num_samples):
        """
        Produce samples from the posterior latent function(s) at the points
        Xnew.
        """
        mu, var = self.predict_f(X, full_cov=True)  # [P, N, N]
        jitter = tf.eye(tf.shape(mu)[0], dtype=default_float()) * default_jitter()
        samples = [None] * self.num_latent
        for i in range(self.num_latent):
            L = tf.linalg.cholesky(var[i, ...] + jitter)
            shape = tf.stack([L.shape[0], num_samples])
            V = tf.random.normal(shape, dtype=L.dtype)
            samples[i] = mu[:, i:(i + 1)] + L @ V
        return tf.transpose(tf.stack(samples))

    def predict_y(self, X):
        """
        Compute the mean and variance of held-out data at the points X
        """
        f_mean, f_var = self.predict_f(X)
        return self.likelihood.predict_mean_and_var(f_mean, f_var)

    def predict_log_density(self, X, Y):
        """
        Compute the (log) density of the data Ynew at the points Xnew

        Note that this computes the log density of the data individually,
        ignoring correlations between them. The result is a matrix the same
        shape as Ynew containing the log densities.
        """
        f_mean, f_var = self.predict_f(X)
        return self.likelihood.predict_density(f_mean, f_var, Y)


class GPModelOLD(BayesianModel):
    """
    A base class for Gaussian process models, that is, those of the form

    .. math::
       :nowrap:

       \\begin{align}
       \\theta & \sim p(\\theta) \\\\
       f       & \sim \\mathcal{GP}(m(x), k(x, x'; \\theta)) \\\\
       f_i       & = f(x_i) \\\\
       y_i\,|\,f_i     & \sim p(y_i|f_i)
       \\end{align}

    This class mostly adds functionality to compile predictions. To use it,
    inheriting classes must define a build_predict function, which computes
    the means and variances of the latent function. This gets compiled
    similarly to build_likelihood in the Model class.

    These predictions are then pushed through the likelihood to obtain means
    and variances of held out data, self.predict_y.

    The predictions can also be used to compute the (log) density of held-out
    data via self.predict_density.

    For handling another data (Xnew, Ynew), set the new value to self.X and self.Y

    >>> m.X = Xnew
    >>> m.Y = Ynew
    """

    def __init__(self,
                 X: object,
                 Y: object,
                 kernel: object,
                 likelihood: object,
                 mean_function: object = None,
                 num_latent: object = 1,
                 seed: object = None) -> object:
        super().__init__()
        self.X = X
        self.Y = Y
        self.num_latent = num_latent or Y.shape[1]
        #TODO(@awav): Why is this here when MeanFunction does not have a __len__ method
        if mean_function is None:
            mean_function = Zero()
        self.mean_function = mean_function
        self.kernel = kernel
        self.likelihood = likelihood

    @abc.abstractmethod
    def predict_f(self, X: tf.Tensor, full=False, full_output_cov=False) -> MeanAndVariance:
        pass

    def predict_f_samples(self, X, num_samples):
        """
        Produce samples from the posterior latent function(s) at the points
        Xnew.
        """
        mu, var = self.predict_f(X, full=True)  # [P, N, N]
        jitter = tf.eye(tf.shape(mu)[0], dtype=default_float()) * default_jitter()
        samples = [None] * self.num_latent
        for i in range(self.num_latent):
            L = tf.linalg.cholesky(var[i, ...] + jitter)
            shape = tf.stack([L.shape[0], num_samples])
            V = tf.random.normal(shape, dtype=L.dtype)
            samples[i] = mu[:, i:(i + 1)] + L @ V
        return tf.transpose(tf.stack(samples))

    def predict_y(self, X):
        """
        Compute the mean and variance of held-out data at the points X
        """
        f_mean, f_var = self.predict_f(X)
        return self.likelihood.predict_mean_and_var(f_mean, f_var)

    def predict_log_density(self, X, Y):
        """
        Compute the (log) density of the data Ynew at the points Xnew

        Note that this computes the log density of the data individually,
        ignoring correlations between them. The result is a matrix the same
        shape as Ynew containing the log densities.
        """
        f_mean, f_var = self.predict_f(X)
        return self.likelihood.predict_density(f_mean, f_var, Y)