https://github.com/rgiordan/LinearResponseVariationalBayes.py
Raw File
Tip revision: 5305d7e204481e518c5ef82bbed065ec627a4a9e authored by Ryan Giordano on 18 July 2020, 03:02:55 UTC
new version
Tip revision: 5305d7e
posterior_predictive_leverage.lyx
#LyX 2.2 created this file. For more info see http://www.lyx.org/
\lyxformat 508
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass article
\use_default_options true
\maintain_unincluded_children false
\language english
\language_package default
\inputencoding auto
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\use_hyperref false
\papersize default
\use_geometry false
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date false
\justification true
\use_refstyle 1
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 3
\paragraph_separation indent
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header

\begin_body

\begin_layout Standard
\begin_inset FormulaMacro
\newcommand{\mbe}{\mathbb{E}}
{\mathbb{E}}
\end_inset


\end_layout

\begin_layout Standard
\begin_inset FormulaMacro
\newcommand{\cov}{\mathrm{Cov}}
{\mathrm{Cov}}
\end_inset


\end_layout

\begin_layout Standard
Suppose we have posterior 
\begin_inset Formula $p\left(\theta\vert x\right)$
\end_inset

 based on a tractable likelihood 
\begin_inset Formula $p\left(x\vert\theta\right)$
\end_inset

 and prior 
\begin_inset Formula $p\left(\theta\right)$
\end_inset

 with
\begin_inset Formula 
\begin{align*}
\log p\left(x\vert\theta\right) & =\sum_{n}\log p\left(x_{n}\vert\theta\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
Define weights 
\begin_inset Formula $w$
\end_inset

 such that
\begin_inset Formula 
\begin{align*}
\log p\left(x\vert\theta,w\right) & =\sum_{n}w_{n}\log p\left(x_{n}\vert\theta\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
For simplicity, assume WLOG that 
\begin_inset Formula $=0$
\end_inset

.
 The sensitivity of 
\begin_inset Formula $\mbe\left[\theta\right]$
\end_inset

 to inclusion of datapoint 
\begin_inset Formula $n$
\end_inset

 is
\begin_inset Formula 
\begin{align*}
S_{n}=\frac{d\mbe_{p\left(\theta\vert x\right)}\left[\theta\right]}{dw_{n}} & =\cov_{p\left(\theta\vert x\right)}\left(\theta,\log p\left(x_{n}\vert\theta\right)\right)=\mbe_{p\left(\theta\vert x\right)}\left[\left(\theta-\mbe_{p\left(\theta\vert x\right)}\left[\theta\right]\right)\log p\left(x_{n}\vert\theta\right)\right]
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
Draw 
\begin_inset Formula $\theta_{b}\sim p\left(\theta\vert x\right)$
\end_inset

.
 Suppose we draw a new sample, 
\begin_inset Formula $x_{b}\sim p\left(x\vert\theta_{b}\right)$
\end_inset

.
 Then the sensitivity of the new dataset is
\begin_inset Formula 
\begin{align*}
S_{bn} & =\mbe_{p\left(\theta_{b}\vert x_{b}\right)}\left[\left(\theta-\mbe_{p\left(\theta_{b}\vert x_{b}\right)}\left[\theta\right]\right)\log p\left(x_{bn}\vert\theta\right)\right]
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
We can write all this out as a function of 
\begin_inset Formula $w$
\end_inset

.
\begin_inset Formula 
\begin{align*}
S\left(w\right)=\frac{d\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\right]}{dw} & =\cov_{p\left(\theta\vert x,w\right)}\left(\theta,\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right)\right)\\
 & =\mbe_{p\left(\theta\vert x,w\right)}\left[\left(\theta-\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\right]\right)\frac{\partial}{\partial w^{T}}\log p\left(x\vert\theta,w\right)\right]\\
 & =\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\frac{\partial}{\partial w^{T}}\log p\left(x\vert\theta,w\right)\right]-\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\right]\mbe_{p\left(\theta\vert x,w\right)}\left[\frac{\partial}{\partial w^{T}}\log p\left(x\vert\theta,w\right)\right]
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
For weights other that 
\begin_inset Formula $w=1$
\end_inset

, it may be difficult to evaluate 
\begin_inset Formula $p\left(\theta\vert x,w\right)$
\end_inset

.
 So we evaluate the second derivative:
\begin_inset Formula 
\begin{align*}
\frac{d}{dw^{T}}\mbe_{p\left(\theta\vert x,w\right)}\left[\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right)\right] & =\cov_{p\left(\theta\vert x,w\right)}\left(\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right),\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right)\right)\\
\frac{d}{dw^{T}}\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\frac{\partial}{\partial w^{T}}\log p\left(x\vert\theta,w\right)\right] & =\cov_{p\left(\theta\vert x,w\right)}\left(\theta\frac{\partial}{\partial w^{T}}\log p\left(x\vert\theta,w\right),\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right)\right)+\\
 & \quad\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\frac{\partial^{2}}{\partial w\partial w^{T}}\log p\left(x\vert\theta,w\right)\right]\\
 & =\cov_{p\left(\theta\vert x,w\right)}\left(\theta\frac{\partial}{\partial w^{T}}\log p\left(x\vert\theta,w\right),\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right)\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
It may be convenient at this point to denote
\begin_inset Formula 
\begin{align*}
\ell_{w}\left(w\right) & :=\frac{\partial}{\partial w}\log p\left(x\vert\theta,w\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
Then
\begin_inset Formula 
\begin{align*}
\frac{dS\left(w\right)}{dw} & =\cov\left(\theta\ell_{w},\ell_{w}\right)-\mbe\left[\theta\right]\cov\left(\ell_{w},\ell_{w}\right)-\mbe\left[\ell_{w}\right]\cov\left(\theta,\ell_{w}\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
It follows that the sensitivity at another point is roughly
\begin_inset Formula 
\begin{align*}
S\left(w\right) & \approx S\left(1\right)+\frac{dS\left(w\right)}{dw^{T}}\left(w-1\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
Note that you can construct the weights for a posterior predictive with
 importance sampling.
 For 
\begin_inset Formula $\theta_{b}\sim p\left(\theta\vert x,w=1\right)$
\end_inset

, let 
\begin_inset Formula 
\begin{align*}
w & =\exp\left(\log p\left(x\vert\theta_{b}\right)\right)\\
\exp\left(z\right) & =1+z+\frac{1}{2!}z^{2}+...\Rightarrow\\
w-1 & \approx\log p\left(x\vert\theta_{b}\right)=\ell_{w}
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
suggesting
\begin_inset Formula 
\begin{align*}
S\left(w_{b}\right)-S\left(1\right) & \approx\cov\left(\theta\ell_{w},\ell_{w}\right)\ell_{w}-\mbe\left[\theta\right]\cov\left(\ell_{w},\ell_{w}\right)\ell_{w}-\mbe\left[\ell_{w}\right]\cov\left(\theta,\ell_{w}\right)\ell_{w}
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
or in expectation given 
\begin_inset Formula $x$
\end_inset

,
\begin_inset Formula 
\begin{align*}
\mbe\left[S\left(w_{b}\right)-S\left(1\right)\vert x\right] & =\cov\left(\theta\ell_{w},\ell_{w}\right)\mbe\left[\ell_{w}\right]-\mbe\left[\theta\right]\cov\left(\ell_{w},\ell_{w}\right)\mbe\left[\ell_{w}\right]-\mbe\left[\ell_{w}\right]\cov\left(\theta,\ell_{w}\right)\mbe\left[\ell_{w}\right]
\end{align*}

\end_inset


\end_layout

\begin_layout Section
Weights
\end_layout

\begin_layout Standard
It occurs to me to ask whether a posterior on a weighted dataset is the
 same as the posterior on a dataset drawn with the corresponding weights.
 At first, it seems like no – it's just a different thing.
 But then if a 
\begin_inset Formula $w$
\end_inset

-weighted version of 
\begin_inset Formula $\tilde{x}$
\end_inset

 approximates 
\begin_inset Formula $x$
\end_inset

, then
\begin_inset Formula 
\begin{align*}
\mbe\left[g\left(\theta\right)\vert x\right] & =\frac{\int g\left(\theta\right)p\left(x\vert\theta\right)p\left(\theta\right)d\theta}{\int p\left(x\vert\theta\right)p\left(\theta\right)d\theta}\\
 & =\frac{\int g\left(\theta\right)\prod_{n}p\left(x_{n}\vert\theta\right)p\left(\theta\right)d\theta}{\int\prod_{n}p\left(x_{n}\vert\theta\right)p\left(\theta\right)d\theta}\\
 & \stackrel{?}{\approx}\frac{\int g\left(\theta\right)\prod_{n}p\left(\tilde{x}_{n}\vert\theta\right)^{w_{n}}p\left(\theta\right)d\theta}{\int\prod_{n}p\left(\tilde{x}_{n}\vert\theta\right)^{w_{n}}p\left(\theta\right)d\theta}
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
And since
\begin_inset Formula 
\begin{align*}
\int g\left(\theta\right)\prod_{n}p\left(\tilde{x}_{n}\vert\theta\right)^{w_{n}}p\left(\theta\right)d\theta & =\int g\left(\theta\right)\exp\left(\sum_{n}w_{n}\log p\left(\tilde{x}_{n}\vert\theta\right)\right)p\left(\theta\right)d\theta\\
 & \approx\int g\left(\theta\right)\exp\left(\sum_{n}\log p\left(x_{n}\vert\theta\right)\right)p\left(\theta\right)d\theta\\
 & =\int g\left(\theta\right)p\left(x\vert\theta\right)p\left(\theta\right)d\theta
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
I think it's ok.
 In other words, every posterior is a functional of an expectation over
 the data 
\begin_inset Formula $x$
\end_inset

.
\end_layout

\begin_layout Section
Posterior predictive
\end_layout

\begin_layout Standard
Draw a posterior predictive check from a VB approximation.
 It will fail.
 Do the leverage scores tell you something useful?
\end_layout

\begin_layout Section
Predicting data
\end_layout

\begin_layout Standard
You can ask what the expected predicted values of sufficient statistics
 are.
 For example,
\begin_inset Formula 
\begin{align*}
\mbe\left[x_{b}\vert x\right] & =\int x_{b}p\left(x_{b}\vert\theta\right)p\left(\theta\vert x\right)d\theta\\
 & \approx\frac{1}{N}\sum\mbe\left[w_{nb}\vert\theta\right]x_{n}\\
 & \approx\frac{1}{NB}\sum_{n}\sum_{b}w_{nb}x_{n}
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
What should it be in exponential families?
\end_layout

\begin_layout Section
Influence function with data
\end_layout

\begin_layout Standard
Suppose we add a data point at 
\begin_inset Formula $x_{0}$
\end_inset

.
 Then
\begin_inset Formula 
\begin{align*}
\frac{d\mbe_{p\left(\theta\vert x,w,x_{0}\right)}\left[\theta\right]}{dw_{0}} & =\cov_{p\left(\theta\vert x\right)}\left(\theta,\log p\left(x_{0}\vert\theta\right)\right)\\
 & =\int p\left(\theta\vert x\right)\left(\theta-\mbe\left[\theta\right]\right)\log p\left(x_{0}\vert\theta\right)d\theta
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
Suppose you add a vector of datapoints at locations 
\begin_inset Formula $x_{0}$
\end_inset

 with weights 
\begin_inset Formula $\alpha w$
\end_inset

.
 Then
\begin_inset Formula 
\begin{align*}
\frac{d\mbe_{p\left(\theta\vert x,w,x_{0}\right)}\left[\theta\right]}{d\alpha} & =\sum_{n}\int w_{n}p\left(\theta\vert x\right)\left(\theta-\mbe\left[\theta\right]\right)\log p\left(x_{0n}\vert\theta\right)d\theta
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
It follows that the influence function for adding data with distribution
 
\begin_inset Formula $g\left(x_{0}\right)$
\end_inset

 is
\begin_inset Formula 
\begin{align*}
I\left(x\right) & =p\left(\theta\vert x\right)\left(\theta-\mbe\left[\theta\right]\right)\log p\left(x_{0}\vert\theta\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
The only dependence on 
\begin_inset Formula $x_{0}$
\end_inset

 is through the log probability? This must be wrong.
 You need to mix your existing data with the new data probably.
\begin_inset Formula 
\begin{align*}
p\left(x\vert\alpha\right) & =\left(1-\alpha\right)\sum_{n}\delta\left(x_{n}\right)+\alpha g\left(x\right)
\end{align*}

\end_inset

with the interpretation that
\begin_inset Formula 
\begin{align*}
\log p\left(x\vert\theta\right) & =\sum_{n}\log p\left(x_{n}\vert\theta\right)\\
 & =\mbe_{x\sim\sum_{n}\delta\left(x_{n}\right)}\left[\log p\left(x\vert\theta\right)\right]
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
this is weird and maybe doesn't make sense.
\end_layout

\begin_layout Section
Exponential families
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{align*}
p\left(x\vert\theta\right) & =\exp\left(\theta^{T}x-nA_{x}\left(\theta\right)\right)\\
p\left(\theta\vert x\right) & =\exp\left(\left(x+\eta\right)^{T}\theta-\left(n+1\right)A_{x}\left(\theta\right)-A_{\theta}\left(x,n\right)\right)\\
x & =\sum_{i=1}^{n}x_{i}
\end{align*}

\end_inset


\begin_inset Formula 
\begin{align*}
\frac{d\mbe_{p\left(\theta\vert x,w\right)}\left[\theta\right]}{dw_{i}} & =\cov_{p\left(\theta\vert x,w\right)}\left(\theta,\theta^{T}x_{i}\right)=\cov\left(\theta\right)_{p\left(\theta\vert x,w\right)}x_{i}
\end{align*}

\end_inset


\end_layout

\begin_layout Section
Expected weights
\end_layout

\begin_layout Standard
Note that
\begin_inset Formula 
\begin{align*}
w_{n} & =\exp\left(\log p\left(x_{n}\vert\theta\right)\right)
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
What is the expected value of this over 
\begin_inset Formula $\theta\sim p\left(\theta\vert x\right)$
\end_inset

?
\begin_inset Formula 
\begin{align*}
\mbe\left[w_{n}\right] & =\int p\left(x_{n}\vert\theta\right)\frac{\prod_{n'}p\left(x_{n'}\vert\theta\right)p\left(\theta\right)}{p\left(x\right)}d\theta\\
 & =\int\frac{p\left(x_{n}\vert\theta\right)\prod_{n'}p\left(x_{n'}\vert\theta\right)p\left(\theta\right)}{p\left(x,x_{n}\right)}\frac{p\left(x,x_{n}\right)}{p\left(x\right)}d\theta\\
 & =\frac{p\left(x,x_{n}\right)}{p\left(x\right)}
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
It is the ratio of normalizing constants.
\end_layout

\begin_layout Section
Normal example
\end_layout

\begin_layout Standard
\begin_inset Formula 
\begin{align*}
\log p\left(x\vert\theta\right) & =-\frac{1}{2}\left(\begin{array}{c}
x_{1}-\theta_{1}\\
x_{2}-\theta_{2}
\end{array}\right)^{T}\left(\begin{array}{cc}
1 & \rho\\
\rho & 1
\end{array}\right)\left(\begin{array}{c}
x_{1}-\theta_{1}\\
x_{2}-\theta_{2}
\end{array}\right)+\frac{1}{2}\log\left|\Lambda\right|\\
 & =-\frac{1}{2}\mathrm{trace}\left(\Lambda\theta\theta^{T}\right)+x^{T}\Lambda\theta-\frac{1}{2}x^{T}\Lambda x+\frac{1}{2}\log\left|\Lambda\right|
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
So 
\begin_inset Formula 
\begin{align*}
\mbe\left[\log p\left(x\vert\theta\right)\right] & =-\frac{1}{2}\mathrm{trace}\left(\Lambda\mbe\left[\theta\theta^{T}\right]\right)+x^{T}\Lambda\mbe\left[\theta\right]-\frac{1}{2}x^{T}\Lambda x+\frac{1}{2}\log\left|\Lambda\right|\\
 & =-\frac{1}{2}\mathrm{trace}\left(\Lambda\left(\Lambda^{-1}+x^{T}x\right)\right)+\frac{1}{2}x^{T}\Lambda x+\frac{1}{2}\log\left|\Lambda\right|\\
 & =-\frac{1}{2}\mathrm{trace}\left(I\right)+\frac{1}{2}\log\left|\Lambda\right|
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
You are missing some things that also depend on 
\begin_inset Formula $w$
\end_inset

!
\begin_inset Formula 
\begin{align*}
\log p\left(x\vert\theta\right) & =-\frac{1}{2}\left(\begin{array}{c}
x_{1}-\theta_{1}\\
x_{2}-\theta_{2}
\end{array}\right)^{T}\left(\begin{array}{cc}
1 & \rho\\
\rho & 1
\end{array}\right)\left(\begin{array}{c}
x_{1}-\theta_{1}\\
x_{2}-\theta_{2}
\end{array}\right)+\frac{1}{2}\log\left|\Lambda\right|\\
 & =-\frac{1}{2}\mathrm{trace}\left(\Lambda\theta\theta^{T}\right)+x^{T}\Lambda\theta-\frac{1}{2}x^{T}\Lambda x+\frac{1}{2}\log\left|\Lambda\right|\\
\log p\left(\theta\vert x,w\right) & =-\frac{1}{2}w\left(\mathrm{trace}\left(\Lambda\theta\theta^{T}\right)+x^{T}\Lambda\theta-\frac{1}{2}x^{T}\Lambda x\right)+\frac{1}{2}\log\left|w\Lambda\right|\Rightarrow\\
 & =w\left(-\frac{1}{2}\mathrm{trace}\left(\Lambda\theta\theta^{T}\right)+x^{T}\Lambda\theta\right)+\frac{p}{2}\log\left|w\right|+\frac{1}{2}\log\left|\Lambda\right|\textrm{ (here p=2)}\\
\mbe\left[\frac{\partial\log p\left(\theta\vert x,w\right)}{\partial w}\right] & =-\frac{1}{2}\mathrm{trace}\left(I\right)+\frac{p}{2}\log\left|w\right|=0
\end{align*}

\end_inset


\end_layout

\begin_layout Standard
Note that you need to include the normalizing constant for 
\begin_inset Formula $\theta$
\end_inset

; so it's not true that 
\begin_inset Formula $\mbe_{p\left(\theta\vert x,w\right)}\left[\frac{\partial}{\partial w}\log p\left(x\vert\theta\right)\right]=0$
\end_inset

, only that 
\begin_inset Formula $\mbe_{p\left(\theta\vert x,w\right)}\left[\frac{\partial}{\partial w}\log p\left(\theta\vert x\right)\right]=0$
\end_inset

.
\end_layout

\end_body
\end_document
back to top