##### swh:1:snp:6f5fd6c965e5bc7b0fe0766a8ce0095c450de857

Tip revision:

**3d3fa9b4e0a96ea840009b62107d47e464e08c11**authored by**KLRhodes**on**13 December 2021, 23:23:06 UTC****Build site.** Tip revision:

**3d3fa9b** functions_for_fit_comparison.Rmd

```
# This is used to draw the
# plots showing improvement in the fits over time; the plots are
# arranged using plot_grid from the cowplot package. To avoid showing
# the legend in every plot, a separate plot is created showing the
# legend only.
#
# Input arguments "dat" and "fits" should be generated by
# compile_poisson_nmf_fits.R. It is assumed that Poisson NMF models
# were fit for k = 2-12, using three different algorithms (EM, CCD,
# SCD), and with and without extrapolation.
```{r}
create_progress_plots <- function (dat, fits, y = c("loglik","res"),
iterations = 1:1000) {
# I'm assuming k = 2-12.
k <- 12
# These are the colours that will be used to plot the results from
# running the three methods: EM, CCD and SCD.
clrs <- c("deepskyblue","darkorange","darkmagenta")
# Process the input arguments.
y <- match.arg(y)
# This list will be used to store the plots for each choice of k,
# for k=2-12. The first plot will be used to draw the legend.
plots <- vector("list",k)
names(plots) <- paste0("k",1:k)
# Create the plot showing the legend only.
plots$k1 <-
suppressMessages(create_progress_plot(dat,fits,2,y,iterations,clrs) +
scale_x_continuous(breaks = NULL,limits = c(0,0)) +
scale_y_continuous(breaks = NULL) +
theme(axis.line = element_blank()) +
labs(x = "",y = ""))
# Repeat for each choice of k, the number of topics. The legend is
# removed from all of these plots.
for (i in 2:k)
plots[[i]] <- create_progress_plot(dat,fits,i,y,iterations,clrs) +
guides(color = "none",fill = "none",shape = "none",linetype = "none",
size = "none") +
ggtitle(paste("k =",i))
# Arrange the 12 plots (including the plot that shows the legend
# only) in a 3 x 4 grid.
return(plot_grid(plots$k2,plots$k3,plots$k4,plots$k5,
plots$k6,plots$k7,plots$k8,plots$k9,
plots$k10,plots$k11,plots$k12,plots$k1,
nrow = 3,ncol = 4))
}
# This is used by create_progress_plots to create one of the plots.
create_progress_plot <- function (dat, fits, i, y, iterations, clrs) {
# Select all fits with k topics, and adjust the labels assigned to
# these fits so that k is not shown.
pdat <- lapply(fits[with(dat,k == i)],"[[","fit")
names(pdat) <- str_remove(names(pdat),paste0("-k=",i))
# Show only the selected iterations, and convert the runtimes from
# seconds to hours.
pdat <- lapply(pdat,function (x) {
x$progress <- transform(x$progress[iterations,],
timing = timing/360)
return(x)
})
# Create the plot showing improvement in the fit over time.
return(plot_progress_poisson_nmf(pdat,y = y,add.point.every = 100,
colors = rep(clrs,2),shapes = 21,fills = c(clrs,rep("white",3)),
theme = function() theme_cowplot(font_size = 10)) +
theme(plot.title = element_text(size = 10,face = "plain")) +
xlab("runtime (h)"))
}
# Create a plot showing the improvement in the log-likelihood as the
# rank, or the number of topics, k, increases.
plot_loglik_vs_rank <- function (fits) {
pdat <- compare_poisson_nmf_fits(lapply(fits,"[[","fit"))
pdat <- by(pdat,pdat$k,function (x) x[which.max(x$loglik),])
pdat <- as.data.frame(do.call(rbind,pdat))
pdat <- transform(pdat,loglik = loglik - min(loglik) + 0.01)
k <- max(pdat$k)
return(ggplot(pdat,aes_string(x = "k",y = "loglik")) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 2:k) +
labs(x = "number of topics (k)",y = "relative loglik") +
theme_cowplot(font_size = 16))
}
```
```