https://github.com/Blei-Lab/ctr
Tip revision: a21d0a81e451b9c1ccf05143f6915bb11bf66545 authored by Jaan Altosaar on 19 August 2015, 10:51:32 UTC
Update README.md
Update README.md
Tip revision: a21d0a8
ctr.cpp
#include "ctr.h"
#include "opt.h"
extern gsl_rng * RANDOM_NUMBER;
int min_iter = 15;
double beta_smooth = 0.01;
c_ctr::c_ctr() {
m_beta = NULL;
m_theta = NULL;
m_U = NULL;
m_V = NULL;
m_num_factors = 0; // m_num_topics
m_num_items = 0; // m_num_docs
m_num_users = 0; // num of users
}
c_ctr::~c_ctr() {
// free memory
if (m_beta != NULL) gsl_matrix_free(m_beta);
if (m_theta != NULL) gsl_matrix_free(m_theta);
if (m_U != NULL) gsl_matrix_free(m_U);
if (m_V != NULL) gsl_matrix_free(m_V);
}
void c_ctr::read_init_information(const char* theta_init_path,
const char* beta_init_path,
const c_corpus* c,
double alpha_smooth) {
int num_topics = m_num_factors;
m_theta = gsl_matrix_alloc(c->m_num_docs, num_topics);
printf("\nreading theta initialization from %s\n", theta_init_path);
FILE * f = fopen(theta_init_path, "r");
mtx_fscanf(f, m_theta);
fclose(f);
//smoothing
gsl_matrix_add_constant(m_theta, alpha_smooth);
//normalize m_theta, in case it's not
for (size_t j = 0; j < m_theta->size1; j ++) {
gsl_vector_view theta_v = gsl_matrix_row(m_theta, j);
vnormalize(&theta_v.vector);
}
m_beta = gsl_matrix_alloc(num_topics, c->m_size_vocab);
printf("reading beta initialization from %s\n", beta_init_path);
f = fopen(beta_init_path, "r");
mtx_fscanf(f, m_beta);
fclose(f);
// exponentiate if it's not
if (mget(m_beta, 0, 0) < 0) {
mtx_exp(m_beta);
}
else {
gsl_matrix_add_constant(m_beta, beta_smooth);
for (size_t j = 0; j < m_beta->size1; j ++) {
gsl_vector_view beta_v = gsl_matrix_row(m_beta, j);
vnormalize(&beta_v.vector);
}
}
}
void c_ctr::set_model_parameters(int num_factors,
int num_users,
int num_items) {
m_num_factors = num_factors;
m_num_users = num_users;
m_num_items = num_items;
}
void c_ctr::init_model(int ctr_run) {
m_U = gsl_matrix_calloc(m_num_users, m_num_factors);
m_V = gsl_matrix_calloc(m_num_items, m_num_factors);
if (ctr_run) {
gsl_matrix_memcpy(m_V, m_theta);
}
else {
// this is for convenience, so that updates are similar.
m_theta = gsl_matrix_calloc(m_num_items, m_num_factors);
for (size_t i = 0; i < m_V->size1; i ++)
for (size_t j = 0; j < m_V->size2; j ++)
mset(m_V, i, j, runiform());
}
}
void c_ctr::stochastic_learn_map_estimate(const c_data* users, const c_data* items,
const c_corpus* c, const ctr_hyperparameter* param,
const char* directory) {
// init model parameters
printf("\nrunning stochastic learning ...\n");
printf("initializing the model ...\n");
init_model(param->ctr_run);
// filename
char name[500];
// start time
time_t start, current;
time(&start);
int elapsed = 0;
int iter = 0;
double likelihood = -exp(50), likelihood_old;
double converge = 1.0;
double learning_rate = param->learning_rate;
/// create the state log file
sprintf(name, "%s/state.log", directory);
FILE* file = fopen(name, "w");
fprintf(file, "iter time likelihood converge\n");
/* alloc auxiliary variables */
gsl_vector* x = gsl_vector_alloc(m_num_factors);
gsl_matrix* phi = NULL;
gsl_matrix* word_ss = NULL;
gsl_matrix* log_beta = NULL;
gsl_vector* gamma = NULL;
if (param->ctr_run && param->theta_opt) {
int max_len = c->max_corpus_length();
phi = gsl_matrix_calloc(max_len, m_num_factors);
word_ss = gsl_matrix_calloc(m_num_factors, c->m_size_vocab);
log_beta = gsl_matrix_calloc(m_num_factors, c->m_size_vocab);
gsl_matrix_memcpy(log_beta, m_beta);
mtx_log(log_beta);
gamma = gsl_vector_alloc(m_num_factors);
}
/* tmp variables for indexes */
int i, j, m, n, l, k, ll, jj;
int* item_ids;
bool positive = true;
double result, inner;
int active_num_items = 0;
for (j = 0; j < m_num_items; ++j) {
if (items->m_vec_len[j] > 0)
active_num_items++;
}
int* idx_base = new int[active_num_items];
l = 0;
for (j = 0; j < m_num_items; ++j) {
if (items->m_vec_len[j] > 0) {
idx_base[l] = j;
++l;
}
}
int* sel = new int[active_num_items];
while (iter < param->max_iter) {
likelihood_old = likelihood;
likelihood = 0.0;
for (i = 0; i < m_num_users; ++i) {
item_ids = users->m_vec_data[i];
n = users->m_vec_len[i];
if (n > 0) {
double lambda_u = param->lambda_u / (2*n);
gsl_vector_view u = gsl_matrix_row(m_U, i);
// this user has rated some articles
// Randomly choose 2*n negative examples
sample_k_from_n(n, active_num_items, sel, idx_base);
qsort(sel, n, sizeof(int), compare);
l = 0; ll = 0;
while (true) {
if (l < n) {
j = item_ids[l]; // positive
} else {
j = -1;
}
if (ll < n) {
jj = sel[ll]; //negative
while (ll < n-1 && jj == sel[ll+1]) ++ll; // skip same values
} else {
jj = -1;
}
if (j == -1) {
if (jj == -1) break;
else {
positive = false; // jj is a negative example
++ll;
}
} else {
if (j < jj) {
positive = true; // j is a positive example
++l;
} else if (j == jj) {
positive = true; // j is a positive example
++l;
++ll;
} else { // j > jj
if (jj == -1) {
positive = true; // j is a positive example
++l;
} else {
positive = false;
++ll; // jj is a negative example
}
}
}
gsl_vector_view v;
gsl_vector_view theta_v;
double lambda_v = 0.0;
if (positive) {
// j is a positive example
lambda_v = param->lambda_v / (2 * items->m_vec_len[j]);
v = gsl_matrix_row(m_V, j);
theta_v = gsl_matrix_row(m_theta, j);
// second-order
// u
gsl_vector_scale(&u.vector, 1 - learning_rate);
gsl_blas_ddot(&v.vector, &v.vector, &inner);
gsl_blas_daxpy(learning_rate / (lambda_u + inner), &v.vector, &u.vector);
// v
if (!param->lda_regression) {
gsl_vector_scale(&v.vector, 1 - learning_rate);
gsl_blas_daxpy(learning_rate, &theta_v.vector, &v.vector);
gsl_blas_ddot(&u.vector, &u.vector, &inner);
gsl_blas_ddot(&u.vector, &theta_v.vector, &result);
gsl_blas_daxpy(learning_rate * (1.0 - result) / (lambda_v + inner), &u.vector, &v.vector);
}
gsl_blas_ddot(&u.vector, &v.vector, &result);
likelihood += -0.5 * (1 - result) * (1 - result);
// gsl_blas_ddot(&u.vector, &v.vector, &result);
// result -= 1.0;
} else {
// jj is a negative example
lambda_v = param->lambda_v / (2 * items->m_vec_len[jj]);
v = gsl_matrix_row(m_V, jj);
theta_v = gsl_matrix_row(m_theta, jj);
// second order
// u
gsl_vector_scale(&u.vector, 1 - learning_rate);
// v
if (!param->lda_regression) {
gsl_vector_scale(&v.vector, 1 - learning_rate);
gsl_blas_daxpy(learning_rate, &theta_v.vector, &v.vector);
gsl_blas_ddot(&u.vector, &u.vector, &inner);
gsl_blas_ddot(&u.vector, &theta_v.vector, &result);
gsl_blas_daxpy(-learning_rate * result / (lambda_v + inner), &u.vector, &v.vector);
}
gsl_blas_ddot(&u.vector, &v.vector, &result);
likelihood += -0.5 * result * result;
// gsl_blas_ddot(&u.vector, &v.vector, &result);
}
// update u
// first-order
// gsl_vector_scale(&u.vector, 1 - param->learning_rate * lambda_u);
// gsl_blas_daxpy(-result * param->learning_rate, &v.vector, &u.vector);
// second order
// update v
// gsl_vector_scale(&v.vector, 1 - param->learning_rate * lambda_v);
// gsl_blas_daxpy(-result * param->learning_rate, &u.vector, &v.vector);
// gsl_blas_daxpy(param->learning_rate * lambda_v, &theta_v.vector, &v.vector);
}
assert(n == l && n == l);
//printf("n=%d, l=%d, ll=%d, j=%d, jj=%d\n", n, l, ll, j, jj);
// update the likelihood
gsl_blas_ddot(&u.vector, &u.vector, &result);
likelihood += -0.5 * param->lambda_u * result;
}
}
for (j = 0; j < m_num_items; ++j) {
gsl_vector_view v = gsl_matrix_row(m_V, j);
gsl_vector_view theta_v = gsl_matrix_row(m_theta, j);
gsl_vector_memcpy(x, &v.vector);
gsl_vector_sub(x, &theta_v.vector);
gsl_blas_ddot(x, x, &result);
likelihood += -0.5 * param->lambda_v * result;
}
// update theta
if (param->ctr_run && param->theta_opt) {
gsl_matrix_set_zero(word_ss);
for (j = 0; j < m_num_items; j ++) {
gsl_vector_view v = gsl_matrix_row(m_V, j);
gsl_vector_view theta_v = gsl_matrix_row(m_theta, j);
m = items->m_vec_len[j];
if (m>0) {
// m > 0, some users have rated this article
const c_document* doc = c->m_docs[j];
likelihood += doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, true);
optimize_simplex(gamma, &v.vector, param->lambda_v, &theta_v.vector);
}
else {
// m=0, this article has never been rated
const c_document* doc = c->m_docs[j];
doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, false);
vnormalize(gamma);
gsl_vector_memcpy(&theta_v.vector, gamma);
}
}
gsl_matrix_memcpy(m_beta, word_ss);
for (k = 0; k < m_num_factors; k ++) {
gsl_vector_view row = gsl_matrix_row(m_beta, k);
vnormalize(&row.vector);
}
gsl_matrix_memcpy(log_beta, m_beta);
mtx_log(log_beta);
}
time(¤t);
elapsed = (int)difftime(current, start);
iter++;
if (iter > 50 && learning_rate > 0.001) learning_rate /= 2.0;
converge = fabs((likelihood-likelihood_old)/likelihood_old);
fprintf(file, "%04d %06d %10.5f %.10f\n", iter, elapsed, likelihood, converge);
fflush(file);
printf("iter=%04d, time=%06d, likelihood=%.5f, converge=%.10f\n", iter, elapsed, likelihood, converge);
// save intermediate results
if (iter % param->save_lag == 0) {
sprintf(name, "%s/%04d-U.dat", directory, iter);
FILE * file_U = fopen(name, "w");
gsl_matrix_fwrite(file_U, m_U);
fclose(file_U);
sprintf(name, "%s/%04d-V.dat", directory, iter);
FILE * file_V = fopen(name, "w");
gsl_matrix_fwrite(file_V, m_V);
fclose(file_V);
if (param->ctr_run && param->theta_opt) {
sprintf(name, "%s/%04d-theta.dat", directory, iter);
FILE * file_theta = fopen(name, "w");
gsl_matrix_fwrite(file_theta, m_theta);
fclose(file_theta);
sprintf(name, "%s/%04d-beta.dat", directory, iter);
FILE * file_beta = fopen(name, "w");
gsl_matrix_fwrite(file_beta, m_beta);
fclose(file_beta);
}
}
}
// save final results
sprintf(name, "%s/final-U.dat", directory);
FILE * file_U = fopen(name, "w");
gsl_matrix_fwrite(file_U, m_U);
fclose(file_U);
sprintf(name, "%s/final-V.dat", directory);
FILE * file_V = fopen(name, "w");
gsl_matrix_fwrite(file_V, m_V);
fclose(file_V);
if (param->ctr_run && param->theta_opt) {
sprintf(name, "%s/final-theta.dat", directory);
FILE * file_theta = fopen(name, "w");
gsl_matrix_fwrite(file_theta, m_theta);
fclose(file_theta);
sprintf(name, "%s/final-beta.dat", directory);
FILE * file_beta = fopen(name, "w");
gsl_matrix_fwrite(file_beta, m_beta);
fclose(file_beta);
}
// free memory
gsl_vector_free(x);
delete [] idx_base;
delete [] sel;
if (param->ctr_run && param->theta_opt) {
gsl_matrix_free(phi);
gsl_matrix_free(log_beta);
gsl_matrix_free(word_ss);
gsl_vector_free(gamma);
}
}
void c_ctr::learn_map_estimate(const c_data* users, const c_data* items,
const c_corpus* c, const ctr_hyperparameter* param,
const char* directory) {
// init model parameters
printf("\ninitializing the model ...\n");
init_model(param->ctr_run);
// filename
char name[500];
// start time
time_t start, current;
time(&start);
int elapsed = 0;
int iter = 0;
double likelihood = -exp(50), likelihood_old;
double converge = 1.0;
/// create the state log file
sprintf(name, "%s/state.log", directory);
FILE* file = fopen(name, "w");
fprintf(file, "iter time likelihood converge\n");
/* alloc auxiliary variables */
gsl_matrix* XX = gsl_matrix_alloc(m_num_factors, m_num_factors);
gsl_matrix* A = gsl_matrix_alloc(m_num_factors, m_num_factors);
gsl_matrix* B = gsl_matrix_alloc(m_num_factors, m_num_factors);
gsl_vector* x = gsl_vector_alloc(m_num_factors);
gsl_matrix* phi = NULL;
gsl_matrix* word_ss = NULL;
gsl_matrix* log_beta = NULL;
gsl_vector* gamma = NULL;
if (param->ctr_run && param->theta_opt) {
int max_len = c->max_corpus_length();
phi = gsl_matrix_calloc(max_len, m_num_factors);
word_ss = gsl_matrix_calloc(m_num_factors, c->m_size_vocab);
log_beta = gsl_matrix_calloc(m_num_factors, c->m_size_vocab);
gsl_matrix_memcpy(log_beta, m_beta);
mtx_log(log_beta);
gamma = gsl_vector_alloc(m_num_factors);
}
/* tmp variables for indexes */
int i, j, m, n, l, k;
int* item_ids;
int* user_ids;
double result;
/// confidence parameters
double a_minus_b = param->a - param->b;
while ((iter < param->max_iter and converge > 1e-4 ) or iter < min_iter) {
likelihood_old = likelihood;
likelihood = 0.0;
// update U
gsl_matrix_set_zero(XX);
for (j = 0; j < m_num_items; j ++) {
m = items->m_vec_len[j];
if (m>0) {
gsl_vector_const_view v = gsl_matrix_const_row(m_V, j);
gsl_blas_dger(1.0, &v.vector, &v.vector, XX);
}
}
gsl_matrix_scale(XX, param->b);
// this is only for U
gsl_matrix_add_diagonal(XX, param->lambda_u);
for (i = 0; i < m_num_users; i ++) {
item_ids = users->m_vec_data[i];
n = users->m_vec_len[i];
if (n > 0) {
// this user has rated some articles
gsl_matrix_memcpy(A, XX);
gsl_vector_set_zero(x);
for (l=0; l < n; l ++) {
j = item_ids[l];
gsl_vector_const_view v = gsl_matrix_const_row(m_V, j);
gsl_blas_dger(a_minus_b, &v.vector, &v.vector, A);
gsl_blas_daxpy(param->a, &v.vector, x);
}
gsl_vector_view u = gsl_matrix_row(m_U, i);
matrix_vector_solve(A, x, &(u.vector));
// update the likelihood
gsl_blas_ddot(&u.vector, &u.vector, &result);
likelihood += -0.5 * param->lambda_u * result;
}
}
if (param->lda_regression) break; // one iteration is enough for lda-regression
// update V
if (param->ctr_run && param->theta_opt) gsl_matrix_set_zero(word_ss);
gsl_matrix_set_zero(XX);
for (i = 0; i < m_num_users; i ++) {
n = users->m_vec_len[i];
if (n>0) {
gsl_vector_const_view u = gsl_matrix_const_row(m_U, i);
gsl_blas_dger(1.0, &u.vector, &u.vector, XX);
}
}
gsl_matrix_scale(XX, param->b);
for (j = 0; j < m_num_items; j ++) {
gsl_vector_view v = gsl_matrix_row(m_V, j);
gsl_vector_view theta_v = gsl_matrix_row(m_theta, j);
user_ids = items->m_vec_data[j];
m = items->m_vec_len[j];
if (m>0) {
// m > 0, some users have rated this article
gsl_matrix_memcpy(A, XX);
gsl_vector_set_zero(x);
for (l = 0; l < m; l ++) {
i = user_ids[l];
gsl_vector_const_view u = gsl_matrix_const_row(m_U, i);
gsl_blas_dger(a_minus_b, &u.vector, &u.vector, A);
gsl_blas_daxpy(param->a, &u.vector, x);
}
// adding the topic vector
// even when ctr_run=0, m_theta=0
gsl_blas_daxpy(param->lambda_v, &theta_v.vector, x);
gsl_matrix_memcpy(B, A); // save for computing likelihood
// here different from U update
gsl_matrix_add_diagonal(A, param->lambda_v);
matrix_vector_solve(A, x, &v.vector);
// update the likelihood for the relevant part
likelihood += -0.5 * m * param->a;
for (l = 0; l < m; l ++) {
i = user_ids[l];
gsl_vector_const_view u = gsl_matrix_const_row(m_U, i);
gsl_blas_ddot(&u.vector, &v.vector, &result);
likelihood += param->a * result;
}
likelihood += -0.5 * mahalanobis_prod(B, &v.vector, &v.vector);
// likelihood part of theta, even when theta=0, which is a
// special case
gsl_vector_memcpy(x, &v.vector);
gsl_vector_sub(x, &theta_v.vector);
gsl_blas_ddot(x, x, &result);
likelihood += -0.5 * param->lambda_v * result;
if (param->ctr_run && param->theta_opt) {
const c_document* doc = c->m_docs[j];
likelihood += doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, true);
optimize_simplex(gamma, &v.vector, param->lambda_v, &theta_v.vector);
}
}
else {
// m=0, this article has never been rated
if (param->ctr_run && param->theta_opt) {
const c_document* doc = c->m_docs[j];
doc_inference(doc, &theta_v.vector, log_beta, phi, gamma, word_ss, false);
vnormalize(gamma);
gsl_vector_memcpy(&theta_v.vector, gamma);
}
}
}
// update beta if needed
if (param->ctr_run && param->theta_opt) {
gsl_matrix_memcpy(m_beta, word_ss);
for (k = 0; k < m_num_factors; k ++) {
gsl_vector_view row = gsl_matrix_row(m_beta, k);
vnormalize(&row.vector);
}
gsl_matrix_memcpy(log_beta, m_beta);
mtx_log(log_beta);
}
time(¤t);
elapsed = (int)difftime(current, start);
iter++;
converge = fabs((likelihood-likelihood_old)/likelihood_old);
if (likelihood < likelihood_old) printf("likelihood is decreasing!\n");
fprintf(file, "%04d %06d %10.5f %.10f\n", iter, elapsed, likelihood, converge);
fflush(file);
printf("iter=%04d, time=%06d, likelihood=%.5f, converge=%.10f\n", iter, elapsed, likelihood, converge);
// save intermediate results
if (iter % param->save_lag == 0) {
sprintf(name, "%s/%04d-U.dat", directory, iter);
FILE * file_U = fopen(name, "w");
mtx_fprintf(file_U, m_U);
fclose(file_U);
sprintf(name, "%s/%04d-V.dat", directory, iter);
FILE * file_V = fopen(name, "w");
mtx_fprintf(file_V, m_V);
fclose(file_V);
if (param->ctr_run) {
sprintf(name, "%s/%04d-theta.dat", directory, iter);
FILE * file_theta = fopen(name, "w");
mtx_fprintf(file_theta, m_theta);
fclose(file_theta);
sprintf(name, "%s/%04d-beta.dat", directory, iter);
FILE * file_beta = fopen(name, "w");
mtx_fprintf(file_beta, m_beta);
fclose(file_beta);
}
}
}
// save final results
sprintf(name, "%s/final-U.dat", directory);
FILE * file_U = fopen(name, "w");
mtx_fprintf(file_U, m_U);
fclose(file_U);
sprintf(name, "%s/final-V.dat", directory);
FILE * file_V = fopen(name, "w");
mtx_fprintf(file_V, m_V);
fclose(file_V);
if (param->ctr_run) {
sprintf(name, "%s/final-theta.dat", directory);
FILE * file_theta = fopen(name, "w");
mtx_fprintf(file_theta, m_theta);
fclose(file_theta);
sprintf(name, "%s/final-beta.dat", directory);
FILE * file_beta = fopen(name, "w");
mtx_fprintf(file_beta, m_beta);
fclose(file_beta);
}
// free memory
gsl_matrix_free(XX);
gsl_matrix_free(A);
gsl_matrix_free(B);
gsl_vector_free(x);
if (param->ctr_run && param->theta_opt) {
gsl_matrix_free(phi);
gsl_matrix_free(log_beta);
gsl_matrix_free(word_ss);
gsl_vector_free(gamma);
}
}
double c_ctr::doc_inference(const c_document* doc, const gsl_vector* theta_v,
const gsl_matrix* log_beta, gsl_matrix* phi,
gsl_vector* gamma, gsl_matrix* word_ss,
bool update_word_ss) {
double pseudo_count = 1.0;
double likelihood = 0;
gsl_vector* log_theta_v = gsl_vector_alloc(theta_v->size);
gsl_vector_memcpy(log_theta_v, theta_v);
vct_log(log_theta_v);
int n, k, w;
double x;
for (n = 0; n < doc->m_length; n ++) {
w = doc->m_words[n];
for (k = 0; k < m_num_factors; k ++)
mset(phi, n, k, vget(theta_v, k) * mget(m_beta, k, w));
gsl_vector_view row = gsl_matrix_row(phi, n);
vnormalize(&row.vector);
for (k = 0; k < m_num_factors; k ++) {
x = mget(phi, n, k);
if (x > 0)
likelihood += x*(vget(log_theta_v, k) + mget(log_beta, k, w) - log(x));
}
}
if (pseudo_count > 0) {
likelihood += pseudo_count * vsum(log_theta_v);
}
gsl_vector_set_all(gamma, pseudo_count); // smoothing with small pseudo counts
for (n = 0; n < doc->m_length; n ++) {
for (k = 0; k < m_num_factors; k ++) {
x = doc->m_counts[n] * mget(phi, n, k);
vinc(gamma, k, x);
if (update_word_ss) minc(word_ss, k, doc->m_words[n], x);
}
}
gsl_vector_free(log_theta_v);
return likelihood;
}