https://gitlab.inria.fr/line/aide-group/macrovsa
Tip revision: 31a87d848f8ab28a06ccf77d0b359fc966974138 authored by vthierry on 15 December 2025, 21:31:50 UTC
sync from makefile
sync from makefile
Tip revision: 31a87d8
kjvdemo.C
#include "macrovsa.hpp"
#include "regex.hpp"
#include "file.hpp"
#include "time.hpp"
#include "stats.hpp"
#include <set>
using namespace macrovsa;
// Generates a LateX tabular from an array of named scalar symbols
static std::string symbolvectortolatex(const std::vector < const Symbol * > &data)
{
std::string result = "\\begin{tabular}{|c|c|c|} \\hline {\\bf word} & ${\\bf \\tau}$ & ${\\bf \\sigma}$ \\\\\n";
for(auto it = data.cbegin(); it != data.cend(); it++) {
result += aidesys::echo("\\hline {\\tt " + (*it)->getName() + "} & $%.2f$ & $%.2e$ \\\\\n", (*it)->getBelief().tau, (*it)->getBelief().sigma);
}
return result + "\\hline\\end{tabular}\n";
}
// Experiment corresponding to the (Mercier & Viéville 2025) draft
int main()
{
// Results output
wjson::Value results;
// Defines what to compute
const bool first_experiment = true;
const bool second_experiment = true;
const bool with_mesoscopic_calculations = true;
// Sets mesoscopic dimension
Symbol::setDimension(1024);
// Computation time measure initialization
aidesys::now(false, true);
// Loads the KJV data and count
wjson::Value data, chapter_names, chapter_indexes, word_names, word_indexes;
{
// Loads the data
{
std::string datafile = "../public/kjvdemo/kjv.data.json";
wjson::Value alldata(aidesys::load (datafile), true);
results["data"]["file-load-time-msec"] = (int) aidesys::now(false, true);
data = alldata.at("chapters");
}
// Builds chapter and word tables
{
std::set < std::string > words;
unsigned int text_lengths = 0;
// Loops on chapters to build the chapter names and indexes and the word set
{
unsigned int i = 0;
for(auto it = data.getNames().cbegin(); it != data.getNames().cend(); it++, i++) {
chapter_names.add(*it);
chapter_indexes[*it] = i;
JSON text = data.at(*it).at("sequence");
for(unsigned int i = 0; i < text.length(); i++) {
words.insert(text.get(i, ""));
}
text_lengths += text.length();
}
}
// Loops on the word set to build the word names and indexes
{
unsigned int i = 0;
for(auto it = words.cbegin(); it != words.cend(); it++, i++) {
word_names.add(*it);
word_indexes[*it] = i;
}
}
results["data"]["word-count-time-msec"] = (int) aidesys::now(false, true);
results["data"]["text-lengths"] = text_lengths;
}
results["data"]["chapter-count"] = chapter_names.length();
results["data"]["word-count"] = word_names.length();
}
// First experiment on word neighborhood
if(first_experiment) {
// Builds the chapters and words bundlings
Bundling *chapters = new Bundling[results.at("data").get("chapter-count", 0)], *words = new Bundling[results.at("data").get("word-count", 0)];
unsigned int building_count = 0;
{
for(auto it = data.getNames().cbegin(); it != data.getNames().cend(); ++it) {
String chapter = *it;
JSON text = data.at(*it).at("sequence");
for(unsigned int i = 0; i < text.length(); i++) {
String word = text.get(i, "");
chapters[chapter_indexes.get(chapter, 0)].add(word);
words[word_indexes.get(word, 0)].add(chapter);
building_count += 2;
}
}
results["first-experiment"]["bundling-build-time-msec"] = (int) aidesys::now(false, true);
double build_count = results.at("data").get("word-count", 0) * results.at("data").get("word-count", 0);
results["first-experiment"]["bundling-build-unary-time-usec"] = 1000.0 * results.at("first-experiment").get("bundling-build-time-msec", 0.0) / build_count;
if(with_mesoscopic_calculations) {
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
words[i].getVector();
}
for(int i = 0; i < results.at("data").get("chapter-count", 0); i++) {
chapters[i].getVector();
}
results["first-experiment"]["bundling-mesoscopic-build-time-msec"] = (int) aidesys::now(false, true);
results["first-experiment"]["bundling-mesoscopic-build-unary-time-usec"] = 1000.0 * results.at("first-experiment").get("bundling-mesoscopic-build-time-msec", 0.0) / build_count;
}
}
// Study two words neighborhood
{
// Compute self similarity to normalize further similarities `g_word`
const unsigned int neighborhood_size = 10;
double *similarity_gain = new double[results.at("data").get("word-count", 0)];
{
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
double s = algo::sim(words[i], words[i]).tau;
aidesys::alert(s <= 0, "illegal-state", "in kjvdemo the '%s' word self similarity = %f <= 0 !", words[i].asString().c_str(), s);
similarity_gain[i] = 1 / sqrt(s);
}
results["first-experiment"]["similarity-gain-build-time-msec"] = (int) aidesys::now(false, true);
results["first-experiment"]["similarity-gain-build-unary-time-usec"] = 1000.0 * results.at("first-experiment").get("similarity-gain-build-time-msec", 0.0) / results.at("data").get("word-count", 0);
if(with_mesoscopic_calculations) {
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
algo::msim(words[i], words[i]);
}
results["first-experiment"]["similarity-gain-mesoscopic-build-time-msec"] = (int) aidesys::now(false, true);
results["first-experiment"]["similarity-gain-mesoscopic-build-unary-time-usec"] = 1000.0 * results.at("first-experiment").get("similarity-gain-mesoscopic-build-time-msec", 0.0) / results.at("data").get("word-count", 0);
}
}
// Creates the "fire" neighborhood
{
// Computes word similarities w.r.t. the word `fire`
Belief *fire_similarities = new Belief[results.at("data").get("word-count", 0)];
{
const unsigned int i0 = word_indexes.get("fire", 0);
const Bundling& fire = words[i0];
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
const Belief& b = algo::sim(fire, words[i]);
double s = similarity_gain[i0] * similarity_gain[i];
fire_similarities[i].tau = s * b.tau;
fire_similarities[i].sigma = s * b.sigma;
}
results["first-experiment"]["fire"]["similarities-build-time-msec"] = (int) aidesys::now(false, true);
}
// Computes the neighorhood vector `v_fire = >_words g_fire g_word (s_fire^T s_word) s_word`
Bundling fire_neighborhood_vector;
{
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
Symbol symbol(word_names.get(i, ""), fire_similarities[i]);
fire_neighborhood_vector.add(symbol);
}
results["first-experiment"]["fire"]["neighborhood-vector-time-msec"] = (int) aidesys::now(false, true);
results["first-experiment"]["fire"]["fire-neighborhood-vector-size"] = fire_neighborhood_vector.get().size();
}
delete[] fire_similarities;
// Sorts and reports the neighborhood
results["first-experiment"]["fire"]["neighborhood"] = symbolvectortolatex(fire_neighborhood_vector.getSorted(neighborhood_size));
}
// Creates the "water" neighborhood
{
// Computes word similarities w.r.t. the word `fire`
Belief *water_similarities = new Belief[results.at("data").get("word-count", 0)];
{
const unsigned int i0 = word_indexes.get("water", 0);
const Bundling& water = words[i0];
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
const Belief& b = algo::sim(water, words[i]);
double s = similarity_gain[i0] * similarity_gain[i];
water_similarities[i].tau = s * b.tau;
water_similarities[i].sigma = s * b.sigma;
}
results["first-experiment"]["water"]["similarities-time-msec"] = (int) aidesys::now(false, true);
}
// Computes the neighorhood vector `v_water = >_words g_water g_word (s_water^T s_word) s_word`
Bundling water_neighborhood_vector;
{
for(int i = 0; i < results.at("data").get("word-count", 0); i++) {
Symbol symbol(word_names.get(i, ""), water_similarities[i]);
water_neighborhood_vector.add(symbol);
}
results["first-experiment"]["water"]["neighborhood-vector-time-msec"] = (int) aidesys::now(false, true);
results["first-experiment"]["water"]["neighborhood-vector-size"] = water_neighborhood_vector.get().size();
}
delete[] water_similarities;
results["first-experiment"]["water"]["neighborhood"] = symbolvectortolatex(water_neighborhood_vector.getSorted(neighborhood_size));
}
delete[] similarity_gain;
}
delete[] chapters, delete[] words;
}
// Second experiment on sort text sequences
if(second_experiment) {
// Generates short text sequences
for(unsigned int prefix_length = 2; prefix_length < 4; prefix_length++) {
wjson::Value& results_ = results["second-experiment"][aidesys::echo("prefix_length-%d", prefix_length)];
AssociativeMap sequences;
const bool using_string = true;
// Loops on chapter texts to build the text prefix words and postfix word
{
for(auto it = data.getNames().cbegin(); it != data.getNames().cend(); it++) {
JSON text = data.at(*it).at("sequence");
Symbol **words = new Symbol *[text.length()];
for(unsigned int i = 0; i < text.length(); i++) {
words[i] = new Symbol(text.get(i, ""));
if(i > prefix_length) {
if(using_string) {
std::string key = "";
for(unsigned int j = i - prefix_length; j < i; j++) {
key += text.get(j, "") + (j == i - 1 ? "" : " ");
}
Symbol prefix(key);
sequences.add(prefix, *words[i]);
} else {
Array prefix;
for(unsigned int j = i - prefix_length; j < i; j++) {
prefix.add(*words[j]);
}
sequences.add(prefix, *words[i]);
}
}
}
// Clean-up
{
for(unsigned int i = 0; i < text.length(); i++) {
delete words[i];
}
delete[] words;
}
}
results_["sequences-build-time-msec"] = (int) aidesys::now(false, true);
results_["sequences-build-unary-time-usec"] = 1000.0 * results_.get("sequences-build-time-msec", 0.0) / sequences.getSize();
if(with_mesoscopic_calculations) {
sequences.getVector();
results_["sequences-mesoscopic-build-time-msec"] = (int) aidesys::now(false, true);
results_["sequences-mesoscopic-build-unary-time-usec"] = 1000.0 * results_.get("sequences-mesoscopic-build-time-msec", 0.0) / sequences.getSize();
}
results_["prefix-count"] = sequences.get().size();
results_["word-count"] = sequences.getSize();
}
// Prefix tail statistics
{
// Collects data and the main prefix-tail pairs
std::vector < double > data;
std::multimap < double, std::pair < std::string, std::string >> prefixtails;
{
double min_tau = 10, c0 = 0, count[10];
for(unsigned int c = 0; c < min_tau; count[c++] = 0) {}
for(auto it = sequences.get().cbegin(); it != sequences.get().cend(); it++) {
String prefix = it->second.first->asString();
for(auto jt = it->second.second.cbegin(); jt != it->second.second.cend(); jt++) {
double tau = jt->second->getBelief().tau;
data.push_back(tau);
c0++;
if(tau > min_tau) {
String tail = jt->second->getName();
prefixtails.insert(std::pair < double, std::pair < std::string, std::string >> (tau, std::pair < std::string, std::string > (prefix, tail)));
} else {
for(unsigned int c = 0; c < min_tau; c++) {
if(tau <= c) {
count[c]++;
}
}
}
}
}
for(unsigned int c = 1; c < min_tau; c++) {
results_["postfix-tau-count-values"]["counts"][aidesys::echo("tau <= %d in %%", c)] = aidesys::echo("%.0f", 100 * count[c] / c0);
}
}
// Reports prefix-tail results
std::string result = "\\begin{tabular}{|c|c|c|} \\hline {\\bf prefix} & {\\bf tail} & ${\\bf \\tau}^2$ \\\\\n";
{
unsigned int count = 10, c = 0;
for(auto it = prefixtails.crbegin(); it != prefixtails.crend() && c < count; it++, c++) {
result += aidesys::echo("\\hline {\\tt " + it->second.first + "} & " + it->second.second + " & $%.0f$ \\\\\n", it->first);
}
result += "\\hline\\end{tabular}\n";
}
String stat = aidesys::getStat(data, NULL, 0, 0x3);
results_["postfix-tau-main-values"] = result;
results_["postfix-tau-statistics"] = stat;
results_["postfix-statistics-build-time-msec"] = (int) aidesys::now(false, true);
}
}
}
// Reports results
{
printf("%s\n", results.asString(true).c_str());
}
}
