https://github.com/msmathcomp/hyperbolic-tsne
Tip revision: bba9d0f089659fb170c7270aa90c796f91bfb2b1 authored by Martin Skrodzki on 02 May 2024, 12:34:19 UTC
Update README.md
Update README.md
Tip revision: bba9d0f
table_relative_gradient_error.py
"""
This script iterates over the results created by the `samples per data set` script. For each final exact embedding,
it extracts the exact cost function value, finds the corresponding accelerated embeddings, and extracts their respective
cost function values. From these cost function values, a relative error is computed and the statistics across this
relative errors are appended to the original data, which is then stored.
"""
###########
# IMPORTS #
###########
import ctypes
import math
from pathlib import Path
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from hyperbolicTSNE import Datasets, load_data
from hyperbolicTSNE.hd_mat_ import hd_matrix
from hyperbolicTSNE.hyperbolic_barnes_hut.tsne import distance_py, gradient
######################
# HELPER FUNCTION(S) #
######################
def squared_hyperbolic_distance(x, y):
return math.pow(distance_py(x, y), 2)
#################################
# GENERAL EXPERIMENT PARAMETERS #
#################################
results_path = Path("../results/samples_per_data_set/")
# Load the overview of the experiments
df = pd.read_csv(results_path.joinpath("overview.csv"))
# Iterate over all snapshots and compare to the various approximated ones
averaged_results = df[["dataset"]].groupby(["dataset"]).first().reset_index()
# Constants
THETA = 0.5 # Theta value to be used in the approximate version of t-SNE
EMBEDDING_DIMENSIONS = 2 # Embed into two dimensions
VERBOSE = 0 # No additional debugging information when computing the gradient
NUM_THREADS = 4 # Number of threads to be used in gradient computation
###################
# EXPERIMENT LOOP #
###################
for i, dataset_record in enumerate(averaged_results.to_records()):
dataset = dataset_record.dataset
dataX = load_data(Datasets[dataset], to_return="X")
# Iterate over all records for this dataset
for record in df[(df.dataset == dataset)].to_records():
gradient_errors_relative = []
# Read the data from the specific experiment
record_path = record.run_directory.replace(".", str(results_path))
subset_idx = np.load(f"{record_path}/subset_idx.npy")
with open(f"{record_path}/params.json", "r") as f:
data = json.load(f)
hd_params = {"perplexity": data["perplexity"]}
f.close()
D, V = hd_matrix(X=dataX[subset_idx], hd_params=hd_params)
val_V = V.data
neighbors = V.indices.astype(np.int64, copy=False)
indptr = V.indptr.astype(np.int64, copy=False)
# Iterate over all embeddings created for this specific data set
for path in tqdm(list(Path(f"{record_path}/embeddings/").rglob("*.csv"))):
if path.name.startswith("._"):
continue
Y = np.loadtxt(path, delimiter=",")
Y = Y.astype(ctypes.c_double, copy=False)
# Compute approximate grad
grad_approx = np.zeros(Y.shape, dtype=ctypes.c_double)
timings = np.zeros(4, dtype=ctypes.c_float)
gradient(
timings,
val_V, Y, neighbors, indptr, grad_approx,
theta=THETA,
n_dimensions=EMBEDDING_DIMENSIONS,
verbose=VERBOSE,
compute_error=True,
num_threads=NUM_THREADS,
exact=False
)
# Compute exact grad
grad_exact = np.zeros(Y.shape, dtype=ctypes.c_double)
timings_exact = np.zeros(4, dtype=ctypes.c_float)
gradient(
timings_exact,
val_V, Y, neighbors, indptr, grad_exact,
theta=THETA,
n_dimensions=EMBEDDING_DIMENSIONS,
verbose=VERBOSE,
compute_error=True,
num_threads=NUM_THREADS,
exact=True,
)
# Compute the error between the two gradient representations
norm_difference = math.sqrt(sum(
[squared_hyperbolic_distance(a, b) for a, b in zip(grad_approx, grad_exact)]
))
norm_exact = math.sqrt(sum([squared_hyperbolic_distance(a, np.array([0., 0.])) for a in grad_exact]))
gradient_errors_relative.append(norm_difference / norm_exact)
# Compute descriptive statistics on the gradient errors
avg_gradient_errors_relative = pd.DataFrame(gradient_errors_relative).describe().T.add_prefix("_gradient")
for name, values in avg_gradient_errors_relative.items():
averaged_results.at[i, name] = values.item()
# Store results
averaged_results.to_csv(results_path.joinpath(f"5-3_output_gradient-chkpt-{dataset}.csv"), index=False)
print(f"Finished processing {dataset}")