https://github.com/msmathcomp/hyperbolic-tsne
Tip revision: bba9d0f089659fb170c7270aa90c796f91bfb2b1 authored by Martin Skrodzki on 02 May 2024, 12:34:19 UTC
Update README.md
Update README.md
Tip revision: bba9d0f
data_generation_full_size_one_run.py
"""
This experiment runs the hyperbolic tsne code changing several parameters:
- dataset: [LUKK, MYELOID8000, PLANARIA, MNIST, C_ELEGANS, WORDNET]
- tsne_type: [accelerated, exact]
- splitting strategy of the polar quad tree: [equal_area, equal_length]
For each data set and each configuration combination, it saves the embedding coordinates, a plot of the embedding, and
timing data for the iterations.
If a run does not finish, the results are not saved.
The code only computes the runs that do not have a folder.
"""
###########
# IMPORTS #
###########
import csv
import json
import traceback
from itertools import product
from pathlib import Path
import numpy as np
from scipy.sparse import issparse, save_npz
from matplotlib import pyplot as plt
from hyperbolicTSNE import Datasets, load_data, initialization, hd_matrix, SequentialOptimizer, HyperbolicTSNE
from hyperbolicTSNE.util import find_last_embedding
from hyperbolicTSNE.visualization import plot_poincare
#################################
# GENERAL EXPERIMENT PARAMETERS #
#################################
BASE_DIR = "../results/full_size_one_run" # directory where results will be saved
DATASETS_DIR = "../datasets" # directory to read the data from
# Constants
SEED = 42 # seed to initialize random processes
PERP = 30 # perplexity value to be used throughout the experiments
KNN_METHOD = "hnswlib" # use hnswlib for determining nearest neighbors in high-dimensional space; note that this is
# an approximation, switch to "sklearn" for an exact method
VANILLA = False # if vanilla is set to true, regular gradient descent without any modifications is performed; for
# vanilla set to false, the optimization makes use of momentum and gains
EXAG = 12 # the factor by which the attractive forces are amplified during early exaggeration
hd_params = {"perplexity": PERP}
# Variables
datasets = [
Datasets.LUKK,
Datasets.MYELOID8000,
Datasets.PLANARIA,
Datasets.MNIST,
Datasets.C_ELEGANS,
Datasets.WORDNET
]
tsne_types = ["accelerated", "exact"] # the type "accelerated" uses the polar quad tree for acceleration, "exact"
# uses no acceleration and runs in quadratic time per iteration
splitting_strategies = ["equal_length", "equal_area"] # the polar quad tree comes in two flavors: Splitting by equal
# area and by equal length in the embedding space. The "equal_length" splitting shows better performance in our
# experiments.
###################
# EXPERIMENT LOOP #
###################
overview_created = False
for dataset in datasets: # Iterate over the data sets
rng = np.random.default_rng(seed=SEED) # random number generator
dataX, dataLabels = load_data( # Load the data
dataset,
data_home=DATASETS_DIR,
to_return="X_labels", # Return the high-dimensional data and its labels
hd_params=hd_params,
knn_method=KNN_METHOD
)
n_samples = dataX.shape[0]
sample_sizes = np.array([n_samples, ]).astype(int) # only run the full size sample, don't create sub-samples
X_embedded = initialization( # create an initial embedding of the data into 2-dimensional space via PCA
n_samples=n_samples,
n_components=2,
X=dataX,
random_state=rng.integers(0, 1000000),
method="pca"
)
for config_id, config in enumerate(product(sample_sizes, tsne_types, splitting_strategies)):
for run_n in [0, ]: # do not repeat the run
sample_size, tsne_type, splitting_strategy = config
print(f"[experiment_grid] Processing {dataset}, run_id {run_n}, config_id ({config_id}): {config}")
# Generate random sample
idx = rng.choice(np.arange(n_samples), sample_size, replace=False)
idx = np.sort(idx)
dataX_sample = dataX[idx]
dataLabels_sample = dataLabels[idx]
X_embedded_sample = X_embedded[idx]
D, V = hd_matrix(X=dataX_sample, hd_params=hd_params, knn_method=KNN_METHOD) # Compute the NN matrix
LR = (dataX_sample.shape[0] * 1) / (EXAG * 1000) # Compute the learning rate
opt_params = SequentialOptimizer.sequence_poincare(
learning_rate_ex=LR, # specify learning rate for the early exaggeration
learning_rate_main=LR, # specify learning rate for the non-exaggerated gradient descent
exaggeration=EXAG,
vanilla=VANILLA,
momentum_ex=0.5, # momentum to be used during early exaggeration
momentum=0.8, # momentum to be used during non-exaggerated gradient descent
exact=(tsne_type == "exact"),
area_split=(splitting_strategy == "equal_area"),
n_iter_check=10, # Needed for early stopping criterion
size_tol=0.999 # Size of the embedding to be used as early stopping criterion
)
run_dir = Path(f"{BASE_DIR}/{dataset.name}/size_{sample_size}/configuration_{config_id}/run_{run_n}/")
if run_dir.exists():
# Skip already computed embeddings
print(f"[experiment_grid] - Exists so not computing it: {run_dir}")
else:
run_dir.mkdir(parents=True, exist_ok=True)
params = {
"lr": LR,
"perplexity": PERP,
"seed": SEED,
"sample_size": int(sample_size),
"tsne_type": tsne_type,
"splitting_strategy": splitting_strategy
}
print(f"[experiment_grid] - Starting configuration {config_id} with dataset {dataset.name}: {params}")
opt_params["logging_dict"] = {
"log_path": str(run_dir.joinpath("embeddings"))
}
# Save the high-dimensional neighborhood matrices for later use
json.dump(params, open(run_dir.joinpath("params.json"), "w"))
np.save(run_dir.joinpath("subset_idx.npy"), idx)
if issparse(D):
save_npz(run_dir.joinpath("D.npz"), D)
else:
np.save(run_dir.joinpath("D.npy"), D)
if issparse(V):
save_npz(run_dir.joinpath("P.npz"), V)
else:
np.save(run_dir.joinpath("P.npy"), V)
hdeo_hyper = HyperbolicTSNE( # Initialize an embedding object
init=X_embedded_sample,
n_components=X_embedded_sample.shape[1],
metric="precomputed",
verbose=2,
opt_method=SequentialOptimizer,
opt_params=opt_params
)
error_title = ""
try:
res_hdeo_hyper = hdeo_hyper.fit_transform((D, V)) # Compute the hyperbolic embedding
except ValueError:
error_title = "_error"
res_hdeo_hyper = find_last_embedding(opt_params["logging_dict"]["log_path"])
traceback.print_exc(file=open(str(run_dir) + "traceback.txt", "w"))
print("[experiment_grid] - Run failed ...")
else: # we save the data if there were no errors
print("[experiment_grid] - Finished running, saving run data directory ...")
# Save the final embedding coordinates
np.save(run_dir.joinpath("final_embedding.npy"), res_hdeo_hyper)
# Save a plot of the final embedding
fig = plot_poincare(res_hdeo_hyper, labels=dataLabels_sample)
fig.savefig(run_dir.joinpath(f"final_embedding{error_title}.png"))
plt.close(fig)
np.save(run_dir.joinpath("logging_dict.npy"), opt_params["logging_dict"])
# Write out timings csv
timings = np.array(hdeo_hyper.optimizer.cf.results)
with open(run_dir.joinpath("timings.csv"), "w", newline="") as timings_file:
timings_writer = csv.writer(timings_file)
timings_writer.writerow(["it_n", "time_type", "total_time"])
for n, row in enumerate(timings):
timings_writer.writerow([n, "tree_building", row[0]])
timings_writer.writerow([n, "tot_gradient", row[1]])
timings_writer.writerow([n, "neg_force", row[2]])
timings_writer.writerow([n, "pos_force", row[3]])
# Create or append to overview csv file after every run
with open(run_dir.joinpath(f"overview_part.csv"), "w", newline="") as overview_file:
overview_writer = csv.writer(overview_file)
overview_writer.writerow(["dataset", *params, "run", "run_directory", "error"])
overview_writer.writerow(
[dataset.name, *params.values(), run_n, str(run_dir).replace(str(BASE_DIR), "."),
error_title != ""])
print()