https://github.com/msmathcomp/hyperbolic-tsne
Tip revision: bba9d0f089659fb170c7270aa90c796f91bfb2b1 authored by Martin Skrodzki on 02 May 2024, 12:34:19 UTC
Update README.md
Update README.md
Tip revision: bba9d0f
table_relative_cost_function_error.py
"""
This script iterates over the results created by the `samples per data set` script. For each final exact embedding,
it extracts the exact cost function value, finds the corresponding accelerated embeddings, and extracts their respective
cost function values. From these cost function values, a relative error is computed and the statistics across this
relative errors are appended to the original data, which is then stored.
"""
###########
# IMPORTS #
###########
from pathlib import Path
import numpy as np
import pandas as pd
######################
# HELPER FUNCTION(S) #
######################
def max_iteration_cost_function(df_record):
record_path = df_record.run_directory.replace(".", str(results_path))
max_it = 0
max_it_cf = 0
for path in list(Path(f"{record_path}/embeddings/").rglob("*.csv")):
if path.name.startswith("._"):
continue
(iteration_number, cost_function_value) = str(path.stem).split(',')
if int(iteration_number) > max_it:
max_it = int(iteration_number)
max_it_cf = float(cost_function_value)
return max_it, max_it_cf
####################
# READING THE DATA #
####################
# Use the data generated from the various runs on the data sets with several sample sizes
results_path = Path("../results/samples_per_data_set/")
# Load the overview of the experiments
df = pd.read_csv(results_path.joinpath("overview.csv"))
# For each dataset, find the maximum sample size, i.e., the size of the data set
maxes = df.groupby(["dataset"]).sample_size.transform(max)
# Filter the data frame to only include these
df = df[df.sample_size == maxes]
# Filter to only include one instance of the exact solutions (as they all have the same results)
exact_results = df[(df.tsne_type == "exact")].groupby(["dataset"]).first().reset_index()
accelerated_results = df[(df.tsne_type == "accelerated") & (df.splitting_strategy == "equal_length")]
# Iterate over the exact solutions and compare to the various approximated ones
for i, record in enumerate(exact_results.to_records()):
# Get the maximum iteration number and corresponding cost function for this exact record
max_iteration, max_iteration_cf = max_iteration_cost_function(record)
print(f"{record.dataset} {record.tsne_type} {max_iteration} {max_iteration_cf}")
# Array to store the relative approximated cost function errors
cf_errors_relative = []
# Iterate over all accelerated results for the current data set
for accelerated_record in accelerated_results[accelerated_results.dataset == record.dataset].to_records():
# Get the maximum iteration number and corresponding cost function for the accelerated record
max_iteration_accelerated, max_iteration_cf_accelerated = max_iteration_cost_function(accelerated_record)
# Ensure that this record is for the correct data set and that it ran for the same number of iterations
if accelerated_record.dataset != record.dataset or max_iteration_accelerated != max_iteration:
raise (
f"When processing {record.dataset}, could not find an accelerated result with {max_iteration} "
f"iterations, found one with {max_iteration_accelerated} iterations."
)
print(
f"{accelerated_record.dataset} {accelerated_record.tsne_type} {max_iteration_accelerated} "
f"{max_iteration_cf_accelerated}"
)
# Compute the relative error of the cost function
cf_error_relative = np.abs(max_iteration_cf - max_iteration_cf_accelerated) / max_iteration_cf
cf_errors_relative.append(cf_error_relative)
# Compute descriptive statistics on the cost function error
avg_cf_errors_relative = pd.DataFrame(cf_errors_relative).describe().T.add_prefix("cf_")
for name, values in avg_cf_errors_relative.items():
exact_results.at[i, name] = values.item()
# Store results
exact_results.to_csv(results_path.joinpath("relative_cost_function_errors.csv"), index=False)
print(f"Finished processing {record.dataset}.")