"""
This script creates a table with the one nearest neighbor erroras obtained in embeddings with exact hyperbolic t-SNE vs.
embeddings that use the polar quad tree as acceleration data structure. The latter values are averaged over different
values of theta. Here, the one-nearest neighbor error is defined to be:
1 - (number of points whose nearest embedding neighbor has the same label as them / number of all points).
"""
###########
# IMPORTS #
###########
from pathlib import Path
import numpy as np
from hyperbolicTSNE.data_loaders import Datasets, load_data
from hyperbolicTSNE.hd_mat_ import _distance_matrix
from hyperbolicTSNE.hyperbolic_barnes_hut.tsne import distance_py
#################################
# GENERAL EXPERIMENT PARAMETERS #
#################################
DATASETS_DIR = "../datasets" # directory to read the data from
results_path = Path("../results/timings_per_theta")
K_HYPERBOLIC_NEIGHBOR_APPROXIMATION = 15 # Number of nearest neighbors to consider in a Euclidean approximation to find
# the actual nearest hyperbolic neighbor
datasets = [
Datasets.LUKK,
Datasets.MYELOID8000,
Datasets.PLANARIA,
Datasets.MNIST,
Datasets.C_ELEGANS,
Datasets.WORDNET
]
##################
# Helper Methods #
##################
def sort_dataframe(frame_to_be_sorted):
frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "LUKK", "order"] = 1
frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "MYELOID8000", "order"] = 2
frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "PLANARIA", "order"] = 3
frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "MNIST", "order"] = 4
frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "WORDNET", "order"] = 5
frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "C_ELEGANS", "order"] = 6
frame_to_be_sorted = frame_to_be_sorted.sort_values(by="order", ascending=True)
return frame_to_be_sorted
def one_nearest_neighbor_error(embedding_path):
# Load the embedding and compute an approximation of its distance matrix
Y = np.load(embedding_path.joinpath("final_embedding.npy"))
D_Y = _distance_matrix(Y, method="sklearn", n_neighbors=K_HYPERBOLIC_NEIGHBOR_APPROXIMATION)
num_points = D_Y.shape[0]
num_available_nn_ld = D_Y[0, :].nnz
# Computation of ordered neighbourhoods
nz_D_Y = D_Y.nonzero()
nz_rows_D_Y = nz_D_Y[0].reshape(-1, num_available_nn_ld) # row coordinate of nz elements from D_Y
nz_cols_D_Y = nz_D_Y[1].reshape(-1, num_available_nn_ld) # col coordinate of nz elements from D_Y
nz_dists_D_Y = np.asarray(D_Y[nz_rows_D_Y, nz_cols_D_Y].todense())
sorting_ids_nz_dists_D_Y = np.argsort(nz_dists_D_Y, axis=1)
sorted_nz_cols_D_Y = nz_cols_D_Y[nz_rows_D_Y, sorting_ids_nz_dists_D_Y] # sorted cols of nz_D_Y
sorted_nz_cols_D_Y = sorted_nz_cols_D_Y[:, 0:K_HYPERBOLIC_NEIGHBOR_APPROXIMATION] # only get NNs that will be used
# The above approximates the neighborhood by Euclidean distances, replace these by hyperbolic ones and resort
arr = np.zeros(sorted_nz_cols_D_Y.shape)
for (i, j), v in np.ndenumerate(sorted_nz_cols_D_Y):
arr[i, j] = distance_py(Y[i], Y[v])
sorting_ids_arr = np.argsort(arr, axis=1)
sorted_nz_cols_D_Y = sorted_nz_cols_D_Y[nz_rows_D_Y, sorting_ids_arr]
sorted_nz_cols_D_Y = sorted_nz_cols_D_Y[:, 0:1] # only get NNs that will be used
# Compute the nearest neighbor error based on the labels of the points
num_correct_one_nearest = 0
for point_id in range(num_points):
# If the nearest point to a point carries the same label, increase the number of correct points
if labels[point_id] == labels[sorted_nz_cols_D_Y[point_id]]:
num_correct_one_nearest += 1
return 1 - (num_correct_one_nearest / num_points)
####################
# READING THE DATA #
####################
data = []
# Iterate over the datasets
for dataset in datasets:
print(f"[Table One Nearest Neighbor Error] Processing data {dataset.name}, nearest neighbor errors:")
labels = load_data(dataset, to_return='labels', data_home=DATASETS_DIR,)
# For each dataset, find the embedding quality of the exact embedding
exact_path = Path(f"{results_path}/{dataset.name}/theta_0.0/")
exact_one_nerest_neighbor_error = one_nearest_neighbor_error(exact_path)
print(f"Theta 0.0 (exact): {exact_one_nerest_neighbor_error}")
theta_errors = []
# For each dataset, iterate over the values of theta
for theta in [0.5]:
theta_path = Path(f"{results_path}/{dataset.name}/theta_{theta}/")
theta_one_nearest_neighbor_error = one_nearest_neighbor_error(theta_path)
print(f"Theta {theta}: {theta_one_nearest_neighbor_error}")
theta_errors.append(theta_one_nearest_neighbor_error)
print(f"Average theta error: {sum(theta_errors) / len(theta_errors)}")
print()