Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

Revision 45f6a33c0276fb344367de006252ba59881fe62b authored by Nicolas F. Chaves-de-Plaza on 11 December 2023, 10:25:31 UTC, committed by Nicolas F. Chaves-de-Plaza on 11 December 2023, 10:25:31 UTC
Updated documentation
1 parent c552d7b
  • Files
  • Changes
  • f3436ac
  • /
  • experiments_and_plots
  • /
  • table_one_nearest_neighbor_error.py
Raw File Download

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • revision
  • directory
  • content
revision badge
swh:1:rev:45f6a33c0276fb344367de006252ba59881fe62b
directory badge Iframe embedding
swh:1:dir:727b2b52c0867af9b12316adbb05d419f44ec1dd
content badge Iframe embedding
swh:1:cnt:58cf36a5ce68549b8ad7597dd09c5d341ddb0faa

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • revision
  • directory
  • content
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
table_one_nearest_neighbor_error.py
"""
This script creates a table with the one nearest neighbor erroras obtained in embeddings with exact hyperbolic t-SNE vs.
embeddings that use the polar quad tree as acceleration data structure. The latter values are averaged over different
values of theta. Here, the one-nearest neighbor error is defined to be:
    1 - (number of points whose nearest embedding neighbor has the same label as them / number of all points).
"""

###########
# IMPORTS #
###########

from pathlib import Path
import numpy as np
from hyperbolicTSNE.data_loaders import Datasets, load_data
from hyperbolicTSNE.hd_mat_ import _distance_matrix
from hyperbolicTSNE.hyperbolic_barnes_hut.tsne import distance_py

#################################
# GENERAL EXPERIMENT PARAMETERS #
#################################

results_path = Path("../results/timings_per_theta/")
K_HYPERBOLIC_NEIGHBOR_APPROXIMATION = 15 # Number of nearest neighbors to consider in a Euclidean approximation to find
# the actual nearest hyperbolic neighbor
datasets = [
    Datasets.LUKK,
    Datasets.MYELOID8000,
    Datasets.PLANARIA,
    Datasets.MNIST,
    Datasets.C_ELEGANS,
    Datasets.WORDNET
]


##################
# Helper Methods #
##################
def sort_dataframe(frame_to_be_sorted):
    frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "LUKK", "order"] = 1
    frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "MYELOID8000", "order"] = 2
    frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "PLANARIA", "order"] = 3
    frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "MNIST", "order"] = 4
    frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "WORDNET", "order"] = 5
    frame_to_be_sorted.loc[frame_to_be_sorted.dataset == "C_ELEGANS", "order"] = 6
    frame_to_be_sorted = frame_to_be_sorted.sort_values(by="order", ascending=True)
    return frame_to_be_sorted


def one_nearest_neighbor_error(embedding_path):
    # Load the embedding and compute an approximation of its distance matrix
    Y = np.load(embedding_path.joinpath("final_embedding.npy"))
    D_Y = _distance_matrix(Y, method="sklearn", n_neighbors=K_HYPERBOLIC_NEIGHBOR_APPROXIMATION)

    num_points = D_Y.shape[0]
    num_available_nn_ld = D_Y[0, :].nnz

    # Computation of ordered neighbourhoods
    nz_D_Y = D_Y.nonzero()
    nz_rows_D_Y = nz_D_Y[0].reshape(-1, num_available_nn_ld)  # row coordinate of nz elements from D_Y
    nz_cols_D_Y = nz_D_Y[1].reshape(-1, num_available_nn_ld)  # col coordinate of nz elements from D_Y
    nz_dists_D_Y = np.asarray(D_Y[nz_rows_D_Y, nz_cols_D_Y].todense())
    sorting_ids_nz_dists_D_Y = np.argsort(nz_dists_D_Y, axis=1)
    sorted_nz_cols_D_Y = nz_cols_D_Y[nz_rows_D_Y, sorting_ids_nz_dists_D_Y]  # sorted cols of nz_D_Y
    sorted_nz_cols_D_Y = sorted_nz_cols_D_Y[:, 0:K_HYPERBOLIC_NEIGHBOR_APPROXIMATION]  # only get NNs that will be used

    # The above approximates the neighborhood by Euclidean distances, replace these by hyperbolic ones and resort
    arr = np.zeros(sorted_nz_cols_D_Y.shape)
    for (i, j), v in np.ndenumerate(sorted_nz_cols_D_Y):
        arr[i, j] = distance_py(Y[i], Y[v])
    sorting_ids_arr = np.argsort(arr, axis=1)
    sorted_nz_cols_D_Y = sorted_nz_cols_D_Y[nz_rows_D_Y, sorting_ids_arr]
    sorted_nz_cols_D_Y = sorted_nz_cols_D_Y[:, 0:1]  # only get NNs that will be used

    # Compute the nearest neighbor error based on the labels of the points
    num_correct_one_nearest = 0
    for point_id in range(num_points):
        # If the nearest point to a point carries the same label, increase the number of correct points
        if labels[point_id] == labels[sorted_nz_cols_D_Y[point_id]]:
            num_correct_one_nearest += 1

    return 1 - (num_correct_one_nearest / num_points)


####################
# READING THE DATA #
####################
data = []

# Iterate over the datasets
for dataset in datasets:

    print(f"[Table One Nearest Neighbor Error] Processing data {dataset.name}, nearest neighbor errors:")
    labels = load_data(dataset, to_return='labels')

    # For each dataset, find the embedding quality of the exact embedding
    exact_path = Path(f"{results_path}/{dataset.name}/theta_0.0/")
    exact_one_nerest_neighbor_error = one_nearest_neighbor_error(exact_path)
    print(f"Theta 0.0 (exact): {exact_one_nerest_neighbor_error}")

    theta_errors = []
    # For each dataset, iterate over the values of theta
    for theta in [n / 20 for n in range(20, -1, -1)]:
        theta_path = Path(f"{results_path}/{dataset.name}/theta_{theta}/")
        theta_one_nearest_neighbor_error = one_nearest_neighbor_error(theta_path)
        print(f"Theta {theta}: {theta_one_nearest_neighbor_error}")
        theta_errors.append(theta_one_nearest_neighbor_error)
    print(f"Average theta error: {sum(theta_errors) / len(theta_errors)}")
    print()
The diff you're trying to view is too large. Only the first 1000 changed files have been loaded.
Showing with 0 additions and 0 deletions (0 / 0 diffs computed)
swh spinner

Computing file changes ...

back to top

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Content policy— Contact— JavaScript license information— Web API