https://github.com/wavestoweather/PCViewer.git
Raw File
Tip revision: e4a2a107c7790a4948783b0f8515ddf84afafc15 authored by Josef Stumpfegger on 24 November 2022, 12:56:34 UTC
Added links to the replicability video and dataset.
Tip revision: e4a2a10
DataClusterer.hpp
#pragma once

#include<iostream>
#include<vector>
#include<future>
#include<random>
#include<utility>
#include<Eigen/Dense>
#include<numeric>
#include "Data.hpp"
#include "DBScan.hpp"


//Single Class for data clustering. In the constructor different Clustering methods can be inserted to change the clustering
class DataClusterer{
public:
    enum class Method: int{
        KMeans,
        DBScan,
        Hirarchical
    };

    enum class DistanceMetric: int{
        Norm,
        SquaredNorm,
        L1Norm
    };

    enum class InitMethod: int{
        Forgy,
        UniformRandom,
        NormalRandom,
        PlusPlus
    };

    enum class KMethod: int{
        Mean,
        Median,
        Mediod
    };

    enum class HClusteringLinkage: int{
        Single,
        Complete,
        Weighted,
        Median,
        Average,
        Ward,
        Centroid
    };

    //contains settings for all clustering methods, the correct settings are read out
    struct ClusterSettings{
        //general settings
        DistanceMetric distanceMetric;

        //KMeans
        int kmeansClusters;
        KMethod kmeansMethod;
        InitMethod kmeansInitMethod;
        int maxIterations;

        //DBScan
        int dbscanMinPoints;
        float dbscanEpsilon;

        //Hirarchical
        HClusteringLinkage hclusteringLinkage;
        int hclusteringClusters;
    };

    DataClusterer(const Eigen::MatrixXf& points, Method method, const ClusterSettings& settings):settings(settings), points(points), method(method){
        future = std::async(runAsync, this);
    }

    static void runAsync(DataClusterer* c){
        c->run();
    }

    void run(){
        switch(method){
            case Method::KMeans:{
                execKMeans();
            }break;
            case Method::DBScan:{
                execDBScan();
            }break;
            case Method::Hirarchical:{
                execHirarchical();
            }break;
        }
        progress = 1;
        clustered = true;
    }

    std::vector<std::vector<uint32_t>> clusters;
    std::vector<Eigen::VectorXf> clusterMeans;
    std::future<void> future;
    float progress = 0;
    bool clustered = false;

protected:
    ClusterSettings settings;
    const Eigen::MatrixXf& points;
    bool clusterUpdated;
    Method method;
    std::vector<uint32_t> clusterAssignments;

    float distance(const Eigen::VectorXf& a,const Eigen::VectorXf& b){
        switch(settings.distanceMetric){
            case DistanceMetric::Norm: return (a - b).norm();
            case DistanceMetric::SquaredNorm: return (a - b).squaredNorm();
            case DistanceMetric::L1Norm: return (a - b).lpNorm<1>();
        }
        return 0;
    }

    void updateStepKMeans(){
        for(auto& vec: clusterMeans) vec.setZero();
        int c = 0;
        for(auto& cluster: clusters){
            for(uint32_t p: cluster){
                clusterMeans[c] += points.row(p);
            }
            clusterMeans[c] /= static_cast<float>(cluster.size());
            ++c;
        }
    }

    void assignmentStepKMeans(){
        clusters = std::vector<std::vector<uint32_t>>(clusters.size()); //erasing all assignments
        for(int i = 0; i < points.rows(); ++i){
            int clusterID = 0;
            float minDist = distance(points.row(i), clusterMeans[0]);
            for(int j = 1; j < settings.kmeansClusters; ++j){
                float dist = distance(points.row(i), clusterMeans[j]);
                if(dist < minDist){
                    clusterID = j;
                    minDist = dist;
                }
            }
            clusters[clusterID].push_back(i);
            clusterUpdated |= clusterAssignments[i] != clusterID;
            clusterAssignments[i] = clusterID; 
        }
    }

    void execKMeans(){
        //initialization
        clusterAssignments = std::vector<uint32_t>(points.rows(), 0);
        std::mt19937 engine;
        engine.seed(static_cast<unsigned long>(time(nullptr)));
        std::uniform_int_distribution<uint32_t> intDistCluster(0, settings.kmeansClusters - 1);
        std::uniform_int_distribution<uint32_t> intDist(0, points.rows());
        std::uniform_real_distribution<float> realDist(0, 1);
        clusterMeans = std::vector<Eigen::VectorXf>(settings.kmeansClusters);
        clusters.resize(settings.kmeansClusters);
        
        switch(settings.kmeansInitMethod){
            case InitMethod::Forgy:{
                for(auto& clusterMean: clusterMeans) clusterMean = points.row(intDist(engine));
                break;
            }
            case InitMethod::UniformRandom:{
                for(int i = 0; i < points.rows(); ++i) clusters[intDistCluster(engine)].push_back(i);
                for(auto& mean: clusterMeans) mean = Eigen::VectorXf::Zero(points.cols());
                updateStepKMeans();
                break;
            }
            case InitMethod::NormalRandom:{
                Eigen::RowVectorXf means = points.colwise().mean();
                Eigen::MatrixXf centered = points.rowwise() - means;
                Eigen::VectorXf stdDevs = (centered.colwise().squaredNorm() / static_cast<float>(points.rows()));
                stdDevs = stdDevs.array().sqrt();
                std::vector<float> randData(points.cols(), 0);
                std::vector<std::normal_distribution<float>> normalDists(points.cols());
                int i = 0;
                for(auto& dist: normalDists) dist = std::normal_distribution<float>(means[i++], stdDevs[i]);

                for(auto& mean: clusterMeans){
                    mean = Eigen::VectorXf::Zero(points.cols());
                    for(i = 0; i < points.cols(); ++i) mean[i] = normalDists[i](engine);
                }
                break;
            }
            case InitMethod::PlusPlus:{
                clusterMeans[0] = points.row(intDist(engine));
                Eigen::VectorXf probabilities(points.rows());
                std::fill_n(probabilities.data(), points.rows(), std::numeric_limits<float>::infinity());
                for(int i = 1; i < settings.kmeansClusters; ++i){
                    //updating all probabilities with the new found cluster
                    for(int j = 0; j < points.rows(); ++j){
                        float dist = distance(points.row(j), clusterMeans[i - 1]);
                        probabilities(j) = std::min(probabilities(j), dist);
                    }
                    float summedProbability = probabilities.sum();
                    if(summedProbability != 0){
                        float rand = realDist(engine) * summedProbability;
                        int j = 0;
                        for(; j < points.rows(); ++j){
                            rand -= probabilities(j);
                            if(rand < 0) break;
                        }
                        clusterMeans[i] = points.row(j);
                    }
                }
                break;
            }
        }
        progress = .1f;

        //k means iteration
        clusterUpdated = true;
        for(int counter = 0; counter < settings.maxIterations && clusterUpdated; ++counter){
            clusterUpdated = false;
            progress = counter / float(settings.maxIterations) * .9 + .1;

            assignmentStepKMeans();
            updateStepKMeans();
        }
        progress = 1;
        clustered = true;
    }
    void execDBScan(){
        DBScan dbscan(settings.dbscanMinPoints, settings.dbscanEpsilon, points, clusters, &progress);
        progress = 1;
        clustered = true;
    }

    // Hirarchical clustering-------------------------------------------------------------------
    struct Node{
        size_t left, right;
        float distance;
        size_t size;
    };
    struct LanceWilliamsCoeffs
    {
        double ai;
        double aj;
        double beta;
        double gamma;
    };

    std::vector<float> h_distances;
    std::vector<std::pair<uint32_t, uint32_t>> h_indices;
    std::vector<Node> h_nodes;
    std::vector<size_t> h_nodeIDs;
    
    inline uint32_t idx(uint32_t x, uint32_t y){
        uint32_t n = points.rows();
        return x * (2 * n - 1 - x) / 2 + (y - x - 1);
    }
    
    void computeDistanceMatrix(){
        size_t p = 0;
        for(uint32_t y = 0; y < points.rows(); ++y){
            for(uint32_t x = y + 1; x < points.rows(); ++x){
                const uint32_t index = idx(y, x);
                progress = .2f * (double(p++) / (double(points.rows()) * points.rows() / 2.0));
                const Eigen::VectorXf vecX = points.row(x);
                const Eigen::VectorXf vecY = points.row(y);

                double measure = distance(vecX, vecY);
                h_distances[index] = measure;
                if(index < 100)
                    bool breakHard = true;
                h_indices[index] = std::make_pair(y,x);
            }
        }
    }

    LanceWilliamsCoeffs computeCoefficients(size_t i, size_t j, size_t k){
        size_t nI, nJ, nK;
        size_t nodeID = 0;
        switch(settings.hclusteringLinkage){
            case HClusteringLinkage::Single:
                return {.5, .5, 0, -.5f};
            case HClusteringLinkage::Complete:
                return {.5, .5, 0, .5};
            case HClusteringLinkage::Weighted:
                return {.5, .5, 0, 0};
            case HClusteringLinkage::Median:
                return {.5, .5, -.25, 0};
            case HClusteringLinkage::Average:{
                nodeID = h_nodeIDs[i] - points.rows();
                nI = (nodeID >= 0) ? h_nodes[nodeID].size : 1;
                nodeID = h_nodeIDs[j] - points.rows();
                nJ = (nodeID >= 0) ? h_nodes[nodeID].size : 1;

                const double sumN = static_cast<double>(nI + nJ);
                const double ai = nI / sumN;
                const double aj = nJ / sumN;

                return { ai, aj, 0.0, 0.0 };
            }
            case HClusteringLinkage::Ward:{
                nodeID = h_nodeIDs[i] - points.rows();
                nI = (nodeID >= 0) ? h_nodes[nodeID].size : 1;
                nodeID = h_nodeIDs[j] - points.rows();
                nJ = (nodeID >= 0) ? h_nodes[nodeID].size : 1;
                nodeID = h_nodeIDs[k] - points.rows();
                nK = (nodeID >= 0) ? h_nodes[nodeID].size : 1;

                const double sumN = static_cast<double>(nI + nJ + nK);
                const double ai = (nI + nK) / sumN;
                const double aj = (nJ + nK) / sumN;
                const double beta = nK / sumN * (-1);

                return { ai, aj, beta, 0.0 };
            }
            case HClusteringLinkage::Centroid:{
                nodeID = h_nodeIDs[i] - points.rows();
                nI = (nodeID >= 0) ? h_nodes[nodeID].size : 1;
                nodeID = h_nodeIDs[j] - points.rows();
                nJ = (nodeID >= 0) ? h_nodes[nodeID].size : 1;

                const double sumN = static_cast<double>(nI + nJ);
                const double ai = nI / sumN;
                const double aj = nJ / sumN;
                const double beta = (-nI * nJ) / (sumN * sumN);

                return { ai, aj, beta, 0.0 };
            }
        }
        return {};
    }

    // Lance-Williams algorithm
    float computeNewDistance(float dKI, float dKJ, size_t i, size_t j, size_t k){
        const LanceWilliamsCoeffs c = computeCoefficients(i, j, k);
        
        double betaTerm = 0, gammaTerm = 0;

        if(c.beta > 0){
            const double valIJ = h_distances[idx(i, j)];
            betaTerm = c.beta * valIJ;
        }
        if(c.gamma > 0){
            const double absD = std::abs(dKI - dKJ);
            gammaTerm = c.gamma * absD;
        }

        return c.ai * dKI + c.aj * dKJ + betaTerm + gammaTerm;
    }

    void execHirarchical(){
        h_distances.resize(points.rows() * (points.rows() - 1) / 2);
        h_indices.resize(h_distances.size());
        h_nodes.resize(points.rows());
        h_nodeIDs.resize(points.rows());
        computeDistanceMatrix();
        progress = .2f;

        const size_t numRuns = points.rows() - 1;

        const float maxValue = std::numeric_limits<float>::max();
        std::iota(h_nodeIDs.begin(), h_nodeIDs.end(), 0);   //filling with 0 to i

        for(size_t counter = 0; counter < numRuns; ++counter){
            progress = .2f + .4f * counter / numRuns;
            auto minIt = std::min_element(h_distances.begin(), h_distances.end());
            float minDist = *minIt;
            size_t i = minIt - h_distances.begin();

            const auto& indices = h_indices[i];
            const size_t& indexI = indices.first, & indexJ = indices.second;

            for(size_t ii = 0; ii < indexI; ++ii){
                const float dKI = h_distances[idx(ii, indexI)];
                const float dKJ = h_distances[idx(ii, indexJ)];

                h_distances[idx(ii, indexI)] = computeNewDistance(dKI, dKJ, indexI, indexJ, ii);
                h_distances[idx(ii, indexJ)] = maxValue;
            }

            for(size_t ii = indexI + 1; ii < indexJ; ++ii){
                const float dKI = h_distances[idx(indexI, ii)];
                const float dKJ = h_distances[idx(ii, indexJ)];

                h_distances[idx(indexI, ii)] = computeNewDistance(dKI, dKJ, indexI, indexJ, ii);
                h_distances[idx(ii, indexJ)] = maxValue;
            }

            for(size_t ii = indexJ + 1; ii < points.rows(); ++ii){
                const float dKI = h_distances[idx(indexI, ii)];
                const float dKJ = h_distances[idx(indexJ, ii)];

                h_distances[idx(indexI, ii)] = computeNewDistance(dKI, dKJ, indexI, indexJ, ii);
                h_distances[idx(indexJ, ii)] = maxValue;
            }

            h_distances[idx(indexI, indexJ)] = maxValue;

            auto& node = h_nodes[counter];

            node.distance = minDist;
            node.left = size_t(h_nodeIDs[indexI]);
            node.right = size_t(h_nodeIDs[indexJ]);
            node.size = (node.left >= points.rows()) ? h_nodes[node.left - points.rows()].size : 1;
            node.size += (node.right >= points.rows()) ? h_nodes[node.right - points.rows()].size : 1;

            h_nodeIDs[indexI] = counter + points.rows();
        }

        //cutting the node trees
        clusters.resize(settings.hclusteringClusters);
        uint32_t cluster = 0;
        std::fill(h_nodeIDs.begin(), h_nodeIDs.end(), 0);
        for(int64_t i = points.rows() - 2; i >= 0; --i){
            progress = .6f + .4f - (.4f * i / points.rows());
            const Node& node = h_nodes[i];
            size_t& nodeID = h_nodeIDs[i];

            if(cluster < settings.hclusteringClusters - 1){
                if(node.left < points.rows()) clusters[nodeID].push_back(node.left);
                else h_nodeIDs[node.left - points.rows()] = nodeID;

                cluster++;

                if (node.right < points.rows()) clusters[cluster].push_back(node.right);
                else h_nodeIDs[node.right - points.rows()] = cluster;
                continue;
            }

            const size_t leftID = node.left;
            if (leftID >= points.rows()) h_nodeIDs[leftID - points.rows()] = nodeID;
            else clusters[nodeID].push_back(leftID);

            const size_t rightID = node.right;
            if (rightID >= points.rows()) h_nodeIDs[rightID - points.rows()] = nodeID;
            else clusters[nodeID].push_back(rightID);
        }
    }
};
back to top