https://github.com/ExpelliarmusSuperComp/Expelliarmus
Raw File
Tip revision: 83a8b7d8fc3d3b7dd4ac7ef5df4c02bd648bc526 authored by ExpelliarmusSuperComp on 07 August 2023, 14:18:01 UTC
LICENSE
Tip revision: 83a8b7d
VMISimilarity.py
import sys
from collections import defaultdict

from StaticInfo import StaticInfo
from GuestFSHelper import GuestFSHelper
from VMIDescription import VMIDescriptor

class SimilarityCalculator:
    @staticmethod
    def checkMainServicesExistence(vmiDescriptor1,mainServices):
        for pkgName in mainServices:
            if not vmiDescriptor1.checkIfNodeExists(pkgName):
                sys.exit("Error: Main Service \"" + pkgName + "\" does not exist in " + vmiDescriptor1.pathToVMI)

    @staticmethod
    def checkCompatibility(vmiDescriptor1, vmiDescriptor2):
        if vmiDescriptor1.distribution != vmiDescriptor2.distribution:
            print "Mapping: Check Compatibility failed: distributions differ! (%s vs. %s)" % (
                vmiDescriptor1.distribution, vmiDescriptor2.distribution)
            return False
        if vmiDescriptor1.distributionVersion != vmiDescriptor2.distributionVersion:
            print "Mapping: Check Compatibility failed: distribution versions differ! (%s vs. %s)" % (
                vmiDescriptor1.distributionVersion, vmiDescriptor2.distributionVersion)
            return False
        if vmiDescriptor1.architecture != vmiDescriptor2.architecture:
            print "Mapping: Check Compatibility failed: architectures differ! (%s vs. %s)" % (
                vmiDescriptor1.distributionVersion, vmiDescriptor2.distributionVersion)
            return False
        return True

    @staticmethod
    def computeSimilarityBetweenVMIDescriptorsSimple(vmi1, vmi2, onlyOnMainServices):
        """
        :param VMIDescriptor vmi1:
        :param VMIDescriptor vmi2:
        :param Boolean onlyOnMainServices:
        :return:
        """

        def max(x, y):
            if x > y:
                return x
            else:
                return y

        g1NodesDict = vmi1.getNodeData()
        g2NodesDict = vmi2.getNodeData()
        numG1Nodes = len(g1NodesDict)
        numG2Nodes = len(g2NodesDict)

        # similarity =
        #                 |matching Nodes in nodesToCheck|
        #               / |nodesToCheck|

        if onlyOnMainServices:
            # nodesToCheck: union(mainServices1,mainServices2)
            nodesToCheck = set(vmi1.getNodeDataFromMainServicesSubtrees().keys()) \
                .union(
                set(vmi2.getNodeDataFromMainServicesSubtrees().keys()))
            numAllNodes = len(nodesToCheck)

            # prefilter nodesToCheck by name occurring in both graphs
            nodesToCheck = nodesToCheck.intersection(set(g1NodesDict.keys()))
            nodesToCheck = nodesToCheck.intersection(set(g2NodesDict.keys()))
        else:
            # nodesToCheck: union(g1,g2)
            numAllNodes = len(set(g1NodesDict.keys()).union(set(g2NodesDict.keys())))
            # prefilter nodesToCheck by name occurring in both graphs
            nodesToCheck = set(g1NodesDict.keys()).intersection(set(g2NodesDict.keys()))

        numMatches = 0

        # Check similarity for all (prefiltered) packages
        for pkgName in nodesToCheck:
            pkg1Data = g1NodesDict[pkgName]
            pkg2Data = g2NodesDict[pkgName]
            if (
                    # Version has to be the same
                            pkg1Data[StaticInfo.dictKeyVersion] == pkg2Data[StaticInfo.dictKeyVersion]
                    # Architecture has to be the same, or at least on has to say all
                    and (
                            pkg1Data[StaticInfo.dictKeyArchitecture] == pkg2Data[StaticInfo.dictKeyArchitecture]
                            or pkg1Data[StaticInfo.dictKeyArchitecture] == "all"
                            or pkg2Data[StaticInfo.dictKeyArchitecture] == "all"
                    )
            ):
                numMatches = numMatches + 1

        similarity = float(numMatches) / float(numAllNodes)
        if onlyOnMainServices:
            print "\nComparison of two VMIs (Only on main services!):\n" \
                  "\tVMI 1: %i packages\n" \
                  "\tVMI 2: %i packages\n" \
                  "\t\t %i main service related packages match in name, version and architecture\n" \
                  "\t\t %i compared packages in total (union of main services from both VMIs)\n" \
                  "\t\t similarity = %i/%i = %.3f" \
                  % (numG1Nodes, numG2Nodes, numMatches, numAllNodes, numMatches, numAllNodes, similarity)
        else:
            print "\nComparison of two VMIs:\n" \
                  "\tVMI 1: %i packages\n" \
                  "\tVMI 2: %i packages\n" \
                  "\t\t %i packages match in name, version and architecture\n" \
                  "\t\t similarity = %i/%i = %.3f" \
                  % (numG1Nodes, numG2Nodes, numMatches, numMatches, numAllNodes, similarity)
        return similarity

    @staticmethod
    def computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2, onlyOnMainServices, verbose=True):
        """
        :param VMIDescriptor vmi1:
        :param VMIDescriptor vmi2:
        :param Boolean onlyOnMainServices:
        :return:
        """
        def max(x, y):
            """
            :param Float x:
            :param Float y:
            :return:
            """
            x = float(x)
            y = float(y)
            if x > y:
                return x
            else:
                return y

        g1NodesDict = vmi1.getNodeData()
        g2NodesDict = vmi2.getNodeData()
        numG1Nodes = len(g1NodesDict)
        numG2Nodes = len(g2NodesDict)

        # similarity =
        #                 |(weight * 1) for each matching Node in nodesToCheck|
        #               / |(weight * 1) for each node in nodesToCheck|
        # in variables:
        #                 sumNormSizeMatches
        #               / sumNormSizeAll

        if onlyOnMainServices:
            # nodesToCheck: union(g1-mainServices1,g2-mainServices2)
            nodesToCheck = set(vmi1.getNodeDataFromMainServicesSubtrees().keys())\
                           .union(
                           set(vmi2.getNodeDataFromMainServicesSubtrees().keys()))
        else:
            # nodesToCheck: union(G1,G2)
            nodesToCheck = set(g1NodesDict.keys()).union(set(g2NodesDict.keys()))

        numAllNodes = len(nodesToCheck)

        # determine maximum install size for normalized sizes as weights
        maxInstallSize = 0
        for pkg in nodesToCheck:
            if pkg in g1NodesDict:
                maxInstallSize = max(maxInstallSize, int(g1NodesDict[pkg][StaticInfo.dictKeyInstallSize]))
            if pkg in g2NodesDict:
                maxInstallSize = max(maxInstallSize, int(g2NodesDict[pkg][StaticInfo.dictKeyInstallSize]))

        # calculate sumNormSizeAll as sum of normalized sizes (weights)
        sumNormSizeAll = 0.0
        for pkg in nodesToCheck:
            if pkg in g1NodesDict and pkg in g2NodesDict:
                sumNormSizeAll = sumNormSizeAll +\
                                 max(g1NodesDict[pkg][StaticInfo.dictKeyInstallSize],
                                     g2NodesDict[pkg][StaticInfo.dictKeyInstallSize])/maxInstallSize
            elif pkg in g1NodesDict:
                sumNormSizeAll = sumNormSizeAll + \
                                 float(g1NodesDict[pkg][StaticInfo.dictKeyInstallSize]) / maxInstallSize
            elif pkg in g2NodesDict:
                sumNormSizeAll = sumNormSizeAll + \
                                 float(g2NodesDict[pkg][StaticInfo.dictKeyInstallSize]) / maxInstallSize

        # prefilter nodesToCheck by name occurring in both graphs
        nodesToCheck = nodesToCheck.intersection(set(g1NodesDict.keys()))
        nodesToCheck = nodesToCheck.intersection(set(g2NodesDict.keys()))

        numMatches = 0
        sumNormSizeMatches = 0.0

        # Check similarity for all (prefiltered) packages
        for pkgName in nodesToCheck:
            pkg1Data = g1NodesDict[pkgName]
            pkg2Data = g2NodesDict[pkgName]
            if (
                    # Version has to be the same
                    pkg1Data[StaticInfo.dictKeyVersion] == pkg2Data[StaticInfo.dictKeyVersion]
                    # Architecture has to be the same, or at least one has to say all
                    and (
                            pkg1Data[StaticInfo.dictKeyArchitecture] == pkg2Data[StaticInfo.dictKeyArchitecture]
                            or pkg1Data[StaticInfo.dictKeyArchitecture] == "all"
                            or pkg2Data[StaticInfo.dictKeyArchitecture] == "all"
                    )
            ):
                numMatches = numMatches + 1
                sumNormSizeMatches = sumNormSizeMatches\
                                     + max(pkg1Data[StaticInfo.dictKeyInstallSize],
                                           pkg2Data[StaticInfo.dictKeyInstallSize])/maxInstallSize

        similarity = float(sumNormSizeMatches) / float(sumNormSizeAll)

        if verbose:
            if onlyOnMainServices:
                print "\nWeighted Comparison of two VMIs (Only on main services!):\n" \
                      "\tGraph 1: %i packages\n" \
                      "\tGraph 2: %i packages\n" \
                      "\t\t %i main service related packages match in name, version and architecture\n" \
                      "\t\t %i compared packages in total (union of main services from both VMIs)\n" \
                      "\t\t similarity = %.3f/%.3f = %.3f" \
                      % (numG1Nodes, numG2Nodes, numMatches, numAllNodes, sumNormSizeMatches, sumNormSizeAll, similarity)
            else:
                print "\nWeighted Comparison of two VMIs:\n" \
                      "\tVMI 1: %i packages\n" \
                      "\tVMI 2: %i packages\n" \
                      "\t\t %i packages match in name, version and architecture\n" \
                      "\t\t similarity = %i/%i = %.3f" \
                      % (numG1Nodes, numG2Nodes, numMatches, sumNormSizeMatches, numAllNodes, similarity)
        return similarity

    @staticmethod
    def computeSimilarityOneToOne(pathToVMI1, mainServices1, pathToVMI2, mainServices2, onlyOnMainServices):

        # Create Descriptors/Graphs for each VMI
        print "\n=== Creating Descriptor for VMI \"%s\"" % (pathToVMI1)
        (guest, root) = GuestFSHelper.getHandle(pathToVMI1, rootRequired=True)
        vmi1 = VMIDescriptor(pathToVMI1, "internal_vmi1", mainServices1, guest, root)
        GuestFSHelper.shutdownHandle(guest)

        print "\n=== Creating Descriptor for VMI \"%s\"" % (pathToVMI2)
        (guest, root) = GuestFSHelper.getHandle(pathToVMI2, rootRequired=True)
        vmi2 = VMIDescriptor(pathToVMI2, "internal_vmi2", mainServices2, guest, root)
        GuestFSHelper.shutdownHandle(guest)

        # Check if Main Services exist
        SimilarityCalculator.checkMainServicesExistence(vmi1, mainServices1)
        SimilarityCalculator.checkMainServicesExistence(vmi2, mainServices2)

        # Compute Similarity
        graphSimilarity = SimilarityCalculator.computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2, onlyOnMainServices)
        return graphSimilarity

    @staticmethod
    def computeSimilarityManyToMany(vmiData, onlyOnMainServices):
        if onlyOnMainServices:
            print "=====Calculating similarities with respect to main services between each of %i VMIs" % len(vmiData)
        else:
            print "=====Calculating similarities between each of %i VMIs" % len(vmiData)

        sortedVMIDescriptorList = list()
        count = 0
        for (pathToVMI, vmiFileName, mainServices) in vmiData:
            count = count + 1
            print "Creating Descriptor for vmi \"%s\" (%i/%i)..." % (vmiFileName, count, len(vmiData))
            (guest, root) = GuestFSHelper.getHandle(pathToVMI, rootRequired=True)
            vmi = VMIDescriptor(pathToVMI, vmiFileName, mainServices, guest, root)
            GuestFSHelper.shutdownHandle(guest)
            sortedVMIDescriptorList.append(vmi)

        similarities = defaultdict(dict)
        for vmi1 in sortedVMIDescriptorList:
            print "Similarities for VMI \"%s\":" % vmi1.vmiName
            for vmi2 in sortedVMIDescriptorList:
                if vmi1.pathToVMI == vmi2.pathToVMI:
                    similarities[vmi1.vmiName][vmi2.vmiName] = None
                else:
                    # Check if Main Services exist
                    SimilarityCalculator.checkMainServicesExistence(vmi1, vmi1.mainServices)
                    SimilarityCalculator.checkMainServicesExistence(vmi2, vmi2.mainServices)

                    sim = SimilarityCalculator.computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2,
                                                                                              onlyOnMainServices,
                                                                                              verbose=False)
                    similarities[vmi1.vmiName][vmi2.vmiName] = sim
                    print "\t%0.2f similarity to VMI \"%s\"" % (sim, vmi2.vmiName)
        return similarities

    @staticmethod
    def computeSimilarityManyToManyOLD(vmisAndMS, onlyOnMainServices):
        if onlyOnMainServices:
            print "=====Calculating similarities with respect to main services between each of %i VMIs" % len(vmisAndMS)
        else:
            print "=====Calculating similarities between each of %i VMIs" % len(vmisAndMS)

        sortedVMIDescriptorList = list()
        i = 0
        for (vmiFileName,mainServices) in vmisAndMS:
            i = i + 1
            print "Creating Descriptor for vmi \"%s\" (%i/%i)..." % (vmiFileName, i, len(vmisAndMS))
            pathToVMI = StaticInfo.relPathLocalVMIFolder + "/" + vmiFileName
            (guest, root) = GuestFSHelper.getHandle(pathToVMI, rootRequired=True)
            vmi = VMIDescriptor(pathToVMI, vmiFileName, mainServices, guest, root)
            GuestFSHelper.shutdownHandle(guest)
            sortedVMIDescriptorList.append(vmi)

        similarities = defaultdict(dict)
        for vmi1 in sortedVMIDescriptorList:
            print "Similarities for VMI \"%s\":" % vmi1.vmiName
            for vmi2 in sortedVMIDescriptorList:
                if vmi1.pathToVMI == vmi2.pathToVMI:
                    similarities[vmi1.vmiName][vmi2.vmiName] = None
                else:
                    # Check if Main Services exist
                    SimilarityCalculator.checkMainServicesExistence(vmi1, vmi1.mainServices)
                    SimilarityCalculator.checkMainServicesExistence(vmi2, vmi2.mainServices)

                    sim = SimilarityCalculator.computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2,
                                                                                              onlyOnMainServices, verbose=False)
                    similarities[vmi1.vmiName][vmi2.vmiName] = sim
                    print "\t%0.2f similarity to VMI \"%s\"" % (sim,vmi2.vmiName)
        return similarities



























back to top