https://github.com/ExpelliarmusSuperComp/Expelliarmus
Tip revision: 83a8b7d8fc3d3b7dd4ac7ef5df4c02bd648bc526 authored by ExpelliarmusSuperComp on 07 August 2023, 14:18:01 UTC
LICENSE
LICENSE
Tip revision: 83a8b7d
VMISimilarity.py
import sys
from collections import defaultdict
from StaticInfo import StaticInfo
from GuestFSHelper import GuestFSHelper
from VMIDescription import VMIDescriptor
class SimilarityCalculator:
@staticmethod
def checkMainServicesExistence(vmiDescriptor1,mainServices):
for pkgName in mainServices:
if not vmiDescriptor1.checkIfNodeExists(pkgName):
sys.exit("Error: Main Service \"" + pkgName + "\" does not exist in " + vmiDescriptor1.pathToVMI)
@staticmethod
def checkCompatibility(vmiDescriptor1, vmiDescriptor2):
if vmiDescriptor1.distribution != vmiDescriptor2.distribution:
print "Mapping: Check Compatibility failed: distributions differ! (%s vs. %s)" % (
vmiDescriptor1.distribution, vmiDescriptor2.distribution)
return False
if vmiDescriptor1.distributionVersion != vmiDescriptor2.distributionVersion:
print "Mapping: Check Compatibility failed: distribution versions differ! (%s vs. %s)" % (
vmiDescriptor1.distributionVersion, vmiDescriptor2.distributionVersion)
return False
if vmiDescriptor1.architecture != vmiDescriptor2.architecture:
print "Mapping: Check Compatibility failed: architectures differ! (%s vs. %s)" % (
vmiDescriptor1.distributionVersion, vmiDescriptor2.distributionVersion)
return False
return True
@staticmethod
def computeSimilarityBetweenVMIDescriptorsSimple(vmi1, vmi2, onlyOnMainServices):
"""
:param VMIDescriptor vmi1:
:param VMIDescriptor vmi2:
:param Boolean onlyOnMainServices:
:return:
"""
def max(x, y):
if x > y:
return x
else:
return y
g1NodesDict = vmi1.getNodeData()
g2NodesDict = vmi2.getNodeData()
numG1Nodes = len(g1NodesDict)
numG2Nodes = len(g2NodesDict)
# similarity =
# |matching Nodes in nodesToCheck|
# / |nodesToCheck|
if onlyOnMainServices:
# nodesToCheck: union(mainServices1,mainServices2)
nodesToCheck = set(vmi1.getNodeDataFromMainServicesSubtrees().keys()) \
.union(
set(vmi2.getNodeDataFromMainServicesSubtrees().keys()))
numAllNodes = len(nodesToCheck)
# prefilter nodesToCheck by name occurring in both graphs
nodesToCheck = nodesToCheck.intersection(set(g1NodesDict.keys()))
nodesToCheck = nodesToCheck.intersection(set(g2NodesDict.keys()))
else:
# nodesToCheck: union(g1,g2)
numAllNodes = len(set(g1NodesDict.keys()).union(set(g2NodesDict.keys())))
# prefilter nodesToCheck by name occurring in both graphs
nodesToCheck = set(g1NodesDict.keys()).intersection(set(g2NodesDict.keys()))
numMatches = 0
# Check similarity for all (prefiltered) packages
for pkgName in nodesToCheck:
pkg1Data = g1NodesDict[pkgName]
pkg2Data = g2NodesDict[pkgName]
if (
# Version has to be the same
pkg1Data[StaticInfo.dictKeyVersion] == pkg2Data[StaticInfo.dictKeyVersion]
# Architecture has to be the same, or at least on has to say all
and (
pkg1Data[StaticInfo.dictKeyArchitecture] == pkg2Data[StaticInfo.dictKeyArchitecture]
or pkg1Data[StaticInfo.dictKeyArchitecture] == "all"
or pkg2Data[StaticInfo.dictKeyArchitecture] == "all"
)
):
numMatches = numMatches + 1
similarity = float(numMatches) / float(numAllNodes)
if onlyOnMainServices:
print "\nComparison of two VMIs (Only on main services!):\n" \
"\tVMI 1: %i packages\n" \
"\tVMI 2: %i packages\n" \
"\t\t %i main service related packages match in name, version and architecture\n" \
"\t\t %i compared packages in total (union of main services from both VMIs)\n" \
"\t\t similarity = %i/%i = %.3f" \
% (numG1Nodes, numG2Nodes, numMatches, numAllNodes, numMatches, numAllNodes, similarity)
else:
print "\nComparison of two VMIs:\n" \
"\tVMI 1: %i packages\n" \
"\tVMI 2: %i packages\n" \
"\t\t %i packages match in name, version and architecture\n" \
"\t\t similarity = %i/%i = %.3f" \
% (numG1Nodes, numG2Nodes, numMatches, numMatches, numAllNodes, similarity)
return similarity
@staticmethod
def computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2, onlyOnMainServices, verbose=True):
"""
:param VMIDescriptor vmi1:
:param VMIDescriptor vmi2:
:param Boolean onlyOnMainServices:
:return:
"""
def max(x, y):
"""
:param Float x:
:param Float y:
:return:
"""
x = float(x)
y = float(y)
if x > y:
return x
else:
return y
g1NodesDict = vmi1.getNodeData()
g2NodesDict = vmi2.getNodeData()
numG1Nodes = len(g1NodesDict)
numG2Nodes = len(g2NodesDict)
# similarity =
# |(weight * 1) for each matching Node in nodesToCheck|
# / |(weight * 1) for each node in nodesToCheck|
# in variables:
# sumNormSizeMatches
# / sumNormSizeAll
if onlyOnMainServices:
# nodesToCheck: union(g1-mainServices1,g2-mainServices2)
nodesToCheck = set(vmi1.getNodeDataFromMainServicesSubtrees().keys())\
.union(
set(vmi2.getNodeDataFromMainServicesSubtrees().keys()))
else:
# nodesToCheck: union(G1,G2)
nodesToCheck = set(g1NodesDict.keys()).union(set(g2NodesDict.keys()))
numAllNodes = len(nodesToCheck)
# determine maximum install size for normalized sizes as weights
maxInstallSize = 0
for pkg in nodesToCheck:
if pkg in g1NodesDict:
maxInstallSize = max(maxInstallSize, int(g1NodesDict[pkg][StaticInfo.dictKeyInstallSize]))
if pkg in g2NodesDict:
maxInstallSize = max(maxInstallSize, int(g2NodesDict[pkg][StaticInfo.dictKeyInstallSize]))
# calculate sumNormSizeAll as sum of normalized sizes (weights)
sumNormSizeAll = 0.0
for pkg in nodesToCheck:
if pkg in g1NodesDict and pkg in g2NodesDict:
sumNormSizeAll = sumNormSizeAll +\
max(g1NodesDict[pkg][StaticInfo.dictKeyInstallSize],
g2NodesDict[pkg][StaticInfo.dictKeyInstallSize])/maxInstallSize
elif pkg in g1NodesDict:
sumNormSizeAll = sumNormSizeAll + \
float(g1NodesDict[pkg][StaticInfo.dictKeyInstallSize]) / maxInstallSize
elif pkg in g2NodesDict:
sumNormSizeAll = sumNormSizeAll + \
float(g2NodesDict[pkg][StaticInfo.dictKeyInstallSize]) / maxInstallSize
# prefilter nodesToCheck by name occurring in both graphs
nodesToCheck = nodesToCheck.intersection(set(g1NodesDict.keys()))
nodesToCheck = nodesToCheck.intersection(set(g2NodesDict.keys()))
numMatches = 0
sumNormSizeMatches = 0.0
# Check similarity for all (prefiltered) packages
for pkgName in nodesToCheck:
pkg1Data = g1NodesDict[pkgName]
pkg2Data = g2NodesDict[pkgName]
if (
# Version has to be the same
pkg1Data[StaticInfo.dictKeyVersion] == pkg2Data[StaticInfo.dictKeyVersion]
# Architecture has to be the same, or at least one has to say all
and (
pkg1Data[StaticInfo.dictKeyArchitecture] == pkg2Data[StaticInfo.dictKeyArchitecture]
or pkg1Data[StaticInfo.dictKeyArchitecture] == "all"
or pkg2Data[StaticInfo.dictKeyArchitecture] == "all"
)
):
numMatches = numMatches + 1
sumNormSizeMatches = sumNormSizeMatches\
+ max(pkg1Data[StaticInfo.dictKeyInstallSize],
pkg2Data[StaticInfo.dictKeyInstallSize])/maxInstallSize
similarity = float(sumNormSizeMatches) / float(sumNormSizeAll)
if verbose:
if onlyOnMainServices:
print "\nWeighted Comparison of two VMIs (Only on main services!):\n" \
"\tGraph 1: %i packages\n" \
"\tGraph 2: %i packages\n" \
"\t\t %i main service related packages match in name, version and architecture\n" \
"\t\t %i compared packages in total (union of main services from both VMIs)\n" \
"\t\t similarity = %.3f/%.3f = %.3f" \
% (numG1Nodes, numG2Nodes, numMatches, numAllNodes, sumNormSizeMatches, sumNormSizeAll, similarity)
else:
print "\nWeighted Comparison of two VMIs:\n" \
"\tVMI 1: %i packages\n" \
"\tVMI 2: %i packages\n" \
"\t\t %i packages match in name, version and architecture\n" \
"\t\t similarity = %i/%i = %.3f" \
% (numG1Nodes, numG2Nodes, numMatches, sumNormSizeMatches, numAllNodes, similarity)
return similarity
@staticmethod
def computeSimilarityOneToOne(pathToVMI1, mainServices1, pathToVMI2, mainServices2, onlyOnMainServices):
# Create Descriptors/Graphs for each VMI
print "\n=== Creating Descriptor for VMI \"%s\"" % (pathToVMI1)
(guest, root) = GuestFSHelper.getHandle(pathToVMI1, rootRequired=True)
vmi1 = VMIDescriptor(pathToVMI1, "internal_vmi1", mainServices1, guest, root)
GuestFSHelper.shutdownHandle(guest)
print "\n=== Creating Descriptor for VMI \"%s\"" % (pathToVMI2)
(guest, root) = GuestFSHelper.getHandle(pathToVMI2, rootRequired=True)
vmi2 = VMIDescriptor(pathToVMI2, "internal_vmi2", mainServices2, guest, root)
GuestFSHelper.shutdownHandle(guest)
# Check if Main Services exist
SimilarityCalculator.checkMainServicesExistence(vmi1, mainServices1)
SimilarityCalculator.checkMainServicesExistence(vmi2, mainServices2)
# Compute Similarity
graphSimilarity = SimilarityCalculator.computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2, onlyOnMainServices)
return graphSimilarity
@staticmethod
def computeSimilarityManyToMany(vmiData, onlyOnMainServices):
if onlyOnMainServices:
print "=====Calculating similarities with respect to main services between each of %i VMIs" % len(vmiData)
else:
print "=====Calculating similarities between each of %i VMIs" % len(vmiData)
sortedVMIDescriptorList = list()
count = 0
for (pathToVMI, vmiFileName, mainServices) in vmiData:
count = count + 1
print "Creating Descriptor for vmi \"%s\" (%i/%i)..." % (vmiFileName, count, len(vmiData))
(guest, root) = GuestFSHelper.getHandle(pathToVMI, rootRequired=True)
vmi = VMIDescriptor(pathToVMI, vmiFileName, mainServices, guest, root)
GuestFSHelper.shutdownHandle(guest)
sortedVMIDescriptorList.append(vmi)
similarities = defaultdict(dict)
for vmi1 in sortedVMIDescriptorList:
print "Similarities for VMI \"%s\":" % vmi1.vmiName
for vmi2 in sortedVMIDescriptorList:
if vmi1.pathToVMI == vmi2.pathToVMI:
similarities[vmi1.vmiName][vmi2.vmiName] = None
else:
# Check if Main Services exist
SimilarityCalculator.checkMainServicesExistence(vmi1, vmi1.mainServices)
SimilarityCalculator.checkMainServicesExistence(vmi2, vmi2.mainServices)
sim = SimilarityCalculator.computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2,
onlyOnMainServices,
verbose=False)
similarities[vmi1.vmiName][vmi2.vmiName] = sim
print "\t%0.2f similarity to VMI \"%s\"" % (sim, vmi2.vmiName)
return similarities
@staticmethod
def computeSimilarityManyToManyOLD(vmisAndMS, onlyOnMainServices):
if onlyOnMainServices:
print "=====Calculating similarities with respect to main services between each of %i VMIs" % len(vmisAndMS)
else:
print "=====Calculating similarities between each of %i VMIs" % len(vmisAndMS)
sortedVMIDescriptorList = list()
i = 0
for (vmiFileName,mainServices) in vmisAndMS:
i = i + 1
print "Creating Descriptor for vmi \"%s\" (%i/%i)..." % (vmiFileName, i, len(vmisAndMS))
pathToVMI = StaticInfo.relPathLocalVMIFolder + "/" + vmiFileName
(guest, root) = GuestFSHelper.getHandle(pathToVMI, rootRequired=True)
vmi = VMIDescriptor(pathToVMI, vmiFileName, mainServices, guest, root)
GuestFSHelper.shutdownHandle(guest)
sortedVMIDescriptorList.append(vmi)
similarities = defaultdict(dict)
for vmi1 in sortedVMIDescriptorList:
print "Similarities for VMI \"%s\":" % vmi1.vmiName
for vmi2 in sortedVMIDescriptorList:
if vmi1.pathToVMI == vmi2.pathToVMI:
similarities[vmi1.vmiName][vmi2.vmiName] = None
else:
# Check if Main Services exist
SimilarityCalculator.checkMainServicesExistence(vmi1, vmi1.mainServices)
SimilarityCalculator.checkMainServicesExistence(vmi2, vmi2.mainServices)
sim = SimilarityCalculator.computeWeightedSimilarityBetweenVMIDescriptors(vmi1, vmi2,
onlyOnMainServices, verbose=False)
similarities[vmi1.vmiName][vmi2.vmiName] = sim
print "\t%0.2f similarity to VMI \"%s\"" % (sim,vmi2.vmiName)
return similarities