Revision bdbb926e7d3102c32a3fc54d4cfa769bfac822f4 authored by Alexander van der Grinten on 18 December 2020, 13:49:57 UTC, committed by GitHub on 18 December 2020, 13:49:57 UTC
Version 8.0: Readme + version update
Benchmark.py
"""
The NetworKit benchmark
This module implements a comprehensive benchmark of NetworKit's analytics algorithms
"""
import sys
import warnings
import math
import os
import numpy
from time import gmtime, strftime
import signal
import networkit
from util import *
from networkit.support import MissingDependencyError
import nk
try:
import pandas
except ImportError:
have_pandas = False
else:
have_pandas = True
try:
import matplotlib.pyplot as plt
except ImportError:
have_plt = False
else:
have_plt = True
try:
import seaborn
except ImportError:
have_seaborn = False
else:
have_seaborn = True
try:
import networkx
import nx
except ImportError:
have_nx = False
else:
have_nx = True
try:
import igraph
import ig
except ImportError:
have_ig = False
else:
have_ig = True
try:
import graph_tool
import gt
except ImportError:
have_gt = False
else:
have_gt = True
# helper function
def averageRuns(df, groupby=["graph"]):
""" Average running time, modularity, edges per second and number of clusters over multiple runs"""
df = df.groupby(groupby, as_index=False).mean()
df = df.sort("m", ascending=True) # sort by graph size
return df
def graphMeta(graphNames, graphDir, fileEnding=".gml.graph", graphFormat=networkit.Format.GML):
if not have_pandas:
raise MissingDependencyError("pandas")
meta = []
for name in graphNames:
info("loading {name}".format(**locals()))
G = networkit.readGraph(os.path.join(graphDir, "{0}{1}".format(name, fileEnding)), graphFormat)
(n, m) = G.size()
meta.append({"name" : name, "n" : n, "m" : m})
info("done")
return pandas.DataFrame(meta, columns=["name", "n", "m"])
def saveData(df, name):
df.to_csv(os.path.join(bench.dataPath, "{0}.csv".format(name)), sep="\t")
# timeout
class Timeout(Exception):
pass
def timeoutHandler(signum, frame):
error("timeout")
raise Timeout()
class bFail:
name = "Fail"
def run(self, G):
raise Exception("FAIL!")
# Plots
## plot settings
if not have_seaborn:
raise MissingDependencyError("seaborn")
seaborn.set_style("whitegrid")
### Colors
lightred = seaborn.xkcd_rgb["red"]
darkred = seaborn.xkcd_rgb["crimson"]
green = seaborn.xkcd_rgb["teal"]
orange = seaborn.xkcd_rgb["bright orange"]
# plot functions
def timePlot(data, size=(6,3)):
if not have_plt:
raise MissingDependencyError("matplotlib")
pos = numpy.arange(len(data))+.5 # the bar centers on the y axis
labels = list(data["graph"])
plt.figure(figsize=size)
plt.xscale("symlog")
plt.barh(pos, data["time"], align='center', height=0.25, color=lightred) # notice the 'height' argument
plt.yticks(pos, labels)
plt.gca().xaxis.set_minor_locator(plt.LogLocator(subs=[0,1,2,3,4,5,6,7,8,9,10]))
#gca().xaxis.set_minor_formatter(FormatStrFormatter("%.2f"))
plt.xlabel("time [s]")
plt.grid(True)
def epsPlot(data, size=(6,3)):
if not have_plt:
raise MissingDependencyError("matplotlib")
pos = numpy.arange(len(data))+.5 # the bar centers on the y axis
labels = list(data["graph"])
plt.figure(figsize=size)
plt.xscale("log")
plt.barh(pos, data["edges/s"], align='center', height=0.25, color=green) # notice the 'height' argument
plt.yticks(pos, labels)
plt.gca().xaxis.set_minor_locator(plt.LogLocator(subs=[0,1,2,3,4,5,6,7,8,9,10]))
#gca().xaxis.set_minor_formatter(FormatStrFormatter("%.2f"))
plt.xlabel("edges / s")
plt.grid(True)
def graphPlot(data, size=None):
if not have_plt:
raise MissingDependencyError("matplotlib")
pos = arange(len(data)) # the bar centers on the y axis
plt.figure(figsize=(5,3.3))
sizes = data["m"]
labels = list(data["graph"])
#barh(pos, data["n"], log=True, align='center', height=0.4, color="darkgrey") # notice the 'height' argument
plt.barh(pos, sizes, log=True, align='center', height=0.4, color="lightblue", label="m") # notice the 'height' argument
plt.yticks(pos, labels)
plt.xlabel("number of edges")
plt.grid(True)
def barPlot(data, labels, x_label="", y_label="", size=None, color="b", transparency=1.0, scale="linear"):
if not have_plt:
raise MissingDependencyError("matplotlib")
pos = numpy.arange(len(data))+.5 # the bar centers on the y axis
plt.figure(figsize=size)
plt.xscale(scale)
plt.barh(pos, data, align='center', height=0.25, color=color, alpha=transparency) # notice the 'height' argumen
plt.yticks(pos, labels)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.grid(True)
class Bench:
"""
A Bench objects represents a benchmarking session. It is responsible
for running the benchmarks, collecting and storing the resulting data.
"""
def __init__(self, graphDir, defaultGraphs, outDir, save=True, nRuns=5, cacheGraphs=False, timeout=None, fileEnding="gml.graph", graphFormat=networkit.Format.GML, indexEdges=False):
self.graphFormat = graphFormat
self.indexEdges = indexEdges
self.fileEnding = fileEnding
self.defaultGraphs = defaultGraphs
self.nRuns = nRuns # default number of runs for each algo
self.graphDir = graphDir
# store result data of benchmarks
self.data = [] # list of dictionaries (rows) with keys representing table columns
self.loadTimes = []
self.outDir = outDir
self.save = save # store data frames on disk if true
# create output directory if it does not exist
if self.save:
if not os.path.isdir(self.outDir):
os.mkdir(self.outDir)
self.outDataDir = os.path.join(self.outDir, "data")
self.plotDir = os.path.join(self.outDir, "plots")
if not os.path.isdir(self.outDataDir):
os.mkdir(self.outDataDir)
if not os.path.isdir(self.plotDir):
os.mkdir(self.plotDir)
# log file
self.logPath = os.path.join(self.outDir, "log.txt")
# graph cache
self.cacheGraphs = cacheGraphs
self.graphCache = {}
#timeout
self.timeout = timeout
def getGraph(self, graphName, algo):
"""" Get the graph from disk or from in-memory cache"""
key = algo.framework + graphName
if key in self.graphCache:
self.info("getting {0} from cache".format(graphName))
G = self.graphCache[key]
return G
else:
graphPath = os.path.join(self.graphDir, "{0}.{1}".format(graphName, self.fileEnding))
self.info("loading {0} from {1}".format(graphName, graphPath))
with Timer() as t:
G = algo.loadGraph(graphPath, self.graphFormat)
if self.indexEdges:
G.indexEdges()
self.info("\t took {1} s".format(graphName, t.elapsed))
eps = self.getSize(G)[1] / t.elapsed
self.loadTimes.append({"framework" : algo.framework, "graph" : graphName, "time": t.elapsed, "edges/s": eps})
if self.cacheGraphs:
self.graphCache[key] = G
return G
def getSize(self, G):
if not have_gt:
raise MissingDependencyError("graph_tool")
if not have_ig:
raise MissingDependencyError("igraph")
if isinstance(G, networkit.Graph):
return G.size()
elif isinstance(G, igraph.Graph):
return (G.vcount(), G.ecount())
elif isinstance(G, graph_tool.Graph):
return (G.num_vertices(), G.num_edges())
else:
raise NotImplementedError("cannot get graph size - unknown object type")
def clearCache(self):
""" Delete all stored graphs to free memory """
del self.graphCache
self.graphCache = {}
def algoBenchmark(self, algo, graphs=None, nRuns=None, timeout=None):
""" Run a kernel represented by an algorithm benchmark object """
# set the defaults
if nRuns is None:
nRuns = self.nRuns # lets argument override the default nRuns
if graphs is None:
graphs = self.defaultGraphs
if timeout is None:
timeout = self.timeout
self.info("benchmarking {algo.name}".format(**locals()))
#table = [] # list of dictionaries, to be converted to a DataFrame
for graphName in graphs:
try:
G = self.getGraph(graphName, algo)
(n, m) = self.getSize(G)
try:
self.info("running {algo.name} ({algo.framework}) {nRuns}x on {graphName}".format(**locals()))
for i in range(nRuns):
row = {} # benchmark data row
row["algorithm"] = algo.name
row["framework"] = algo.framework
row["graph"] = graphName
row["m"] = m
try: # timeout
if timeout:
signal.signal(signal.SIGALRM, timeoutHandler)
signal.alarm(int(timeout * 60)) # timeout in seconds
with Timer() as t:
result = algo.run(G)
self.info("\t took {0} s".format(t.elapsed))
# store data
row["time"] = t.elapsed
row["edges/s"] = m / t.elapsed # calculate edges per second
row["result"] = result
except Timeout as tx:
self.error("{algo.name} timed out after {timeout} minutes".format(**locals()))
row["time"] = None
row["edges/s"] = None
row["result"] = None
finally:
self.data.append(row)
signal.alarm(int(1e9)) # in any case, cancel the timeout alarm by setting it to a ridiculously high time
except Exception as ex:
self.error("algorithm {algo.name} failed with exception: {ex}".format(**locals()))
except Exception as ex:
self.error("loading graph {graphName} failed with exception: {ex}".format(**locals()))
#df = pandas.DataFrame(table)
#df.sort("m") # sort by number of edges
#self.data[algo.name] = df
# store data frame on disk
#if self.save:
# df.to_csv(os.path.join(self.outDataDir, "{algo.name}.csv".format(**locals())))
self.log("Done")
def log(self, message):
""" Write a message to the logfile"""
with open(self.logPath, "a") as logfile:
logfile.write("{0}: {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), message))
def info(self, message):
print(message)
sys.stdout.flush()
self.log(message)
def error(self, message):
print(message)
sys.stdout.flush()
self.log(message)
def debug(self, message):
pass
# print(message)
def timePlot(self, algoName):
if not have_plt:
raise MissingDependencyError("matplotlib")
timePlot(averageRuns(self.data[algoName]))
if self.save:
plt.savefig(os.path.join(self.plotDir, "{algoName}-time.pdf".format(**locals())), bbox_inches="tight")
def epsPlot(self, algoName):
if not have_plt:
raise MissingDependencyError("matplotlib")
epsPlot(averageRuns(self.data[algoName]))
if self.save:
plt.savefig(os.path.join(self.plotDir, "{algoName}-eps.pdf".format(**locals())), bbox_inches="tight")
def graphPlot(self):
if not have_plt:
raise MissingDependencyError("matplotlib")
epsPlot(averageRuns(self.data))
if self.save:
plt.savefig(os.path.join(self.plotDir, "graphs.pdf".format(**locals())), bbox_inches="tight")
def plot(self, algoName):
print(algoName)
self.timePlot(algoName)
self.epsPlot(algoName)
def finalize(self):
if not have_pandas:
raise MissingDependencyError("pandas")
self.dataFrame = pandas.DataFrame(self.data)
if self.save:
self.dataFrame.to_csv(os.path.join(self.outDataDir, "data.csv".format(**locals())))
def plotSummary2(self, figsize=None, groupby="framework", palette="Greens_d"):
""" Plot a summary of algorithm performances"""
if not have_plt:
raise MissingDependencyError("matplotlib")
if not have_seaborn:
raise MissingDependencyError("seaborn")
if figsize:
plt.figure(figsize=figsize)
plt.gca().xaxis.get_major_formatter().set_powerlimits((3, 3))
plt.xscale("log")
plt.xlabel("edges/s")
ax = seaborn.boxplot(y="algorithm", x="edges/s", hue=groupby, data=self.dataFrame, linewidth=1, width=.5, palette=palette)
if self.save:
plt.savefig(os.path.join(self.plotDir, "epsSummary.pdf".format(**locals())), bbox_inches="tight")
def plotSummary(self, algoNames=None, figsize=None):
""" Plot a summary of algorithm performances"""
if not have_plt:
raise MissingDependencyError("matplotlib")
if not have_pandas:
raise MissingDependencyError("pandas")
if not have_seaborn:
raise MissingDependencyError("seaborn")
if algoNames is None:
algoNames = list(self.data.keys())
epsSummary = pandas.DataFrame()
for (algoName, algoData) in self.data.items():
if algoName in algoNames:
epsSummary[algoName] = pandas.Series(self.data[algoName]["edges/s"])
# data frame
self.epsSummary = epsSummary
self.epsSummary = self.epsSummary.reindex_axis(sorted(self.epsSummary.columns), axis=1)
if self.save:
self.epsSummary.to_csv(os.path.join(self.outDataDir, "epsSummary.csv".format(**locals())))
# plot
if figsize:
plt.figure(figsize=figsize)
plt.gca().xaxis.get_major_formatter().set_powerlimits((3, 3))
plt.xscale("log")
plt.xlabel("edges/s")
seaborn.boxplot(self.epsSummary, order=self.epsSummary.columns, linewidth=1.5, width=.25, color=green, vert=False)
if self.save:
plt.savefig(os.path.join(self.plotDir, "epsSummary.pdf".format(**locals())), bbox_inches="tight")
def plotAll(self):
for key in self.data.keys():
self.plot(key)
def getLoadTimes(self):
""" Get input times for graphs"""
if not have_pandas:
raise MissingDependencyError("pandas")
return pandas.DataFrame(self.loadTimes)
def generatorBenchmark(self, generator, argtuples, nRuns=None, timeout=None):
""" Run a kernel represented by an algorithm benchmark object """
if not have_pandas:
raise MissingDependencyError("pandas")
# set the defaults
if nRuns is None:
nRuns = self.nRuns # lets argument override the default nRuns
if timeout is None:
timeout = self.timeout
genName = str(generator)
self.info("benchmarking {genName}".format(**locals()))
table = [] # list of dictionaries, to be converted to a DataFrame
for param in argtuples:
try:
gen = generator(*param)
row = {} # benchmark data row
row["graph"] = str(param)
try:
self.info("running {genName} {nRuns} times".format(**locals()))
for i in range(nRuns):
row["algorithm"] = genName
try: # timeout
result = None
if timeout:
signal.signal(signal.SIGALRM, timeoutHandler)
signal.alarm(int(timeout * 60)) # timeout in seconds
with Timer() as t:
result = gen.generate()
self.debug("took {0} s".format(t.elapsed))
# store data
row["m"] = result.numberOfNodes()
row["time"] = t.elapsed
row["result"] = result
except Timeout as tx:
self.error("{genName} timed out after {timeout} minutes".format(**locals()))
row["time"] = None
row["result"] = None
finally:
table.append(row)
signal.alarm(int(1e9)) # in any case, cancel the timeout alarm by setting it to a ridiculously high time
except Exception as ex:
self.error("generator {genName} failed with exception: {ex}".format(**locals()))
except Exception as ex:
self.error("initializing generator {genName} failed with exception: {ex}".format(**locals()))
if not have_pandas:
raise MissingDependencyError("pandas")
df = pandas.DataFrame(table)
self.data[genName] = df
# store data frame on disk
if self.save:
df.to_csv(os.path.join(self.outDataDir, "{genName}.csv".format(**locals())))
# - generators
# - Erdös-Renyi (generators.ErdosRenyiGenerator)
# - Barabasi-Albert (generators.BarabasiAlbertGenerator)
# - Chung-Lu (generators.ChungLuGenerator)
# - RMAT (generators.RmatGenerator)
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...