Content - cec1414372952fee7b0d0572510a671f9dbd6954 - a14cf0d/cluster_traces.py

visit type:
https://github.com/magnusmorton/trace-analysis

06 April 2024, 01:14:20 UTC
Tip revision: 4645af99638edea16d00e811c922b0fb9d6b86d9 authored by Magnus Morton on 11 January 2016, 20:33:10 UTC
subplots and recording output
Tip revision: 4645af9
cluster_traces.py
#!/usr/bin/env python

# This program attempts to cluster traces
import sys
import os.path
import pdb
import re
import numpy as np

from matplotlib import pyplot
from scipy.cluster.vq import vq, kmeans, whiten


# numpy elements map to the following in index order
object_ops = [
    
    'GETFIELD_GC_PURE_OP',
    'GETFIELD_RAW_PURE_OP',
    
    'GETINTERIORFIELD_GC_OP',
    'RAW_LOAD_OP',
    'GETFIELD_GC_OP',
    'GETFIELD_RAW_OP',
    


    'RAW_STORE_OP',
    'SETFIELD_GC_OP',
    'SETINTERIORFIELD_GC_OP',
    'SETINTERIORFIELD_RAW_OP',
    'ZERO_PTR_FIELD_OP' # only emitted by the rewrite, clears a pointer field
                        # at a given constant offset, no descr
    ]


array_ops = ['ARRAYLEN_GC_OP',
             'GETARRAYITEM_GC_OP',
             'GETARRAYITEM_RAW_OP',
             'GETARRAYITEM_GC_PURE_OP',
             'GETARRAYITEM_RAW_PURE_OP',
             'SETARRAYITEM_GC_OP',
             'SETARRAYITEM_RAW_OP',
             
             'ZERO_ARRAY_OP']

num_ops = ['INCREMENT_DEBUG_COUNTER_OP',
           'INT_LT_OP',
           'INT_LE_OP',
           'INT_EQ_OP',
           'INT_NE_OP',
           'INT_GT_OP',
           'INT_GE_OP',
           'UINT_LT_OP',
           'UINT_LE_OP',
           'UINT_GT_OP',
           'UINT_GE_OP',
           'INT_ADD_OP',
           'INT_SUB_OP',
           'INT_MUL_OP',
           'INT_FLOORDIV_OP',
           'UINT_FLOORDIV_OP',
           'INT_MOD_OP',
           'INT_AND_OP',
           'INT_OR_OP',
           'INT_XOR_OP',
           'INT_RSHIFT_OP',
           'INT_LSHIFT_OP',
           
           'UINT_RSHIFT_OP',
           'INT_SIGNEXT_OP',
           'INT_IS_ZERO_OP',
           'INT_IS_TRUE_OP',
           'INT_NEG_OP',
           'INT_INVERT_OP',
           'INT_FORCE_GE_ZERO_OP',
           'INT_ADD_OVF_OP',
           'INT_SUB_OVF_OP',
           'INT_MUL_OVF_OP',
           'FLOAT_ADD_OP',
           'FLOAT_SUB_OP',
           'FLOAT_MUL_OP',
           'FLOAT_TRUEDIV_OP',
           'FLOAT_NEG_OP',
           'FLOAT_ABS_OP']


alloc_ops = ['NEW_OP',             #-> GcStruct, gcptrs inside are zeroed (not the rest)
             'NEW_WITH_VTABLE_OP',  #-> GcStruct with vtable, gcptrs inside are zeroed
             'NEW_ARRAY_OP',       #-> GcArray, not zeroed. only for arrays of primitives
             'NEW_ARRAY_CLEAR_OP', #-> GcArray, fully zeroed
             'NEWSTR_OP',           #-> STR, the hash field is zeroed
             'NEWUNICODE_OP']      #-> UNICODE, the hash field is zeroed]

string_ops = [ 
    'UNICODELEN_OP',
    'UNICODEGETITEM_OP',
    'STRLEN_OP',
    'COPYSTRCONTENT_OP',     
    'COPYUNICODECONTENT_OP',
    'STRGETITEM_OP']
guard = "GUARD:"
jump = "JUMP_OP"

begin_re = re.compile("BEGIN TRACE: (.*) from (.*)\n")
counts = np.zeros(8)
prog_vecs = {}
traces = 0
print "READING FILES..."
with open("histograms.dat", "r") as f:
    prog_vec = None
    current_name = None
    for line in f:
        split = line.split()
        index = 99
        match_begin  =  begin_re.match(line)
        if match_begin:
            traces += 1
            if prog_vec is not None:
                #normalise
                total = np.sum(prog_vec)
                func = lambda x: x / float(total)
                vfunc = np.vectorize(func)
                # add to global list
                prog_vecs[current_name] = vfunc(prog_vec)
            # reset 
            prog_vec = np.zeros(6)
            current_name = match_begin.group(1)
            continue
        elif split[0] in object_ops:
            index = 0
        elif split[0] in array_ops:
            index = 1
        elif split[0] in num_ops:
            index = 2
        elif split[0] in alloc_ops:
            index = 3
        elif split[0] == guard:
            index = 4
        elif split[0] == jump:
            index = 5
        else:
            continue
        counts[index] += 1
        prog_vec[index] = int(split[1])


features = np.array(prog_vecs.values())

whitened = whiten(features)

std = np.std(features, 0)

print "PERFORMING Kmeans"

# initial = [kmeans(features,i) for i in range(1,40)]
# pyplot.plot([var for (cent,var) in initial])
# pyplot.show()
centroids,_ =  kmeans(whitened, 3, 100)

# for x in np.nditer(centroids):
#     print x

print "Centroids:"
unwhitened = centroids * std
for x in xrange(unwhitened.shape[0]):
    print unwhitened[x]

assignment,cdist = vq(whitened,centroids)

counts = {}

for x in xrange(assignment.size):
    val = assignment[x]
    if val not in counts:
        counts[val] = 1
    else:
        counts[val] += 1

print "CLUSTER COUNTS"
print counts
#print assignment
#pdb.set_trace()
#pdb.set_trace()
pdb.set_trace()