#!/usr/bin/env python

# This program attempts to cluster traces
import sys
import os.path
import pdb
import re
import numpy as np

from matplotlib import pyplot
from scipy.cluster.vq import vq, kmeans, whiten

# numpy elements map to the following in index order
object_ops = [

    'ZERO_PTR_FIELD_OP' # only emitted by the rewrite, clears a pointer field
                        # at a given constant offset, no descr

array_ops = ['ARRAYLEN_GC_OP',


alloc_ops = ['NEW_OP',             #-> GcStruct, gcptrs inside are zeroed (not the rest)
             'NEW_WITH_VTABLE_OP',  #-> GcStruct with vtable, gcptrs inside are zeroed
             'NEW_ARRAY_OP',       #-> GcArray, not zeroed. only for arrays of primitives
             'NEW_ARRAY_CLEAR_OP', #-> GcArray, fully zeroed
             'NEWSTR_OP',           #-> STR, the hash field is zeroed
             'NEWUNICODE_OP']      #-> UNICODE, the hash field is zeroed]

string_ops = [ 
guard = "GUARD:"
jump = "JUMP_OP"

begin_re = re.compile("BEGIN TRACE: (.*) from (.*)\n")
counts = np.zeros(8)
prog_vecs = {}
traces = 0
print "READING FILES..."
with open("histograms.dat", "r") as f:
    prog_vec = None
    current_name = None
    for line in f:
        split = line.split()
        index = 99
        match_begin  =  begin_re.match(line)
        if match_begin:
            traces += 1
            if prog_vec is not None:
                total = np.sum(prog_vec)
                func = lambda x: x / float(total)
                vfunc = np.vectorize(func)
                # add to global list
                prog_vecs[current_name] = vfunc(prog_vec)
            # reset 
            prog_vec = np.zeros(6)
            current_name =
        elif split[0] in object_ops:
            index = 0
        elif split[0] in array_ops:
            index = 1
        elif split[0] in num_ops:
            index = 2
        elif split[0] in alloc_ops:
            index = 3
        elif split[0] == guard:
            index = 4
        elif split[0] == jump:
            index = 5
        counts[index] += 1
        prog_vec[index] = int(split[1])

features = np.array(prog_vecs.values())

whitened = whiten(features)

std = np.std(features, 0)

print "PERFORMING Kmeans"

# initial = [kmeans(features,i) for i in range(1,40)]
# pyplot.plot([var for (cent,var) in initial])
centroids,_ =  kmeans(whitened, 3, 100)

# for x in np.nditer(centroids):
#     print x

print "Centroids:"
unwhitened = centroids * std
for x in xrange(unwhitened.shape[0]):
    print unwhitened[x]

assignment,cdist = vq(whitened,centroids)

counts = {}

for x in xrange(assignment.size):
    val = assignment[x]
    if val not in counts:
        counts[val] = 1
        counts[val] += 1

print counts
#print assignment
