Content - 18d77643303cb796e0fdcc9e309a0a0c7197e590 - e8c8578/Utilities.py

visit type:
Tip revision: 8e64d9628355fde264b6481f9955478fed995fe5 authored by Avinash Bukkittu on 04 October 2017, 22:20:26 UTC
Merge branch 'master' of https://github.com/ab4377/dream-project
Tip revision: 8e64d96
Utilities.py
import Constants
import StepStrideFrequencyModuleConstants
import numpy as np
import pandas as pd
import os
from scipy.fftpack import fft
from collections import defaultdict
from scipy.interpolate import InterpolatedUnivariateSpline
import pickle
import math
import matplotlib.pyplot as plt
import peakutils
import sys

'''Finds peaks in y between start_index to end_index'''
def get_peaks_between(y, start_index, end_index, thres, min_dist):
    assert start_index < end_index
    indices = peakutils.indexes(y[start_index:end_index],thres=thres,min_dist=min_dist)
    indices = [start_index + index for index in indices]
    return indices

def fourier_transform(x):
    N = len(x)
    x = np.array(x)
    #x = x - np.mean(x)
    y = fft(x)
    a = np.abs(y[0:N//2])
    f = np.linspace(0,Constants.sampling_frequency/2,N//2)
    return (f,a)

def squared_signal(x):
    return x**2

def remove_noise(x,l):
    x_new = np.array([i-l if i > l else 0 for i in x])
    return x_new

def interpolate_data(f,a):
    x = np.linspace(0,Constants.sampling_frequency/2,StepStrideFrequencyModuleConstants.sample_points)
    ius = InterpolatedUnivariateSpline(f,a)
    return (x,ius(x))

def default_dict():
    return np.zeros(shape=(StepStrideFrequencyModuleConstants.sample_points,1),dtype=float)

def get_meta_data():
    return pd.read_csv(Constants.meta_data_location)

'''This method reads the meta-data file and retrieves maximum possible equal sized datapoints for
class label. If the data is pickled in X.pkl and Y.pkl, it will directly load from that file'''
def load_data():
    print "Loading the data..."
    X = None
    class_label = None
    if os.path.isfile('X.pkl'):
        with open('X.pkl','r') as f:
            X = pickle.load(f)
    if os.path.isfile('Y.pkl'):
        with open('Y.pkl','r') as f:
            class_label = pickle.load(f)

    if X is not None and class_label is not None:
        return (X,class_label)
    meta_data = pd.read_csv(Constants.meta_data_location)
    medTimepointToJsonFiles = defaultdict(list)
    labels = meta_data["medTimepoint"]
    file_location = meta_data["recordId"]
    for label, file_location in zip(labels, file_location):
        if type(label) is float:
            continue
        else:
            medTimepointToJsonFiles[label].append(file_location)

    (size, label) = min([(len(medTimepointToJsonFiles[key]), key) for key in medTimepointToJsonFiles.keys()])
    X = []
    class_label = []
    for key in medTimepointToJsonFiles.keys():
        for file_location in medTimepointToJsonFiles[key][:size]:
            #if file_location.endswith(".tmp"):
            if os.path.isfile(Constants.walking_outbound_path + "/" + file_location + "/deviceMotion_walking_outbound.json.items.csv"):
              x= pd.read_csv(Constants.walking_outbound_path + "/" + file_location + "/deviceMotion_walking_outbound.json.items.csv")
              X.append(x)
              class_label.append(key)
            '''else:
                print file_location'''

    '''meta_data = pd.read_csv(Constants.meta_data_location)
    class_label_map = {}
    for index,row in meta_data.iterrows():
        class_label_map[row["outbound_walk_json_file"]] = row["medTimepoint"]
    file_names = os.listdir(Constants.walking_outbound_path)
    for file_name in file_names:
        if file_name.endswith(".tmp"):
            x = pd.read_json(Constants.walking_outbound_path + "/" + file_name)
            X.append(x)
            class_label.append(class_label_map[file_name])'''
    with open('X.pkl','w') as f:
        pickle.dump(X,f)
    with open('Y.pkl','w') as f:
        pickle.dump(class_label,f)
    return X,class_label

'''This method reads the meta-data file and retrieves ALL the datapoints i.e . 
If the data is pickled in X_full.pkl, it will directly load from that file'''
def load_full_data():
    print "Loading the data..."
    X = None
    recordIds = None
    if os.path.isfile('X_full.pkl'):
        with open('X_full.pkl', 'r') as f:
            X = pickle.load(f)

    if os.path.isfile('recordIds_full.pkl'):
        with open('recordIds_full.pkl','r') as f:
            recordIds = pickle.load(f)
    if X is not None and recordIds is not None:
        return X, recordIds

    meta_data = pd.read_csv(Constants.meta_data_location)
    if X is None:
        X = []
        for file_location in meta_data["recordId"]:
            if os.path.isfile(Constants.walking_outbound_path + "/" + file_location + "/deviceMotion_walking_outbound.json.items.csv"):
                x = pd.read_csv(Constants.walking_outbound_path + "/" + file_location + "/deviceMotion_walking_outbound.json.items.csv")
                X.append(x)
        with open('X_full.pkl', 'w') as f:
            pickle.dump(X, f)

    if recordIds is None:
        recordIds = []
        for file_location in meta_data["recordId"]:
            if os.path.isfile(Constants.walking_outbound_path + "/" + file_location + "/deviceMotion_walking_outbound.json.items.csv"):
                recordIds.append(file_location)
        with open('recordIds_full.pkl', 'w') as f:
            pickle.dump(recordIds, f)

    return X, recordIds


'''@:param idx is the index for which to find half-energy
    prev_idx index of the previous peak
    next_idx index of the next peak'''
def get_half_energy(p_spectrum,f,idx,prev_idx,next_idx):
    a_spectrum = np.sqrt(p_spectrum)
    p_value = a_spectrum[idx]
    q_value = p_value/2
    ius = InterpolatedUnivariateSpline(f,a_spectrum-q_value)
    p_ius = InterpolatedUnivariateSpline(f,p_spectrum)
    roots = ius.roots()
    if len(roots) == 0:
        l_limit = find_minima(p_spectrum,f,prev_idx,idx,idx)
        r_limit = find_minima(p_spectrum,f,idx,next_idx,idx)
    elif len(roots) == 1:
        #check the root if it lies on the left side or the right side
        if roots[0] < f[prev_idx]:
            l_limit = find_minima(p_spectrum,f,prev_idx,idx,idx)
            r_limit = find_minima(p_spectrum,f,idx,next_idx,idx)
        elif roots[0] >= f[prev_idx] and roots[0] < f[idx]:
            l_limit = roots[0]
            #find minina on the right side
            r_limit = find_minima(p_spectrum,f,idx,next_idx,idx)
        elif roots[0] >= f[idx] and roots[0] <= f[next_idx]:
            r_limit = roots[0]
            l_limit = find_minima(p_spectrum,f,prev_idx,idx,idx)
        else:
            l_limit = find_minima(p_spectrum, f, prev_idx, idx, idx)
            r_limit = find_minima(p_spectrum, f, idx, next_idx, idx)
    elif len(roots) >= 2:
        #find left-closest root
        t = [(f[idx]-root,root) for root in roots if root < f[idx]]
        if len(t) == 0:
            l_limit = find_minima(p_spectrum, f, prev_idx, idx, idx)
        else:
            (d,l_limit) = min(t)
            if l_limit <= f[prev_idx]:
                l_limit = find_minima(p_spectrum,f,prev_idx,idx,idx)

        #find right-closest root
        t = [(root-f[idx],root) for root in roots if root > f[idx]]
        if len(t) == 0:
            r_limit = find_minima(p_spectrum, f, idx, next_idx, idx)
        else:
            d,r_limit = min(t)
            if r_limit >= f[next_idx]:
                r_limit = find_minima(p_spectrum,f,idx,next_idx,idx)
        '''diff = np.array([abs(root - f[idx]) for root in roots])
        sorted_indices = np.argsort(diff)
        i1,i2 = sorted_indices[0],sorted_indices[1]
        if f[i1] >= f[i2]:
            r_limit = roots[i1]
            l_limit = roots[i2]
        else:
            r_limit = roots[i2]
            l_limit = roots[i1]'''

    assert  r_limit > l_limit
    if r_limit is None:
        print "r_limit is None"
        sys.exit()
    if l_limit is None:
        print "l_limit is None"
        sys.exit()

    num = int((r_limit - l_limit)*StepStrideFrequencyModuleConstants.sample_points/(Constants.sampling_frequency/2))
    x = np.linspace(l_limit,r_limit,num=num)
    return np.trapz(p_ius(x),x)

#finds minima in y between a and b closest to point f[p]
def find_minima(y,f,a,b,p):
    assert a < b
    #find minima i.e multiply by -1 and find maxima
    indices = peakutils.indexes(-1*y[a:b+1])
    if len(indices) == 0:
        idx = np.argmin(y[a:b+1])
        return f[a+idx]
    elif len(indices) == 1:
        return f[a+indices[0]]
    else:
        (d,i) = min([(abs(f[a+idx]-f[p]),idx) for idx in indices])
        return f[a+i]

'''This method calculates the global peaks for the four labels and pickles the output'''
def get_global_peaks():
    (X,Y) = load_data()
    print "Computing Fourier transform and power spectrum..."
    X1 = []
    Y1 = []
    Z1 = []
    for x in X:
        (fx,x1) = fourier_transform(x["x"])
        (fy,y1) = fourier_transform(x["y"])
        (fz,z1) = fourier_transform(x["z"])
        x1 = remove_noise(x1,0.5)
        y1 = remove_noise(y1,0.5)
        z1 = remove_noise(z1,0.5)

        #interpolate
        (fx,x1) = interpolate_data(fx,x1)
        (fy,y1) = interpolate_data(fy,y1)
        (fz,z1) = interpolate_data(fz,z1)
        x1 = squared_signal(x1)
        y1 = squared_signal(y1)
        z1 = squared_signal(z1)
        X1.append(x1)
        Y1.append(y1)
        Z1.append(z1)

    print "Averaging the power spectrum..."
    power_spectrums_by_class_label_x = defaultdict(list)
    power_spectrums_by_class_label_y = defaultdict(list)
    power_spectrums_by_class_label_z = defaultdict(list)
    for idx,y in enumerate(Y):
        power_spectrums_by_class_label_x[y].append(X1[idx])
        power_spectrums_by_class_label_y[y].append(Y1[idx])
        power_spectrums_by_class_label_z[y].append(Z1[idx])

    avg_power_spectrum_x = {}
    avg_power_spectrum_y = {}
    avg_power_spectrum_z = {}
    for key in power_spectrums_by_class_label_x.keys():
        power_spectrums_by_class_label_x[key] = np.array(power_spectrums_by_class_label_x[key])
        avg_power_spectrum_x[key] = np.mean(power_spectrums_by_class_label_x[key],axis=0)
        avg_power_spectrum_y[key] = np.mean(power_spectrums_by_class_label_y[key],axis=0)
        avg_power_spectrum_z[key] = np.mean(power_spectrums_by_class_label_z[key],axis=0)

    global_peaks_grouped_by_key_x = {}
    global_peaks_grouped_by_key_y = {}
    global_peaks_grouped_by_key_z = {}
    for key in avg_power_spectrum_x.keys():
        p = avg_power_spectrum_x[key]
        f = StepStrideFrequencyModuleConstants.F
        # find the start-index
        start_index = next(x[0] for x in enumerate(f) if x[1] >= 0.5)
        # find the end-index
        end_index = next(x[0] for x in enumerate(f) if x[1] >= 2.5)
        indices = []
        t = 0.3
        while len(indices) < 2:
            # indices = peakutils.indexes(p_spectrum, thres=t, min_dist=1)
            indices = get_peaks_between(p, start_index=start_index, end_index=end_index, thres=t,
                                                  min_dist=1)
            t = t / 10

        print f[indices]

        left_idx = 0
        right_idx = indices[1]
        p1 = np.array([f[indices[0]],math.sqrt(p[indices[0]]),get_half_energy(p,f,indices[0],left_idx,right_idx)])

        left_idx = indices[0]
        right_idx = indices[2] if len(indices)>2 else Constants.sampling_frequency
        p2 = np.array([f[indices[1]],math.sqrt(p[indices[1]]),get_half_energy(p,f,indices[1],left_idx,right_idx)])

        global_peaks_grouped_by_key_x[key] = {"p1":p1,
                                    "p2":p2} #p is already a power spectrum
    print "=========================="

    for key in avg_power_spectrum_y.keys():
        p = avg_power_spectrum_y[key]
        f = StepStrideFrequencyModuleConstants.F
        # find the start-index
        start_index = next(x[0] for x in enumerate(f) if x[1] >= 0.5)
        # find the end-index
        end_index = next(x[0] for x in enumerate(f) if x[1] >= 2.5)
        indices = []
        t = 0.3
        while len(indices) < 2:
            indices = get_peaks_between(p, start_index=start_index, end_index=end_index, thres=t,
                                                  min_dist=1)
            t = t / 10
        print f[indices]
        left_idx = 0
        right_idx = indices[1]
        p1 = np.array([f[indices[0]],math.sqrt(p[indices[0]]),get_half_energy(p,f,indices[0],left_idx,right_idx)])

        left_idx = indices[0]
        right_idx = indices[2] if len(indices)>2 else Constants.sampling_frequency
        p2 = np.array([f[indices[1]],math.sqrt(p[indices[1]]),get_half_energy(p,f,indices[1],left_idx,right_idx)])

        global_peaks_grouped_by_key_y[key] = {"p1":p1,
                                    "p2":p2} #p is already a power spectrum
    print "=========================="

    for key in avg_power_spectrum_z.keys():
        p = avg_power_spectrum_z[key]
        f = StepStrideFrequencyModuleConstants.F
        # find the start-index
        start_index = next(x[0] for x in enumerate(f) if x[1] >= 0.5)
        # find the end-index
        end_index = next(x[0] for x in enumerate(f) if x[1] >= 2.5)
        indices = []
        t = 0.3
        while len(indices) < 2:
            indices = get_peaks_between(p, start_index=start_index, end_index=end_index, thres=t,
                                                  min_dist=1)
            t = t / 10
        print f[indices]
        left_idx = 0
        right_idx = indices[1]
        p1 = np.array([f[indices[0]],math.sqrt(p[indices[0]]),get_half_energy(p,f,indices[0],left_idx,right_idx)])

        left_idx = indices[0]
        right_idx = indices[2] if len(indices)>2 else Constants.sampling_frequency
        p2 = np.array([f[indices[1]],math.sqrt(p[indices[1]]),get_half_energy(p,f,indices[1],left_idx,right_idx)])

        global_peaks_grouped_by_key_z[key] = {"p1":p1,
                                    "p2":p2} #p is already a power spectrum

    print global_peaks_grouped_by_key_x
    print "=========================="
    print global_peaks_grouped_by_key_y
    print "=========================="
    print global_peaks_grouped_by_key_z

    with open("global_peaks_x.pkl","w") as f:
        pickle.dump(global_peaks_grouped_by_key_x,f)

    with open("global_peaks_y.pkl","w") as f:
        pickle.dump(global_peaks_grouped_by_key_y,f)

    with open("global_peaks_z.pkl","w") as f:
        pickle.dump(global_peaks_grouped_by_key_z,f)

    sys.exit()
    #create validation set
    for i in xrange(3):
        random_indices = random.sample(range(len(rows)), 500)
        df = pd.DataFrame(columns=["healthCode_medTimepoint","all_peaks","selected_peaks"])
        for idx in random_indices:
            result = get_step_stride_frequency(global_peaks_grouped_by_key_x,avg_pspectrum_by_healthcode_and_class_label,rows[idx])
            df = df.append([result])
        df.to_csv("validation-set" + str(i+1)+ ".csv",index=False)

'''This method is used to find step and stride frequency for each p_spectrum'''
def get_step_stride_frequency(global_peaks, p_spectrum, group_type, mu, sd):
    alpha = 1.0
    #peaks will be stored in this array
    indices = []
    t = 0.1
    f = StepStrideFrequencyModuleConstants.F

    #find the start-index
    start_index = next(x[0] for x in enumerate(f) if x[1] >= 0.5)
    #find the end-index
    end_index = next(x[0] for x in enumerate(f) if x[1] >= 2.5)
    while len(indices) < 2:
        indices = get_peaks_between(p_spectrum, start_index=start_index, end_index=end_index,thres=t,min_dist=1)
        t = t / 10

    #print f[indices]
    score = float("inf")
    (a, b) = (None, None)
    for i in range(len(indices)):
        f1 = f[indices[i]]
        a1 = p_spectrum[indices[i]]
        left_idx = indices[i-1] if i-1>=0 else 0
        right_idx = indices[i+1] if i+1<len(indices) else len(f)-1
        p1 = np.array([f1, math.sqrt(a1), get_half_energy(p_spectrum, f, indices[i],left_idx,right_idx)])
        p1 = (p1 - mu)/sd
        p1_prime = global_peaks[group_type]["p1"]
        p1_prime = (p1_prime - mu)/sd
        for j in range(i + 1, len(indices)):
            f2 = f[indices[j]]
            a2 = p_spectrum[indices[j]]
            left_idx = indices[j-1] if j-1>=0 else 0
            right_idx = indices[j+1] if j+1<len(indices) else len(f)-1
            p2 = np.array([f2, math.sqrt(a2), get_half_energy(p_spectrum, f, indices[j],left_idx,right_idx)])
            p2 = (p2 - mu)/sd
            p2_prime = global_peaks[group_type]["p2"]
            p2_prime = (p2_prime - mu)/sd

            # minimize this function to find p1 and p2
            new_score = np.linalg.norm(p1 - p1_prime) + np.linalg.norm(p2 - p2_prime) + alpha * np.linalg.norm(2 * p1[0] - p2[0])  # + p1[0]**2 + p2[0]**2
            if new_score < score:
                score = new_score
                (a, b) = (i, j)
    return {"group_type":group_type, "all_peaks": f[indices], "selected_peaks":f[[indices[a], indices[b]]]}

'''This method is used to load the data with given filename'''
def get_data(filename):
    return pd.read_json(Constants.walking_outbound_path + "/" + filename)

if __name__ == '__main__':
    load_data()