Content - 0f6d2857a43ee2881461dafc2909367f728a0ce1 - e8c8578/RunOnData.py

visit type:
Tip revision: 8e64d9628355fde264b6481f9955478fed995fe5 authored by Avinash Bukkittu on 04 October 2017, 22:20:26 UTC
Merge branch 'master' of https://github.com/ab4377/dream-project
Tip revision: 8e64d96
RunOnData.py
import os
import pickle
import pandas as pd
import sys
from collections import Counter
import numpy as np
from collections import defaultdict
from tqdm import tqdm
import argparse

'''This method is used to fetch features from the data'''
def fetch_features(mdl,datum,**kwargs):
    viterbi_sequence = mdl.predict(datum, 0)[1]
    #collapse the sequence
    collapsed_sequence = []
    prev_state = viterbi_sequence[0]
    count = 1
    for idx, state in enumerate(viterbi_sequence[1:]):
        if state != prev_state:
            collapsed_sequence.append(count)
            prev_state = state
            count = 1
        else:
            count = count + 1
    collapsed_sequence.append(count)
    # get the time spent in each collapsed sequence
    total_time = kwargs["total_time"]
    assert len(viterbi_sequence) > 0
    time_in_collapsed_sequence = np.array([total_time * seq / len(viterbi_sequence) for seq in collapsed_sequence])
    fraction_time_in_states = np.zeros(shape=(25,), dtype=float)
    counts = Counter(viterbi_sequence)
    for key in counts.keys():
        fraction_time_in_states[key] = total_time * counts[key] / len(viterbi_sequence)
    return len(time_in_collapsed_sequence), time_in_collapsed_sequence, fraction_time_in_states

'''This method is used to bring all the rows to the same size for a given (activity,category) 
i.e ({outbound,return,both},{0,1,2,3})'''
def normalize_features(feature_set):
    max_length = max([t[0] for t in feature_set.values()])
    new_feature_set = []
    feature_values = []
    for idx,key in enumerate(feature_set.keys()):
        collapsed_seq = feature_set[key][1]
        if len(collapsed_seq) < max_length :
            curr_len = len(collapsed_seq)
            collapsed_seq = np.append(collapsed_seq, np.zeros(shape=(max_length-curr_len,), dtype=float), axis=0)
        assert len(collapsed_seq) == max_length
        fraction_time = feature_set[key][2]
        feature_values.append(np.append(collapsed_seq,fraction_time,axis=0))
        feature = [key]
        feature.extend(np.append(collapsed_seq,fraction_time,axis=0).tolist())
        new_feature_set.append(feature)
    #now calculate the mean of the feature values which is used for missing data
    feature_values = np.array(feature_values)
    return new_feature_set, np.mean(feature_values,axis=0)

#constants to be used throughout the program
model_location = "/ifs/home/c2b2/ip_lab/shares/DATA/dataset/models/"
training_data_location = "/ifs/home/c2b2/ip_lab/shares/DATA/fwd_bwd_data/converted_fb_accel_data/"
additional_training_data_location = "/ifs/home/c2b2/ip_lab/shares/DATA/dataset/supp_fwd_bwd_data/"
test_data_location = "/ifs/home/c2b2/ip_lab/shares/DATA/dataset/test_fwd_bwd_data/"
meta_data_location = "/ifs/home/c2b2/ip_lab/shares/DATA/dataset/meta-data.csv"
meta_testdata_location = "/ifs/home/c2b2/ip_lab/shares/DATA/dataset/meta-data-testing.csv"
meta_additional_data_location = "/ifs/home/c2b2/ip_lab/shares/DATA/dataset/meta-data-additional.csv"

#different types of models
both_mdls = {}
return_mdls = {}
outbound_mdls = {}

#different models based on the phones
#this data can be found in category.csv, but hardcoding it now, since it is a one-time thing
iphone_6 = ["category0.pyhsmm","category1.pyhsmm","category2.pyhsmm","category3.pyhsmm"]
iphone_6_plus = ["category4.pyhsmm","category5.pyhsmm","category6.pyhsmm","category7.pyhsmm"]
iphone_5s_gsm = ["category8.pyhsmm","category9.pyhsmm","category10.pyhsmm","category11.pyhsmm"]
others = ["category12.pyhsmm","category13.pyhsmm","category14.pyhsmm","category15.pyhsmm"]

def run_on_data(type,activity,category_index,start_index,end_index):

    data_location = None
    if type == "training":
        meta_data = pd.read_csv(meta_data_location)
        data_location = training_data_location
    elif type == "testing":
        meta_data = pd.read_csv(meta_testdata_location)
        data_location = test_data_location
    elif type == "additional":
        meta_data = pd.read_csv(meta_additional_data_location)
        data_location = additional_training_data_location
    else:
        print "Invalid dataset type {}".format(type)
        sys.exit()

    if activity == "both":
        print "Loading the outbound+return models..."
        for file in os.listdir(model_location + "both/"):
            full_path = model_location + "both/" + file
            with open(full_path,"r") as f:
                both_mdls[file] = pickle.load(f)
    elif activity == "outbound":
        print "Loading the outbound models..."
        for file in os.listdir(model_location + "outbound/"):
            full_path = model_location + "outbound/" + file
            with open(full_path, "r") as f:
                outbound_mdls[file] = pickle.load(f)
    elif activity == "return":
        print "Loading the return models..."
        for file in os.listdir(model_location + "return/"):
            full_path = model_location + "return/" + file
            with open(full_path, "r") as f:
                return_mdls[file] = pickle.load(f)
    else:
        print "Invalid activity {}".format(activity)
        sys.exit()

    print "Running on the data..."
    start_index = int(start_index)
    end_index = int(end_index)
    assert start_index < end_index
    meta_data = meta_data[start_index:end_index]
    feature_set = defaultdict(tuple)
    missing_records = []
    if activity == "outbound":
        for index,row in meta_data.iterrows():
            record_id = row["recordId"]
            phone_info = row["phoneInfo"]
            #load the data corresponding to this record_id
            record_id_location = data_location + record_id
            if os.path.isdir(record_id_location):
                outbound_path = record_id_location + "/" + "outbound.csv"
                if os.path.isfile(outbound_path):
                    outbound_df = pd.read_csv(outbound_path)
                    start_time = outbound_df.loc[0, "timestamp"]
                    end_time = outbound_df.loc[len(outbound_df) - 1, "timestamp"]
                    outbound_array = outbound_df[["x","y","z"]].as_matrix()
                    if phone_info == "iPhone 6":
                        model = iphone_6[category_index]
                    elif phone_info == "iPhone 6 Plus":
                        model = iphone_6_plus[category_index]
                    elif phone_info == "iPhone 5s (GSM)":
                        model = iphone_5s_gsm[category_index]
                    else:
                        model = others[category_index]
                    #for idx,model_name in enumerate(model):
                    if outbound_mdls.has_key(model):
                        mdl = outbound_mdls[model]
                        t = fetch_features(mdl, outbound_array, total_time=end_time - start_time)
                        feature_set[record_id] = t
                    else:
                        print "{} does not exist for {}".format(model,activity)

            else:
                missing_records.append(record_id)
                print "Outbound data for {} is missing".format(record_id)
    elif activity == "return":
        for index, row in meta_data.iterrows():
            record_id = row["recordId"]
            phone_info = row["phoneInfo"]
            # load the data corresponding to this record_id
            record_id_location = data_location + record_id
            if os.path.isdir(record_id_location):
                return_path = record_id_location + "/" + "return.csv"
                if os.path.isfile(return_path):
                    outbound_df = pd.read_csv(return_path)
                    start_time = outbound_df.loc[0, "timestamp"]
                    end_time = outbound_df.loc[len(outbound_df) - 1, "timestamp"]
                    return_array = outbound_df[["x", "y", "z"]].as_matrix()
                    if phone_info == "iPhone 6":
                        model = iphone_6[category_index]
                    elif phone_info == "iPhone 6 Plus":
                        model = iphone_6_plus[category_index]
                    elif phone_info == "iPhone 5s (GSM)":
                        model = iphone_5s_gsm[category_index]
                    else:
                        model = others[category_index]

                    if return_mdls.has_key(model):
                        mdl = return_mdls[model]
                        t = fetch_features(mdl, return_array, total_time=end_time - start_time)
                        feature_set[record_id] = t
                    else:
                        print "{} does not exist for {}".format(model, activity)
            else:
                missing_records.append(record_id)
                print "Return data for {} is missing".format(record_id)
    elif activity == "both":
        for index, row in meta_data.iterrows():
            record_id = row["recordId"]
            phone_info = row["phoneInfo"]
            # load the data corresponding to this record_id
            record_id_location = data_location + record_id
            if os.path.isdir(record_id_location):
                outbound_path = record_id_location + "/" + "outbound.csv"
                return_path = record_id_location + "/" + "return.csv"
                if os.path.isfile(outbound_path) and os.path.isfile(return_path):
                    outbound_df = pd.read_csv(outbound_path)
                    return_df = pd.read_csv(return_path)
                    outbound_time = outbound_df.loc[len(outbound_df) - 1, "timestamp"] - outbound_df.loc[0, "timestamp"]
                    return_time = return_df.loc[len(return_df) - 1, "timestamp"] - return_df.loc[0, "timestamp"]
                    total_time = outbound_time + return_time
                    both_df = outbound_df[["x", "y", "z"]]
                    both_array = both_df.append(return_df[["x", "y", "z"]]).as_matrix()
                    if phone_info == "iPhone 6":
                        model = iphone_6[category_index]
                    elif phone_info == "iPhone 6 Plus":
                        model = iphone_6_plus[category_index]
                    elif phone_info == "iPhone 5s (GSM)":
                        model = iphone_5s_gsm[category_index]
                    else:
                        model = others[category_index]
                    if both_mdls.has_key(model):
                        mdl = both_mdls[model]
                        t = fetch_features(mdl, both_array, total_time=total_time)
                        feature_set[record_id] = t
                    else:
                        print "{} does not exist for {}".format(model, activity)
            else:
                missing_records.append(record_id)
                print "Outbound+Return data for {} is missing".format(record_id)
    else:
        print "Invalid value {} for activity".format(activity)
        sys.exit()

    with open("{}/feature_set_{}_{}_category{}_start-index{}_end-index{}.pkl".format(type+"-new",type,activity,category_index,start_index,end_index),"w") as f:
        pickle.dump(feature_set,f)

    with open("{}/missing_records_{}_{}_category{}_start-index{}_end-index{}.pkl".format(type+"-new",type,activity,category_index,start_index,end_index),"w") as f:
        pickle.dump(missing_records,f)

    '''feature_set, mean_value = normalize_features(feature_set)
    for record in missing_records:
        feature_set.append([record, mean_value.tolist()])
    feature_set = pd.DataFrame(data=feature_set)
    feature_set.to_csv("feature_set_{}_{}.csv".format(activity,category_index), index=False)'''


if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument("type",help="type of dataset")
    arg_parser.add_argument("activity", help="activity")
    arg_parser.add_argument("category_index", help="index")
    arg_parser.add_argument("start_index",help="start_index")
    arg_parser.add_argument("end_index",help="end_index")
    args = arg_parser.parse_args()
    run_on_data(args.type,args.activity,int(args.category_index),args.start_index,args.end_index)