https://github.com/bansallab/indoor_outdoor
Tip revision: 2a0e6c7e35bc07eb2fb901e202dca7e64fc9fe5c authored by Zachary Susswein on 02 April 2023, 16:11:23 UTC
Clean up paths and add required data
Clean up paths and add required data
Tip revision: 2a0e6c7
timeseries_clustering.py
#!/usr/bin/env python
# # Network-based timeseries clustering
# #### Approach: calculate correlations between county timeseries; represent correlations as a county network, with dropping low weight edges; and run community structure detection on the network to identify groups of nodes that are more similar to each out in terms of dynamics
#
# #### Author: Shweta Bansal
# #### Started Date: July 12, 2021
# #### Updated: Dec 23, 2022
import os
import networkx as nx
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import random as rnd
# import functions for the time-series clustering method
import epi_geo_functions as fegf
# run hierarchical clustering
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import fcluster
###########################################
# CHANGE PARAMETERS HERE FOR REST OF ANALYSIS
year_of_analysis = [2018,2019] #baseline years only
years_label = "_".join([str(y) for y in year_of_analysis])
filename = years_label+'_Dec2022_final'
threshold = 4 # absolute threshold for mean centered indoor activity measure
corr_percentile = 90 # percentile for the minimum time series correlation between counties (90 = default)
num_bootstrap = 25 # number of bootstrap networks over which to do analysis
drop_small = 10 # drop small communities of at most this many counties
contiguity_threshold = 2 # merge any island communities with neighboring cluster
sandbox = False # make true if shweta debugging
rolling = True
num_weeks = 4 # number of weeks over which to do rolling mean
###########################################
# LOAD DATA & CLEAN
# Load indoor/outdoor data for all years, take rolling mean
df_sm = pd.read_csv("indoor_activity_data/indoor_activity_2018_2020.csv")
# fix date format
df_sm['date'] = pd.to_datetime(df_sm.date, format='%Y-%m-%d')
# take rolling mean
# number of weeks to roll defined above
# smoothing doesn't affect clustering results, but makes time series plots cleaner
if rolling:
ct_list = list(df_sm.county.unique())
dfn = pd.DataFrame()
for ct in ct_list:
dfx = df_sm[df_sm.county==ct]
# rolling average of time series
dfx = dfx.sort_values(by='date')
dfx = dfx[['date', 'indoor_activity']]
dfx = dfx.set_index('date')
dfx = dfx.rolling(num_weeks).mean()
dfx = dfx.reset_index()
dfx['county'] = ct
dfn = pd.concat([dfn, dfx], ignore_index=True)
df_sm = dfn.copy()
df_sm = df_sm[['date', 'county', 'indoor_activity']]
df_sm.tail()
###########################################
# PREPARE TIMESERIES DATA FOR CLUSTERING
df_time = df_sm.copy()
# only keep data for years of interest
df_time['year'] = df_time.date.dt.year
df_time = df_time[df_time.year.isin(year_of_analysis)]
df_time = df_time[['county', 'indoor_activity', 'date']]
# convert to long to wide format where rows = weeks, columns = fips
df_matrix = df_time.pivot(index = 'date', columns='county', values='indoor_activity').reset_index(drop=True)
# keep unnormalized copy of df_matrix
df_matrix_unnorm = df_matrix.copy()
# z-normalize time series
m = df_matrix.mean(axis=0) # take mean for each col (i.e. for each county)
s = df_matrix.std(axis=0) # take std for each col
df_matrix = df_matrix.sub(m, axis=1) # subtract county mean from county time series
df_matrix = df_matrix.div(s, axis=1) # divide county time series by county stdev
# clean up dataframe by making all Nans 0
df_matrix = df_matrix.fillna(0)
df_matrix.head()
###########################################
# CREATE CORRELATION MATRIX
# get correlation matrix between timeseries
corr_df = fegf.calc_timeseries_correlations(df_matrix)
corr_df.head()
###########################################
# CREATE NETWORK
# create network from correlation matrix
threshold = np.percentile(corr_df.values.tolist()[0], corr_percentile)
network = fegf.create_network(corr_df, threshold, df_sm, False)
# create nearest neighbor network
G_nn = fegf.make_nn_network("./", False)
############################################
# PERFORM TIME SERIES CLUSTERING THROUGH LOUVAIN COMMUNITY STRUCTURE
# run community structure detection using Louvain algo
part, dftemp = fegf.comm_struct_robustness(network, G_nn, 'louvain_igraph', num_bootstrap)
part_df = fegf.reduce_num_communities(part, drop_small) # drop communities that are smaller than dropsmall of number of counties
# save to file
part_df.to_csv("community_structure_"+filename+'.csv', index=False)
num_clusters = len(part_df.modularity_class.unique())
# output number of clusters and sizes of each cluster
print(part_df.modularity_class.value_counts())
#############################################
# POST PROCESS RESULTS
# make community structure feasible
part_nonan_df = fegf.fill_in_nans(G_nn, part_df, 0.15)
part_contig_df = fegf.increase_spatial_contiguity(G_nn, part_nonan_df, contiguity_threshold, 0.15)
part_nonan_df.to_csv("community_structure_nonan_"+filename+'.csv', index=False)
part_contig_df.to_csv("community_structure_feasible_"+filename+'.csv', index=False)
#############################################
# OUTPUT RESULTS
# plot time series (non-zscored data)
fegf.plot_timeseries(df_matrix_unnorm, part_df, filename+'_non_zscore', year_of_analysis, num_clusters, [0.25,2])
# plot time series (z-normalized)
#fegf.plot_timeseries(df_matrix, part_df, filename+'_zscore', year_of_analysis, num_clusters, [-3,3])
# make map
fegf.make_module_map(part_nonan_df,filename)
# make feasible map
fegf.make_module_map(part_contig_df,filename+'_feasible')
#############################################
# COMPARE RESULTS TO HIERARCHICAL CLUSTERING
import importlib
importlib.reload(fegf)
# prepare matrix
df_matrix_hc = df_matrix.fillna(0).transpose() # needs to be county x week
df_matrix_hc = df_matrix_hc.loc[~(df_matrix_hc==0).all(axis=1)] # remove rows with all 0s
list_nodes = list(df_matrix_hc.index)
part_hier_clust = {} # dictionary of dataframes
for num_clusters in range(2,5): # vary number of clusters in partition
# set up the time series linkage matrix for clustering
Z = hac.linkage(df_matrix_hc, method='ward', metric='euclidean')
# do time series clustering
results = fcluster(Z, t=num_clusters, criterion='maxclust')
# the results just tell you which partition each node (animal) is in, so this attaches the node ids to the cluster ids
partition = dict(zip(list_nodes, results))
# reduce small communities
part_hier_clust[num_clusters] = fegf.reduce_num_communities(partition, 5)
# output module number and sizes
print(part_hier_clust[num_clusters].modularity_class.value_counts())
#############################################
# For num_clusters = 3, output results and quantify similartity to network clustering partition
num_clusters = 3
# relabel partitions to match the ordering of the main results
part_hier_clust[num_clusters] = fegf.relabel_clusters(part_hier_clust[num_clusters])
# output partition
part_hier_clust[num_clusters].to_csv("community_structure_hierclust_"+filename+'.csv', index=False)
# plot time series (non-zscored data)
fegf.plot_timeseries(df_matrix_unnorm, part_hier_clust[num_clusters], filename+'_hierclust_'+str(num_clusters), year_of_analysis, num_clusters, [0.25,2])
# make map
fegf.make_module_map(part_hier_clust[num_clusters],filename+'_hierclust_'+str(num_clusters))
# calculate mutual information on two partitions
part_network = pd.read_csv('community_structure_[2018, 2019]_Dec2022_corr90.csv') # this is the 90th percentile correlation network's comm struct results
df3 = pd.merge(part_network, part_hier_clust[num_clusters], on='node', how='inner') # merge two partitions by node
nmi = fegf.normalized_mutual_info_score(df3["modularity_class_x"], df3["modularity_class_y"])
print(nmi)
numlist = len(list(df3.modularity_class_x))
match = sum([1 for a,b in zip(df3.modularity_class_x, df3.modularity_class_y) if a==b])/float(numlist)
print(match)