https://github.com/bansallab/indoor_outdoor
Raw File
Tip revision: d8a2ffc49f46a22c45814bd1dfcd1b054f2a4a27 authored by Shweta Bansal on 03 April 2023, 13:09:08 UTC
Merge pull request #1 from bansallab/update-path
Tip revision: d8a2ffc
epi_geo_functions.py
#!/usr/bin/env python3

########################
# CODE FOR Running Community Structure Detection
#
# Code written by Shweta Bansal
# LAST UPDATED: Dec 22, 2022 
########################

###########################################################
# IMPORT NECESSARY MODULES

import pandas as pd
import csv
import networkx as nx
import itertools 
import numpy as np
import math
import operator
from collections import Counter
#import importlib

# import graphing tools
import matplotlib.pyplot as plt
import plotly.offline as plto
plto.init_notebook_mode(connected=True)

# import functions for mapping
import plotly.express as px
from urllib.request import urlopen
import json
import kaleido # to save map files


# import stats/machine learning tools
import scipy as sy
from sklearn.metrics.cluster import normalized_mutual_info_score

# import community # this is the networkx implementation of the Louvain algorithm
# from https://github.com/taynaud/python-louvain
import community

# import an even faster implementation of Louvain
# SB tested this on Feb 17, 2019 and it gives identical results to community in a much shorter time
# https://louvain-igraph.readthedocs.io/en/latest/reference.html
import louvain
import igraph as ig

# import community adapted to spectral null from MacMahon et al, 2015
# Code base is from https://github.com/taynaud/python-louvain
#from community_with_spectralnull import community_louvain as community_spectralnull
#importlib.reload(community_spectralnull)

# stop warnings from printing out (comment out if debugging) 
import warnings as wn
wn.filterwarnings("ignore")

####################################################################
# PART 1:
# DATA CLEANING AND BASIC DATA ISOLATION FUNCTIONS
####################################################################

    
####################################################
def calc_timeseries_correlations(all_county_df):
# Makes a dataframe OF THE CORRELATIONS BETWEEN ALL TIMESERIES IN THE DATA
# inputs: the dataframe of county timeseries(rows: weeks, columns: counties)
# outputs: a dataframe of the correlations in the flu season county x county with 1 down diagonal
#          and the dataframe of county timeseries
   
    
    # calculate pearsons correlation between time series for each pair of time series
    # and make NaNs and negative correlations 0 (these 0s will later be ignored)
    correlation_df = all_county_df.corr(method='pearson')
    correlation_df = correlation_df.fillna(0)
    correlation_df = correlation_df.clip(lower=0) # make negative values 0
    
    return correlation_df
    
    
####################################################
def create_network(correlation_df, threshold, data, state=None):
# CREATES A NETWORK FROM THE CORRELATION DATAFRAME
# INPUTS: needs the correlation df, threshold for edgeweight to be included in the network 
# outputs: a graph of the correlation network 
    
    if state:
        list_counties = data.state.unique()
    else:
        list_counties = data.county.unique()

    matrix_1 = sy.sparse.lil_matrix(correlation_df) # convert correlation dataframe to matrix
    matrix_1.setdiag(0)                             # ignore self correlations (set the diagonals = 0)
    matrix_1[matrix_1 < threshold] = 0              # set anything below the threshold = 0 
    
    # Construct the network
    # Note: no edges exist between counties where correlation < threshold
    network = nx.from_scipy_sparse_matrix(matrix_1)
     
    # Relabel nodes with FIPS 
    nodelabel_mapping = dict(zip(range(0,len(list_counties)), list_counties))
    nx.relabel_nodes(network,nodelabel_mapping, copy=False)
	
	# Remove edges that have NaN weight 
    to_remove = [(i,j) for i,j,d in network.edges(data=True) if math.isnan(float(d['weight']))]
    network.remove_edges_from(to_remove)
    
    return network

##################################################
# Creates nearest neighbor graph for US counties (neighbor list is based on shared borders)
def make_nn_network(fullpath, state=None):
    
    # read neighbor lists
    if state:
        reader_data = csv.reader(open(fullpath+'county_neighbors_fips.txt'), delimiter=',')
        neighbor_list = [(int(int(line[0])/1000), int(int(line[1])/1000)) for line in reader_data]
    else:
        reader_data = csv.reader(open(fullpath+'county_neighbors_fips.txt'), delimiter=',')
        neighbor_list = [(int(line[0]), int(line[1])) for line in reader_data]
 
    G_nn = nx.Graph()
    for u,v in neighbor_list:
        G_nn.add_edge(u,v, weight=1)
              
    return G_nn


##################################################
# PART 2: RUN LOUVAIN COMMUNITY DETECTION ANALYSIS
##################################################

####################################################
def louvain_community_detection(network, G_nn, implementation, Cg = None, nodelabel_map=None):
# Runs Louvain community detection algorithm on weighted network
# If using 'louvain_spectralnull', need to provie Cg matrix

    if implementation == 'louvain_community':
        partition = community.best_partition(network, weight='weight')
        
    elif implementation == 'louvain_igraph': # fastest
        # convert to igraph
        edges = [(e1,e2,a['weight']) for e1,e2, a in network.edges(data=True)]
        graph_ig = ig.Graph.TupleList(edges, directed=False, weights=True, vertex_name_attr='name')
        ws = [e["weight"] for e in graph_ig.es()]
        part = louvain.find_partition(graph_ig, louvain.ModularityVertexPartition, weights = ws)
        # convert result to dictionary
        partition = {}
        for comm_id,comm_members in enumerate(part):
            comm_member_names = [graph_ig.vs[c]["name"] for c in comm_members]
            partition.update({c: comm_id for c in comm_member_names})
    
    #elif implementation == 'louvain_spectralnull':
    # code not complete, need to adapt method for induced graph (Dec 2022)
        #partition = community_spectralnull.best_partition(graph=network, Cg=Cg, nodelabel_map=nodelabel_map, weight='weight')


    # make sure all nodes are in partition (if any missing, add as NaNs)
    m1 = [n for n in list(network.nodes()) if n not in partition.keys()]   
    m2 = ([n for n in list(G_nn.nodes()) if n not in partition.keys()])
    missing_nodes = m1 + m2
    missing_nodes = list(set(missing_nodes))
    partition.update({m: float('NaN') for m in missing_nodes})

    return partition


####################################################
def reduce_num_communities(partition, thresh):
# REDUCES COMMUNITIES SMALLER THAN THRESH BY REMOVING THEM, and relabel modularity class to be consecutive
# Partition is dictionary with key: nodeid, value: community id
   
	# find the community sizes of each community
    uniq_comms = list(set(partition.values()))
    uniq_comms = [u for u in uniq_comms if float(u) != float('NaN')]
    comm_size = {i:len([node for node, part in partition.items() if part == i]) for i in uniq_comms} #dictionary
    
    # find all the communties that are too small (i.e smaller than or equal to thresh nodes)
    small_comm_id = [comm for comm,csize in comm_size.items() if (csize <= thresh)]
    
    # for communities identified as "small", make all nodes have NaN as modularity class
    for node in [key for key,val in partition.items() if val in small_comm_id]:
        partition[node] = float('NaN')
        
    # relabel modularity classes so that they are consecutive
    current_modclass = list(set(partition.values()))
    current_modclass = [int(m) for m in current_modclass if str(m) != 'nan']
    new_labels = list(range(0,len(current_modclass))) # create a new set of labels for the modularity classes
    label_dict = dict(zip(current_modclass, new_labels))
    
    # convert partition dictionary to dataframe; switch labels using map;
    part_df = pd.DataFrame.from_dict(partition, orient='index').reset_index()
    part_df = part_df.rename(columns = {'index': 'node', 0: 'modularity_class'})
    part_df['modularity_class'] = part_df['modularity_class'].map(label_dict) # relabel old labels with new labels
        
    return part_df
    

##################################################
def comm_struct_robustness(G, G_nn, implementation, num_bootstrap = 10, Cg=None, nodelabel_map=None):
# This function calculates a robust community structure 
# uses method of Donker/Wallinga/Slack/Grundmann "Hospital Networks and the Dispersal of Hospital-Acquired Pathogens by Patient Transfer"
# works by generating a bootstrap network with a different set of edge weights
# and re-computing the community structure on that graph
# finally the partition which has the most agreement to other partitions is chosen
# SB tested this method on Feb 17, 2019 and found that each bootstrap partition is highly similar to others

    part_alt = {}
    df = {}
    B = {i:0 for i in range(num_bootstrap)}
    
    for i in range(0,num_bootstrap):
        G_alt = bootstrap_network_edgeweight(G)
        part_alt[i] = louvain_community_detection(G_alt, G_nn, implementation, Cg, nodelabel_map)
        df[i] = pd.DataFrame.from_dict(part_alt[i], orient='index').reset_index()
        
    # Figure out how much consensus there is between each partition
    pairs = list(itertools.combinations(range(0,num_bootstrap), 2))
    for i,j in pairs:
        
        # merge pair of dataframes so that they line up
        df3 = pd.merge(df[i], df[j], on='index', how='inner')
        
        df3 = df3[~df3["0_x"].isna()]
        df3 = df3[~df3["0_y"].isna()]

        # calculate mutual information on two partitions
        nmi = normalized_mutual_info_score(df3["0_x"], df3["0_y"])
        
        # this value measures how much consensus partition[i] has with all the others       
        B[i] = B[i] + nmi
 
    #finding the key to the largest value in the dictionary
    if num_bootstrap == 1:
        i= 0
    else:
        i = max(B.items(), key=operator.itemgetter(1))[0]
       
    return part_alt[i], df[i]

##################################################
def bootstrap_network_edgeweight(G):
# returns a network where the edge-weights are redrawn from 
# a Poisson distribution with mean = eij (original edge weight)
	
	Galt = nx.Graph()
    
    #draw a bootstrap edge weight +/- Normal(0, 0.05)
	edges = [(u,v, d['weight']+np.random.normal(loc = 0,scale = 0.05)) for u,v,d in G.edges(data=True)]
    
	Galt.add_weighted_edges_from(edges)
	
	return Galt


##################################################
# PART 2b: SPATIAL COMMUNITY STRUCTURE ADJUSTMENT
##################################################


##################################################
def increase_spatial_contiguity(G_nn, partition_orig, size_thresh, Q_thresh):
# contiguity = all parts of a district be physically connected; no land islands
# increases spatial cohesiveness by looking at each community and combining nodes with 
#   nearest neighbor communities if not connected to rest of community
# keeps going till new Qrel value is still within thresh of original Qrel value
# INPUTS: G is network, G_nn is nearest neighbor network, partition is a dataframe with node and moularity
#  classes and Q_thresh is a tolerance for the reduction in modularity (Q_relative)
    
    partition = partition_orig.copy()
    
    # get a list of all the unique modularity classes (except 'nan')
    unique_modclass = list(partition['modularity_class'].dropna().unique())
    
    # identify geographical components of each community
    components = []                  
    for c in unique_modclass:
        comm_members = partition.loc[partition['modularity_class']== float(c)]['node'].tolist() # get all nodes that have modularity_class ==c

        H = G_nn.subgraph(comm_members)
        
        # find small components (size_thresh counties or less) that are islands
        comps = [comp for comp in nx.connected_components(H) if len(comp) <= size_thresh]
        components.extend(comps)

       
    # try to make graph more contiguous starting from the smallest components       
    for comp in components:
        
        # get all nearest neighbors of nodes in this component
        nn = [list(G_nn.neighbors(node)) for node in comp] # is list of lists
        nearest_neigh= [node for l in nn for node in l if node not in comp] # collapse list of lists and remove those nodes that are in the component
        nearest_neigh = list(set(nearest_neigh)) # get unique nearest neighbors
                
        nearest_neigh_comm = partition.loc[partition.node.isin(nearest_neigh)]['modularity_class'].dropna().tolist()
        
        if nearest_neigh_comm:
            comm_common = Counter(nearest_neigh_comm).most_common()[0][0]
            #print(nearest_neigh_comm, Counter(nearest_neigh_comm).most_common(), Counter(nearest_neigh_comm).most_common()[0][0])
    
            # make all nodes in this component match most popular neighboring community
            partition.loc[partition.node.isin(comp), 'modularity_class'] = comm_common

    nmi = calc_mutual_information(partition_orig, partition)
    print("      NMI: ", nmi)
					
    return partition

##################################################
def fill_in_nans(G_nn, partition_orig, Q_thresh):
# contiguity = all parts of a district be physically connected; no land islands
# increases spatial cohesiveness by looking at each community and combining nodes with 
#   nearest neighbor communities if not connected to rest of community
# keeps going till new Qrel value is still within thresh of original Qrel value
# INPUTS: G is network, G_nn is nearest neighbor network, partition is a dataframe with node and moularity
#  classes and Q_thresh is a tolerance for the reduction in modularity (Q_relative)

    
    partition = partition_orig.copy()
            
    # add all the components that are nan
    components = []
    p = partition['modularity_class'].isna()
    comm_members = partition.node[p].tolist() # get all nodes that have modularity_class ==c
    H = G_nn.subgraph(comm_members) 
    components.extend(list(nx.connected_components(H)))

    # try to make graph more contiguous starting from the smallest components       
    for comp in components:
        
        # get all nearest neighbors of nodes in this component
        nn = [list(G_nn.neighbors(node)) for node in comp] # is list of lists
        nearest_neigh= [node for l in nn for node in l if node not in comp] # collapse list of lists and remove those nodes that are in the component
        nearest_neigh = list(set(nearest_neigh)) # get unique nearest neighbors
                
        nearest_neigh_comm = partition.loc[partition.node.isin(nearest_neigh)]['modularity_class'].dropna().tolist()
        
        if nearest_neigh_comm:
            comm_common = Counter(nearest_neigh_comm).most_common()[0][0]
    
            # make all nodes in this component match most popular neighboring community
            partition.loc[partition.node.isin(comp), 'modularity_class'] = comm_common
                 
    nmi = calc_mutual_information(partition_orig, partition)
    print("      NMI: ", nmi)
					
    return partition

    	
##################################################
# PART 3: ANALYZE COMMUNITY STRUCTURE
##################################################


#################################################################
def calc_modularity(network, part):
# calculates the Newman modularity and the relative modularity of the network
# inputs: the robust network, the louvain robust partition dictionary
# outputs: the relative q value and the q value
        
    # Calculate Newman Modularity
    Q = community.modularity(part, network, weight='weight')
    
    # Calculate relative modularity (Sah et al, 2018, PNAS)
    L = network.number_of_edges() # number of edges 
    unique_mod_classes = set(part.values()) # list of modularity classes
    
    Qmax = 0
    for mod in unique_mod_classes:
        nodes = [n for (n,p) in part.items() if p == mod]
        H = network.subgraph(nodes)
        L_k = H.number_of_edges()
        x = (L_k/L)
        Qmax = Qmax + x*(1-x)

    Qrel = Q/Qmax
    
    return Q, Qrel


##########################################################
def calc_similarity_silhouette(node, matrix, part):
# get similarity between a given node and its community (needed for silhouette score)
#inputs: a given node, the distance matrix, partition
# outputs: the similarity of that node to the rest of the nodes in the cluster
        
    # find community of node
    nodecomm = part.loc[part['node']==node]['modularity_class'].tolist()[0]
    
    # find other nodes in same community (and remove original node)
    other_nodes_in_comm = part.loc[part['modularity_class']== nodecomm]['node'].tolist()
    other_nodes_in_comm = [n for n in other_nodes_in_comm if n != node]

    # calc dist to each node in same community
    dists_node = matrix[node]
    dists_comm =  dists_node[other_nodes_in_comm].tolist() # row of distances from node to all other nodes in same comm
    
    # calc avg distance between node and all nodes in same community
    sim = np.mean(dists_comm)
    
    return sim

###########################################################
def calc_dissimilarity_silhouette(node, matrix, part):
# get dissimilarity between a given node and its community (needed for silhouette score)
#inputs: a given node, the distance matrix, partition
#outputs: the dissimilarity of that node to the rest of the nodes
    
    # find community of node
    nodecomm = part.loc[part['node']==node]['modularity_class'].tolist()[0]
        
    # find all other community ids
    #first get a list of all mod classes in partition, then remove the mod class of node from that
    mod_list = list(part['modularity_class'].unique())
    cleanedlist = [x for x in mod_list if str(x) != 'nan']
    other_mods = sorted(i for i in cleanedlist if i != nodecomm) #remove mod in question from all mod list 
      
    # create list of average distances to all other communities
    avg_dist = []
    for other_comm in other_mods:
            
        # find nodes in other community
        nodes_in_other_comm = part.loc[part['modularity_class'] == other_comm]['node'].tolist()
    
        # calc dist from node to all nodes in this community
        dists_node = matrix[node]
        dists_comm =  dists_node[nodes_in_other_comm].tolist() # list of distances from node to all other nodes in other comm
       
        # avg dist from node to this community
        avg_dist.append(np.mean(dists_comm))
        
   # print('         ', avg_dist)
        
    # dissimilarity = minimum average distance to any other community
    if avg_dist:
        dissim = min(avg_dist)
    else:
        dissim = float('NaN')        
    
    return dissim


###############################################################
def calc_silhouette(all_county_timeseries, part):
# caculate silhouette score of community partition
# inputs: dist matrix, partition dataframe
# output: mean silhouette score
    
    # calculate euclidean distance between county timeseries
    corr_df = calc_timeseries_correlations(all_county_timeseries) # calculates pearson's correlations
    dist_matrix_df = 1 - corr_df # distance = 1-correlation
    
    #print(corr_df.head())
    
    #print(dist_matrix.head())
    
    # get list of counties and node ids           
    list_counties = list(part['node'].unique())
    #print(list_counties)
    
    # for each node (i.e. county fips), calculate silhouette score
    sil_dict = {}
    for node in list_counties:
    
        # calc similarity (a)
        a = calc_similarity_silhouette(node, dist_matrix_df, part)

        # calc dissimilarity (b)
        b = calc_dissimilarity_silhouette(node, dist_matrix_df, part)

        # calc silhouette ((b-a)/max(a,b))
        s = (b-a)/max(a, b)
        
        #print(node, a,b,s)
        
        sil_dict[node] = s
    
    # remove all Nans from silhouette score list
    sil_list = sil_dict.values()
    sil_list = [x for x in sil_list if str(x) != 'nan']
    
    #get the mean silhouette score
    mean_sil = np.mean(sil_list)
      
    return mean_sil, sil_list, sil_dict

##############################################   
def calc_mutual_information( df, df2):
# calculates mutual information between two partitions
# input: two filenames which contain partitions (fips, modularity_class)
    
    # need to make sure both partitions are ordered the same way
    #  (using inner to eliminate any fips missing in either partition)
    #  (The merge will create two columns modularity_class_x and modularity_class_y)
    df3 = pd.merge(df, df2, on='node', how='inner')
    
    # drop rows with NaNs; convert mods to ints
    df3 = df3.dropna(how='any')
    df3["modularity_class_x"] = df3["modularity_class_x"].astype(int)
    df3["modularity_class_y"] = df3["modularity_class_y"].astype(int)

    # calc match between paritionss
    numlist = len(list(df3.modularity_class_x))
    if numlist:
        match = sum([1 for a,b in zip(df3.modularity_class_x, df3.modularity_class_y) if a==b])/float(numlist) # print matches
    else:
        match= 0    
    nmi = normalized_mutual_info_score(df3.modularity_class_x, df3.modularity_class_y)
            
    return nmi, match


##################################################
# PART 5: OUTPUTTING RESULTS
##################################################

##################################################
def relabel_clusters(partition):
# RELABEL CLUSTERS SO THAT COLORS MATCH MAIN RESULTS
    
    part = partition
    
    # rename clusters into something else temporarily
    part.loc[part.modularity_class == 1, 'modularity_class'] = 'old_1'
    part.loc[part.modularity_class == 2, 'modularity_class'] = 'old_2'
    
    # rename clusters
    part.loc[part.modularity_class == 'old_1', 'modularity_class'] = 2
    part.loc[part.modularity_class == 'old_2', 'modularity_class'] = 1
        
    partition = part
    
    return partition


##################################################
# function to plot timeseries by community
def plot_timeseries(df, partition, filename,  year_of_analysis,  num_clusters, ylim):
    
    unique_modclass = list(partition['modularity_class'].unique())
    unique_modclass = [int(m) for m in unique_modclass if str(m) != 'nan']
    unique_modclass.sort()
       
    colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c', '#d95f02', '#7570b3', '#e7298a']
    
    # Plot timeseries
    hfont = {'fontname':'Helvetica'}
    fig, axes = plt.subplots(2, int(np.ceil(num_clusters/2)), figsize=(20,20))
    axes = axes.ravel()
    
    # for separate figure with all the averages
    #fig, ax2 = plt.subplots(figsize=(10,10))

    for mod in unique_modclass:
            
        # find nodes in ters community and get timeseries for just those nodes
        nodes_in_cluster = list(partition[partition.modularity_class == mod]['node'])
        flu_timeseries_mod = df[nodes_in_cluster]
               
        # plot all county timeseries (transparent)
        flu_timeseries_mod.plot(legend=False, color = colors[mod], alpha = 0.05, linewidth = 0.5, zorder = 8, ax=axes[mod])
        
        # plot smoothed average (opaque) for community
        flu_timeseries_mod_avg = flu_timeseries_mod.mean(axis=1)
        flu_timeseries_mod_avg.plot(legend=False, color = colors[mod], alpha = 1, linewidth = 4, zorder = 15, ax=axes[mod]);

        # put all the averages on one separate plot
        #flu_timeseries_mod_avg.plot(legend=False, color = colors[mod], alpha = 1, linewidth = 4, zorder = 15, ax=ax2);

        #axes[mod].set_title('Community '+str(mod),fontsize = 20)
        #axes[mod].set_xlabel("Week", fontsize = 20, **hfont)
        axes[mod].set_ylabel("Indoor Activity Seasonality", fontsize = 20, **hfont)
        axes[mod].tick_params(axis='x', labelsize= 16)
        axes[mod].tick_params(axis='y', labelsize= 16)
        axes[mod].set_ylim(ylim);

        if len(year_of_analysis) == 1:
            axes[mod].set_xticks([1, 5,9,14,18,23,27,31,36,40,44,49])
            axes[mod].set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
        elif len(year_of_analysis) == 2:
            axes[mod].set_xticks([1, 13, 26, 40, 53, 66, 79, 92])
            axes[mod].set_xticklabels(['Jan\n2018', 'Apr', 'Jul', 'Oct', 'Jan\n2019', 'Apr', 'Jul', 'Oct'])

        plt.savefig("timeseries_plot_"+str(filename)+'.png')

    return

##################################################
# function to plot county-level map with counties colored by community/module
def make_module_map(part_df, filename):
    with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
        counties = json.load(response)

    part_df2 = part_df.copy()
    part_df2['node'] = part_df2.node.astype(str).str.zfill(5)


    part_df2 = part_df2[~part_df2.modularity_class.isna()]
    part_df2['modularity_class'] = part_df2.modularity_class.astype(int)
    part_df2['modularity_class'] = part_df2.modularity_class.replace(0, 'A')
    part_df2['modularity_class'] = part_df2.modularity_class.replace(1, 'B')
    part_df2['modularity_class'] = part_df2.modularity_class.replace(2, 'C')
    part_df2['modularity_class'] = part_df2.modularity_class.replace(3, 'D')

    fig = px.choropleth(part_df2,                 # name of your dataframe
                        geojson=counties,
                        locations='node', # name of column in df that has the county fips
                        color='modularity_class',      # name of column in df that has the data you want to plot
                        color_discrete_map={"A":'#a6cee3',"B":'#1f78b4',"C":'#b2df8a',"D":'#33a02c'},#px.colors.qualitative.Safe,
                        scope='usa',
                       )

    fig.update_traces(marker_line_width=0.1,  # controls county border line width
                      marker_opacity=0.85,  # changes fill color opacity to let state borders through
                      marker_line_color='#262626',  # controls county border color; needs to be darker than "states"
                      )
    fig.update_layout(coloraxis_showscale=False, showlegend=False)

    fig.show()
    
    fig.write_image("modulemap_"+str(filename)+".png", scale=10, engine = 'kaleido')

    
    return
back to top