Revision ba29c4fb5f303958286f1a2f90fe8a8b7cb3ea84 authored by tkEzaki on 18 May 2021, 03:31:37 UTC, committed by GitHub on 18 May 2021, 03:31:37 UTC
1 parent 38251e0
cluster_distribution_analysis.py
``````import numpy as np
import random

def fill_surrounding(data):
# Adds a single row/column to the top,bottom,left, and right of the picture.
# The added sites have a value equal to the minimum value in the original data.

minimum = np.min(data)
data_filled = np.ones((data.shape[0]+2, data.shape[1]+2)) * minimum
data_filled[1:data.shape[0]+1,1:data.shape[1]+1] = np.copy(data)
return data_filled

def compute_cluster_map(data):
# Computes the cluster map.
# If the site [i][j] is a local cluster, the size of the cluster is
# stored to cluster_map[i][j], otherwise cluster_map[i][j] = 0.

minimum = np.min(data)
cluster_map = np.zeros(data.shape)
for i in range(len(data)-2):
for j in range(len(data[0])-2):
if data[i+1][j+1] > minimum:# minimum value represents the background.
flag = True
search_i = i+1
search_j = j+1
while flag:
max_index = np.argmax([data[search_i][search_j],data[search_i-1][search_j],data[search_i][search_j-1],data[search_i+1][search_j],data[search_i][search_j+1]])
if max_index == 0:#local max found
if (search_i - (i + 1)) ** 2 + (search_j - (j + 1)) ** 2 > 0:
cluster_map[search_i][search_j] += 1
flag = False
elif max_index == 1:
search_i -= 1
elif max_index == 2:
search_j -=1
elif max_index == 3:
search_i += 1
elif max_index == 4:
search_j +=1
return cluster_map

def cluster_distribution(cluster_data,cluster_max=60):
# Computes cluster_dist[i], the number of clusters having cluster size i.

cluster_dist = np.zeros(cluster_max)
for cluster_line in cluster_data:
for cluster in cluster_line:
if cluster >0:
cluster_dist[int(cluster)] += 1

return cluster_dist

def expected_value(prob):
e = 0
for i in range(len(prob)):
e += i * prob[i]
return e

def randomize_data(data):
minimum = np.min(data)
values = set(np.ravel(data.tolist()))
values.remove(minimum)
value_list = list(values)
random.shuffle(value_list)

surrogate_data = np.zeros(data.shape)
for i in range(len(data)):
for j in range(len(data[0])):
if data[i][j] == minimum:
surrogate_data[i][j] = minimum
else:
surrogate_data[i][j] = random.choice(value_list)
return surrogate_data

def average_list(dist_list):
case_num = len(dist_list)
bin_num = len(dist_list[0])
average_dist = np.zeros(bin_num)
for i in range(case_num):
for bin in range(bin_num):
average_dist[bin] += dist_list[i][bin]/case_num
return average_dist

def null_cluster_distribution(data, n=100):
dist_list = []
for i in range(n):
surrogate_data = randomize_data(data)
cluster_map = compute_cluster_map(surrogate_data)
dist_list.append(cluster_distribution(cluster_map))
return average_list(dist_list)

if __name__ == '__main__':
# Read data and compute the average cluster size
cluster_map = compute_cluster_map(data)
cluster_dist = cluster_distribution(cluster_map)
prob_dist = cluster_dist / np.sum(cluster_dist)
average_cluster_size = expected_value(prob_dist)

# Generate surrogate data for computing a null distribution
cluster_dist_surrogate = null_cluster_distribution(data,n=100)
prob_dist_surrogate = cluster_dist_surrogate / np.sum(cluster_dist_surrogate)
average_cluster_size_surrogate = expected_value(prob_dist_surrogate)

# Normalize the result
normalized_average_cluster_size = average_cluster_size / average_cluster_size_surrogate
print(normalized_average_cluster_size)``````

Computing file changes ...