Content - 8f2470ad3a82b921d3cc6b9d7e21d28684d7b5fe - 29cbae0/NetVotes/src/main.R

main.R
#############################################################################################
# Main script, launches the whole process:
# - Load the raw data
# - Preprocess/filter the resulting tables
# - Process the voting agreement index and other statistics
# - Extract the collection of networks
# - Process various network statistics
# - Apply the community detection algorithms
# - Compare the resulting partitions, also with the pILS results if available.
# - Incidentally generate plots related to certain of these steps
# 
# The parameters located at the beginning of the script (section "Init parameters", right below)
# allow to control it, and to restrict the focus to certain topics/years, or control certain 
# points of the network extraction.
# 
# 07/2015 Israel Mendonça (v1)
# 09/2015 Vincent Labatut (v2)
#
# setwd("D:/Eclipse/workspaces/Networks/NetVotes")
# setwd("~/eclipse/workspaces/Networks/NetVotes")
# source("src/main.R")
#############################################################################################
# libraries for parallel processing
library(foreach)
library(doParallel)

source("src/define-imports.R")



#############################################################################################
# Init parameters
#############################################################################################
##################### raw data
#dataset.name <- "VW"		# VoteWatch
dataset.name <- "IYP"		# It's your Parliament
#dataset.name <- "PT"		# Parltrack

##################### domains
domains <- c(DOMAIN.VALUES, DOMAIN.ALL)			# which domains to process individually
#domains <- DOMAIN.CULT
#domains <- c(DOMAIN.VW2SYMB[TEST.DOMAINS],DOMAIN.ALL)
##################### dates
dates <- c(DATE.T7.YEARS, DATE.T7.TERM)			# which time periods to process individually
#dates <- c(
#		DATE.T7.Y1
#		DATE.T7.Y2
#		DATE.T7.Y3
#		DATE.T7.Y4
#		DATE.T7.Y5
#		DATE.T7.TERM
#)
#dates <- TEST.YEARS
##################### everything at once
everything <- TRUE								# whether or not to process all data without distinction of country or date
#everything <- FALSE
##################### countries
countries <- COUNTRY.VALUES						# which country to process individually
#countries <- c(COUNTRY.HR)
#countries <- TEST.COUNTRIES
#countries <- c(
#		COUNTRY.AT,COUNTRY.BE,COUNTRY.BG,COUNTRY.HR,COUNTRY.CY,COUNTRY.CZ,COUNTRY.DK
#		COUNTRY.EE,COUNTRY.FI,COUNTRY.FR,COUNTRY.DE,COUNTRY.GR,COUNTRY.HU,COUNTRY.IE
#		COUNTRY.IT,COUNTRY.LV,COUNTRY.LT,COUNTRY.LU,COUNTRY.MT,COUNTRY.NL,COUNTRY.PL
#		COUNTRY.PT,COUNTRY.RO,COUNTRY.SK,COUNTRY.SI,COUNTRY.ES,COUNTRY.SE,COUNTRY.UK
#)
##################### groups
groups <- GROUP.VALUES							# which group to process individually
#groups <- c(GROUP.SD)
#groups <- GROUP.VW2SYMB[TEST.GROUPS]
#groups <- c(
#	GROUP.ALDE,GROUP.ECR,GROUP.EFD,GROUP.EPP
#	GROUP.GREENS,GROUP.GUENGL,GROUP.NI,GROUP.SD
#)

##################### score matrix used to process agreement
score.file <- "m3"					# see folder in/score
#thresh <- c(0,0)					# no thresholding at all
#thresh <- c(-0.34,+0.34)			# thresholds applied to agreement index values during network extraction (use c(0,0) for no filtering)
thresh <- NA						# both thresholds automatically estimated (through k-means)

##################### partitioning algorithms
### community detection algorithms
#comdet.algos <- COMDET.ALGO.VALUES
comdet.algos <- c(
#	COMDET.ALGO.EDGEBETW,
	COMDET.ALGO.INFOMAP,
	COMDET.ALGO.LABELPROP,
	COMDET.ALGO.LOUVAIN,
	COMDET.ALGO.WALKTRAP
)
#comdet.algos <- c()
### correlation clustering algorithms (see define-algos.R for the parameter description)
corclst.algos <- c()
#corclst.algos <- c(
#	get.grasp.code(rcc=FALSE, l=1, k=7, alpha=1.0, gain=0, perturbation=30)
#	get.grasp.code(rcc=TRUE, l=1, k=7, alpha=1.0, gain=0, perturbation=30),
#	get.ils.code(rcc=FALSE, l=1, k=7, alpha=1.0, gain=0, perturbation=30),
#	get.ils.code(rcc=TRUE, l=1, k=7, alpha=1.0, gain=0, perturbation=30)
#)
repetitions <- 5						# number of times each algorithm must be applied

##################### measures used to compare partitions
comp.measures <- c(
	"nmi",
	"rand",
	"adjusted.rand"
)

##################### formats of the generated plot (NA for screen -- mainly for debug)
plot.formats <- c(
	"PDF" 
#	"PNG"
#	NA
)

##################### configure parallel processing
#cn <- detectCores(all.tests=TRUE)
#if(!is.na(cn))
#	cl <- makeCluster(cn)		# automatically use all the available processors
#else
	cl <- makeCluster(4)		# manually set the number of processors to use
registerDoParallel(cl)


#############################################################################################
# Load raw data
#############################################################################################
if(dataset.name=="VW")
{	data <- load.votewatch.data()
}else if(dataset.name=="IYP")
{	data <- load.itsyourparliament.data()
}else if(dataset.name=="PT")
{	data <- load.parltrack.data()
}



#############################################################################################
# Process raw data stats (this might take a while)
#############################################################################################
process.stats(data$all.votes, data$behavior.values, data$doc.details, data$mep.details,
		domains, dates, everything, countries, groups, plot.formats)



#############################################################################################
# Process agreement and related stats (this might also take a while)
#############################################################################################
#process.agreement(data$all.votes, data$doc.details, data$mep.details, score.file,
#		domains, dates, everything, countries, groups, plot.formats)



#############################################################################################
# Extract all the networks (just a bit faster)
#############################################################################################
#extract.all.networks(data$mep.details, thresh, score.file,
#		domains, dates, everything, countries, groups, plot.formats)


#############################################################################################
# Detect communities for all the networks
#############################################################################################
#partition.all.graphs(data$mep.details, thresh, score.file,
#		domains, dates, everything, countries, groups, comdet.algos, corclst.algos, repetitions, plot.formats, force=FALSE)


#############################################################################################
# Evaluate the detected partitions, for all the networks
#############################################################################################
#evaluate.all.partitions(data$mep.details, thresh, score.file,
#		domains, dates, everything, countries, groups, comdet.algos, corclst.algos, repetitions, plot.formats)


#############################################################################################
# Compare the detected partitions, for all the networks
#############################################################################################
#compare.all.partitions(data$mep.details, thresh, score.file,
#		domains, dates, everything, countries, groups, comdet.algos, corclst.algos, comp.measures, repetitions)




##################### stop parallel processing
stopCluster(cl)


tlog("Done!")
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
# Notes
# - when dealing with filtered nodes (countries, groups): shouldn't we record only the concerned nodes (in the networks)?

# Problems
# - Pb with IYP: absence not explicitly represented >> peaks of zero agreement for yearly votes.
#	xx get official dates for each MEP and check with that
#	>> check that, when filtering, MEPs voting *only* NA (i.e. not absent) are not taken into account (this is general, not IYP-specific).
# - in IYP, there's also a duplicate problem, some MEPs are represented several times.
#	>> this could be handled when filtering the data? or at loading/preparation time?
# - agreement: for complete dataset, some nodes such as 599 have only 1s: possible, but improbable
#   note: might be due to small numbers of expressed votes (i.e. non-NA)
#	note2: version from a long time ago. check again now.

# Extraction de réseaux
# - on peut extraire des réseaux au niveau des partis politiques
# - peut étre aussi pour chaque vote ? mais les clusters seront triviaux (pr vs ctr) 

# Stats sur données brutes
# - on pourrait aussi voir le comportement individuel: nombre de votes de chaque type pour une personne.
#   ça peut être complet, agrégé par année, par législature... 


# nohup R --vanilla < src/main.R > terminal.output.txt &