1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230 | # -*- coding: utf-8 -*-
"""
Created on Wed Aug 16 08:59:01 2023
@author: luis.pinos-ullauri
"""
import os.path
import pandas as pd
################## Reading Functions for the real data set ####################
### Function that returns the real data set
### Returns a pandas dataframe where each row belongs to a student in a particular stage
### The courses c1 to c11 describe the followed course identifiers up until that stage
def read_real_data():
#If the file exists, read it and return it
if os.path.exists("./real_data/full_dataset_coursesre.csv"):
real_data=pd.read_csv("./real_data/full_dataset_coursesre.csv",encoding=('latin1'))
real_data.drop([real_data.columns[0],real_data.columns[3],
real_data.columns[4],real_data.columns[5],
real_data.columns[6],real_data.columns[7],
real_data.columns[18],real_data.columns[19],
real_data.columns[20],real_data.columns[21],
real_data.columns[23],real_data.columns[24],
real_data.columns[25]],axis=1,inplace=True)
return real_data
return []
### Function that returns the course effects
### Returns a 10x104 pandas dataframe by default, where each row relates to each skill (10 soft skills)
### and 104 is the number of courses in the dataset
def get_courses_effects():
#If the file exists, read it and return it
if os.path.exists("./real_data/arranged_pooled_results_all.csv"):
courses_effects=pd.read_csv("./real_data/arranged_pooled_results_all.csv")
courses_effects.drop([courses_effects.columns[0],
courses_effects.columns[2],courses_effects.columns[3],
courses_effects.columns[4],courses_effects.columns[5],
courses_effects.columns[6],courses_effects.columns[111]],axis=1,inplace=True)
return courses_effects
return []
### Function that returns the student random intercept effects (list)
### Returns a list by default, where each index corresponds to each skill (10 soft skills) of student with student_id
### 884 is the number of students in the dataset
def get_student_random_effect(student_id):
#If the file exists, read it and return it
if os.path.exists("./real_data/arranged_thetas_results_all.csv"):
student_effects=pd.read_csv("./real_data/arranged_thetas_results_all.csv")
return student_effects.iloc[:,student_effects.columns.get_loc("r_student["+str(student_id)+"]")].tolist()
return []
### Function that returns the thresholds for the ordinal logistic regression model
### The thresholds split the logit scale in 4 regions with K-1 values, where K is the number of possible responses
### Returns a 10x3 pandas dataframe by default, where each row relates to each skill (10 soft skills) and 3 is the number of thresholds
def get_thresholds():
#If the file exists, read it and return it
if os.path.exists("./real_data/arranged_pooled_results_all.csv"):
thresholds=pd.read_csv("./real_data/arranged_pooled_results_all.csv")
return thresholds.iloc[:,[2,3,4]]
return []
### Function that returns the desired soft skill proficiency
### Returns a list by default, where each index corresponds to each skill (10 soft skills)
def get_desired_outcome(domain_id):
#If the file exists, read it and return it
if os.path.exists("./real_data/mean_skills_stage.csv"):
descriptives_skills=pd.read_csv("./real_data/mean_skills_stage.csv")
descriptives_skills.drop(descriptives_skills.columns[0],axis=1,inplace=True)
return descriptives_skills.iloc[4+domain_id,[2,4,6,8,10,12,14,16,18,20]].tolist()
return []
### Function that returns the courses by domain id
### Returns an unordered list of the courses elegible for domain id
def get_courses_domain(domain_id):
#If the file exists, read it and return it
domain_str=""
if domain_id==1:
domain_str="ee"
elif domain_id==2:
domain_str="is"
elif domain_id==3:
domain_str="mx"
elif domain_id==4:
domain_str="nu"
if os.path.exists("./real_data/courses_ids_names_"+domain_str+".csv"):
available_courses=pd.read_csv("./real_data/courses_ids_names_"+domain_str+".csv",encoding=('latin1'))
available_courses.drop(available_courses.columns[0],axis=1,inplace=True)
return [*available_courses["variable_id"]]
return []
################## Reading Functions for the result files ####################
### Function that returns the appropiate directory address according to the parameters
### Returns a string to be used for the address of the result files
def toStringfilestructure(domain_id,compensatory,score_function=None,student_id=None,folder="./results/"):
#Checking the tree structure for the file system
if domain_id==1:
domain_as_string="domain_ee/"
elif domain_id==2:
domain_as_string="domain_is/"
elif domain_id==3:
domain_as_string="domain_mx/"
elif domain_id==4:
domain_as_string="domain_nu/"
if compensatory is True:
compensatory_as_string="compensatory/"
else:
compensatory_as_string="partially_compensatory/"
if score_function is None:
return folder+domain_as_string+compensatory_as_string
if score_function==1:
score_function_as_string="linear/"
elif score_function==2:
score_function_as_string="logistic/"
elif score_function==3:
score_function_as_string="quadratic/"
#score_function is not None
if student_id is not None:
return folder+domain_as_string+compensatory_as_string+score_function_as_string+str(student_id)+"/"
#score_function and student id is None
else:
return folder+domain_as_string+compensatory_as_string+score_function_as_string
### Function that reads the bestsol file and returns the fitness of the solution of a single student under a specific seed
### If dimensions is true. it will return a list of the overall fitness
def read_solution(student_id,domain_id,score_function,compensatory,number_generations,crossover_probability,mutation_probability,seed,dimensions):
file_directory=toStringfilestructure(domain_id, compensatory, score_function, student_id)
file_title=file_directory+"bestsol_"+str(student_id)+"_"+str(number_generations)+"_"+str(int(crossover_probability*100))+"_"+str(int(mutation_probability*100))+"_"+str(seed)+".garcs"
#If the file exists, read it
if os.path.exists(file_title):
file=open(file_title,'r')
#read the line
line = file.readline()
#split the values
token_line=line.split(' ')
#return the overall fitness
if dimensions is False:
file.close()
return float(token_line[len(token_line)-11])
#return the overall fitness with the soft skill scores
#index of first soft skill score
index_init=len(token_line)-11
fitness_values=token_line[index_init:len(token_line)]
fitness_values_float=[float(fitness_str) for fitness_str in fitness_values]
if compensatory is False:
overall_fitness=1
for i in range(1,len(fitness_values_float)):
overall_fitness=overall_fitness*fitness_values_float[i]
fitness_values_float[0]=overall_fitness
return fitness_values_float
#print(seed)
#print(student_id)
### Function that returns the average fitness from the different seeds (runs) of the same student
### If dimensions is true, it will return the average of the overall fitness alongside the average of the scores
### If dimensions is false, it will only return the average of the overall fitness
def read_average_solution(student_id,domain_id,score_function,compensatory,number_generations,crossover_probability,mutation_probability,seed_init,seed_end,dimensions):
#overall fitness
average_fitness=0
#list for soft skill scores
average_dimensions=[0,0,0,0,0,0,0,0,0,0,0]
#loop to check the seeds
for seed in range(seed_init,seed_end+1):
#calculate on overall fitness
current_fitness=read_solution(student_id,domain_id,score_function,compensatory,number_generations,crossover_probability,mutation_probability,seed,dimensions)
#print(current_fitness)
if dimensions is False:
average_fitness=average_fitness+current_fitness
else:
#print(average_dimensions)
average_dimensions=[sum(x) for x in zip(average_dimensions,current_fitness)]
if dimensions is False:
return average_fitness/(seed_end-seed_init+1)
for i in range(len(average_dimensions)):
average_dimensions[i]=average_dimensions[i]/(seed_end-seed_init+1)
return average_dimensions
### Function that reads the bestsol file and returns the fitness of the solution of a single student under a specific seed
### If dimensions is true. it will return a list of the overall fitness
def read_fitness_by_gen(student_id,domain_id,score_function,compensatory,number_generations,crossover_probability,mutation_probability,seed):
file_directory=toStringfilestructure(domain_id, compensatory, score_function, student_id)
file_title=file_directory+"bestall_"+str(student_id)+"_"+str(number_generations)+"_"+str(int(crossover_probability*100))+"_"+str(int(mutation_probability*100))+"_"+str(seed)+".garcs"
#If the file exists, read it
if os.path.exists(file_title):
file=open(file_title,'r')
#read the line
lines = file.readlines()
for i in range(len(lines)):
#split the values
token_line=lines[i].split(' ')
if i==0:
fitness_gen_time=pd.DataFrame([0,float(token_line[0]),float(token_line[1])]).T
else:
fitness_gen_time=pd.concat([fitness_gen_time,pd.DataFrame([i,float(token_line[0]),float(token_line[1])]).T])
return fitness_gen_time
return 0
### Function that returns the average fitness from the different seeds (runs) of the same student
### If dimensions is true, it will return the average of the overall fitness alongside the average of the scores
### If dimensions is false, it will only return the average of the overall fitness
def read_average_fitness_by_gen(student_id,domain_id,score_function,compensatory,number_generations,crossover_probability,mutation_probability,seed_init,seed_end):
#loop to check the seeds
for seed in range(seed_init,seed_end+1):
current_fitness_by_gen=read_fitness_by_gen(student_id, domain_id, score_function, compensatory, number_generations, crossover_probability, mutation_probability, seed)
if seed==seed_init:
average_fitness_by_gen=current_fitness_by_gen
else:
for i in range(current_fitness_by_gen.shape[0]):
average_fitness_by_gen.iloc[i,1]=average_fitness_by_gen.iloc[i,1]+current_fitness_by_gen.iloc[i,1]
average_fitness_by_gen.iloc[i,2]=average_fitness_by_gen.iloc[i,2]+current_fitness_by_gen.iloc[i,2]
for i in range(average_fitness_by_gen.shape[0]):
average_fitness_by_gen.iloc[i,1]=average_fitness_by_gen.iloc[i,1]/(seed_end-seed_init+1)
average_fitness_by_gen.iloc[i,2]=average_fitness_by_gen.iloc[i,2]/(seed_end-seed_init+1)
return average_fitness_by_gen
|