https://github.com/F-FIDELO-19-008-FLEURY/course-recommender
Tip revision: b4af72cf48d8a032675a2e80ed9b480e78e7cdff authored by Alexis Lebis on 09 November 2023, 08:54:25 UTC
modified mut function to prevent rn to draw from an existing value and looping until ok.
modified mut function to prevent rn to draw from an existing value and looping until ok.
Tip revision: b4af72c
read_functions.py
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 16 08:59:01 2023
@author: luis.pinos-ullauri
"""
import os.path
import pandas as pd
################## Reading Functions for the real data set ####################
### Function that returns the real data set
### Returns a longitudinal dataframe where each row belongs to a student in a particular stage
### The courses c1 to c11 describe the followed course identifiers up until that stage
def read_real_data():
#If the file exists, read it and return it
if os.path.exists("./real_data/full_dataset_coursesRE.csv"):
real_data=pd.read_csv("./real_data/full_dataset_coursesRE.csv",encoding=('latin1'))
real_data.drop([real_data.columns[0],real_data.columns[3],
real_data.columns[4],real_data.columns[5],
real_data.columns[6],real_data.columns[7],
real_data.columns[18],real_data.columns[19],
real_data.columns[20],real_data.columns[21],
real_data.columns[23],real_data.columns[24],
real_data.columns[25]],axis=1,inplace=True)
return real_data
return []
### Function that returns the average fitness from the different seeds of the same student
def read_average_solution(student_row,domain_id,score_function,compensatory,number_generations,crossover_prob,mutation_prob,seed_init,seed_end,dimensions):
average_fitness=0
average_dimensions=[]
for seed in range(seed_init,seed_end+1):
if dimensions is False:
average_fitness=average_fitness+read_solution(student_row,domain_id,score_function,compensatory,number_generations,crossover_prob,mutation_prob,seed,dimensions)
else:
average_dimensions=[a+b for a,b in zip(average_dimensions,read_solution(student_row,domain_id,score_function,compensatory,number_generations,crossover_prob,mutation_prob,seed,dimensions))]
if dimensions is False:
return average_fitness/(seed_end-seed_init+1)
for i in range(len(average_dimensions)):
average_dimensions[i]=average_dimensions[i]/(seed_end-seed_init+1)
return average_dimensions
### Function that returns the fitness of the solution of a single student
def read_solution(student_row,domain_id,score_function,compensatory,number_generations,crossover_prob,mutation_prob,seed,dimensions):
if compensatory:
file_title="bestsol_"+str(student_row)+"_"+str(domain_id)+"_"+str(score_function)+"_comp_"+str(number_generations)+"_"+str(int(crossover_prob*100))+"_"+str(int(mutation_prob*100))+"_"+str(seed)
else:
file_title="bestsol_"+str(student_row)+"_"+str(domain_id)+"_"+str(score_function)+"_parcomp_"+str(number_generations)+"_"+str(int(crossover_prob*100))+"_"+str(int(mutation_prob*100))+"_"+str(seed)
if os.path.exists(file_title):
file=open(file_title,'r')
line = file.readline()
token_line=line.split(' ')
if dimensions is False:
file.close()
return float(token_line[len(token_line)-1])
return float(token_line)
return 0
### Function that returns the course effects
### Returns a 10x104 dataframe by default, where each row relates to each skill (10 soft skills)
### and 104 is the number of courses in the dataset
def get_courses_effects():
#If the file exists, read it and return it
if os.path.exists("./real_data/arranged_pooled_results_all.csv"):
courses_effects=pd.read_csv("./real_data/arranged_pooled_results_all.csv")
courses_effects.drop([courses_effects.columns[0],
courses_effects.columns[2],courses_effects.columns[3],
courses_effects.columns[4],courses_effects.columns[5],
courses_effects.columns[6],courses_effects.columns[111]],axis=1,inplace=True)
return courses_effects
return []
### Function that returns the student random intercept effects
### Returns a 10x1 dataframe by default, where each row relates to each skill (10 soft skills) of student with student_id
### 884 is the number of students in the dataset
def get_student_random_effect(student_id):
#If the file exists, read it and return it
if os.path.exists("./real_data/arranged_thetas_results_all.csv"):
student_effects=pd.read_csv("./real_data/arranged_thetas_results_all.csv")
return student_effects.iloc[:,student_effects.columns.get_loc("r_student["+str(student_id)+"]")]
return []
### Function that returns the thresholds for the ordinal logistic regression
### Returns a 10x3 dataframe by default, where each row relates to each skill (10 soft skills) and 3 is the number of thresholds
def get_thresholds():
#If the file exists, read it and return it
if os.path.exists("./real_data/arranged_pooled_results_all.csv"):
thresholds=pd.read_csv("./real_data/arranged_pooled_results_all.csv")
return thresholds.iloc[:,[2,3,4]]
return []
### Function that returns the thresholds for the minimum standard (for the moment mean sskill)
### Returns a 1x10 dataframe by default, where each column relates to each skill (10 soft skills) and 4 is the number of domains
def get_desired_standard(domain_id):
#If the file exists, read it and return it
if os.path.exists("./real_data/mean_skills_stage.csv"):
descriptives_skills=pd.read_csv("./real_data/mean_skills_stage.csv")
descriptives_skills.drop(descriptives_skills.columns[0],axis=1,inplace=True)
return descriptives_skills.iloc[4+domain_id,[2,4,6,8,10,12,14,16,18,20]]
return []
### Function that returns the courses by domain id
### Returns an unordered list of the courses elegible for domain id
def get_courses_domain(domain_id):
#If the file exists, read it and return it
if os.path.exists("./real_data/courses_ids_names_all.csv"):
available_courses=pd.read_csv("./real_data/courses_ids_names_all.csv",encoding=('latin1'))
available_courses.drop(available_courses.columns[0],axis=1,inplace=True)
return [*available_courses.loc[available_courses["domain_id"]==domain_id]["variable_id"]]
return []
#import math
#possible_courses=get_courses_domain(3)
#print(len(possible_courses))
#len_sol=11
#combs=math.comb(len(possible_courses),len_sol)
#print(combs)