https://github.com/voitijaner/Movie-RSs-Master-Thesis-Submission-Voit
Tip revision: dadcec2ae8e6965a5002afbaf7341d8ca19d0438 authored by voitijaner on 04 September 2020, 12:46:31 UTC
Update README.md
Update README.md
Tip revision: dadcec2
statistic_calculations.py
import pandas as pd
"""The genre, year and production country specific genres can be found in the corresponding files."""
def loadMovieLenseMapping():
"""Returns the movielensemapping.csv dataset"""
return pd.read_csv('data/movielense/movielense_mapping/movielensmapping.csv', sep='\t', encoding='utf-8', usecols=[0,1,2], names=['id', 'name', 'dbpediaLink'])
def loadFinalMovieList():
"""Returns the final movie List, extracted from the language_mapping_list of the similarMovieExtraction file."""
file = open('data/lang_versions/mapping_to_other_languages/lang_mapping_list_2016.pkl', 'rb')
movie_mapping_list = pickle.load(file)
return movie_mapping_list['english_version'].unique()
def loadMovieLenseRatings():
"""Returns MovieLense ratings dataset."""
return pd.read_csv('data/movielense/ratings.dat', sep='::', usecols=[0,1,2], names=['user_id', 'movie_id', 'rating'], encoding="utf-8")
def calculateAndPrintSparsity():
"""Calculate and prints sparsity."""
#Load Data
ratings = loadMovieLenseRatings()
movielenseMapping = loadMovieLenseMapping()
movies = loadFinalMovieList()
#Sparsity initial data set
sparsity = 1 - len(ratings) / (len(ratings['user_id'].unique()) * len(ratings['movie_id'].unique()))
print("Initial")
print("Users: " + str(len(ratings['user_id'].unique())))
print("Movies: " + str(len(ratings['movie_id'].unique())))
print("Sparsity: " + str(sparsity))
#Sparsity after movie selection
movielenseMapping = movielenseMapping[movielenseMapping.dbpediaLink.isin(movies)]
ratings = ratings[ratings.movie_id.isin(movielenseMapping['id'])]
sparsity = 1 - len(ratings) / (len(ratings['user_id'].unique()) * len(ratings['movie_id'].unique()))
print("After movie selection")
print("Users: " + str(len(ratings['user_id'].unique())))
print("Movies: " + str(len(ratings['movie_id'].unique())))
print("Sparsity: " + str(sparsity))
#Sparsity after preprocessing
ratings['frequency'] = ratings['movie_id'].map(ratings['movie_id'].value_counts())
ratings = ratings.sort_values(by=['frequency'], ascending=False)
count = 0
for item in ratings['movie_id'].unique():
if count < len(ratings['movie_id'].unique())/100:
ratings = ratings[ratings['movie_id'] != item]
count += 1
ratings = ratings.groupby('user_id').filter(lambda x: len(x) >= 50)
ratings = ratings.drop(['frequency'], axis=1)
sparsity = 1 - len(ratings) / (len(ratings['user_id'].unique()) * len(ratings['movie_id'].unique()))
print("After preprocessing")
print("Users: " + str(len(ratings['user_id'].unique())))
print("Movies: " + str(len(ratings['movie_id'].unique())))
print("Sparsity: " + str(sparsity))
movies_not_in_ratings = movielenseMapping[~movielenseMapping.id.isin(ratings['movie_id'])]
print("Movies not in ratings")
print(movies_not_in_ratings['id'])
calculateAndPrintSparsity()