https://github.com/voitijaner/Movie-RSs-Master-Thesis-Submission-Voit
Raw File
Tip revision: dadcec2ae8e6965a5002afbaf7341d8ca19d0438 authored by voitijaner on 04 September 2020, 12:46:31 UTC
Update README.md
Tip revision: dadcec2
statistic_calculations.py
import pandas as pd


"""The genre, year and production country specific genres can be found in the corresponding files."""

def loadMovieLenseMapping():
	"""Returns the movielensemapping.csv dataset"""
	return pd.read_csv('data/movielense/movielense_mapping/movielensmapping.csv', sep='\t', encoding='utf-8', usecols=[0,1,2], names=['id', 'name', 'dbpediaLink'])

def loadFinalMovieList():
    """Returns the final movie List, extracted from the language_mapping_list of the similarMovieExtraction file."""
    file = open('data/lang_versions/mapping_to_other_languages/lang_mapping_list_2016.pkl', 'rb')
    movie_mapping_list = pickle.load(file)
    return movie_mapping_list['english_version'].unique()

def loadMovieLenseRatings():
    """Returns MovieLense ratings dataset."""
    return pd.read_csv('data/movielense/ratings.dat', sep='::', usecols=[0,1,2], names=['user_id', 'movie_id', 'rating'], encoding="utf-8")

def calculateAndPrintSparsity():
    """Calculate and prints sparsity."""

    #Load Data
    ratings = loadMovieLenseRatings()
    movielenseMapping = loadMovieLenseMapping()
    movies = loadFinalMovieList()
    #Sparsity initial data set
    sparsity = 1 - len(ratings) / (len(ratings['user_id'].unique()) * len(ratings['movie_id'].unique()))
    print("Initial")
    print("Users: " + str(len(ratings['user_id'].unique())))
    print("Movies: " + str(len(ratings['movie_id'].unique())))
    print("Sparsity: " + str(sparsity))


    #Sparsity after movie selection
    movielenseMapping = movielenseMapping[movielenseMapping.dbpediaLink.isin(movies)]

    ratings = ratings[ratings.movie_id.isin(movielenseMapping['id'])]

    sparsity = 1 - len(ratings) / (len(ratings['user_id'].unique()) * len(ratings['movie_id'].unique()))

    print("After movie selection")
    print("Users: " + str(len(ratings['user_id'].unique())))
    print("Movies: " + str(len(ratings['movie_id'].unique())))
    print("Sparsity: " + str(sparsity))


    #Sparsity after preprocessing
    ratings['frequency'] = ratings['movie_id'].map(ratings['movie_id'].value_counts())

    ratings = ratings.sort_values(by=['frequency'], ascending=False)

    count = 0
    for item in ratings['movie_id'].unique():
        if count < len(ratings['movie_id'].unique())/100:
            ratings = ratings[ratings['movie_id'] != item]
        count += 1  
    ratings = ratings.groupby('user_id').filter(lambda x: len(x) >= 50)
    ratings = ratings.drop(['frequency'], axis=1)

    sparsity = 1 - len(ratings) / (len(ratings['user_id'].unique()) * len(ratings['movie_id'].unique()))

    print("After preprocessing")
    print("Users: " + str(len(ratings['user_id'].unique())))
    print("Movies: " + str(len(ratings['movie_id'].unique())))
    print("Sparsity: " + str(sparsity))


    movies_not_in_ratings = movielenseMapping[~movielenseMapping.id.isin(ratings['movie_id'])]
    print("Movies not in ratings")
    print(movies_not_in_ratings['id'])


calculateAndPrintSparsity()
back to top