from __future__ import division
import graphlab as gl
import pandas as pd
from sklearn import cross_validation
# load ratings file
actions_df = pd.read_csv('data/movielense/ratings.dat', sep='::', usecols=[0,1,2], names=['userId', 'movieId', 'rating'], encoding="utf-8")
# load and transform final movie list with genres
genre_movie_list = pd.read_csv('data/movielense/final_movie_genre_year_county_list.csv', usecols=['movieId', 'genres'])
genre_movie_list = genre_movie_list.drop_duplicates()
genre_movie_list = genre_movie_list.rename(columns={'movieId' : 'movie_id'})
# ----------- PREPROCESSING -----------
# Remove movies which are not in the final movie list
print("Initial ratings: " + str(len(actions_df)))
movie_list = genre_movie_list['movie_id'].tolist()
boolean_series = actions_df.movieId.isin(movie_list)
actions_df = actions_df[boolean_series]
actions_df['frequency'] = actions_df['movieId'].map(actions_df['movieId'].value_counts())
print("Ratings after movie selection: " + str(len(actions_df)))
actions_df = actions_df.sort_values(by=['frequency'], ascending=False)
count = 0
# remove popularity bias
for item in actions_df['movieId'].unique():
if count < len(actions_df['movieId'].unique())/100:
actions_df = actions_df[actions_df['movieId'] != item]
count += 1
# sparcity reduction
actions_df = actions_df.groupby('userId').filter(lambda x: len(x) >= 50)
actions_df = actions_df.drop(['frequency'], axis=1)
actions_df = actions_df.rename(columns={'userId':'user_id', 'movieId':'item_id'})
print("Final ratings after preprocessing: " + str(len(actions_df)))
# remove movies from genre_list which are no longer in ratings data set (after preprocessing)
boolean_series = genre_movie_list.movie_id.isin(actions_df['item_id'].unique())
genre_movie_list = genre_movie_list[boolean_series]
# create final genre list with a movie list per genre
final_genre_list = []
for genre in genre_movie_list['genres'].unique():
genre_movies = genre_movie_list[genre_movie_list['genres'] == genre]['movie_id'].values
if len(genre_movies) > 100:
final_genre_list.append([genre, genre_movies])
# genre statistics
for row in final_genre_list:
print(row[0])
print("items: " + str(len(row[1])))
temp = actions_df[actions_df['item_id'].isin(row[1])]
temp = temp.groupby('user_id').filter(lambda x: len(x) >= 50)
print("ratings: " + str(len(temp)))
print("users: " + str(len(temp['user_id'].unique())))
#add placeholder for all data
final_genre_list.append(['all', ['']])
# ----------- CREATE RS MODELS -----------
def create_RS(lang, k, training_data, nearest_items_sf_genre):
"""Create the item_similarity_recommender models and returns it."""
model = gl.item_similarity_recommender.create(training_data, similarity_type="cosine", user_id='user_id', item_id='item_id', target="rating", only_top_k = k, nearest_items=nearest_items_sf_genre)
return model
model_list = []
# for each genre, the recommender models for the best k's will be created
for genre in final_genre_list:
# if models are created over whole dataset: all ratings are used
if genre[0] == 'all':
actions_df_genre = actions_df
else:
# extract genre ratings and remove users with less than 50 ratings
actions_df_genre = actions_df[actions_df['item_id'].isin(genre[1])]
actions_df_genre = actions_df_genre.groupby('user_id').filter(lambda x: len(x) >= 50)
print("------------------------------------START GENRE " + genre[0] + "------------------------------------")
for i in range(3):
#split in train and test data sets, each langauge version is is trained with k between 1 and 10 on the same splits
train, test = cross_validation.train_test_split(actions_df_genre, test_size=0.15, stratify=actions_df_genre['user_id'])
training_data = gl.SFrame(train)
validation_data = gl.SFrame(test)
for lang in ["de", "en", "it", "ru", "fr"]:
# load similar items for all data or genre specific lists
if genre[0] == 'all':
nearest_items_df_genre = pd.read_csv('data/similar_movies/50_nearest_items_lang='+lang+'.csv', usecols=['movie_id','similar','score'])
else:
nearest_items_df_genre = pd.read_csv('data/similar_movies/genre/10_nearest_items_'+genre[0]+'_lang='+lang+'.csv', usecols=['movie_id','similar','score'])
# similar movie_id and the similar movies are seen as float, convert to int
nearest_items_df_genre['similar'] = nearest_items_df_genre['similar'].astype(int)
nearest_items_df_genre['movie_id'] = nearest_items_df_genre['movie_id'].astype(int)
nearest_items_df_genre = nearest_items_df_genre.rename(columns={"movie_id": "item_id"})
nearest_items = gl.SFrame(nearest_items_df_genre)
# set k to the best per language
if lang == "de":
k = 3
if lang == "fr":
k = 3
if lang == "it":
k = 2
if lang == "ru":
k = 6
if lang == "en":
k = 2
model = create_RS(l, k , training_data, nearest_items)
model_list.append([model, validation_data, lang, genre[0], k])
# convert model_list into DataFrame for later use
model_list_df = pd.DataFrame(model_list, columns = ["model", "validation_data", "lang", "genre", "k"])
model_list_df
# ----------- EVALUATE RS MODELS -----------
# resulting df for precision / recall extraction
precision_recall_df = pd.DataFrame(columns=['precision', 'recall', 'genre', 'k', 'lang'])
count = 0
for i, row in model_list_df.iterrows():
count += 1
print(count)
validation_data = row['validation_data']
lang = row['lang']
k = row['k']
genre = row['genre']
model = row['model']
# extract relevant movies and evaluate the RS with them
validation_data_by_genre_and_relevant_items = validation_data.filter_by([4,5], 'rating')
predictions_by_genre = model.evaluate(validation_data_by_genre_and_relevant_items, verbose=False)
precision_recall_by_genre = predictions_by_genre['precision_recall_overall'].filter_by([10], 'cutoff')
# add to precision - recall results list
result_row = {'precision':precision_recall_by_genre['precision'][0], 'recall':precision_recall_by_genre['recall'][0], 'genre':genre, 'k':k, 'lang':lang}
precision_recall_df = precision_recall_df.append(result_row, ignore_index=True)
# list wich containes the averaged precision / recall and F1 scores for each model - langauge version
model_results_df = pd.DataFrame(columns=['mean_precision', 'mean_recall', 'F1', 'k', 'lang', 'genre'])
for lang in precision_recall_df["lang"].unique():
for genre in precision_recall_df["genre"].unique():
# extract the models for the each language and genre pair
prec_rows = precision_recall_df.loc[(precision_recall_df["lang"] == lang) & (precision_recall_df["genre"] == genre)]
presicion_per_model = 0
recall_per_model = 0
k = 0
#calcualte avg. precision, recall and f1 score
for i, row in prec_rows.iterrows():
k = row['k']
presicion_per_model = presicion_per_model + row['precision']
recall_per_model = recall_per_model + row['recall']
if genre == "all":
print(lang)
print("Precision: " + str(row['precision']) + " , Recall: " + str(row['recall']) + " , F1: " + str(2*row['precision']*row['recall']/(row['precision']+row['recall'])))
avg_presicion_per_model = presicion_per_model / len(prec_rows)
avg_recall_per_model = recall_per_model / len(prec_rows)
f1 = 2*avg_presicion_per_model*avg_recall_per_model/(avg_presicion_per_model + avg_recall_per_model)
# add to model results list
result_row = {'mean_precision':avg_presicion_per_model, 'mean_recall':avg_recall_per_model, 'F1':f1, 'k':k, 'lang':lang, 'genre':genre}
model_results_df = model_results_df.append(result_row, ignore_index = True)
# ----------- OUTPUT RS MODEL RESULTS -----------
print(model_results_df.sort_values(by=['genre', 'F1'], ascending=False))
#calculate the difference of the overall performance to the model results for each year
results_with_diff = model_results_df.copy()
#results_with_diff = model_results_df[model_results_df['k']==4]
for i, row in results_with_diff.iterrows():
precision_all = results_with_diff.loc[(results_with_diff['genre']=='all') & (results_with_diff['lang']==row['lang'])]['mean_precision'].values[0]
recall_all = results_with_diff.loc[(results_with_diff['genre']=='all') & (results_with_diff['lang']==row['lang'])]['mean_recall'].values[0]
F1_all = results_with_diff.loc[(results_with_diff['genre']=='all') & (results_with_diff['lang']==row['lang'])]['F1'].values[0]
precision_diff = row['mean_precision'] - precision_all
recall_diff = row['mean_recall'] - recall_all
F1_diff = row['F1'] - F1_all
results_with_diff.at[i, 'precision_diff'] = 100 / precision_all * precision_diff
results_with_diff.at[i, 'recall_diff'] = 100 / recall_all * recall_diff
results_with_diff.at[i, 'F1_diff'] = 100 / F1_all * F1_diff
print(results_with_diff.sort_values(by=['genre', 'F1'], ascending=False))