https://github.com/voitijaner/Movie-RSs-Master-Thesis-Submission-Voit
Raw File
Tip revision: dadcec2ae8e6965a5002afbaf7341d8ca19d0438 authored by voitijaner on 04 September 2020, 12:46:31 UTC
Update README.md
Tip revision: dadcec2
similar_movie_extractions.py
import os
import pandas as pd
import _pickle as pickle
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from rdf2vec.converters import rdflib_to_kg
from rdf2vec.converters import create_kg
from rdf2vec.walkers import RandomWalker
from rdf2vec import RDF2VecTransformer





def loadInstanceTypes(lang):
	"""Returns the instance type for the specified language version with removed brackets."""
	instance_types = pd.read_csv('data/lang_versions/instance_types_transitive_'+lang+'.ttl', sep='\s', usecols=[0,1,2], names=['subject', 'predicate', 'object'], header = 1, encoding="utf-8")
	#last line in file contains no data
	instance_types = instance_types[:-1]
	return remove_brackets(instance_types)

def loadMappingBasedObjects(lang):
	"""Returns the mapping based objects type for the specified language version with removed brackets."""
	mappingbased_objects = pd.read_csv('data/lang_versions/mappingbased_objects_'+lang+'.ttl', sep='\s', usecols=[0,1,2], names=['subject', 'predicate', 'object'], header=1, encoding = "utf-8")
	#last line in file contains no data
	mappingbased_objects = mappingbased_objects[:-1]
	return remove_brackets(mappingbased_objects)

def loadEnglishToOtherLanguages():
	"""Returns the interlanguage_links_en.tll file and returns it with removed brackets."""
	interlanguage_links_en = pd.read_csv('data/lang_versions/mapping_to_other_languages/interlanguage_links_en.ttl', sep='\s', usecols=[0,1,2], names=['subject', 'predicate', 'object'], header=1, encoding = "utf-8")
	return remove_brackets(interlanguage_links_en)

def loadMovieLenseMapping():
	"""Returns the movielensemapping.csv dataset"""
	return pd.read_csv('data/movielense/movielense_mapping/movielensmapping.csv', sep='\t', encoding='utf-8', usecols=[0,1,2], names=['id', 'name', 'dbpediaLink'])

def remove_brackets(df):
	"""Remove starting and ending brackets for subject, predicate and object of the given RDF-Dataset."""
	df['subject'] = df['subject'].str.replace('<', '')
	df['subject'] = df['subject'].str.replace('>', '')
	df['predicate'] = df['predicate'].str.replace('>', '')
	df['predicate'] = df['predicate'].str.replace('<', '')
	df['object'] = df['object'].str.replace('>', '')
	df['object'] = df['object'].str.replace('<', '')
	return df

def extractMovies(instance_types):
	"""Extract movie entities from the instance-types dataset."""
	entities = instance_types[instance_types['object']=="http://schema.org/Movie"]['subject']
	return entities

def getMovieListFiltered():
	"""Returns a movie list with DBpedia-links, contained in the MovieLense mapping data set."""
	#Loads the english instance typies for movie extractions.
	instance_types = loadInstanceTypes("en")
	entities = extractMovies(instance_types)
	movielenseMapping = loadMovieLenseMapping()
	entities_filtered = entities[entities.isin(movielenseMapping['dbpediaLink'])]
	return entities_filtered

def getTransformer(depth, walks_perGraph, sg, vector_size):
	"""Creates and returns the transformer used by the RDF2Vec algorithm."""
	random_walker = RandomWalker(depth, walks_perGraph)
	transformer = RDF2VecTransformer(walkers=[random_walker], sg=sg, vector_size=vector_size)
	return transformer

def getEmbeddings(transformer, kg, entities):
	"""Creates and returns embeddings for the entities from the given knowledge graph, transformer."""
	return transformer.fit_transform(kg, entities)

def getCosineSimDf(embeddings, entities):
    """For the given embeddings and entities list, the cosine similarity matrix is calculated and returend. """
    cosine_sim = cosine_similarity(embeddings)
    return pd.DataFrame(data=cosine_sim, index=entities, columns=entities)

def getNHighestRated(n, movie, cosine_df):
	"""Extract the n most similar movies for the one given movie from the cosine similarity df."""
	extracted_row = cosine_df[movie]
	return extracted_row.nlargest(n+1)[1:]

def get_top_N_Items_with_Names(item_list, n, cosine_df, movielenseMapping):
	"""For each item in the item_list, the top n similar movies are extracted from the cosine_df. 
	The movielenseMapping is to enrich the similar movies with the movieId.
	
	Not recently used in program."""
	result = pd.DataFrame(columns=['movieId', 'title', 'similar_movies'])
	for item in item_list:
		n_items = getNHighestRated(n, item, cosine_df)
		n_item_list = n_items.index.tolist()
		row_result = {'movieId': movielenseMapping[movielenseMapping['dbpediaLink'] == item].iloc[0]['id'], 'title': item.replace('http://dbpedia.org/resource/', ''), 'similar_movies': [w.replace('http://dbpedia.org/resource/', '') for w in n_item_list]}
		result = result.append(row_result, ignore_index = True)
	return result

def get_k_nearest_items_with_score(item_list, n, cosine_df):
    """For each item in the item_list, the top n similar movies are extracted and returend with the consine score from the cosine_df."""
    movielenseMapping = loadMovieLenseMapping()
    result = pd.DataFrame(columns=['movie_id', 'similar', 'score', 'movie_name', 'similar_name'])

    for item in item_list:
        n_items = getNHighestRated(n, item, cosine_df)
        movie_names_list = n_items.index.tolist()
        score_list = n_items.values
        for i in range(n):
            movie_id_item = movielenseMapping[movielenseMapping['dbpediaLink'] == item].iloc[0]['id']
            movie_id_simiilar = movielenseMapping[movielenseMapping['dbpediaLink'] == movie_names_list[i]].iloc[0]['id']
            row_result = {'movie_id': movie_id_item , 'similar': movie_id_simiilar, 'score': score_list[i], 'movie_name': item, 'similar_name': movie_names_list[i]}
            result = result.append(row_result, ignore_index = True)
    return result


def create_and_store_k_nearest_genre_items_with_score(item_list, k, cosine_df, lang):
    """For each item in the item_list, the top n similar movies per genre are extracted 
    and stored with the consine score from the cosine_df. """
    counter = 0

    movielenseMapping = loadMovieLenseMapping()

    genre_movie_list = pd.read_csv('data/movielense/final_movie_genre_year_county_list.csv', usecols=['movie_name', 'genres'])
    result = pd.DataFrame(columns=['movie_id', 'similar', 'score', 'movie_name', 'similar_name'])
    genre_movie_list = genre_movie_list.drop_duplicates()
    final_genre_list = []

    #extract genres
    for genre in genre_movie_list['genres'].unique():
        genre_movies = genre_movie_list[genre_movie_list['genres'] == genre]['movie_name'].values
        if len(genre_movies) >= 100:
            final_genre_list.append([genre, genre_movies])

    # filter cosine similarity matrix and create the similar movie results per genre. 
    for genre in final_genre_list:
        cosine_df_filtered = cosine_df.filter(items=genre[1]).filter(genre[1], axis=0)

        for item in genre[1]:
            n_items = getNHighestRated(k, item, cosine_df_filtered)
            movie_names_list = n_items.index.tolist()
            score_list = n_items.values
            for i in range(k):
                movie_id_item = movielenseMapping[movielenseMapping['dbpediaLink'] == item].iloc[0]['id']
                movie_id_simiilar = movielenseMapping[movielenseMapping['dbpediaLink'] == movie_names_list[i]].iloc[0]['id']
                row_result = {'movie_id': movie_id_item , 'similar': movie_id_simiilar, 'score': score_list[i], 'movie_name': item, 'similar_name': movie_names_list[i]}
                result = result.append(row_result, ignore_index = True)
        store_k_nearest_items_genre_lang(lang,k,result,genre[0])

def store_k_nearest_items_genre_lang(lang,k,k_nearest_list, genre):
    """Stores the similar items for the movie RSs. The language, genre and k are used for identification."""
    with open("data/similar_movies/genre/"+str(k)+"_nearest_items_"+genre+"_lang="+lang+".pkl", 'wb') as f:
        pickle.dump(k_nearest_list, f)
    k_nearest_list.to_csv("data/similar_movies/genre/"+str(k)+"_nearest_items_"+genre+"_lang="+lang+".csv")

def store_k_nearest_items_lang(lang,k,k_nearest_list):
    """Stores the similar items for the movie RSs. The language and and k are used for identification."""
    with open("data/similar_movies/"+str(k)+"_nearest_items_lang="+lang+".pkl", 'wb') as f:
        pickle.dump(k_nearest_list, f)
    k_nearest_list.to_csv("data/similar_movies/"+str(k)+"_nearest_items_lang="+lang+".csv")

def store_cosine_sim_df(lang,cosine_df):
	"""Stores the cosine_df for later use. The language is used for identification."""
	with open("data/cosine_similarities/cosine_df_lang="+lang+".pkl", 'wb') as f:
		pickle.dump(cosine_df, f)


def createCosineSim(lang_list, lang_mapping_list):
	"""For the given language list, the n = 50 similar movies lists for the movie RSs are created and stored.
	The lang_mapping_list is used for mapping english DBpedia links to the other DBpedia language versions."""
	for l in lang_list:
		print("Start Language: " + l)
		#The English DBpedia version has no "en" in it, therefore the entities have to be extracted in another way than for other languages.
		if(l == "en"):
			entities_from_lang_list = lang_mapping_list[lang_mapping_list.other_version.str.contains("http://dbpedia.org/")]
		else:
			entities_from_lang_list = lang_mapping_list[lang_mapping_list.other_version.str.contains("http://"+l+".dbpedia.org/")]
		#Loads the instance types and the mapping based objects for current the language version in one list
		all_data = pd.concat([loadInstanceTypes(l), loadMappingBasedObjects(l)])
		print("Data Length" + str(len(all_data)))
		#Creates the KG
		kg = create_kg(all_data.itertuples(index=False), label_predicates=[])
		del all_data
		#Get the DBpedia links for current language
		entites_in_lang = entities_from_lang_list['other_version']
		entites_filtered = entites_in_lang[entites_in_lang.isin(entites_in_lang)]
		transformer = getTransformer(4, 500, 5, 200)
		embeddings = getEmbeddings(transformer, kg, entites_in_lang)
		del kg
		del transformer
		#Get english DBpedia link
		entites_filtered_english = entities_from_lang_list[entities_from_lang_list['other_version'].isin(entites_filtered)]['english_version']
		#Create the cosine similarity matrix
		cosine_sim = getCosineSimDf(embeddings, entites_filtered_english)
		#Store the cosine similarity matrix for current language
		store_cosine_sim_df(l, cosine_sim)
		#Extract the k most similar movies.
		k_nearest_items = get_k_nearest_items_with_score(entites_filtered_english, 50, cosine_sim)
		store_k_nearest_items_lang(l, 50, k_nearest_items)
		create_and_store_k_nearest_genre_items_with_score(entites_filtered_english, 10, cosine_sim, l)

def storeLangMappingList(list):
	"""Stores the DBpedia language mapping list."""
	with open('data/lang_versions/mapping_to_other_languages/lang_mapping_list_2016.pkl', 'wb') as f:
		pickle.dump(list, f)

def getMoviesForOtherLanguages(entities, dbpedia_versions, interlanguage_list_en):
    """Extracts for the entities and given DBpedia versions the english movie links and the corresponding 
	link in for the other langauge versions of DBpedia"""
    interlanguage_list_en_filtered = interlanguage_list_en[interlanguage_list_en.subject.isin(list(entities))]
    result = pd.DataFrame(columns=['english_version', 'other_version'])
    for entitie in entities:
		#Extracts the entries from the english interlanguage links data set
        same_as_df = interlanguage_list_en_filtered.loc[interlanguage_list_en_filtered['subject'] == entitie]
		#For each dbpedia version it is checked, if there is an corresponding entrie for the current entitie
        for link in db_pedia_versions['links']:
            sameAs = same_as_df[same_as_df.object.str.contains(link)]
            if sameAs.empty != True:
                row_result = {'english_version':entitie, 'other_version':sameAs['object'].values[0]}
                result = result.append(row_result, ignore_index = True)
    # only movies wich are in all provided dbpedia language version are stored
    result = result.groupby('english_version').filter(lambda x: len(x) == len(dbpedia_versions))
    storeLangMappingList(result)
    return result


#Load the movie list contained in the english dbpedia version and movielense mapping.
entities_filtered_en = getMovieListFiltered()
interlanguage_links = loadEnglishToOtherLanguages()
db_pedia_versions = pd.DataFrame(["http://de.dbpedia.org/", "http://dbpedia.org/", "http://fr.dbpedia.org/", "http://it.dbpedia.org/", "http://ru.dbpedia.org/"],columns=['links'])
lang_mapping_list = getMoviesForOtherLanguages(entities_filtered_en, db_pedia_versions, interlanguage_links)


lang_list = ["en", "de", "fr", "it", "ru"]

createCosineSim(lang_list, lang_mapping_list)
back to top