import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
path = '../datas/movies/'
data = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
data.head(2)
# 컬럼 확인
data.columns
Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
'imdb_id', 'original_language', 'original_title', 'overview',
'popularity', 'poster_path', 'production_companies',
'production_countries', 'release_date', 'revenue', 'runtime',
'spoken_languages', 'status', 'tagline', 'title', 'video',
'vote_average', 'vote_count'],
dtype='object')
# 전처리
# overview의 결측치가 있는 항목은 모두 제거
data = data[data['overview'].notnull()].reset_index(drop=True)
data.shape
(44512, 24)
data['overview'].head()
0 Led by Woody, Andy's toys live happily in his ...
1 When siblings Judy and Peter discover an encha...
2 A family wedding reignites the ancient feud be...
3 Cheated on, mistreated and stepped on, the wom...
4 Just when George Banks has recovered from his ...
Name: overview, dtype: object
# 불용어 : 유의미하지 않은 단어 토큰을 제거
tfidf = TfidfVectorizer(stop_words='english')
# overview에 대해서 tf-idf 수행
tfidf_matrix = tfidf.fit_transform(data['overview'])
tfidf_matrix.shape
(44512, 75827)
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
np.round(cosine_matrix, 4)
array([[1. , 0.015 , 0. , ..., 0. , 0.0059, 0. ],
[0.015 , 1. , 0.0468, ..., 0. , 0.022 , 0.0092],
[0. , 0.0468, 1. , ..., 0. , 0.014 , 0. ],
...,
[0. , 0. , 0. , ..., 1. , 0. , 0. ],
[0.0059, 0.022 , 0.014 , ..., 0. , 1. , 0. ],
[0. , 0.0092, 0. , ..., 0. , 0. , 1. ]])
# movie title와 id를 매핑할 dictionary를 생성.
movie2id = {}
for i, c in enumerate(data['title']): movie2id[i] = c
# id와 movie title를 매핑할 dictionary를 생성.
id2movie = {}
for i, c in movie2id.items(): id2movie[c] = i
# Toy Story의 id 추출
idx = id2movie['Toy Story'] # Toy Story : 0번 인덱스
sim_scores = [(i, c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬
sim_scores[0:10] # 상위 10개의 인덱스와 유사도를 추출
[(15282, 0.5321733978946077),
(2979, 0.4721455937067049),
(10271, 0.2749625162608231),
(24316, 0.27322653023092314),
(23646, 0.2354394695808281),
(28893, 0.22397858775140161),
(42572, 0.21761842522811858),
(37778, 0.21593677709089282),
(41893, 0.20190977282766226),
(8303, 0.1986849443943904)]
# 인덱스를 Movie Title로 변환
sim_scores = [(movie2id[i], score) for i, score in sim_scores[0:10]]
sim_scores
[('Toy Story 3', 0.5321733978946077),
('Toy Story 2', 0.4721455937067049),
('The 40 Year Old Virgin', 0.2749625162608231),
('Small Fry', 0.27322653023092314),
("Andy Hardy's Blonde Trouble", 0.2354394695808281),
('Hot Splash', 0.22397858775140161),
('Andy Kaufman Plays Carnegie Hall', 0.21761842522811858),
('Superstar: The Life and Times of Andy Warhol', 0.21593677709089282),
('Andy Peters: Exclamation Mark Question Point', 0.20190977282766226),
('The Champ', 0.1986849443943904)]