Upload mlflow.py
Browse files
mlflow.py
ADDED
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import mlflow
|
2 |
+
from mlflow import log_metric, log_param, log_artifacts
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from ast import literal_eval
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
9 |
+
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
|
10 |
+
from nltk.stem.snowball import SnowballStemmer
|
11 |
+
import warnings; warnings.simplefilter('ignore')
|
12 |
+
|
13 |
+
"""# Read datasets"""
|
14 |
+
# /Applications/Education/7th Semester/Machine Learning/Me/Project/IMDB/credits.csv
|
15 |
+
def read_data_set():
|
16 |
+
md=pd.read_csv('IMDB/movies_metadata.csv')
|
17 |
+
md.head(2)
|
18 |
+
|
19 |
+
credits=pd.read_csv('IMDB/credits.csv')
|
20 |
+
credits.head(2)
|
21 |
+
|
22 |
+
keywords=pd.read_csv('IMDB/keywords.csv')
|
23 |
+
keywords.head(2)
|
24 |
+
|
25 |
+
links_small=pd.read_csv('IMDB/links_small.csv')
|
26 |
+
links_small.head(2)
|
27 |
+
|
28 |
+
df_rating = pd.read_csv('IMDB/ratings_small.csv')
|
29 |
+
return md,credits,keywords,links_small,df_rating
|
30 |
+
df_rating.head(5)
|
31 |
+
|
32 |
+
"""# Data preprocessing"""
|
33 |
+
|
34 |
+
def data_preprocessing():
|
35 |
+
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
|
36 |
+
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
|
37 |
+
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
|
38 |
+
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
|
39 |
+
C = vote_averages.mean()
|
40 |
+
|
41 |
+
m = vote_counts.quantile(0.95)
|
42 |
+
|
43 |
+
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
|
44 |
+
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
|
45 |
+
qualified['vote_count'] = qualified['vote_count'].astype('int')
|
46 |
+
qualified['vote_average'] = qualified['vote_average'].astype('int')
|
47 |
+
qualified.shape
|
48 |
+
|
49 |
+
def weighted_rating(x):
|
50 |
+
v = x['vote_count']
|
51 |
+
R = x['vote_average']
|
52 |
+
return (v/(v+m) * R) + (m/(m+v) * C)
|
53 |
+
|
54 |
+
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
|
55 |
+
s.name = 'genre'
|
56 |
+
gen_md = md.drop('genres', axis=1).join(s)
|
57 |
+
md = md.drop([19730, 29503, 35587])
|
58 |
+
md.head(5)
|
59 |
+
|
60 |
+
md['id'] = md['id'].astype('int')
|
61 |
+
smd = md[md['id'].isin(links_small)]
|
62 |
+
smd.shape
|
63 |
+
|
64 |
+
smd['tagline'] = smd['tagline'].fillna('')
|
65 |
+
smd['description'] = smd['overview'] + smd['tagline']
|
66 |
+
smd['description'] = smd['description'].fillna('')
|
67 |
+
|
68 |
+
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
|
69 |
+
tfidf_matrix = tf.fit_transform(smd['description'])
|
70 |
+
tfidf_matrix.shape
|
71 |
+
|
72 |
+
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
|
73 |
+
|
74 |
+
smd = smd.reset_index()
|
75 |
+
titles = smd['title']
|
76 |
+
indices = pd.Series(smd.index, index=smd['title'])
|
77 |
+
|
78 |
+
"""#First content based recommendation"""
|
79 |
+
|
80 |
+
def get_recommendations(title):
|
81 |
+
idx = indices[title]
|
82 |
+
sim_scores = list(enumerate(cosine_sim[idx]))
|
83 |
+
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
84 |
+
sim_scores = sim_scores[1:31]
|
85 |
+
movie_indices = [i[0] for i in sim_scores]
|
86 |
+
return titles.iloc[movie_indices]
|
87 |
+
|
88 |
+
# get_recommendations('The Godfather').head(10)
|
89 |
+
|
90 |
+
"""# Some process on data to have better recommendations"""
|
91 |
+
|
92 |
+
read_data_set.keywords['id'] = read_data_set.keywords['id'].astype('int')
|
93 |
+
credits['id'] = credits['id'].astype('int')
|
94 |
+
read_data_set.md['id'] = read_data_set.md['id'].astype('int')
|
95 |
+
read_data_set.md.shape
|
96 |
+
|
97 |
+
md = md.merge(credits, on='id')
|
98 |
+
md = md.merge(keywords, on='id')
|
99 |
+
smd = md[md['id'].isin(read_data_set.links_small)]
|
100 |
+
smd.shape
|
101 |
+
|
102 |
+
smd['cast'] = smd['cast'].apply(literal_eval)
|
103 |
+
smd['crew'] = smd['crew'].apply(literal_eval)
|
104 |
+
smd['keywords'] = smd['keywords'].apply(literal_eval)
|
105 |
+
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
|
106 |
+
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
|
107 |
+
|
108 |
+
def get_director(x):
|
109 |
+
for i in x:
|
110 |
+
if i['job'] == 'Director':
|
111 |
+
return i['name']
|
112 |
+
return np.nan
|
113 |
+
|
114 |
+
smd['director'] = smd['crew'].apply(get_director)
|
115 |
+
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
|
116 |
+
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
|
117 |
+
|
118 |
+
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
|
119 |
+
|
120 |
+
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
|
121 |
+
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
|
122 |
+
smd['director'] = smd['director'].apply(lambda x: [x,x, x])
|
123 |
+
|
124 |
+
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
|
125 |
+
s.name = 'keyword'
|
126 |
+
s = s.value_counts()
|
127 |
+
s[:5]
|
128 |
+
|
129 |
+
s = s[s > 1]
|
130 |
+
stemmer = SnowballStemmer('english')
|
131 |
+
stemmer.stem('dogs')
|
132 |
+
|
133 |
+
def filter_keywords(x):
|
134 |
+
words = []
|
135 |
+
for i in x:
|
136 |
+
if i in s:
|
137 |
+
words.append(i)
|
138 |
+
return words
|
139 |
+
|
140 |
+
smd['keywords'] = smd['keywords'].apply(filter_keywords)
|
141 |
+
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
|
142 |
+
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
|
143 |
+
|
144 |
+
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
|
145 |
+
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))
|
146 |
+
|
147 |
+
smd.head(5)
|
148 |
+
|
149 |
+
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
|
150 |
+
count_matrix = count.fit_transform(smd['soup'])
|
151 |
+
cosine_sim = cosine_similarity(count_matrix, count_matrix)
|
152 |
+
|
153 |
+
smd = smd.reset_index()
|
154 |
+
titles = smd['title']
|
155 |
+
indices = pd.Series(smd.index, index=smd['title'])
|
156 |
+
|
157 |
+
df_rating = pd.read_csv('IMDB/ratings_small.csv')
|
158 |
+
df_rating.head(5)
|
159 |
+
|
160 |
+
"""# Data preprocessing and analyzing"""
|
161 |
+
rating_copy = df_rating.copy()
|
162 |
+
rating_copy['rating'] = rating_copy['rating'].apply(np.floor)
|
163 |
+
gp_by_rating = rating_copy.groupby('rating')['rating'].agg(['count'])
|
164 |
+
|
165 |
+
|
166 |
+
|
167 |
+
movie_count = df_rating['movieId'].nunique()
|
168 |
+
cust_count = df_rating['userId'].nunique()
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
ax = gp_by_rating.plot(kind = 'barh', legend = False, figsize = (8,8))
|
173 |
+
plt.title('{:,} Movies, {:,} customers'.format(movie_count, cust_count), fontsize=14)
|
174 |
+
plt.axis('off')
|
175 |
+
|
176 |
+
for i in range(0,6):
|
177 |
+
ax.text(gp_by_rating.iloc[i][0]/4, i, 'Rating {}: {:.0f}%'.
|
178 |
+
format(i, gp_by_rating.iloc[i][0]*100 / gp_by_rating.sum()[0]), color = 'black')
|
179 |
+
|
180 |
+
agg_function = ['count','mean']
|
181 |
+
|
182 |
+
gp_by_movie = df_rating.groupby('movieId')['rating'].agg(agg_function)
|
183 |
+
|
184 |
+
df_rating = pd.merge(df_rating, smd, how='right', left_on='movieId', right_on='id')
|
185 |
+
df_rating = df_rating[['movieId', 'userId', 'rating']]
|
186 |
+
pivot_rating = pd.pivot_table(df_rating, values='rating', index='userId', columns='movieId')
|
187 |
+
pivot_rating
|
188 |
+
|
189 |
+
|
190 |
+
"""# Improved recommendation"""
|
191 |
+
with mlflow.start_run(run_name="run") as run:
|
192 |
+
get_recommendations('The Dark Knight').head(10)
|
193 |
+
|
194 |
+
def improved_recommendations(title):
|
195 |
+
idx = indices[title]
|
196 |
+
sim_scores = list(enumerate(cosine_sim[idx]))
|
197 |
+
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
198 |
+
sim_scores = sim_scores[1:26]
|
199 |
+
movie_indices = [i[0] for i in sim_scores]
|
200 |
+
eval_cosine = sum(movie_indices) / len(movie_indices)
|
201 |
+
movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
|
202 |
+
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
|
203 |
+
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
|
204 |
+
C = vote_averages.mean()
|
205 |
+
m = vote_counts.quantile(0.60)
|
206 |
+
qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
|
207 |
+
qualified['vote_count'] = qualified['vote_count'].astype('int')
|
208 |
+
qualified['vote_average'] = qualified['vote_average'].astype('int')
|
209 |
+
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
|
210 |
+
qualified = qualified.sort_values('wr', ascending=False).head(10)
|
211 |
+
return qualified, eval_cosine
|
212 |
+
|
213 |
+
q, eval_cosine = improved_recommendations('The Dark Knight')
|
214 |
+
mlflow.log_metric('cosine_sim',eval_cosine)
|
215 |
+
"""# Collaborative Filtering"""
|
216 |
+
|
217 |
+
|
218 |
+
"""# PearsonR recommendation"""
|
219 |
+
|
220 |
+
df_movie_title = smd[['id', 'title']]
|
221 |
+
df_movie_title.shape
|
222 |
+
|
223 |
+
def corr_recommend(movie_title, min_count):
|
224 |
+
i = int(df_movie_title[df_movie_title['title'] == movie_title]['id'])
|
225 |
+
target = pivot_rating[i]
|
226 |
+
similar_to_target = pivot_rating.corrwith(target)
|
227 |
+
corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
|
228 |
+
corr_target.dropna(inplace = True)
|
229 |
+
corr_target = corr_target.sort_values('PearsonR', ascending = False)
|
230 |
+
corr_target.index = corr_target.index.map(int)
|
231 |
+
corr_target = corr_target.join(df_movie_title).join(gp_by_movie)[['PearsonR', 'title', 'count', 'mean']]
|
232 |
+
return corr_target[corr_target['count']>min_count][:10]
|
233 |
+
|
234 |
+
corr_recommend('The Dark Knight', 0)
|
235 |
+
|
236 |
+
def hybrid_recommendation(movie_name):
|
237 |
+
|
238 |
+
soup_based = improved_recommendations(movie_name)
|
239 |
+
corr = corr_recommend(soup_based.iloc[0]['title'],0)
|
240 |
+
return get_recommendations(corr.iloc[0]['title'])
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
print(hybrid_recommendation('Toy Story').head(10))
|