Spaces:

RMakushkin
/

FindMyShow

Runtime error

App Files Files Community

RMakushkin commited on Dec 15, 2023

Commit

d713893

1 Parent(s): c8c7f92

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
app.py +87 -0
embeddings_main.npy +3 -0
faiss_index_main.index +3 -0
func.py +53 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 embs.txt filter=lfs diff=lfs merge=lfs -text
 dataset.csv filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 embs.txt filter=lfs diff=lfs merge=lfs -text
 dataset.csv filter=lfs diff=lfs merge=lfs -text
+faiss_index_main.index filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import ast
+import faiss
+from func import filter_by_ganre, embed_user
+"""
+# Умный поиск сериалов
+"""
+df = pd.read_csv('dataset.csv')
+embeddings = np.load('embeddings_main.npy')
+index = faiss.read_index('faiss_index_main.index')
+df['ganres'] = df['ganres'].apply(lambda x: ast.literal_eval(x))
+st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; ">Количество сериалов, \
+         предоставляемых сервисом {len(df)}</p>', unsafe_allow_html=True)
+ganres_lst = sorted(['драма', 'документальный', 'биография', 'комедия', 'фэнтези', 'приключения', 'для детей', 'мультсериалы',
+              'мелодрама', 'боевик', 'детектив', 'фантастика', 'триллер', 'семейный', 'криминал', 'исторический', 'музыкальные',
+              'мистика', 'аниме', 'ужасы', 'спорт', 'скетч-шоу', 'военный', 'для взрослых', 'вестерн'])
+st.sidebar.header('Панель инструментов :gear:')
+choice_g = st.sidebar.multiselect("Выберите жанры", options=ganres_lst)
+n = st.sidebar.selectbox("Количество отображаемых элементов на странице", options=[5, 10, 15])
+# col3, col4 = st.columns([5,2])
+# with col3:
+text = st.text_input('Введите описание для рекомендации')
+# with col4:
+button = st.button('Отправить запрос', type="primary")
+if text and button:
+    if len(choice_g) == 0:
+        choice_g = ganres_lst
+    filt_ind = filter_by_ganre(df, choice_g)
+    user_emb = embed_user(filt_ind, embeddings, text, n)
+    _, sorted_indices = index.search(user_emb.reshape(1, -1), n)
+    st.write(f'<p style="font-family: Arial, sans-serif; font-size: 18px; text-align: center;"><strong>Всего подобранных \
+         рекомендаций {len(sorted_indices[0])}</strong></p>', unsafe_allow_html=True)
+    st.write('\n')
+    # Отображение изображений и названий
+    # for ind, sim in top_dict.items():
+    #     col1, col2 = st.columns([3, 4])
+    #     with col1:
+    #         st.image(df['poster'][ind], width=300)
+    #     with col2:
+    #         st.write(f"***Название:*** {df['title'][ind]}")
+    #         st.write(f"***Жанр:*** {', '.join(df['ganres'][ind])}")
+    #         st.write(f"***Описание:*** {df['description'][ind]}")
+    #         similarity = round(sim, 4)
+    #         st.write(f"***Cosine Similarity : {similarity}***")
+    #         st.write(f"***Ссылка на фильм : {df['url'][ind]}***")
+    #     st.markdown(
+    #     "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
+    #     unsafe_allow_html=True
+    # )
+    for ind in sorted_indices[0]:
+        col1, col2 = st.columns([3, 4])
+        with col1:
+            st.image(df['poster'][ind], width=300)
+        with col2:
+            st.write(f"***Название:*** {df['title'][ind]}")
+            st.write(f"***Жанр:*** {', '.join(df['ganres'][ind])}")
+            st.write(f"***Описание:*** {df['description'][ind]}")
+            # similarity = round(sim, 4)
+            # st.write(f"***Cosine Similarity : {similarity}***")
+            st.write(f"***Ссылка на фильм : {df['url'][ind]}***")
+        st.markdown(
+        "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
+        unsafe_allow_html=True
+    )
+# streamlit run app.py

embeddings_main.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b33d9e4726eff511c3f0f74dd9d1f22f863828aa0c03ff060c2983be3dce0115
+size 45892736

faiss_index_main.index ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5fbaa50af8354c8a54372b1c763337f98792c351fa2e3aa266f448ec8266da2
+size 45892653

func.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import pandas as pd
+import numpy as np
+import torch
+from transformers import BertModel, BertTokenizer
+from sklearn.metrics.pairwise import cosine_similarity
+tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
+model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+def filter_by_ganre(df: pd.DataFrame, ganre_list: list):
+    filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
+    filt_ind = filtered_df.index.to_list()
+    return filt_ind
+# def mean_pooling(model_output, attention_mask):
+#     token_embeddings = model_output['last_hidden_state']
+#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+#     sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+#     sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+#     return sum_embeddings / sum_mask
+# def recommendation(filt_ind: list, embeddings: np.array, user_text: str, n=10):
+#     token_user_text = tokenizer(user_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
+#     user_embeddings = torch.Tensor().to(device)
+#     model.to(device)
+#     model.eval()
+#     with torch.no_grad():
+#         batch = {k: v.to(device) for k, v in token_user_text.items()}
+#         outputs = model(**batch)
+#         user_embeddings = torch.cat([user_embeddings, mean_pooling(outputs, batch['attention_mask'])])
+#         user_embeddings = user_embeddings.cpu().numpy()
+#     cosine_similarities = cosine_similarity(embeddings[filt_ind], user_embeddings.reshape(1, -1))
+#     df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
+#     dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
+#     return dict_topn
+def embed_user(filt_ind: list, embeddings:np.array, user_text: str, n=10):
+    tokens = tokenizer(user_text, return_tensors="pt", padding=True, truncation=True).to(device)
+    model.to(device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**tokens)
+        user_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().reshape(1, -1)
+    return user_embedding
+    # cosine_similarities = cosine_similarity(embeddings[filt_ind], user_embedding.reshape(1, -1))
+    # df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
+    # dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
+    # return dict_topn