RMakushkin commited on
Commit
b16a5c8
·
1 Parent(s): 6e98257

Upload 5 files

Browse files
Files changed (6) hide show
  1. .gitattributes +2 -0
  2. dataset.csv +3 -0
  3. embeddings.txt +3 -0
  4. func.py +37 -0
  5. requirements.txt +77 -0
  6. shows.py +67 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset.csv filter=lfs diff=lfs merge=lfs -text
37
+ embeddings.txt filter=lfs diff=lfs merge=lfs -text
dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c10dbf7a899fbf0553bf6cab5fd11abf35cf224e4e6e4f7843fdd19144c550
3
+ size 19266108
embeddings.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b5b8bd90c7567e8b983efa49fe56a6f4d94406196bf6d89184d1cb46902624d
3
+ size 292747747
func.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from transformers import BertModel, BertTokenizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+
8
+ tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
9
+ model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence", output_hidden_states = True)
10
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
11
+
12
+
13
+ def filter_by_ganre(df: pd.DataFrame, ganre_list: list):
14
+ filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
15
+ return filtered_df
16
+
17
+ def mean_pooling(model_output, attention_mask):
18
+ token_embeddings = model_output['last_hidden_state']
19
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
20
+ sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
21
+ sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
22
+ return sum_embeddings / sum_mask
23
+
24
+ def recommendation(df: pd.DataFrame, embeddings:np.array, user_text: str, n=10):
25
+ token_user_text = tokenizer(user_text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
26
+ user_embeddings = torch.Tensor().to(device)
27
+ model.to(device)
28
+ model.eval()
29
+ with torch.no_grad():
30
+ batch = {k: v.to(device) for k, v in token_user_text.items()}
31
+ outputs = model(**batch)
32
+ user_embeddings = torch.cat([user_embeddings, mean_pooling(outputs, batch['attention_mask'])])
33
+ user_embeddings = user_embeddings.cpu().numpy()
34
+ cosine_similarities = cosine_similarity(embeddings, user_embeddings.reshape(1, -1))
35
+ df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
36
+ dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
37
+ return dict_topn
requirements.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.2.0
2
+ attrs==23.1.0
3
+ blinker==1.7.0
4
+ cachetools==5.3.2
5
+ certifi==2023.11.17
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ filelock==3.13.1
9
+ fsspec==2023.12.1
10
+ gitdb==4.0.11
11
+ GitPython==3.1.40
12
+ huggingface-hub==0.19.4
13
+ idna==3.6
14
+ imageio==2.33.0
15
+ importlib-metadata==6.11.0
16
+ Jinja2==3.1.2
17
+ joblib==1.3.2
18
+ jsonschema==4.20.0
19
+ jsonschema-specifications==2023.11.2
20
+ markdown-it-py==3.0.0
21
+ MarkupSafe==2.1.3
22
+ mdurl==0.1.2
23
+ mpmath==1.3.0
24
+ networkx==3.2.1
25
+ numpy==1.26.2
26
+ nvidia-cublas-cu12==12.1.3.1
27
+ nvidia-cuda-cupti-cu12==12.1.105
28
+ nvidia-cuda-nvrtc-cu12==12.1.105
29
+ nvidia-cuda-runtime-cu12==12.1.105
30
+ nvidia-cudnn-cu12==8.9.2.26
31
+ nvidia-cufft-cu12==11.0.2.54
32
+ nvidia-curand-cu12==10.3.2.106
33
+ nvidia-cusolver-cu12==11.4.5.107
34
+ nvidia-cusparse-cu12==12.1.0.106
35
+ nvidia-nccl-cu12==2.18.1
36
+ nvidia-nvjitlink-cu12==12.3.101
37
+ nvidia-nvtx-cu12==12.1.105
38
+ packaging==23.2
39
+ pandas==2.1.3
40
+ Pillow==10.1.0
41
+ protobuf==4.25.1
42
+ pyarrow==14.0.1
43
+ pydeck==0.8.1b0
44
+ Pygments==2.17.2
45
+ python-dateutil==2.8.2
46
+ pytz==2023.3.post1
47
+ PyYAML==6.0.1
48
+ referencing==0.32.0
49
+ regex==2023.10.3
50
+ requests==2.31.0
51
+ rich==13.7.0
52
+ rpds-py==0.13.2
53
+ safetensors==0.4.1
54
+ scikit-learn==1.3.2
55
+ scipy==1.11.4
56
+ sentencepiece==0.1.99
57
+ six==1.16.0
58
+ smmap==5.0.1
59
+ streamlit==1.29.0
60
+ sympy==1.12
61
+ tenacity==8.2.3
62
+ threadpoolctl==3.2.0
63
+ tokenizers==0.15.0
64
+ toml==0.10.2
65
+ toolz==0.12.0
66
+ torch==2.1.1
67
+ tornado==6.4
68
+ tqdm==4.66.1
69
+ transformers==4.35.2
70
+ triton==2.1.0
71
+ typing_extensions==4.8.0
72
+ tzdata==2023.3
73
+ tzlocal==5.2
74
+ urllib3==2.1.0
75
+ validators==0.22.0
76
+ watchdog==3.0.0
77
+ zipp==3.17.0
shows.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import ast
5
+
6
+ from func import filter_by_ganre, recommendation
7
+
8
+
9
+
10
+ """
11
+ # Умный поиск сериалов
12
+ """
13
+
14
+ df = pd.read_csv('dataset.csv')
15
+ embeddings = np.loadtxt('embeddings.txt')
16
+ df['ganres'] = df['ganres'].apply(lambda x: ast.literal_eval(x))
17
+
18
+ st.write(f'<p style="font-family: Arial, sans-serif; font-size: 24px; ">Количество сериалов, \
19
+ предоставляемых сервисом {len(df)}</p>', unsafe_allow_html=True)
20
+
21
+ ganres_lst = sorted(['драма', 'документальный', 'биография', 'комедия', 'фэнтези', 'приключения', 'для детей', 'мультсериалы',
22
+ 'мелодрама', 'боевик', 'детектив', 'фантастика', 'триллер', 'семейный', 'криминал', 'исторический', 'музыкальные',
23
+ 'мистика', 'аниме', 'ужасы', 'спорт', 'скетч-шоу', 'военный', 'для взрослых', 'вестерн'])
24
+
25
+ st.sidebar.header('Панель инструментов :gear:')
26
+ choice_g = st.sidebar.multiselect("Выберите жанры", options=ganres_lst)
27
+ n = st.sidebar.selectbox("Количество отображаемых элементов на странице", options=[5, 10, 15])
28
+
29
+
30
+ # col3, col4 = st.columns([5,2])
31
+
32
+ # with col3:
33
+ text = st.text_input('Введите описание для рекомендации')
34
+
35
+ # with col4:
36
+
37
+ button = st.button('Отправить запрос', type="primary")
38
+
39
+ if text and button:
40
+ if len(choice_g) == 0:
41
+ choice_g = ganres_lst
42
+ filtered_df = filter_by_ganre(df, choice_g)
43
+ top_dict = recommendation(filtered_df, embeddings, text)
44
+ st.write(f'<p style="font-family: Arial, sans-serif; font-size: 18px; text-align: center;"><strong>Всего подобранных \
45
+ рекомендаций {len(top_dict)}</strong></p>', unsafe_allow_html=True)
46
+ st.write('\n')
47
+
48
+ # Отображение изображений и названий
49
+ for ind, sim in enumerate(top_dict):
50
+ col1, col2 = st.columns([3, 4])
51
+ with col1:
52
+ st.image(df['poster'][ind], width=300)
53
+ with col2:
54
+ st.write(f"***Название:*** {df['title'][ind]}")
55
+ st.write(f"***Жанр:*** {', '.join(df['ganres'][ind])}")
56
+ st.write(f"***Описание:*** {df['description'][ind]}")
57
+ similarity = round(sim, 4)
58
+ st.write(f"***Cosine Similarity : {similarity}***")
59
+ st.write(f"***Ссылка на фильм : {df['url'][ind]}***")
60
+
61
+ st.markdown(
62
+ "<hr style='border: 2px solid #000; margin-top: 10px; margin-bottom: 10px;'>",
63
+ unsafe_allow_html=True
64
+ )
65
+
66
+
67
+ # streamlit run shows.py