waseoke commited on
Commit
bb06e40
·
verified ·
1 Parent(s): d917b06

Update calculate_cosine_similarity.py

Browse files
Files changed (1) hide show
  1. calculate_cosine_similarity.py +118 -22
calculate_cosine_similarity.py CHANGED
@@ -1,14 +1,58 @@
 
 
1
  from pymongo import MongoClient
2
  import numpy as np
3
  from sklearn.metrics.pairwise import cosine_similarity
4
 
5
  # MongoDB Atlas 연결
6
- client = MongoClient("mongodb+srv://waseoke:[email protected]/test?retryWrites=true&w=majority")
 
 
7
  db = client["two_tower_model"]
8
  user_embedding_collection = db["user_embeddings"]
9
  product_embedding_collection = db["product_embeddings"]
10
  train_dataset = db["train_dataset"]
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # 유사도 계산 함수
13
  def calculate_similarity(input_embedding, target_embeddings):
14
  """
@@ -17,67 +61,119 @@ def calculate_similarity(input_embedding, target_embeddings):
17
  similarities = cosine_similarity(input_embedding, target_embeddings).flatten()
18
  return similarities
19
 
20
- def find_most_similar_anchor(user_id):
 
21
  """
22
  사용자 임베딩을 기준으로 가장 유사한 anchor 상품을 반환.
23
  """
 
 
 
 
24
  # 사용자 임베딩 가져오기
25
  user_data = user_embedding_collection.find_one({"user_id": user_id})
 
26
  if not user_data:
27
  raise ValueError(f"No embedding found for user_id: {user_id}")
28
-
29
- user_embedding = np.array(user_data["embedding"]).reshape(1, -1)
 
 
 
 
30
 
31
- # Anchor 데이터 가져오기
 
 
 
 
32
  anchors, anchor_embeddings = [], []
33
- train_data = list(train_dataset.find())
34
- for entry in train_data:
35
- anchors.append(entry["anchor"])
36
- anchor_embeddings.append(entry["anchor_embedding"])
 
 
 
37
 
38
  anchor_embeddings = np.array(anchor_embeddings)
39
 
 
 
 
40
  # Cosine Similarity 계산
41
- similarities = calculate_similarity(user_embedding, anchor_embeddings)
 
 
42
  most_similar_index = np.argmax(similarities)
43
 
44
  return anchors[most_similar_index], anchor_embeddings[most_similar_index]
45
 
46
- def find_most_similar_product(anchor_embedding):
 
47
  """
48
- Anchor 임베딩과 학습된 임베딩 중 가장 유사한 상품을 반환.
49
  """
50
- # Train 데이터의 positive/negative 임베딩과 비교
51
- train_data = list(train_dataset.find())
52
  train_embeddings, products = [], []
53
- for entry in train_data:
54
- products.extend([entry["positive"], entry["negative"]])
55
- train_embeddings.extend([entry["positive_embedding"], entry["negative_embedding"]])
 
 
 
 
 
56
 
57
  train_embeddings = np.array(train_embeddings)
58
 
 
 
 
59
  # Cosine Similarity 계산
60
- similarities = calculate_similarity(anchor_embedding.reshape(1, -1), train_embeddings)
 
 
61
  most_similar_index = np.argmax(similarities)
62
 
63
  return products[most_similar_index], train_embeddings[most_similar_index]
64
 
 
65
  def recommend_shop_product(similar_product_embedding):
66
  """
67
- 유사한 학습된 상품 임베딩과 쇼핑몰 상품 임베딩을 비교하여 추천.
68
  """
69
- # 쇼핑몰 상품 임베딩 데이터 가져오기
70
  all_products = list(product_embedding_collection.find())
71
  shop_product_embeddings, shop_product_ids = [], []
 
72
  for product in all_products:
73
  shop_product_ids.append(product["product_id"])
74
  shop_product_embeddings.append(product["embedding"])
75
 
76
  shop_product_embeddings = np.array(shop_product_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Cosine Similarity 계산
79
- similarities = calculate_similarity(similar_product_embedding.reshape(1, -1), shop_product_embeddings)
 
 
80
  most_similar_index = np.argmax(similarities)
81
 
82
  return shop_product_ids[most_similar_index]
83
-
 
1
+ import torch
2
+ import torch.nn as nn
3
  from pymongo import MongoClient
4
  import numpy as np
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
  # MongoDB Atlas 연결
8
+ client = MongoClient(
9
+ "mongodb+srv://waseoke:[email protected]/test?retryWrites=true&w=majority"
10
+ )
11
  db = client["two_tower_model"]
12
  user_embedding_collection = db["user_embeddings"]
13
  product_embedding_collection = db["product_embeddings"]
14
  train_dataset = db["train_dataset"]
15
 
16
+
17
+ # Autoencoder 모델 정의 (512차원 -> 128차원)
18
+ class Autoencoder(nn.Module):
19
+ def __init__(self):
20
+ super(Autoencoder, self).__init__()
21
+ self.encoder = nn.Sequential(
22
+ nn.Linear(512, 256), # 512 -> 256
23
+ nn.ReLU(),
24
+ nn.Linear(256, 128), # 256 -> 128
25
+ )
26
+ self.decoder = nn.Sequential(
27
+ nn.Linear(128, 256), # 128 -> 256
28
+ nn.ReLU(),
29
+ nn.Linear(256, 512), # 256 -> 512
30
+ )
31
+
32
+ def forward(self, x):
33
+ return self.encoder(x)
34
+
35
+
36
+ # Autoencoder를 초기화하고 학습된 모델을 로드
37
+ autoencoder = Autoencoder()
38
+ autoencoder.eval() # 학습된 모델 사용 시
39
+
40
+
41
+ # 학습된 모델 로드
42
+ def load_trained_model(model_path="product_model.pth"):
43
+ """
44
+ 학습된 모델을 로드.
45
+ """
46
+ model = torch.nn.Sequential(
47
+ torch.nn.Linear(768, 256), # 768: KoBERT 임베딩 차원
48
+ torch.nn.ReLU(),
49
+ torch.nn.Linear(256, 128),
50
+ )
51
+ model.load_state_dict(torch.load(model_path))
52
+ model.eval() # 평가 모드
53
+ return model
54
+
55
+
56
  # 유사도 계산 함수
57
  def calculate_similarity(input_embedding, target_embeddings):
58
  """
 
61
  similarities = cosine_similarity(input_embedding, target_embeddings).flatten()
62
  return similarities
63
 
64
+
65
+ def find_most_similar_anchor(user_id, model):
66
  """
67
  사용자 임베딩을 기준으로 가장 유사한 anchor 상품을 반환.
68
  """
69
+ # user_id의 데이터 타입 확인 및 변환
70
+ if isinstance(user_id, str) and user_id.isdigit():
71
+ user_id = int(user_id)
72
+
73
  # 사용자 임베딩 가져오기
74
  user_data = user_embedding_collection.find_one({"user_id": user_id})
75
+
76
  if not user_data:
77
  raise ValueError(f"No embedding found for user_id: {user_id}")
78
+ user_embedding = torch.tensor(
79
+ user_data["embedding"][0], dtype=torch.float32
80
+ ).unsqueeze(0)
81
+
82
+ padding = torch.zeros((1, 768 - 512))
83
+ user_embedding = torch.cat((user_embedding, padding), dim=1)
84
 
85
+ # 사용자 임베딩 차원 축소 (768 -> 128)
86
+ user_embedding = model[0](user_embedding) # 첫 번째 레이어만 사용하여 차원 축소
87
+ user_embedding = model[2](user_embedding) # 마지막 레이어 적용 (128 차원)
88
+
89
+ # Anchor 데이터 생성
90
  anchors, anchor_embeddings = [], []
91
+
92
+ # Anchor 데이터를 product_model.pth에서 추출
93
+ for _ in range(100): # Anchor 데이터가 100개라고 가정
94
+ random_input = torch.rand((1, 768)) # KoBERT 차원에 맞는 랜덤 데이터
95
+ anchor_embedding = model(random_input).detach().numpy().flatten()
96
+ anchors.append(f"Product_{len(anchors) + 1}") # Anchor 상품 이름
97
+ anchor_embeddings.append(anchor_embedding)
98
 
99
  anchor_embeddings = np.array(anchor_embeddings)
100
 
101
+ print(f"User embedding dimension: {user_embedding.shape}")
102
+ print(f"Anchor embedding dimension: {anchor_embeddings.shape}")
103
+
104
  # Cosine Similarity 계산
105
+ similarities = calculate_similarity(
106
+ user_embedding.detach().numpy().reshape(1, -1), anchor_embeddings
107
+ )
108
  most_similar_index = np.argmax(similarities)
109
 
110
  return anchors[most_similar_index], anchor_embeddings[most_similar_index]
111
 
112
+
113
+ def find_most_similar_product(anchor_embedding, model):
114
  """
115
+ Anchor 임베딩을 기반으로 학습된 positive/negative 상품 중 가장 유사한 상품을 반환.
116
  """
 
 
117
  train_embeddings, products = [], []
118
+ # Anchor 데이터와 유사한 상품 임베딩을 생성
119
+ for _ in range(100): # 예시로 100개의 상품 임베딩을 계산한다고 가정
120
+ random_input = torch.rand((1, 768)) # KoBERT 차원에 맞는 랜덤 데이터
121
+ train_embedding = (
122
+ model(random_input).detach().numpy().flatten()
123
+ ) # 모델을 통해 임베딩 계산
124
+ products.append(f"Product_{len(products) + 1}") # 상품 이름
125
+ train_embeddings.append(train_embedding)
126
 
127
  train_embeddings = np.array(train_embeddings)
128
 
129
+ print(f"Anchor embedding dimension: {anchor_embedding.shape}")
130
+ print(f"Train embedding dimension: {train_embeddings.shape}")
131
+
132
  # Cosine Similarity 계산
133
+ similarities = calculate_similarity(
134
+ anchor_embedding.reshape(1, -1), train_embeddings
135
+ )
136
  most_similar_index = np.argmax(similarities)
137
 
138
  return products[most_similar_index], train_embeddings[most_similar_index]
139
 
140
+
141
  def recommend_shop_product(similar_product_embedding):
142
  """
143
+ 학습된 상품과 쇼핑몰 상품 임베딩을 비교하여 최종 추천 상품 반환.
144
  """
 
145
  all_products = list(product_embedding_collection.find())
146
  shop_product_embeddings, shop_product_ids = [], []
147
+
148
  for product in all_products:
149
  shop_product_ids.append(product["product_id"])
150
  shop_product_embeddings.append(product["embedding"])
151
 
152
  shop_product_embeddings = np.array(shop_product_embeddings)
153
+ shop_product_embeddings = shop_product_embeddings.reshape(
154
+ shop_product_embeddings.shape[0], -1
155
+ )
156
+
157
+ # Shop 제품 임베딩을 NumPy 배열로 변환
158
+ shop_product_embeddings = np.array(shop_product_embeddings)
159
+
160
+ # Autoencoder로 차원 축소 (512 -> 128)
161
+ shop_product_embeddings_reduced = (
162
+ autoencoder.encoder(torch.tensor(shop_product_embeddings).float())
163
+ .detach()
164
+ .numpy()
165
+ )
166
+
167
+ # similar_product_embedding을 (1, 128)로 변환
168
+ similar_product_embedding = similar_product_embedding.reshape(1, -1)
169
+
170
+ print(f"Similar product embedding dimension: {similar_product_embedding.shape}")
171
+ print(f"Shop product embedding dimension: {shop_product_embeddings_reduced.shape}")
172
 
173
  # Cosine Similarity 계산
174
+ similarities = calculate_similarity(
175
+ similar_product_embedding, shop_product_embeddings_reduced
176
+ )
177
  most_similar_index = np.argmax(similarities)
178
 
179
  return shop_product_ids[most_similar_index]