Trent
Replace multicore TSNE
58706b9
import gzip
import json
from collections import Counter
import pandas as pd
import numpy as np
import jax.numpy as jnp
import tqdm
from sentence_transformers import util
from typing import List, Union
import torch
from backend.utils import load_model, filter_questions, load_embeddings
from sklearn.manifold import TSNE
def cos_sim(a, b):
return jnp.matmul(a, jnp.transpose(b)) / (jnp.linalg.norm(a) * jnp.linalg.norm(b))
# We get similarity between embeddings.
def text_similarity(anchor: str, inputs: List[str], model_name: str, model_dict: dict):
print(model_name)
model = load_model(model_name, model_dict)
# Creating embeddings
if hasattr(model, 'encode'):
anchor_emb = model.encode(anchor)[None, :]
inputs_emb = model.encode(inputs)
else:
assert len(model) == 2
anchor_emb = model[0].encode(anchor)[None, :]
inputs_emb = model[1].encode(inputs)
# Obtaining similarity
similarity = list(jnp.squeeze(cos_sim(anchor_emb, inputs_emb)))
# Returning a Pandas' dataframe
d = {'inputs': inputs,
'score': [round(similarity[i], 3) for i in range(len(similarity))]}
df = pd.DataFrame(d, columns=['inputs', 'score'])
return df
# Search
def text_search(anchor: str, n_answers: int, model_name: str, model_dict: dict):
# Proceeding with model
print(model_name)
assert model_name == "distilbert_qa"
model = load_model(model_name, model_dict)
# Creating embeddings
query_emb = model.encode(anchor, convert_to_tensor=True)[None, :]
print("loading embeddings")
corpus_emb = load_embeddings()
# Getting hits
hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0]
filtered_posts = filter_questions("python")
print(f"{len(filtered_posts)} posts found with tag: python")
hits_titles = []
hits_scores = []
urls = []
for hit in hits:
post = filtered_posts[hit['corpus_id']]
hits_titles.append(post['title'])
hits_scores.append("{:.3f}".format(hit['score']))
urls.append(f"https://stackoverflow.com/q/{post['id']}")
return hits_titles, hits_scores, urls
def text_cluster(anchor: str, n_answers: int, model_name: str, model_dict: dict):
# Proceeding with model
print(model_name)
assert model_name == "distilbert_qa"
model = load_model(model_name, model_dict)
# Creating embeddings
query_emb = model.encode(anchor, convert_to_tensor=True)[None, :]
print("loading embeddings")
corpus_emb = load_embeddings()
# Getting hits
hits = util.semantic_search(query_emb, corpus_emb, score_function=util.dot_score, top_k=n_answers)[0]
filtered_posts = filter_questions("python")
hits_dict = [filtered_posts[hit['corpus_id']] for hit in hits]
hits_dict.append(dict(id = '1', title = anchor, tags = ['']))
hits_emb = torch.stack([corpus_emb[hit['corpus_id']] for hit in hits])
hits_emb = torch.cat((hits_emb, query_emb))
# Dimensionality reduction with t-SNE
tsne = TSNE(n_components=3, verbose=1, perplexity=15, n_iter=1000)
tsne_results = tsne.fit_transform(hits_emb.cpu())
df = pd.DataFrame(hits_dict)
tags = list(df['tags'])
counter = Counter(tags[0])
for i in tags[1:]:
counter.update(i)
df_tags = pd.DataFrame(counter.most_common(), columns=['Tag', 'Mentions'])
most_common_tags = list(df_tags['Tag'])[1:5]
labels = []
for tags_list in list(df['tags']):
for common_tag in most_common_tags:
if common_tag in tags_list:
labels.append(common_tag)
break
elif common_tag != most_common_tags[-1]:
continue
else:
labels.append('others')
df['title'] = [post['title'] for post in hits_dict]
df['labels'] = labels
df['tsne_x'] = tsne_results[:, 0]
df['tsne_y'] = tsne_results[:, 1]
df['tsne_z'] = tsne_results[:, 2]
df['size'] = [2 for i in range(len(df))]
# Making the query bigger than the rest of the observations
df['size'][len(df) - 1] = 10
df['labels'][len(df) - 1] = 'QUERY'
import plotly.express as px
fig = px.scatter_3d(df, x='tsne_x', y='tsne_y', z='tsne_z', color='labels', size='size',
color_discrete_sequence=px.colors.qualitative.D3, hover_data=[df.title])
return fig