Spaces:
Runtime error
Runtime error
abhijeethp
commited on
Commit
·
0f35f83
1
Parent(s):
7f48873
added similarity score calc
Browse files- .gitignore +2 -1
- app.py +21 -2
- battleground_tab.py +51 -0
- embedding_models/open_ai.py +29 -0
- embedding_models/registry.py +15 -0
- requirements.txt +4 -0
- similarity_models/cosine_similarity.py +15 -0
- similarity_models/registry.py +15 -0
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
.idea
|
|
|
|
1 |
+
.idea
|
2 |
+
venv
|
app.py
CHANGED
@@ -1,4 +1,23 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from battleground_tab import BattlegroundTab
|
3 |
|
4 |
+
|
5 |
+
def main():
|
6 |
+
st.title("Embedding Arena")
|
7 |
+
tab1, tab2, tab3 = st.tabs(["Battleground", "Leaderboard", "About"])
|
8 |
+
|
9 |
+
with tab1:
|
10 |
+
BattlegroundTab().ui()
|
11 |
+
|
12 |
+
with tab2:
|
13 |
+
st.header("Leaderboard")
|
14 |
+
st.write("#TODO")
|
15 |
+
|
16 |
+
with tab3:
|
17 |
+
st.header("About")
|
18 |
+
st.write("#TODO")
|
19 |
+
|
20 |
+
|
21 |
+
# Run the main function when the app is executed
|
22 |
+
if __name__ == "__main__":
|
23 |
+
main()
|
battleground_tab.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from embedding_models.registry import registry as embedding
|
3 |
+
from similarity_models.registry import registry as similarity
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
def calculate_similarity(text_1, text_2):
|
8 |
+
# TODO: pick any N random embedding models
|
9 |
+
similarity_scores = []
|
10 |
+
|
11 |
+
# TODO: pick any random similarity model
|
12 |
+
similarity_model = similarity.models()["cosine"]
|
13 |
+
for name, model in embedding.models().items():
|
14 |
+
embedding_1 = model.embed(text_1)
|
15 |
+
embedding_2 = model.embed(text_2)
|
16 |
+
|
17 |
+
similarity_scores.append((name, similarity_model.score(embedding_1, embedding_2)))
|
18 |
+
return similarity_scores
|
19 |
+
|
20 |
+
|
21 |
+
class BattlegroundTab:
|
22 |
+
|
23 |
+
def __init__(self):
|
24 |
+
pass
|
25 |
+
|
26 |
+
def ui(self):
|
27 |
+
st.header("Battleground")
|
28 |
+
st.write("Battle embedding models with each other! May the best win!")
|
29 |
+
|
30 |
+
col1, col2 = st.columns(2)
|
31 |
+
with col1:
|
32 |
+
text_1 = st.text_input("Enter first text here!")
|
33 |
+
|
34 |
+
with col2:
|
35 |
+
text_2 = st.text_input("Enter second text here!")
|
36 |
+
|
37 |
+
expected_sc = st.slider(
|
38 |
+
'How similar do feel these words are',
|
39 |
+
min_value=1, max_value=10, step=1, value=5) / 10
|
40 |
+
st.write('Expected Similarity Score = ', expected_sc)
|
41 |
+
|
42 |
+
if st.button("Calculate Similarity Score"):
|
43 |
+
similarity_scores = calculate_similarity(text_1, text_2)
|
44 |
+
df = pd.DataFrame(similarity_scores, columns=['Model', 'Score'])
|
45 |
+
df['Loss'] = abs(df['Score'] - expected_sc)
|
46 |
+
winner_model = df.loc[df['Loss'].idxmin(), 'Model']
|
47 |
+
df['Winner'] = ''
|
48 |
+
df.loc[df['Model'] == winner_model, 'Winner'] = '👑'
|
49 |
+
df = df.drop(columns=['Loss'])
|
50 |
+
markdown_table = df.to_markdown(index=False)
|
51 |
+
st.markdown(markdown_table)
|
embedding_models/open_ai.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
|
3 |
+
class OpenAIAda002():
|
4 |
+
|
5 |
+
def name(self):
|
6 |
+
return "OpenAI/text-embedding-ada-002"
|
7 |
+
|
8 |
+
def embed(self, text):
|
9 |
+
client = OpenAI()
|
10 |
+
resp = client.embeddings.create(
|
11 |
+
input=[text],
|
12 |
+
model="text-embedding-ada-002"
|
13 |
+
)
|
14 |
+
embedding = resp.data[0].embedding
|
15 |
+
return embedding
|
16 |
+
|
17 |
+
class OpenAI3Large():
|
18 |
+
|
19 |
+
def name(self):
|
20 |
+
return "OpenAI/text-embedding-3-large"
|
21 |
+
|
22 |
+
def embed(self, text):
|
23 |
+
client = OpenAI()
|
24 |
+
resp = client.embeddings.create(
|
25 |
+
input=[text],
|
26 |
+
model="text-embedding-3-large"
|
27 |
+
)
|
28 |
+
embedding = resp.data[0].embedding
|
29 |
+
return embedding
|
embedding_models/registry.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from embedding_models import open_ai
|
2 |
+
class _Registry:
|
3 |
+
def __init__(self):
|
4 |
+
pass
|
5 |
+
|
6 |
+
def _models(self):
|
7 |
+
return [
|
8 |
+
open_ai.OpenAIAda002(),
|
9 |
+
open_ai.OpenAI3Large()
|
10 |
+
]
|
11 |
+
|
12 |
+
def models(self):
|
13 |
+
return {m.name(): m for m in self._models()}
|
14 |
+
|
15 |
+
registry = _Registry()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
scikit-learn
|
3 |
+
openai
|
4 |
+
tabulate
|
similarity_models/cosine_similarity.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
3 |
+
|
4 |
+
class CosineSimilarity:
|
5 |
+
def __init__(self):
|
6 |
+
pass
|
7 |
+
|
8 |
+
def name(self):
|
9 |
+
return "cosine"
|
10 |
+
|
11 |
+
def score(self,embedding_1, embedding_2):
|
12 |
+
embedding_1 = np.array([embedding_1])
|
13 |
+
embedding_2 = np.array([embedding_2])
|
14 |
+
similarity_score = cosine_similarity(embedding_1, embedding_2)
|
15 |
+
return similarity_score[0][0]
|
similarity_models/registry.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from similarity_models.cosine_similarity import CosineSimilarity
|
2 |
+
|
3 |
+
class _Registry:
|
4 |
+
def __init__(self):
|
5 |
+
pass
|
6 |
+
|
7 |
+
def _models(self):
|
8 |
+
return [
|
9 |
+
CosineSimilarity()
|
10 |
+
]
|
11 |
+
|
12 |
+
def models(self):
|
13 |
+
return {m.name(): m for m in self._models()}
|
14 |
+
|
15 |
+
registry = _Registry()
|