abhijeethp commited on
Commit
0f35f83
·
1 Parent(s): 7f48873

added similarity score calc

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
- .idea
 
 
1
+ .idea
2
+ venv
app.py CHANGED
@@ -1,4 +1,23 @@
1
  import streamlit as st
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from battleground_tab import BattlegroundTab
3
 
4
+
5
+ def main():
6
+ st.title("Embedding Arena")
7
+ tab1, tab2, tab3 = st.tabs(["Battleground", "Leaderboard", "About"])
8
+
9
+ with tab1:
10
+ BattlegroundTab().ui()
11
+
12
+ with tab2:
13
+ st.header("Leaderboard")
14
+ st.write("#TODO")
15
+
16
+ with tab3:
17
+ st.header("About")
18
+ st.write("#TODO")
19
+
20
+
21
+ # Run the main function when the app is executed
22
+ if __name__ == "__main__":
23
+ main()
battleground_tab.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from embedding_models.registry import registry as embedding
3
+ from similarity_models.registry import registry as similarity
4
+ import pandas as pd
5
+
6
+
7
+ def calculate_similarity(text_1, text_2):
8
+ # TODO: pick any N random embedding models
9
+ similarity_scores = []
10
+
11
+ # TODO: pick any random similarity model
12
+ similarity_model = similarity.models()["cosine"]
13
+ for name, model in embedding.models().items():
14
+ embedding_1 = model.embed(text_1)
15
+ embedding_2 = model.embed(text_2)
16
+
17
+ similarity_scores.append((name, similarity_model.score(embedding_1, embedding_2)))
18
+ return similarity_scores
19
+
20
+
21
+ class BattlegroundTab:
22
+
23
+ def __init__(self):
24
+ pass
25
+
26
+ def ui(self):
27
+ st.header("Battleground")
28
+ st.write("Battle embedding models with each other! May the best win!")
29
+
30
+ col1, col2 = st.columns(2)
31
+ with col1:
32
+ text_1 = st.text_input("Enter first text here!")
33
+
34
+ with col2:
35
+ text_2 = st.text_input("Enter second text here!")
36
+
37
+ expected_sc = st.slider(
38
+ 'How similar do feel these words are',
39
+ min_value=1, max_value=10, step=1, value=5) / 10
40
+ st.write('Expected Similarity Score = ', expected_sc)
41
+
42
+ if st.button("Calculate Similarity Score"):
43
+ similarity_scores = calculate_similarity(text_1, text_2)
44
+ df = pd.DataFrame(similarity_scores, columns=['Model', 'Score'])
45
+ df['Loss'] = abs(df['Score'] - expected_sc)
46
+ winner_model = df.loc[df['Loss'].idxmin(), 'Model']
47
+ df['Winner'] = ''
48
+ df.loc[df['Model'] == winner_model, 'Winner'] = '👑'
49
+ df = df.drop(columns=['Loss'])
50
+ markdown_table = df.to_markdown(index=False)
51
+ st.markdown(markdown_table)
embedding_models/open_ai.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+
3
+ class OpenAIAda002():
4
+
5
+ def name(self):
6
+ return "OpenAI/text-embedding-ada-002"
7
+
8
+ def embed(self, text):
9
+ client = OpenAI()
10
+ resp = client.embeddings.create(
11
+ input=[text],
12
+ model="text-embedding-ada-002"
13
+ )
14
+ embedding = resp.data[0].embedding
15
+ return embedding
16
+
17
+ class OpenAI3Large():
18
+
19
+ def name(self):
20
+ return "OpenAI/text-embedding-3-large"
21
+
22
+ def embed(self, text):
23
+ client = OpenAI()
24
+ resp = client.embeddings.create(
25
+ input=[text],
26
+ model="text-embedding-3-large"
27
+ )
28
+ embedding = resp.data[0].embedding
29
+ return embedding
embedding_models/registry.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from embedding_models import open_ai
2
+ class _Registry:
3
+ def __init__(self):
4
+ pass
5
+
6
+ def _models(self):
7
+ return [
8
+ open_ai.OpenAIAda002(),
9
+ open_ai.OpenAI3Large()
10
+ ]
11
+
12
+ def models(self):
13
+ return {m.name(): m for m in self._models()}
14
+
15
+ registry = _Registry()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ scikit-learn
3
+ openai
4
+ tabulate
similarity_models/cosine_similarity.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics.pairwise import cosine_similarity
3
+
4
+ class CosineSimilarity:
5
+ def __init__(self):
6
+ pass
7
+
8
+ def name(self):
9
+ return "cosine"
10
+
11
+ def score(self,embedding_1, embedding_2):
12
+ embedding_1 = np.array([embedding_1])
13
+ embedding_2 = np.array([embedding_2])
14
+ similarity_score = cosine_similarity(embedding_1, embedding_2)
15
+ return similarity_score[0][0]
similarity_models/registry.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from similarity_models.cosine_similarity import CosineSimilarity
2
+
3
+ class _Registry:
4
+ def __init__(self):
5
+ pass
6
+
7
+ def _models(self):
8
+ return [
9
+ CosineSimilarity()
10
+ ]
11
+
12
+ def models(self):
13
+ return {m.name(): m for m in self._models()}
14
+
15
+ registry = _Registry()