davidberenstein1957 HF staff commited on
Commit
8636daf
·
verified ·
1 Parent(s): d593d0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -31
app.py CHANGED
@@ -4,37 +4,38 @@ from sentence_transformers import SentenceTransformer
4
  import duckdb
5
  from huggingface_hub import get_token
6
 
7
- model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m-v1.5")
8
-
9
- def similarity_search(
10
- query: str,
11
- k: int = 5,
12
- dataset_name: str = "smol-blueprint-project/hf-blogs-text-embeddings",
13
- embedding_column: str = "embedding",
14
- ):
15
- # Use same model as used for indexing
16
- query_vector = model.encode(query)
17
- embedding_dim = model.get_sentence_embedding_dimension()
18
-
19
- sql = f"""
20
- SELECT
21
- title,
22
- author,
23
- date,
24
- local,
25
- tags,
26
- URL,
27
- chunk,
28
- array_cosine_distance(
29
- {embedding_column}::float[{embedding_dim}],
30
- {query_vector.tolist()}::float[{embedding_dim}]
31
- ) as distance
32
- FROM 'hf://datasets/{dataset_name}/**/*.parquet'
33
- ORDER BY distance
34
- LIMIT {k}
 
 
35
  """
36
-
37
- return duckdb.sql(sql).to_df()
38
 
39
  with gr.Blocks() as demo:
40
  gr.Markdown("""# Vector Search Hub Datasets
@@ -43,7 +44,7 @@ with gr.Blocks() as demo:
43
  query = gr.Textbox(label="Query")
44
  k = gr.Slider(1, 10, value=5, label="Number of results")
45
  btn = gr.Button("Search")
46
- results = gr.Dataframe(headers=["title", "url", "content", "distance"])
47
  btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
48
 
49
 
 
4
  import duckdb
5
  from huggingface_hub import get_token
6
 
7
+ from sentence_transformers import SentenceTransformer
8
+ from sentence_transformers.models import StaticEmbedding
9
+ import duckdb
10
+
11
+ # Initialize a StaticEmbedding module
12
+ static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
13
+ model = SentenceTransformer(modules=[static_embedding])
14
+
15
+ dataset_name = "smol-blueprint/fineweb-bbc-news-text-embeddings"
16
+ embedding_column = "embedding"
17
+
18
+ duckdb.sql(
19
+ query=f"""
20
+ INSTALL vss;
21
+ LOAD vss;
22
+ CREATE TABLE embeddings AS
23
+ SELECT *, {embedding_column}::float[{model.get_sentence_embedding_dimension()}] as embedding_float
24
+ FROM 'hf://datasets/{dataset_name}/**/*.parquet';
25
+ CREATE INDEX my_hnsw_index ON embeddings USING HNSW (embedding_float) WITH (metric = 'cosine');
26
+ """
27
+ )
28
+
29
+ def similarity_search(query: str, k: int = 5):
30
+ embedding = model.encode(query).tolist()
31
+ return duckdb.sql(
32
+ query=f"""
33
+ SELECT url, chunk, array_cosine_distance(embedding_float, {embedding}::FLOAT[{model.get_sentence_embedding_dimension()}]) as distance
34
+ FROM embeddings
35
+ ORDER BY distance
36
+ LIMIT {k};
37
  """
38
+ ).to_df()
 
39
 
40
  with gr.Blocks() as demo:
41
  gr.Markdown("""# Vector Search Hub Datasets
 
44
  query = gr.Textbox(label="Query")
45
  k = gr.Slider(1, 10, value=5, label="Number of results")
46
  btn = gr.Button("Search")
47
+ results = gr.Dataframe(headers=["url", "chunk", "distance"])
48
  btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
49
 
50