mylessss commited on
Commit
d86024f
·
1 Parent(s): 697c77c
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +13 -11
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: motherduck embedding explorer
3
  emoji: 🦆
4
  colorFrom: purple
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
- short_description: motherduck embedding explorer
10
  ---
11
 
12
  Check out marimo at <https://github.com/marimo-team/marimo>
 
1
  ---
2
+ title: motherduck embeddings visualizer
3
  emoji: 🦆
4
  colorFrom: purple
5
  colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ short_description: Visualize embeddings in a 2D space with marimo
10
  ---
11
 
12
  Check out marimo at <https://github.com/marimo-team/marimo>
app.py CHANGED
@@ -23,7 +23,6 @@ app = marimo.App(width="medium")
23
  @app.cell
24
  def __():
25
  import marimo as mo
26
-
27
  return (mo,)
28
 
29
 
@@ -35,7 +34,8 @@ def __(mo):
35
 
36
  > Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
37
 
38
- [_Read the full blog here._](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/)
 
39
  """
40
  )
41
  return
@@ -102,13 +102,15 @@ def __(demo_with_embeddings, mo, my_db):
102
 
103
 
104
  @app.cell
105
- def __(PCA, hdbscan, np, umap):
106
  def umap_reduce(np_array, metric="cosine"):
107
  """
108
  Reduce the dimensionality of the embeddings to 2D using
109
  UMAP algorithm. UMAP preserves both local and global structure
110
  of the high-dimensional data.
111
  """
 
 
112
  reducer = umap.UMAP(
113
  n_components=2, # Reduce to 2D for visualization
114
  metric=metric, # Default: cosine similarity for text embeddings
@@ -117,12 +119,16 @@ def __(PCA, hdbscan, np, umap):
117
  )
118
  return reducer.fit_transform(np_array)
119
 
 
120
  def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
121
  """
122
  Cluster the embeddings using HDBSCAN algorithm.
123
  We first reduce dimensionality to 50D with PCA to speed up clustering,
124
  while still preserving most of the important information.
125
  """
 
 
 
126
  pca = PCA(n_components=50)
127
  np_array = pca.fit_transform(np_array)
128
 
@@ -135,7 +141,6 @@ def __(PCA, hdbscan, np, umap):
135
  return np.where(
136
  hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
137
  )
138
-
139
  return cluster_points, umap_reduce
140
 
141
 
@@ -180,6 +185,8 @@ def __(
180
  umap_reduce,
181
  ):
182
  with mo.status.spinner("Clustering points...") as _s:
 
 
183
  embeddings_array = embeddings["text_embedding"].to_numpy()
184
  hdb_labels = cluster_points(
185
  embeddings_array,
@@ -189,7 +196,7 @@ def __(
189
  _s.update("Reducing dimensionality...")
190
  embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
191
  mo.show_code()
192
- return embeddings_2d, embeddings_array, hdb_labels
193
 
194
 
195
  @app.cell
@@ -274,19 +281,14 @@ def __():
274
  # Data manipulation and database connections
275
  import polars as pl
276
  import duckdb
277
- import numba # <- FYI, this module takes a while to load, be patient
278
  import pyarrow
279
 
280
  # Visualization
281
  import altair as alt
282
 
283
  # ML tools for dimensionality reduction and clustering
284
- import umap # For reducing high-dimensional embeddings to 2D
285
- import hdbscan # For clustering similar embeddings
286
  import numpy as np
287
- from sklearn.decomposition import PCA
288
-
289
- return PCA, alt, duckdb, hdbscan, np, numba, pl, pyarrow, umap
290
 
291
 
292
  if __name__ == "__main__":
 
23
  @app.cell
24
  def __():
25
  import marimo as mo
 
26
  return (mo,)
27
 
28
 
 
34
 
35
  > Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
36
 
37
+ !!! Info
38
+ **This marimo application is the result [this blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).** It is recommend to go through the blog first.
39
  """
40
  )
41
  return
 
102
 
103
 
104
  @app.cell
105
+ def __(np):
106
  def umap_reduce(np_array, metric="cosine"):
107
  """
108
  Reduce the dimensionality of the embeddings to 2D using
109
  UMAP algorithm. UMAP preserves both local and global structure
110
  of the high-dimensional data.
111
  """
112
+ import umap
113
+
114
  reducer = umap.UMAP(
115
  n_components=2, # Reduce to 2D for visualization
116
  metric=metric, # Default: cosine similarity for text embeddings
 
119
  )
120
  return reducer.fit_transform(np_array)
121
 
122
+
123
  def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
124
  """
125
  Cluster the embeddings using HDBSCAN algorithm.
126
  We first reduce dimensionality to 50D with PCA to speed up clustering,
127
  while still preserving most of the important information.
128
  """
129
+ import hdbscan
130
+ from sklearn.decomposition import PCA
131
+
132
  pca = PCA(n_components=50)
133
  np_array = pca.fit_transform(np_array)
134
 
 
141
  return np.where(
142
  hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
143
  )
 
144
  return cluster_points, umap_reduce
145
 
146
 
 
185
  umap_reduce,
186
  ):
187
  with mo.status.spinner("Clustering points...") as _s:
188
+ import numba # <- FYI, this module takes a while to load, be patient
189
+
190
  embeddings_array = embeddings["text_embedding"].to_numpy()
191
  hdb_labels = cluster_points(
192
  embeddings_array,
 
196
  _s.update("Reducing dimensionality...")
197
  embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
198
  mo.show_code()
199
+ return embeddings_2d, embeddings_array, hdb_labels, numba
200
 
201
 
202
  @app.cell
 
281
  # Data manipulation and database connections
282
  import polars as pl
283
  import duckdb
 
284
  import pyarrow
285
 
286
  # Visualization
287
  import altair as alt
288
 
289
  # ML tools for dimensionality reduction and clustering
 
 
290
  import numpy as np
291
+ return alt, duckdb, np, pl, pyarrow
 
 
292
 
293
 
294
  if __name__ == "__main__":