updates
Browse files
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
---
|
2 |
-
title: motherduck
|
3 |
emoji: 🦆
|
4 |
colorFrom: purple
|
5 |
colorTo: indigo
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
-
short_description:
|
10 |
---
|
11 |
|
12 |
Check out marimo at <https://github.com/marimo-team/marimo>
|
|
|
1 |
---
|
2 |
+
title: motherduck embeddings visualizer
|
3 |
emoji: 🦆
|
4 |
colorFrom: purple
|
5 |
colorTo: indigo
|
6 |
sdk: docker
|
7 |
pinned: false
|
8 |
license: mit
|
9 |
+
short_description: Visualize embeddings in a 2D space with marimo
|
10 |
---
|
11 |
|
12 |
Check out marimo at <https://github.com/marimo-team/marimo>
|
app.py
CHANGED
@@ -23,7 +23,6 @@ app = marimo.App(width="medium")
|
|
23 |
@app.cell
|
24 |
def __():
|
25 |
import marimo as mo
|
26 |
-
|
27 |
return (mo,)
|
28 |
|
29 |
|
@@ -35,7 +34,8 @@ def __(mo):
|
|
35 |
|
36 |
> Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
|
37 |
|
38 |
-
|
|
|
39 |
"""
|
40 |
)
|
41 |
return
|
@@ -102,13 +102,15 @@ def __(demo_with_embeddings, mo, my_db):
|
|
102 |
|
103 |
|
104 |
@app.cell
|
105 |
-
def __(
|
106 |
def umap_reduce(np_array, metric="cosine"):
|
107 |
"""
|
108 |
Reduce the dimensionality of the embeddings to 2D using
|
109 |
UMAP algorithm. UMAP preserves both local and global structure
|
110 |
of the high-dimensional data.
|
111 |
"""
|
|
|
|
|
112 |
reducer = umap.UMAP(
|
113 |
n_components=2, # Reduce to 2D for visualization
|
114 |
metric=metric, # Default: cosine similarity for text embeddings
|
@@ -117,12 +119,16 @@ def __(PCA, hdbscan, np, umap):
|
|
117 |
)
|
118 |
return reducer.fit_transform(np_array)
|
119 |
|
|
|
120 |
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
121 |
"""
|
122 |
Cluster the embeddings using HDBSCAN algorithm.
|
123 |
We first reduce dimensionality to 50D with PCA to speed up clustering,
|
124 |
while still preserving most of the important information.
|
125 |
"""
|
|
|
|
|
|
|
126 |
pca = PCA(n_components=50)
|
127 |
np_array = pca.fit_transform(np_array)
|
128 |
|
@@ -135,7 +141,6 @@ def __(PCA, hdbscan, np, umap):
|
|
135 |
return np.where(
|
136 |
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
137 |
)
|
138 |
-
|
139 |
return cluster_points, umap_reduce
|
140 |
|
141 |
|
@@ -180,6 +185,8 @@ def __(
|
|
180 |
umap_reduce,
|
181 |
):
|
182 |
with mo.status.spinner("Clustering points...") as _s:
|
|
|
|
|
183 |
embeddings_array = embeddings["text_embedding"].to_numpy()
|
184 |
hdb_labels = cluster_points(
|
185 |
embeddings_array,
|
@@ -189,7 +196,7 @@ def __(
|
|
189 |
_s.update("Reducing dimensionality...")
|
190 |
embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
|
191 |
mo.show_code()
|
192 |
-
return embeddings_2d, embeddings_array, hdb_labels
|
193 |
|
194 |
|
195 |
@app.cell
|
@@ -274,19 +281,14 @@ def __():
|
|
274 |
# Data manipulation and database connections
|
275 |
import polars as pl
|
276 |
import duckdb
|
277 |
-
import numba # <- FYI, this module takes a while to load, be patient
|
278 |
import pyarrow
|
279 |
|
280 |
# Visualization
|
281 |
import altair as alt
|
282 |
|
283 |
# ML tools for dimensionality reduction and clustering
|
284 |
-
import umap # For reducing high-dimensional embeddings to 2D
|
285 |
-
import hdbscan # For clustering similar embeddings
|
286 |
import numpy as np
|
287 |
-
|
288 |
-
|
289 |
-
return PCA, alt, duckdb, hdbscan, np, numba, pl, pyarrow, umap
|
290 |
|
291 |
|
292 |
if __name__ == "__main__":
|
|
|
23 |
@app.cell
|
24 |
def __():
|
25 |
import marimo as mo
|
|
|
26 |
return (mo,)
|
27 |
|
28 |
|
|
|
34 |
|
35 |
> Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
|
36 |
|
37 |
+
!!! Info
|
38 |
+
**This marimo application is the result [this blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).** It is recommend to go through the blog first.
|
39 |
"""
|
40 |
)
|
41 |
return
|
|
|
102 |
|
103 |
|
104 |
@app.cell
|
105 |
+
def __(np):
|
106 |
def umap_reduce(np_array, metric="cosine"):
|
107 |
"""
|
108 |
Reduce the dimensionality of the embeddings to 2D using
|
109 |
UMAP algorithm. UMAP preserves both local and global structure
|
110 |
of the high-dimensional data.
|
111 |
"""
|
112 |
+
import umap
|
113 |
+
|
114 |
reducer = umap.UMAP(
|
115 |
n_components=2, # Reduce to 2D for visualization
|
116 |
metric=metric, # Default: cosine similarity for text embeddings
|
|
|
119 |
)
|
120 |
return reducer.fit_transform(np_array)
|
121 |
|
122 |
+
|
123 |
def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
|
124 |
"""
|
125 |
Cluster the embeddings using HDBSCAN algorithm.
|
126 |
We first reduce dimensionality to 50D with PCA to speed up clustering,
|
127 |
while still preserving most of the important information.
|
128 |
"""
|
129 |
+
import hdbscan
|
130 |
+
from sklearn.decomposition import PCA
|
131 |
+
|
132 |
pca = PCA(n_components=50)
|
133 |
np_array = pca.fit_transform(np_array)
|
134 |
|
|
|
141 |
return np.where(
|
142 |
hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
|
143 |
)
|
|
|
144 |
return cluster_points, umap_reduce
|
145 |
|
146 |
|
|
|
185 |
umap_reduce,
|
186 |
):
|
187 |
with mo.status.spinner("Clustering points...") as _s:
|
188 |
+
import numba # <- FYI, this module takes a while to load, be patient
|
189 |
+
|
190 |
embeddings_array = embeddings["text_embedding"].to_numpy()
|
191 |
hdb_labels = cluster_points(
|
192 |
embeddings_array,
|
|
|
196 |
_s.update("Reducing dimensionality...")
|
197 |
embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
|
198 |
mo.show_code()
|
199 |
+
return embeddings_2d, embeddings_array, hdb_labels, numba
|
200 |
|
201 |
|
202 |
@app.cell
|
|
|
281 |
# Data manipulation and database connections
|
282 |
import polars as pl
|
283 |
import duckdb
|
|
|
284 |
import pyarrow
|
285 |
|
286 |
# Visualization
|
287 |
import altair as alt
|
288 |
|
289 |
# ML tools for dimensionality reduction and clustering
|
|
|
|
|
290 |
import numpy as np
|
291 |
+
return alt, duckdb, np, pl, pyarrow
|
|
|
|
|
292 |
|
293 |
|
294 |
if __name__ == "__main__":
|