Spaces:

marimo-team
/

motherduck-embeddings-visualizer

Running

App Files Files Community

mylessss commited on Dec 10, 2024

Commit

d86024f

1 Parent(s): 697c77c

updates

Browse files

Files changed (2) hide show

README.md +2 -2
app.py +13 -11

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: motherduck embedding explorer
 emoji: 🦆
 colorFrom: purple
 colorTo: indigo
 sdk: docker
 pinned: false
 license: mit
-short_description: motherduck embedding explorer
 ---
 Check out marimo at <https://github.com/marimo-team/marimo>

 ---
+title: motherduck embeddings visualizer
 emoji: 🦆
 colorFrom: purple
 colorTo: indigo
 sdk: docker
 pinned: false
 license: mit
+short_description: Visualize embeddings in a 2D space with marimo
 ---
 Check out marimo at <https://github.com/marimo-team/marimo>

app.py CHANGED Viewed

@@ -23,7 +23,6 @@ app = marimo.App(width="medium")
 @app.cell
 def __():
     import marimo as mo
     return (mo,)
@@ -35,7 +34,8 @@ def __(mo):
         > Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
-        [_Read the full blog here._](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/)
         """
     )
     return
@@ -102,13 +102,15 @@ def __(demo_with_embeddings, mo, my_db):
 @app.cell
-def __(PCA, hdbscan, np, umap):
     def umap_reduce(np_array, metric="cosine"):
         """
         Reduce the dimensionality of the embeddings to 2D using
         UMAP algorithm. UMAP preserves both local and global structure
         of the high-dimensional data.
         """
         reducer = umap.UMAP(
             n_components=2,  # Reduce to 2D for visualization
             metric=metric,  # Default: cosine similarity for text embeddings
@@ -117,12 +119,16 @@ def __(PCA, hdbscan, np, umap):
         )
         return reducer.fit_transform(np_array)
     def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
         """
         Cluster the embeddings using HDBSCAN algorithm.
         We first reduce dimensionality to 50D with PCA to speed up clustering,
         while still preserving most of the important information.
         """
         pca = PCA(n_components=50)
         np_array = pca.fit_transform(np_array)
@@ -135,7 +141,6 @@ def __(PCA, hdbscan, np, umap):
         return np.where(
             hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
         )
     return cluster_points, umap_reduce
@@ -180,6 +185,8 @@ def __(
     umap_reduce,
 ):
     with mo.status.spinner("Clustering points...") as _s:
         embeddings_array = embeddings["text_embedding"].to_numpy()
         hdb_labels = cluster_points(
             embeddings_array,
@@ -189,7 +196,7 @@ def __(
         _s.update("Reducing dimensionality...")
         embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
     mo.show_code()
-    return embeddings_2d, embeddings_array, hdb_labels
 @app.cell
@@ -274,19 +281,14 @@ def __():
     # Data manipulation and database connections
     import polars as pl
     import duckdb
-    import numba  # <- FYI, this module takes a while to load, be patient
     import pyarrow
     # Visualization
     import altair as alt
     # ML tools for dimensionality reduction and clustering
-    import umap  # For reducing high-dimensional embeddings to 2D
-    import hdbscan  # For clustering similar embeddings
     import numpy as np
-    from sklearn.decomposition import PCA
-    return PCA, alt, duckdb, hdbscan, np, numba, pl, pyarrow, umap
 if __name__ == "__main__":

 @app.cell
 def __():
     import marimo as mo
     return (mo,)
         > Text embeddings have become a crucial tool in AI/ML applications, allowing us to convert text into numerical vectors that capture semantic meaning. These vectors are often used for semantic search, but in this blog post, we'll explore how to visualize and explore text embeddings interactively using MotherDuck and marimo.
+        !!! Info
+            **This marimo application is the result [this blog](https://motherduck.com/blog/MotherDuck-Visualize-Embeddings-Marimo/).** It is recommend to go through the blog first.
         """
     )
     return
 @app.cell
+def __(np):
     def umap_reduce(np_array, metric="cosine"):
         """
         Reduce the dimensionality of the embeddings to 2D using
         UMAP algorithm. UMAP preserves both local and global structure
         of the high-dimensional data.
         """
+        import umap
         reducer = umap.UMAP(
             n_components=2,  # Reduce to 2D for visualization
             metric=metric,  # Default: cosine similarity for text embeddings
         )
         return reducer.fit_transform(np_array)
     def cluster_points(np_array, min_cluster_size=4, max_cluster_size=50):
         """
         Cluster the embeddings using HDBSCAN algorithm.
         We first reduce dimensionality to 50D with PCA to speed up clustering,
         while still preserving most of the important information.
         """
+        import hdbscan
+        from sklearn.decomposition import PCA
         pca = PCA(n_components=50)
         np_array = pca.fit_transform(np_array)
         return np.where(
             hdb.labels_ == -1, "outlier", "cluster_" + hdb.labels_.astype(str)
         )
     return cluster_points, umap_reduce
     umap_reduce,
 ):
     with mo.status.spinner("Clustering points...") as _s:
+        import numba  # <- FYI, this module takes a while to load, be patient
         embeddings_array = embeddings["text_embedding"].to_numpy()
         hdb_labels = cluster_points(
             embeddings_array,
         _s.update("Reducing dimensionality...")
         embeddings_2d = umap_reduce(embeddings_array, metric=metric_dropdown.value)
     mo.show_code()
+    return embeddings_2d, embeddings_array, hdb_labels, numba
 @app.cell
     # Data manipulation and database connections
     import polars as pl
     import duckdb
     import pyarrow
     # Visualization
     import altair as alt
     # ML tools for dimensionality reduction and clustering
     import numpy as np
+    return alt, duckdb, np, pl, pyarrow
 if __name__ == "__main__":