Spaces:

xet-team
/

cas-analysis

Sleeping

App Files Files Community

jsulz HF staff commited on Nov 21, 2024

Commit

c8790f8

1 Parent(s): 001bb95

initial commit

Browse files

Files changed (6) hide show

.gitignore +1 -0
README.md +5 -7
app.py +240 -0
poetry.lock +0 -0
pyproject.toml +19 -0
requirements.txt +54 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 ---
-title: Cas Analysis
-emoji: 🐢
-colorFrom: gray
-colorTo: pink
 sdk: gradio
-sdk_version: 5.6.0
 app_file: app.py
 pinned: false
 short_description: Visualize a day of global upload traffic on the Hub.
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: CAS PoPs Analysis
+emoji: 📉
+colorFrom: pink
+colorTo: red
 sdk: gradio
+sdk_version: 5.3.0
 app_file: app.py
 pinned: false
 short_description: Visualize a day of global upload traffic on the Hub.
 ---

app.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# pylint: disable=no-member
+import pandas as pd
+import gradio as gr
+import plotly.express as px
+import plotly.graph_objects as go
+import numpy as np
+s3_aggregation_df = pd.read_parquet(
+    "hf://datasets/xet-team/cas-pops-analysis-data/aggregated_s3_logs.parquet"
+)
+aws_regions = pd.read_parquet(
+    "hf://datasets/xet-team/cas-pops-analysis-data/regions.parquet"
+)
+sum_request_count = s3_aggregation_df["request_count"].sum()
+sum_object_size = s3_aggregation_df["object_size"].sum()
+n_unique_countries = s3_aggregation_df["country_code"].nunique()
+unique_regions = list(s3_aggregation_df["region"].unique())
+unique_countries = list(s3_aggregation_df["country_name"].unique())
+all_regions_countries = unique_regions + unique_countries
+agg_by_region = (
+    s3_aggregation_df.groupby(["region"])[["object_size", "request_count"]]
+    .sum()
+    .reset_index()
+)
+agg_by_region["object_size_pct"] = (
+    agg_by_region["object_size"] / agg_by_region["object_size"].sum()
+)
+agg_by_region["request_count_pct"] = (
+    agg_by_region["request_count"] / agg_by_region["request_count"].sum()
+)
+agg_by_region["object_size_pct_fmt"] = agg_by_region["object_size_pct"].apply(
+    lambda x: f"{100*x:.2f}"
+)
+agg_by_region["request_pct_fmt"] = agg_by_region["request_count_pct"].apply(
+    lambda x: f"{100*x:.2f}"
+)
+def remap_radio_value(value):
+    return "object_size" if value == "Upload size" else "request_count"
+def pareto_chart(sort_by, global_filter="All"):
+    sort_by = remap_radio_value(sort_by)
+    title = sort_by.replace("_", " ").title()
+    _df = (
+        s3_aggregation_df.groupby(["country_code", "country_name", "region"])[sort_by]
+        .sum()
+        .reset_index()
+    )
+    if global_filter != "All":
+        if global_filter in unique_regions:
+            _df = _df[_df["region"] == global_filter]
+    _df = _df.sort_values(by=sort_by, ascending=False)
+    _df["cumulative_percentage"] = _df[sort_by].cumsum() / _df[sort_by].sum() * 100
+    _df = _df.head(20)
+    if global_filter != "All":
+        _df = _df.head(10)
+    fig = go.Figure()
+    fig.add_trace(
+        go.Bar(
+            x=_df["country_code"],
+            y=_df[sort_by],
+            name=title,
+            hovertext=_df["country_name"],
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=_df["country_code"],
+            y=_df["cumulative_percentage"],
+            yaxis="y2",
+            name="Cumulative Percentage",
+            mode="lines+markers",
+        )
+    )
+    region = global_filter + " region" if global_filter != "All" else "All Regions"
+    # Update layout
+    if title == "Object Size":
+        title = "Uploaded Data (TB)"
+    else:
+        title = "Requests"
+    fig.update_layout(
+        title=f"Top {_df.shape[0]} Countries by Total {title} in {region}",
+        xaxis_title="Country ISO Code",
+        yaxis_title=title,
+        yaxis2=dict(title="Cumulative Percentage", overlaying="y", side="right"),
+        xaxis=dict(range=[-0.5, len(_df["country_code"]) - 0.5]),
+        legend=dict(orientation="h"),
+    )
+    fig.add_hline(
+        y=80,
+        line_dash="dot",
+        annotation_text="",
+        annotation_position="top right",
+        yref="y2",
+    )
+    return fig
+def manually_animated_choropleth_filter(hour, df_column, global_filter):
+    df_column = remap_radio_value(df_column)
+    hour = hour - 1
+    if global_filter != "All":
+        min_range = s3_aggregation_df[s3_aggregation_df["region"] == global_filter][
+            df_column
+        ].min()
+        max_range = s3_aggregation_df[s3_aggregation_df["region"] == global_filter][
+            df_column
+        ].max()
+    else:
+        min_range = s3_aggregation_df[df_column].min()
+        max_range = s3_aggregation_df[df_column].max()
+    _df = s3_aggregation_df[s3_aggregation_df["hour"] == hour]
+    if global_filter != "All":
+        if global_filter in unique_regions:
+            _df = _df[_df["region"] == global_filter]
+    title = df_column.replace("_", " ").title()
+    fig = px.choropleth(
+        data_frame=_df,
+        locations="country_code",
+        color=df_column,
+        color_continuous_scale=px.colors.sequential.Plasma,
+        projection="natural earth",
+        height=800,
+        hover_name="country_name",
+        hover_data=df_column,
+        range_color=[min_range, max_range],
+    )
+    if title == "Object Size":
+        title = "Global Distribution of Uploaded Data (TB)"
+    else:
+        title = "Global Distribution of Requests"
+    fig.update_layout(
+        title_text=title,
+        geo=dict(showframe=False, showcoastlines=False),
+        margin=dict(l=0, r=0, t=0, b=0),
+    )
+    return fig
+with gr.Blocks(theme="citrus", fill_width=False) as demo:
+    gr.Markdown(
+        """
+        # A Global Analysis of Hub Uploads
+        """
+    )
+    gr.HTML(
+        f"<div id='global' style='font-size:16px;color:var(--body-text-color)'><span style='background-color:#f59e0b;color:black;padding:2px'>{n_unique_countries}</span> countries developing, sending <span style='background-color:#f59e0b;color:black;padding:2px'>{sum_request_count:,}</span> upload requests, and pushing over <span style='background-color:#f59e0b;color:black;padding:2px'>{sum_object_size / 1e+12:.2f}TB</span> to the Hub in 24 hours.</div>"
+    )
+    gr.Markdown(
+        "Use the slider below to view the data by hour. Select `Upload Size` to see total uploaded size in bytes, or `Requests` to show the cumulative number of requests from each country."
+    )
+    gr.Markdown(
+        "Xet-backed storage uses a [content-addressable store (CAS)](https://en.wikipedia.org/wiki/Content-addressable_storage) as an integral part of its architecture. This enables efficient deduplication and optimized data storage, making it ideal for our needs. As we re-architect uploads and downloads on the Hub, we are inserting a CAS as the first stop for content distribution. To see how uploads are routed to each CAS cluster in our architecture, use the drop-down menu to filter by AWS region. For more details, check out our accompanying blog post."
+    )
+    with gr.Row():
+        with gr.Group():
+            with gr.Column(scale=1):
+                hour = gr.Slider(minimum=1, step=1, maximum=24, label="Hour")
+                with gr.Row():
+                    aggregate_by = gr.Radio(
+                        choices=["Upload size", "Requests"],
+                        value="Upload size",
+                        label="View by total upload size in bytes or cumulative requests from a country",
+                    )
+                    countries = gr.Dropdown(
+                        choices=["All"] + unique_regions,
+                        label="Filter by CAS AWS region",
+                        multiselect=False,
+                        value="All",
+                    )
+    chloropleth_map = gr.Plot()
+    # Load the map and listen to changes on the year slider updating the map accordingly
+    demo.load(
+        manually_animated_choropleth_filter,
+        inputs=[hour, aggregate_by, countries],
+        outputs=chloropleth_map,
+    )
+    hour.change(
+        manually_animated_choropleth_filter,
+        inputs=[hour, aggregate_by, countries],
+        outputs=chloropleth_map,
+        show_progress=False,
+    )
+    aggregate_by.change(
+        manually_animated_choropleth_filter,
+        inputs=[hour, aggregate_by, countries],
+        outputs=chloropleth_map,
+        show_progress=False,
+    )
+    countries.change(
+        manually_animated_choropleth_filter,
+        inputs=[hour, aggregate_by, countries],
+        outputs=chloropleth_map,
+        show_progress=False,
+    )
+    gr.Markdown(
+        "The Pareto chart below shows the top countries by upload size or request count, with a cumulative line indicating the percentage of total upload volume or requests represented by these countries. Like the map above, the values change as you filter by AWS region."
+    )
+    bar_chart = gr.Plot()
+    demo.load(
+        pareto_chart,
+        inputs=[aggregate_by, countries],
+        outputs=bar_chart,
+    )
+    aggregate_by.change(
+        pareto_chart,
+        inputs=[aggregate_by, countries],
+        outputs=bar_chart,
+        show_progress=False,
+    )
+    countries.change(
+        pareto_chart,
+        inputs=[aggregate_by, countries],
+        outputs=bar_chart,
+        show_progress=False,
+    )
+demo.launch()
+# TODO - add bandwidth slowdown

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,19 @@

+[tool.poetry]
+name = "cas-pops-analysis"
+version = "0.1.0"
+description = ""
+authors = ["jsulz <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.12"
+gradio = "^5.3.0"
+pandas = "^2.2.3"
+plotly = "^5.24.1"
+pyarrow = "^17.0.0"
+numpy = "^2.1.2"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,54 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.6.2.post1
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+fastapi==0.115.3
+ffmpy==0.4.0
+filelock==3.16.1
+fsspec==2024.10.0
+gradio-client==1.4.2
+gradio==5.3.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.1
+idna==3.10
+jinja2==3.1.4
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+mdurl==0.1.2
+numpy==2.1.2
+orjson==3.10.9
+packaging==24.1
+pandas==2.2.3
+pillow==10.4.0
+plotly==5.24.1
+pyarrow==17.0.0
+pydantic-core==2.23.4
+pydantic==2.9.2
+pydub==0.25.1
+pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-multipart==0.0.12
+pytz==2024.2
+pyyaml==6.0.2
+requests==2.32.3
+rich==13.9.2
+ruff==0.7.0
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.41.0
+tenacity==9.0.0
+tomlkit==0.12.0
+tqdm==4.66.5
+typer==0.12.5
+typing-extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+uvicorn==0.32.0
+websockets==12.0