jsulz HF staff commited on
Commit
c8790f8
Β·
1 Parent(s): 001bb95

initial commit

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. README.md +5 -7
  3. app.py +240 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +19 -0
  6. requirements.txt +54 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
- title: Cas Analysis
3
- emoji: 🐒
4
- colorFrom: gray
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.6.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: Visualize a day of global upload traffic on the Hub.
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: CAS PoPs Analysis
3
+ emoji: πŸ“‰
4
+ colorFrom: pink
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.3.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: Visualize a day of global upload traffic on the Hub.
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pylint: disable=no-member
2
+ import pandas as pd
3
+ import gradio as gr
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ import numpy as np
7
+
8
+ s3_aggregation_df = pd.read_parquet(
9
+ "hf://datasets/xet-team/cas-pops-analysis-data/aggregated_s3_logs.parquet"
10
+ )
11
+ aws_regions = pd.read_parquet(
12
+ "hf://datasets/xet-team/cas-pops-analysis-data/regions.parquet"
13
+ )
14
+
15
+
16
+ sum_request_count = s3_aggregation_df["request_count"].sum()
17
+ sum_object_size = s3_aggregation_df["object_size"].sum()
18
+ n_unique_countries = s3_aggregation_df["country_code"].nunique()
19
+
20
+ unique_regions = list(s3_aggregation_df["region"].unique())
21
+ unique_countries = list(s3_aggregation_df["country_name"].unique())
22
+ all_regions_countries = unique_regions + unique_countries
23
+
24
+ agg_by_region = (
25
+ s3_aggregation_df.groupby(["region"])[["object_size", "request_count"]]
26
+ .sum()
27
+ .reset_index()
28
+ )
29
+ agg_by_region["object_size_pct"] = (
30
+ agg_by_region["object_size"] / agg_by_region["object_size"].sum()
31
+ )
32
+ agg_by_region["request_count_pct"] = (
33
+ agg_by_region["request_count"] / agg_by_region["request_count"].sum()
34
+ )
35
+ agg_by_region["object_size_pct_fmt"] = agg_by_region["object_size_pct"].apply(
36
+ lambda x: f"{100*x:.2f}"
37
+ )
38
+ agg_by_region["request_pct_fmt"] = agg_by_region["request_count_pct"].apply(
39
+ lambda x: f"{100*x:.2f}"
40
+ )
41
+
42
+
43
+ def remap_radio_value(value):
44
+ return "object_size" if value == "Upload size" else "request_count"
45
+
46
+
47
+ def pareto_chart(sort_by, global_filter="All"):
48
+ sort_by = remap_radio_value(sort_by)
49
+ title = sort_by.replace("_", " ").title()
50
+ _df = (
51
+ s3_aggregation_df.groupby(["country_code", "country_name", "region"])[sort_by]
52
+ .sum()
53
+ .reset_index()
54
+ )
55
+ if global_filter != "All":
56
+ if global_filter in unique_regions:
57
+ _df = _df[_df["region"] == global_filter]
58
+
59
+ _df = _df.sort_values(by=sort_by, ascending=False)
60
+ _df["cumulative_percentage"] = _df[sort_by].cumsum() / _df[sort_by].sum() * 100
61
+
62
+ _df = _df.head(20)
63
+ if global_filter != "All":
64
+ _df = _df.head(10)
65
+
66
+ fig = go.Figure()
67
+ fig.add_trace(
68
+ go.Bar(
69
+ x=_df["country_code"],
70
+ y=_df[sort_by],
71
+ name=title,
72
+ hovertext=_df["country_name"],
73
+ )
74
+ )
75
+ fig.add_trace(
76
+ go.Scatter(
77
+ x=_df["country_code"],
78
+ y=_df["cumulative_percentage"],
79
+ yaxis="y2",
80
+ name="Cumulative Percentage",
81
+ mode="lines+markers",
82
+ )
83
+ )
84
+
85
+ region = global_filter + " region" if global_filter != "All" else "All Regions"
86
+ # Update layout
87
+ if title == "Object Size":
88
+ title = "Uploaded Data (TB)"
89
+ else:
90
+ title = "Requests"
91
+ fig.update_layout(
92
+ title=f"Top {_df.shape[0]} Countries by Total {title} in {region}",
93
+ xaxis_title="Country ISO Code",
94
+ yaxis_title=title,
95
+ yaxis2=dict(title="Cumulative Percentage", overlaying="y", side="right"),
96
+ xaxis=dict(range=[-0.5, len(_df["country_code"]) - 0.5]),
97
+ legend=dict(orientation="h"),
98
+ )
99
+ fig.add_hline(
100
+ y=80,
101
+ line_dash="dot",
102
+ annotation_text="",
103
+ annotation_position="top right",
104
+ yref="y2",
105
+ )
106
+ return fig
107
+
108
+
109
+ def manually_animated_choropleth_filter(hour, df_column, global_filter):
110
+ df_column = remap_radio_value(df_column)
111
+ hour = hour - 1
112
+ if global_filter != "All":
113
+ min_range = s3_aggregation_df[s3_aggregation_df["region"] == global_filter][
114
+ df_column
115
+ ].min()
116
+ max_range = s3_aggregation_df[s3_aggregation_df["region"] == global_filter][
117
+ df_column
118
+ ].max()
119
+ else:
120
+ min_range = s3_aggregation_df[df_column].min()
121
+ max_range = s3_aggregation_df[df_column].max()
122
+
123
+ _df = s3_aggregation_df[s3_aggregation_df["hour"] == hour]
124
+ if global_filter != "All":
125
+ if global_filter in unique_regions:
126
+ _df = _df[_df["region"] == global_filter]
127
+
128
+ title = df_column.replace("_", " ").title()
129
+ fig = px.choropleth(
130
+ data_frame=_df,
131
+ locations="country_code",
132
+ color=df_column,
133
+ color_continuous_scale=px.colors.sequential.Plasma,
134
+ projection="natural earth",
135
+ height=800,
136
+ hover_name="country_name",
137
+ hover_data=df_column,
138
+ range_color=[min_range, max_range],
139
+ )
140
+ if title == "Object Size":
141
+ title = "Global Distribution of Uploaded Data (TB)"
142
+ else:
143
+ title = "Global Distribution of Requests"
144
+ fig.update_layout(
145
+ title_text=title,
146
+ geo=dict(showframe=False, showcoastlines=False),
147
+ margin=dict(l=0, r=0, t=0, b=0),
148
+ )
149
+ return fig
150
+
151
+
152
+ with gr.Blocks(theme="citrus", fill_width=False) as demo:
153
+
154
+ gr.Markdown(
155
+ """
156
+ # A Global Analysis of Hub Uploads
157
+ """
158
+ )
159
+
160
+ gr.HTML(
161
+ f"<div id='global' style='font-size:16px;color:var(--body-text-color)'><span style='background-color:#f59e0b;color:black;padding:2px'>{n_unique_countries}</span> countries developing, sending <span style='background-color:#f59e0b;color:black;padding:2px'>{sum_request_count:,}</span> upload requests, and pushing over <span style='background-color:#f59e0b;color:black;padding:2px'>{sum_object_size / 1e+12:.2f}TB</span> to the Hub in 24 hours.</div>"
162
+ )
163
+
164
+ gr.Markdown(
165
+ "Use the slider below to view the data by hour. Select `Upload Size` to see total uploaded size in bytes, or `Requests` to show the cumulative number of requests from each country."
166
+ )
167
+
168
+ gr.Markdown(
169
+ "Xet-backed storage uses a [content-addressable store (CAS)](https://en.wikipedia.org/wiki/Content-addressable_storage) as an integral part of its architecture. This enables efficient deduplication and optimized data storage, making it ideal for our needs. As we re-architect uploads and downloads on the Hub, we are inserting a CAS as the first stop for content distribution. To see how uploads are routed to each CAS cluster in our architecture, use the drop-down menu to filter by AWS region. For more details, check out our accompanying blog post."
170
+ )
171
+
172
+ with gr.Row():
173
+ with gr.Group():
174
+ with gr.Column(scale=1):
175
+ hour = gr.Slider(minimum=1, step=1, maximum=24, label="Hour")
176
+ with gr.Row():
177
+ aggregate_by = gr.Radio(
178
+ choices=["Upload size", "Requests"],
179
+ value="Upload size",
180
+ label="View by total upload size in bytes or cumulative requests from a country",
181
+ )
182
+ countries = gr.Dropdown(
183
+ choices=["All"] + unique_regions,
184
+ label="Filter by CAS AWS region",
185
+ multiselect=False,
186
+ value="All",
187
+ )
188
+ chloropleth_map = gr.Plot()
189
+
190
+ # Load the map and listen to changes on the year slider updating the map accordingly
191
+ demo.load(
192
+ manually_animated_choropleth_filter,
193
+ inputs=[hour, aggregate_by, countries],
194
+ outputs=chloropleth_map,
195
+ )
196
+ hour.change(
197
+ manually_animated_choropleth_filter,
198
+ inputs=[hour, aggregate_by, countries],
199
+ outputs=chloropleth_map,
200
+ show_progress=False,
201
+ )
202
+ aggregate_by.change(
203
+ manually_animated_choropleth_filter,
204
+ inputs=[hour, aggregate_by, countries],
205
+ outputs=chloropleth_map,
206
+ show_progress=False,
207
+ )
208
+ countries.change(
209
+ manually_animated_choropleth_filter,
210
+ inputs=[hour, aggregate_by, countries],
211
+ outputs=chloropleth_map,
212
+ show_progress=False,
213
+ )
214
+
215
+ gr.Markdown(
216
+ "The Pareto chart below shows the top countries by upload size or request count, with a cumulative line indicating the percentage of total upload volume or requests represented by these countries. Like the map above, the values change as you filter by AWS region."
217
+ )
218
+
219
+ bar_chart = gr.Plot()
220
+ demo.load(
221
+ pareto_chart,
222
+ inputs=[aggregate_by, countries],
223
+ outputs=bar_chart,
224
+ )
225
+ aggregate_by.change(
226
+ pareto_chart,
227
+ inputs=[aggregate_by, countries],
228
+ outputs=bar_chart,
229
+ show_progress=False,
230
+ )
231
+ countries.change(
232
+ pareto_chart,
233
+ inputs=[aggregate_by, countries],
234
+ outputs=bar_chart,
235
+ show_progress=False,
236
+ )
237
+
238
+ demo.launch()
239
+
240
+ # TODO - add bandwidth slowdown
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "cas-pops-analysis"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["jsulz <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "^3.12"
10
+ gradio = "^5.3.0"
11
+ pandas = "^2.2.3"
12
+ plotly = "^5.24.1"
13
+ pyarrow = "^17.0.0"
14
+ numpy = "^2.1.2"
15
+
16
+
17
+ [build-system]
18
+ requires = ["poetry-core"]
19
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ annotated-types==0.7.0
3
+ anyio==4.6.2.post1
4
+ certifi==2024.8.30
5
+ charset-normalizer==3.4.0
6
+ click==8.1.7
7
+ colorama==0.4.6
8
+ fastapi==0.115.3
9
+ ffmpy==0.4.0
10
+ filelock==3.16.1
11
+ fsspec==2024.10.0
12
+ gradio-client==1.4.2
13
+ gradio==5.3.0
14
+ h11==0.14.0
15
+ httpcore==1.0.6
16
+ httpx==0.27.2
17
+ huggingface-hub==0.26.1
18
+ idna==3.10
19
+ jinja2==3.1.4
20
+ markdown-it-py==3.0.0
21
+ markupsafe==2.1.5
22
+ mdurl==0.1.2
23
+ numpy==2.1.2
24
+ orjson==3.10.9
25
+ packaging==24.1
26
+ pandas==2.2.3
27
+ pillow==10.4.0
28
+ plotly==5.24.1
29
+ pyarrow==17.0.0
30
+ pydantic-core==2.23.4
31
+ pydantic==2.9.2
32
+ pydub==0.25.1
33
+ pygments==2.18.0
34
+ python-dateutil==2.9.0.post0
35
+ python-multipart==0.0.12
36
+ pytz==2024.2
37
+ pyyaml==6.0.2
38
+ requests==2.32.3
39
+ rich==13.9.2
40
+ ruff==0.7.0
41
+ semantic-version==2.10.0
42
+ shellingham==1.5.4
43
+ six==1.16.0
44
+ sniffio==1.3.1
45
+ starlette==0.41.0
46
+ tenacity==9.0.0
47
+ tomlkit==0.12.0
48
+ tqdm==4.66.5
49
+ typer==0.12.5
50
+ typing-extensions==4.12.2
51
+ tzdata==2024.2
52
+ urllib3==2.2.3
53
+ uvicorn==0.32.0
54
+ websockets==12.0