Spaces:
Running
Running
Compressed -> Deduped column header
#4
by
erinys
HF staff
- opened
app.py
CHANGED
@@ -54,21 +54,18 @@ def process_dataset():
|
|
54 |
columns={
|
55 |
"type": "Repository Type",
|
56 |
"num_files": "Number of Files",
|
57 |
-
"total_size": "Total Size (
|
58 |
}
|
59 |
)
|
60 |
file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
|
61 |
|
62 |
# sort the dataframe by total size in descending order
|
63 |
file_counts_and_sizes = file_counts_and_sizes.sort_values(
|
64 |
-
by="Total Size (
|
65 |
)
|
66 |
|
67 |
# drop nas from the extension column
|
68 |
file_extensions = file_extensions.dropna(subset=["extension"])
|
69 |
-
file_extensions_by_month = file_extensions_by_month[
|
70 |
-
file_extensions_by_month["extension"] != ""
|
71 |
-
]
|
72 |
|
73 |
return (
|
74 |
repo_by_size_df,
|
@@ -91,19 +88,19 @@ def cumulative_growth_df(_df):
|
|
91 |
return cumulative_df
|
92 |
|
93 |
|
94 |
-
def compare_last_10_months(_cumulative_df,
|
95 |
last_10_months = _cumulative_df.tail(10).copy()
|
96 |
last_10_months["total"] = last_10_months.sum(axis=1)
|
97 |
last_10_months["total_change"] = last_10_months["total"].diff()
|
98 |
-
last_10_months["
|
99 |
-
|
100 |
)
|
101 |
last_10_months["savings"] = (
|
102 |
-
last_10_months["total_change"] - last_10_months["
|
103 |
)
|
104 |
|
105 |
last_10_months = format_dataframe_size_column(
|
106 |
-
last_10_months, ["total_change", "
|
107 |
)
|
108 |
|
109 |
last_10_months["date"] = _cumulative_df.tail(10).index
|
@@ -115,50 +112,50 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
|
|
115 |
last_10_months = last_10_months.drop(last_10_months.index[0])
|
116 |
# order the columns date, total, total_change
|
117 |
last_10_months = last_10_months[
|
118 |
-
["date", "total_change", "
|
119 |
]
|
120 |
# rename the columns
|
121 |
last_10_months = last_10_months.rename(
|
122 |
columns={
|
123 |
"date": "Date",
|
124 |
-
"total_change": "Month-to-Month Growth (
|
125 |
-
"
|
126 |
-
"savings": "Dedupe Savings (
|
127 |
}
|
128 |
)
|
129 |
return last_10_months
|
130 |
|
131 |
|
132 |
-
def tabular_analysis(repo_sizes, cumulative_df,
|
133 |
-
# create a new column in the repository sizes dataframe for "
|
134 |
-
repo_sizes["Deduped Size (
|
135 |
-
repo_sizes["Dedupe Savings (
|
136 |
|
137 |
for column in cumulative_df.columns:
|
138 |
cum_repo_size = cumulative_df[column].iloc[-1]
|
139 |
-
comp_repo_size =
|
140 |
repo_size_diff = cum_repo_size - comp_repo_size
|
141 |
repo_sizes.loc[
|
142 |
repo_sizes["Repository Type"] == column.capitalize(),
|
143 |
-
"Deduped Size (
|
144 |
] = comp_repo_size
|
145 |
repo_sizes.loc[
|
146 |
-
repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (
|
147 |
] = repo_size_diff
|
148 |
|
149 |
-
# add a row that sums the total size and
|
150 |
repo_sizes.loc["Total"] = repo_sizes.sum()
|
151 |
repo_sizes.loc["Total", "Repository Type"] = "Total"
|
152 |
return repo_sizes
|
153 |
|
154 |
|
155 |
-
def cumulative_growth_plot_analysis(cumulative_df,
|
156 |
"""
|
157 |
Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
|
158 |
|
159 |
Args:
|
160 |
df (DataFrame): The input dataframe containing the data.
|
161 |
-
|
162 |
|
163 |
Returns:
|
164 |
tuple: A tuple containing two elements:
|
@@ -192,11 +189,11 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
|
|
192 |
)
|
193 |
|
194 |
# Add a scatter trace for each type
|
195 |
-
for column in
|
196 |
fig.add_trace(
|
197 |
go.Scatter(
|
198 |
-
x=
|
199 |
-
y=
|
200 |
mode="lines",
|
201 |
name=column.capitalize() + " (File-Level Deduplication)",
|
202 |
line=dict(color=color_map.get(column, "black"), dash="dash"),
|
@@ -207,7 +204,7 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
|
|
207 |
fig.update_layout(
|
208 |
title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
|
209 |
xaxis_title="Date",
|
210 |
-
yaxis_title="Cumulative Size (
|
211 |
legend_title="Type",
|
212 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
213 |
)
|
@@ -254,7 +251,7 @@ def cumulative_growth_single(_df):
|
|
254 |
fig.update_layout(
|
255 |
title="Cumulative Growth of Models, Spaces, and Datasets",
|
256 |
xaxis_title="Date",
|
257 |
-
yaxis_title="Size (
|
258 |
legend_title="Type",
|
259 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
260 |
)
|
@@ -280,9 +277,9 @@ def plot_total_sum(by_type_arr):
|
|
280 |
|
281 |
# Update layout
|
282 |
fig.update_layout(
|
283 |
-
title="Top 20 File Extensions by Total Size (in
|
284 |
xaxis_title="File Extension",
|
285 |
-
yaxis_title="Total Size (
|
286 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
287 |
colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
|
288 |
)
|
@@ -333,9 +330,9 @@ def filter_by_extension_month(_df, _extension):
|
|
333 |
|
334 |
# Update layout
|
335 |
fig.update_layout(
|
336 |
-
title="Monthly Additions of LFS Files by Extension (in
|
337 |
xaxis_title="Date",
|
338 |
-
yaxis_title="Size (
|
339 |
legend_title="Type",
|
340 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
341 |
)
|
@@ -350,11 +347,11 @@ def area_plot_by_extension_month(_df):
|
|
350 |
fig = px.area(_df, x="date", y="total_size", color="extension")
|
351 |
# Update layout
|
352 |
fig.update_layout(
|
353 |
-
title="File Extension Monthly Additions (in
|
354 |
xaxis_title="Date",
|
355 |
-
yaxis_title="Size (
|
356 |
legend_title="Type",
|
357 |
-
# format y-axis to be
|
358 |
yaxis=dict(tickformat=".2f"),
|
359 |
)
|
360 |
|
@@ -393,16 +390,16 @@ with gr.Blocks(theme="citrus") as demo:
|
|
393 |
|
394 |
# Convert year and month into a datetime column
|
395 |
df = month_year_to_date(df)
|
396 |
-
|
397 |
|
398 |
# Calculate the cumulative growth of models, spaces, and datasets over time
|
399 |
cumulative_df = cumulative_growth_df(df)
|
400 |
-
|
401 |
|
402 |
-
last_10_months = compare_last_10_months(cumulative_df,
|
403 |
|
404 |
by_repo_type_analysis = tabular_analysis(
|
405 |
-
by_repo_type, cumulative_df,
|
406 |
)
|
407 |
|
408 |
# Add top level heading and introduction text
|
@@ -431,15 +428,15 @@ with gr.Blocks(theme="citrus") as demo:
|
|
431 |
with gr.Column(scale=2):
|
432 |
gr.Markdown("### Current Storage Usage")
|
433 |
gr.Markdown(
|
434 |
-
"As of September 20, 2024, total files stored in Git LFS summed to almost 29 PB. To put this into perspective, the last [Common Crawl](https://commoncrawl.org/) download was [451
|
435 |
)
|
436 |
with gr.Column(scale=3):
|
437 |
# Convert the total size to petabytes and format to two decimal places
|
438 |
current_storage = format_dataframe_size_column(
|
439 |
by_repo_type_analysis,
|
440 |
-
["Total Size (
|
441 |
)
|
442 |
-
gr.Dataframe(current_storage[["Repository Type", "Total Size (
|
443 |
|
444 |
gr.HTML(div_px(25))
|
445 |
# File Extension analysis
|
@@ -448,23 +445,24 @@ with gr.Blocks(theme="citrus") as demo:
|
|
448 |
"What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
|
449 |
)
|
450 |
gr.Markdown(
|
451 |
-
"[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over
|
452 |
)
|
453 |
# Get the top 10 file extensions by size
|
454 |
by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
|
|
|
455 |
# make a bar chart of the by_extension_size dataframe
|
456 |
gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
|
457 |
# drop the unnamed: 0 column
|
458 |
by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
|
459 |
# average size
|
460 |
-
by_extension_size["Average File Size (
|
461 |
by_extension_size["size"].astype(float) / by_extension_size["count"]
|
462 |
)
|
463 |
-
by_extension_size["Average File Size (
|
464 |
-
by_extension_size["Average File Size (
|
465 |
)
|
466 |
-
by_extension_size["Average File Size (
|
467 |
-
"Average File Size (
|
468 |
].map("{:.2f}".format)
|
469 |
# format the size column
|
470 |
by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
|
@@ -473,7 +471,7 @@ with gr.Blocks(theme="citrus") as demo:
|
|
473 |
columns={
|
474 |
"extension": "File Extension",
|
475 |
"count": "Number of Files",
|
476 |
-
"size": "Total Size (
|
477 |
}
|
478 |
)
|
479 |
|
@@ -485,15 +483,15 @@ with gr.Blocks(theme="citrus") as demo:
|
|
485 |
by_extension_size[
|
486 |
[
|
487 |
"File Extension",
|
488 |
-
"Total Size (
|
489 |
"Number of Files",
|
490 |
-
"Average File Size (
|
491 |
]
|
492 |
]
|
493 |
)
|
494 |
|
495 |
gr.HTML(div_px(5))
|
496 |
-
gr.Markdown("### Storage Growth by File Extension (Monthly
|
497 |
gr.Markdown(
|
498 |
"The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
|
499 |
)
|
@@ -501,7 +499,7 @@ with gr.Blocks(theme="citrus") as demo:
|
|
501 |
|
502 |
gr.HTML(div_px(5))
|
503 |
gr.Markdown(
|
504 |
-
"To dig deeper, use the dropdown to filter by file extension and see the bytes added (in
|
505 |
)
|
506 |
|
507 |
# get the unique values in the extension column and remove any empty strings
|
@@ -525,7 +523,9 @@ with gr.Blocks(theme="citrus") as demo:
|
|
525 |
gr.Markdown(
|
526 |
"The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
|
527 |
)
|
528 |
-
dedupe_fig = cumulative_growth_plot_analysis(
|
|
|
|
|
529 |
gr.Plot(dedupe_fig)
|
530 |
|
531 |
gr.HTML(div_px(5))
|
@@ -534,7 +534,7 @@ with gr.Blocks(theme="citrus") as demo:
|
|
534 |
with gr.Column(scale=1):
|
535 |
gr.Markdown("### Current Storage Usage + File-level Deduplication")
|
536 |
gr.Markdown(
|
537 |
-
"This simple change to the storage backend will save 3.24
|
538 |
)
|
539 |
with gr.Column(scale=3):
|
540 |
# Convert the total size to petabytes and format to two decimal places
|
@@ -545,7 +545,7 @@ with gr.Blocks(theme="citrus") as demo:
|
|
545 |
with gr.Column(scale=1):
|
546 |
gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
|
547 |
gr.Markdown(
|
548 |
-
"This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3
|
549 |
)
|
550 |
with gr.Column(scale=3):
|
551 |
gr.Dataframe(last_10_months)
|
|
|
54 |
columns={
|
55 |
"type": "Repository Type",
|
56 |
"num_files": "Number of Files",
|
57 |
+
"total_size": "Total Size (PBs)",
|
58 |
}
|
59 |
)
|
60 |
file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
|
61 |
|
62 |
# sort the dataframe by total size in descending order
|
63 |
file_counts_and_sizes = file_counts_and_sizes.sort_values(
|
64 |
+
by="Total Size (PBs)", ascending=False
|
65 |
)
|
66 |
|
67 |
# drop nas from the extension column
|
68 |
file_extensions = file_extensions.dropna(subset=["extension"])
|
|
|
|
|
|
|
69 |
|
70 |
return (
|
71 |
repo_by_size_df,
|
|
|
88 |
return cumulative_df
|
89 |
|
90 |
|
91 |
+
def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
|
92 |
last_10_months = _cumulative_df.tail(10).copy()
|
93 |
last_10_months["total"] = last_10_months.sum(axis=1)
|
94 |
last_10_months["total_change"] = last_10_months["total"].diff()
|
95 |
+
last_10_months["compressed_change"] = (
|
96 |
+
_cumulative_df_compressed.tail(10).sum(axis=1).diff()
|
97 |
)
|
98 |
last_10_months["savings"] = (
|
99 |
+
last_10_months["total_change"] - last_10_months["compressed_change"]
|
100 |
)
|
101 |
|
102 |
last_10_months = format_dataframe_size_column(
|
103 |
+
last_10_months, ["total_change", "compressed_change", "savings"]
|
104 |
)
|
105 |
|
106 |
last_10_months["date"] = _cumulative_df.tail(10).index
|
|
|
112 |
last_10_months = last_10_months.drop(last_10_months.index[0])
|
113 |
# order the columns date, total, total_change
|
114 |
last_10_months = last_10_months[
|
115 |
+
["date", "total_change", "compressed_change", "savings"]
|
116 |
]
|
117 |
# rename the columns
|
118 |
last_10_months = last_10_months.rename(
|
119 |
columns={
|
120 |
"date": "Date",
|
121 |
+
"total_change": "Month-to-Month Growth (PBs)",
|
122 |
+
"compressed_change": "Growth with File-Level Deduplication (PBs)",
|
123 |
+
"savings": "Dedupe Savings (PBs)",
|
124 |
}
|
125 |
)
|
126 |
return last_10_months
|
127 |
|
128 |
|
129 |
+
def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
|
130 |
+
# create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
|
131 |
+
repo_sizes["Deduped Size (PBs)"] = ""
|
132 |
+
repo_sizes["Dedupe Savings (PBs)"] = ""
|
133 |
|
134 |
for column in cumulative_df.columns:
|
135 |
cum_repo_size = cumulative_df[column].iloc[-1]
|
136 |
+
comp_repo_size = cumulative_df_compressed[column].iloc[-1]
|
137 |
repo_size_diff = cum_repo_size - comp_repo_size
|
138 |
repo_sizes.loc[
|
139 |
repo_sizes["Repository Type"] == column.capitalize(),
|
140 |
+
"Deduped Size (PBs)",
|
141 |
] = comp_repo_size
|
142 |
repo_sizes.loc[
|
143 |
+
repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
|
144 |
] = repo_size_diff
|
145 |
|
146 |
+
# add a row that sums the total size and compressed size
|
147 |
repo_sizes.loc["Total"] = repo_sizes.sum()
|
148 |
repo_sizes.loc["Total", "Repository Type"] = "Total"
|
149 |
return repo_sizes
|
150 |
|
151 |
|
152 |
+
def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
|
153 |
"""
|
154 |
Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
|
155 |
|
156 |
Args:
|
157 |
df (DataFrame): The input dataframe containing the data.
|
158 |
+
df_compressed (DataFrame): The input dataframe containing the compressed data.
|
159 |
|
160 |
Returns:
|
161 |
tuple: A tuple containing two elements:
|
|
|
189 |
)
|
190 |
|
191 |
# Add a scatter trace for each type
|
192 |
+
for column in cumulative_df_compressed.columns:
|
193 |
fig.add_trace(
|
194 |
go.Scatter(
|
195 |
+
x=cumulative_df_compressed.index,
|
196 |
+
y=cumulative_df_compressed[column] / 1e15, # Convert to petabytes
|
197 |
mode="lines",
|
198 |
name=column.capitalize() + " (File-Level Deduplication)",
|
199 |
line=dict(color=color_map.get(column, "black"), dash="dash"),
|
|
|
204 |
fig.update_layout(
|
205 |
title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
|
206 |
xaxis_title="Date",
|
207 |
+
yaxis_title="Cumulative Size (PBs)",
|
208 |
legend_title="Type",
|
209 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
210 |
)
|
|
|
251 |
fig.update_layout(
|
252 |
title="Cumulative Growth of Models, Spaces, and Datasets",
|
253 |
xaxis_title="Date",
|
254 |
+
yaxis_title="Size (PBs)",
|
255 |
legend_title="Type",
|
256 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
257 |
)
|
|
|
277 |
|
278 |
# Update layout
|
279 |
fig.update_layout(
|
280 |
+
title="Top 20 File Extensions by Total Size (in PBs)",
|
281 |
xaxis_title="File Extension",
|
282 |
+
yaxis_title="Total Size (PBs)",
|
283 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
284 |
colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
|
285 |
)
|
|
|
330 |
|
331 |
# Update layout
|
332 |
fig.update_layout(
|
333 |
+
title="Monthly Additions of LFS Files by Extension (in TBs)",
|
334 |
xaxis_title="Date",
|
335 |
+
yaxis_title="Size (TBs)",
|
336 |
legend_title="Type",
|
337 |
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
|
338 |
)
|
|
|
347 |
fig = px.area(_df, x="date", y="total_size", color="extension")
|
348 |
# Update layout
|
349 |
fig.update_layout(
|
350 |
+
title="File Extension Monthly Additions (in PBs) Over Time",
|
351 |
xaxis_title="Date",
|
352 |
+
yaxis_title="Size (PBs)",
|
353 |
legend_title="Type",
|
354 |
+
# format y-axis to be PBs (currently bytes) with two decimal places
|
355 |
yaxis=dict(tickformat=".2f"),
|
356 |
)
|
357 |
|
|
|
390 |
|
391 |
# Convert year and month into a datetime column
|
392 |
df = month_year_to_date(df)
|
393 |
+
df_compressed = month_year_to_date(file_df)
|
394 |
|
395 |
# Calculate the cumulative growth of models, spaces, and datasets over time
|
396 |
cumulative_df = cumulative_growth_df(df)
|
397 |
+
cumulative_df_compressed = cumulative_growth_df(df_compressed)
|
398 |
|
399 |
+
last_10_months = compare_last_10_months(cumulative_df, cumulative_df_compressed)
|
400 |
|
401 |
by_repo_type_analysis = tabular_analysis(
|
402 |
+
by_repo_type, cumulative_df, cumulative_df_compressed
|
403 |
)
|
404 |
|
405 |
# Add top level heading and introduction text
|
|
|
428 |
with gr.Column(scale=2):
|
429 |
gr.Markdown("### Current Storage Usage")
|
430 |
gr.Markdown(
|
431 |
+
"As of September 20, 2024, total files stored in Git LFS summed to almost 29 PB. To put this into perspective, the last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31) - the Hub stores the equivalent of more than **64 Common Crawls** π€―."
|
432 |
)
|
433 |
with gr.Column(scale=3):
|
434 |
# Convert the total size to petabytes and format to two decimal places
|
435 |
current_storage = format_dataframe_size_column(
|
436 |
by_repo_type_analysis,
|
437 |
+
["Total Size (PBs)", "Deduped Size (PBs)", "Dedupe Savings (PBs)"],
|
438 |
)
|
439 |
+
gr.Dataframe(current_storage[["Repository Type", "Total Size (PBs)"]])
|
440 |
|
441 |
gr.HTML(div_px(25))
|
442 |
# File Extension analysis
|
|
|
445 |
"What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
|
446 |
)
|
447 |
gr.Markdown(
|
448 |
+
"[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PBs (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PBs (11%) of LFS storage."
|
449 |
)
|
450 |
# Get the top 10 file extensions by size
|
451 |
by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
|
452 |
+
|
453 |
# make a bar chart of the by_extension_size dataframe
|
454 |
gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
|
455 |
# drop the unnamed: 0 column
|
456 |
by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
|
457 |
# average size
|
458 |
+
by_extension_size["Average File Size (MBs)"] = (
|
459 |
by_extension_size["size"].astype(float) / by_extension_size["count"]
|
460 |
)
|
461 |
+
by_extension_size["Average File Size (MBs)"] = (
|
462 |
+
by_extension_size["Average File Size (MBs)"] / 1e6
|
463 |
)
|
464 |
+
by_extension_size["Average File Size (MBs)"] = by_extension_size[
|
465 |
+
"Average File Size (MBs)"
|
466 |
].map("{:.2f}".format)
|
467 |
# format the size column
|
468 |
by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
|
|
|
471 |
columns={
|
472 |
"extension": "File Extension",
|
473 |
"count": "Number of Files",
|
474 |
+
"size": "Total Size (PBs)",
|
475 |
}
|
476 |
)
|
477 |
|
|
|
483 |
by_extension_size[
|
484 |
[
|
485 |
"File Extension",
|
486 |
+
"Total Size (PBs)",
|
487 |
"Number of Files",
|
488 |
+
"Average File Size (MBs)",
|
489 |
]
|
490 |
]
|
491 |
)
|
492 |
|
493 |
gr.HTML(div_px(5))
|
494 |
+
gr.Markdown("### Storage Growth by File Extension (Monthly PBs Added)")
|
495 |
gr.Markdown(
|
496 |
"The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
|
497 |
)
|
|
|
499 |
|
500 |
gr.HTML(div_px(5))
|
501 |
gr.Markdown(
|
502 |
+
"To dig deeper, use the dropdown to filter by file extension and see the bytes added (in TBs) each month for specific file types."
|
503 |
)
|
504 |
|
505 |
# get the unique values in the extension column and remove any empty strings
|
|
|
523 |
gr.Markdown(
|
524 |
"The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
|
525 |
)
|
526 |
+
dedupe_fig = cumulative_growth_plot_analysis(
|
527 |
+
cumulative_df, cumulative_df_compressed
|
528 |
+
)
|
529 |
gr.Plot(dedupe_fig)
|
530 |
|
531 |
gr.HTML(div_px(5))
|
|
|
534 |
with gr.Column(scale=1):
|
535 |
gr.Markdown("### Current Storage Usage + File-level Deduplication")
|
536 |
gr.Markdown(
|
537 |
+
"This simple change to the storage backend will save 3.24 PBs (the equivalent of 7.2 Common Crawls)."
|
538 |
)
|
539 |
with gr.Column(scale=3):
|
540 |
# Convert the total size to petabytes and format to two decimal places
|
|
|
545 |
with gr.Column(scale=1):
|
546 |
gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
|
547 |
gr.Markdown(
|
548 |
+
"This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PBs uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
|
549 |
)
|
550 |
with gr.Column(scale=3):
|
551 |
gr.Dataframe(last_10_months)
|