Compressed -> Deduped column header

#4
by erinys HF staff - opened
Files changed (1) hide show
  1. app.py +57 -57
app.py CHANGED
@@ -54,21 +54,18 @@ def process_dataset():
54
  columns={
55
  "type": "Repository Type",
56
  "num_files": "Number of Files",
57
- "total_size": "Total Size (PB)",
58
  }
59
  )
60
  file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
61
 
62
  # sort the dataframe by total size in descending order
63
  file_counts_and_sizes = file_counts_and_sizes.sort_values(
64
- by="Total Size (PB)", ascending=False
65
  )
66
 
67
  # drop nas from the extension column
68
  file_extensions = file_extensions.dropna(subset=["extension"])
69
- file_extensions_by_month = file_extensions_by_month[
70
- file_extensions_by_month["extension"] != ""
71
- ]
72
 
73
  return (
74
  repo_by_size_df,
@@ -91,19 +88,19 @@ def cumulative_growth_df(_df):
91
  return cumulative_df
92
 
93
 
94
- def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
95
  last_10_months = _cumulative_df.tail(10).copy()
96
  last_10_months["total"] = last_10_months.sum(axis=1)
97
  last_10_months["total_change"] = last_10_months["total"].diff()
98
- last_10_months["deduped_change"] = (
99
- _cumulative_df_deduped.tail(10).sum(axis=1).diff()
100
  )
101
  last_10_months["savings"] = (
102
- last_10_months["total_change"] - last_10_months["deduped_change"]
103
  )
104
 
105
  last_10_months = format_dataframe_size_column(
106
- last_10_months, ["total_change", "deduped_change", "savings"]
107
  )
108
 
109
  last_10_months["date"] = _cumulative_df.tail(10).index
@@ -115,50 +112,50 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
115
  last_10_months = last_10_months.drop(last_10_months.index[0])
116
  # order the columns date, total, total_change
117
  last_10_months = last_10_months[
118
- ["date", "total_change", "deduped_change", "savings"]
119
  ]
120
  # rename the columns
121
  last_10_months = last_10_months.rename(
122
  columns={
123
  "date": "Date",
124
- "total_change": "Month-to-Month Growth (PB)",
125
- "deduped_change": "Growth with File-Level Deduplication (PB)",
126
- "savings": "Dedupe Savings (PB)",
127
  }
128
  )
129
  return last_10_months
130
 
131
 
132
- def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
133
- # create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
134
- repo_sizes["Deduped Size (PB)"] = ""
135
- repo_sizes["Dedupe Savings (PB)"] = ""
136
 
137
  for column in cumulative_df.columns:
138
  cum_repo_size = cumulative_df[column].iloc[-1]
139
- comp_repo_size = cumulative_df_deduped[column].iloc[-1]
140
  repo_size_diff = cum_repo_size - comp_repo_size
141
  repo_sizes.loc[
142
  repo_sizes["Repository Type"] == column.capitalize(),
143
- "Deduped Size (PB)",
144
  ] = comp_repo_size
145
  repo_sizes.loc[
146
- repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PB)"
147
  ] = repo_size_diff
148
 
149
- # add a row that sums the total size and deduped size
150
  repo_sizes.loc["Total"] = repo_sizes.sum()
151
  repo_sizes.loc["Total", "Repository Type"] = "Total"
152
  return repo_sizes
153
 
154
 
155
- def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
156
  """
157
  Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
158
 
159
  Args:
160
  df (DataFrame): The input dataframe containing the data.
161
- df_deduped (DataFrame): The input dataframe containing the deduped data.
162
 
163
  Returns:
164
  tuple: A tuple containing two elements:
@@ -192,11 +189,11 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
192
  )
193
 
194
  # Add a scatter trace for each type
195
- for column in cumulative_df_deduped.columns:
196
  fig.add_trace(
197
  go.Scatter(
198
- x=cumulative_df_deduped.index,
199
- y=cumulative_df_deduped[column] / 1e15, # Convert to petabytes
200
  mode="lines",
201
  name=column.capitalize() + " (File-Level Deduplication)",
202
  line=dict(color=color_map.get(column, "black"), dash="dash"),
@@ -207,7 +204,7 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
207
  fig.update_layout(
208
  title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
209
  xaxis_title="Date",
210
- yaxis_title="Cumulative Size (PB)",
211
  legend_title="Type",
212
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
213
  )
@@ -254,7 +251,7 @@ def cumulative_growth_single(_df):
254
  fig.update_layout(
255
  title="Cumulative Growth of Models, Spaces, and Datasets",
256
  xaxis_title="Date",
257
- yaxis_title="Size (PB)",
258
  legend_title="Type",
259
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
260
  )
@@ -280,9 +277,9 @@ def plot_total_sum(by_type_arr):
280
 
281
  # Update layout
282
  fig.update_layout(
283
- title="Top 20 File Extensions by Total Size (in PB)",
284
  xaxis_title="File Extension",
285
- yaxis_title="Total Size (PB)",
286
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
287
  colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
288
  )
@@ -333,9 +330,9 @@ def filter_by_extension_month(_df, _extension):
333
 
334
  # Update layout
335
  fig.update_layout(
336
- title="Monthly Additions of LFS Files by Extension (in TB)",
337
  xaxis_title="Date",
338
- yaxis_title="Size (TB)",
339
  legend_title="Type",
340
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
341
  )
@@ -350,11 +347,11 @@ def area_plot_by_extension_month(_df):
350
  fig = px.area(_df, x="date", y="total_size", color="extension")
351
  # Update layout
352
  fig.update_layout(
353
- title="File Extension Monthly Additions (in PB) Over Time",
354
  xaxis_title="Date",
355
- yaxis_title="Size (PB)",
356
  legend_title="Type",
357
- # format y-axis to be PB (currently bytes) with two decimal places
358
  yaxis=dict(tickformat=".2f"),
359
  )
360
 
@@ -393,16 +390,16 @@ with gr.Blocks(theme="citrus") as demo:
393
 
394
  # Convert year and month into a datetime column
395
  df = month_year_to_date(df)
396
- df_deduped = month_year_to_date(file_df)
397
 
398
  # Calculate the cumulative growth of models, spaces, and datasets over time
399
  cumulative_df = cumulative_growth_df(df)
400
- cumulative_df_deduped = cumulative_growth_df(df_deduped)
401
 
402
- last_10_months = compare_last_10_months(cumulative_df, cumulative_df_deduped)
403
 
404
  by_repo_type_analysis = tabular_analysis(
405
- by_repo_type, cumulative_df, cumulative_df_deduped
406
  )
407
 
408
  # Add top level heading and introduction text
@@ -431,15 +428,15 @@ with gr.Blocks(theme="citrus") as demo:
431
  with gr.Column(scale=2):
432
  gr.Markdown("### Current Storage Usage")
433
  gr.Markdown(
434
- "As of September 20, 2024, total files stored in Git LFS summed to almost 29 PB. To put this into perspective, the last [Common Crawl](https://commoncrawl.org/) download was [451 TB](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31) - the Hub stores the equivalent of more than **64 Common Crawls** 🀯."
435
  )
436
  with gr.Column(scale=3):
437
  # Convert the total size to petabytes and format to two decimal places
438
  current_storage = format_dataframe_size_column(
439
  by_repo_type_analysis,
440
- ["Total Size (PB)", "Deduped Size (PB)", "Dedupe Savings (PB)"],
441
  )
442
- gr.Dataframe(current_storage[["Repository Type", "Total Size (PB)"]])
443
 
444
  gr.HTML(div_px(25))
445
  # File Extension analysis
@@ -448,23 +445,24 @@ with gr.Blocks(theme="citrus") as demo:
448
  "What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
449
  )
450
  gr.Markdown(
451
- "[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PB (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PB (11%) of LFS storage."
452
  )
453
  # Get the top 10 file extensions by size
454
  by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
 
455
  # make a bar chart of the by_extension_size dataframe
456
  gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
457
  # drop the unnamed: 0 column
458
  by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
459
  # average size
460
- by_extension_size["Average File Size (MB)"] = (
461
  by_extension_size["size"].astype(float) / by_extension_size["count"]
462
  )
463
- by_extension_size["Average File Size (MB)"] = (
464
- by_extension_size["Average File Size (MB)"] / 1e6
465
  )
466
- by_extension_size["Average File Size (MB)"] = by_extension_size[
467
- "Average File Size (MB)"
468
  ].map("{:.2f}".format)
469
  # format the size column
470
  by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
@@ -473,7 +471,7 @@ with gr.Blocks(theme="citrus") as demo:
473
  columns={
474
  "extension": "File Extension",
475
  "count": "Number of Files",
476
- "size": "Total Size (PB)",
477
  }
478
  )
479
 
@@ -485,15 +483,15 @@ with gr.Blocks(theme="citrus") as demo:
485
  by_extension_size[
486
  [
487
  "File Extension",
488
- "Total Size (PB)",
489
  "Number of Files",
490
- "Average File Size (MB)",
491
  ]
492
  ]
493
  )
494
 
495
  gr.HTML(div_px(5))
496
- gr.Markdown("### Storage Growth by File Extension (Monthly PB Added)")
497
  gr.Markdown(
498
  "The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
499
  )
@@ -501,7 +499,7 @@ with gr.Blocks(theme="citrus") as demo:
501
 
502
  gr.HTML(div_px(5))
503
  gr.Markdown(
504
- "To dig deeper, use the dropdown to filter by file extension and see the bytes added (in TB) each month for specific file types."
505
  )
506
 
507
  # get the unique values in the extension column and remove any empty strings
@@ -525,7 +523,9 @@ with gr.Blocks(theme="citrus") as demo:
525
  gr.Markdown(
526
  "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
527
  )
528
- dedupe_fig = cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped)
 
 
529
  gr.Plot(dedupe_fig)
530
 
531
  gr.HTML(div_px(5))
@@ -534,7 +534,7 @@ with gr.Blocks(theme="citrus") as demo:
534
  with gr.Column(scale=1):
535
  gr.Markdown("### Current Storage Usage + File-level Deduplication")
536
  gr.Markdown(
537
- "This simple change to the storage backend will save 3.24 PB (the equivalent of 7.2 Common Crawls)."
538
  )
539
  with gr.Column(scale=3):
540
  # Convert the total size to petabytes and format to two decimal places
@@ -545,7 +545,7 @@ with gr.Blocks(theme="citrus") as demo:
545
  with gr.Column(scale=1):
546
  gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
547
  gr.Markdown(
548
- "This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PB uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
549
  )
550
  with gr.Column(scale=3):
551
  gr.Dataframe(last_10_months)
 
54
  columns={
55
  "type": "Repository Type",
56
  "num_files": "Number of Files",
57
+ "total_size": "Total Size (PBs)",
58
  }
59
  )
60
  file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
61
 
62
  # sort the dataframe by total size in descending order
63
  file_counts_and_sizes = file_counts_and_sizes.sort_values(
64
+ by="Total Size (PBs)", ascending=False
65
  )
66
 
67
  # drop nas from the extension column
68
  file_extensions = file_extensions.dropna(subset=["extension"])
 
 
 
69
 
70
  return (
71
  repo_by_size_df,
 
88
  return cumulative_df
89
 
90
 
91
+ def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
92
  last_10_months = _cumulative_df.tail(10).copy()
93
  last_10_months["total"] = last_10_months.sum(axis=1)
94
  last_10_months["total_change"] = last_10_months["total"].diff()
95
+ last_10_months["compressed_change"] = (
96
+ _cumulative_df_compressed.tail(10).sum(axis=1).diff()
97
  )
98
  last_10_months["savings"] = (
99
+ last_10_months["total_change"] - last_10_months["compressed_change"]
100
  )
101
 
102
  last_10_months = format_dataframe_size_column(
103
+ last_10_months, ["total_change", "compressed_change", "savings"]
104
  )
105
 
106
  last_10_months["date"] = _cumulative_df.tail(10).index
 
112
  last_10_months = last_10_months.drop(last_10_months.index[0])
113
  # order the columns date, total, total_change
114
  last_10_months = last_10_months[
115
+ ["date", "total_change", "compressed_change", "savings"]
116
  ]
117
  # rename the columns
118
  last_10_months = last_10_months.rename(
119
  columns={
120
  "date": "Date",
121
+ "total_change": "Month-to-Month Growth (PBs)",
122
+ "compressed_change": "Growth with File-Level Deduplication (PBs)",
123
+ "savings": "Dedupe Savings (PBs)",
124
  }
125
  )
126
  return last_10_months
127
 
128
 
129
+ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
130
+ # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
131
+ repo_sizes["Deduped Size (PBs)"] = ""
132
+ repo_sizes["Dedupe Savings (PBs)"] = ""
133
 
134
  for column in cumulative_df.columns:
135
  cum_repo_size = cumulative_df[column].iloc[-1]
136
+ comp_repo_size = cumulative_df_compressed[column].iloc[-1]
137
  repo_size_diff = cum_repo_size - comp_repo_size
138
  repo_sizes.loc[
139
  repo_sizes["Repository Type"] == column.capitalize(),
140
+ "Deduped Size (PBs)",
141
  ] = comp_repo_size
142
  repo_sizes.loc[
143
+ repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
144
  ] = repo_size_diff
145
 
146
+ # add a row that sums the total size and compressed size
147
  repo_sizes.loc["Total"] = repo_sizes.sum()
148
  repo_sizes.loc["Total", "Repository Type"] = "Total"
149
  return repo_sizes
150
 
151
 
152
+ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
153
  """
154
  Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
155
 
156
  Args:
157
  df (DataFrame): The input dataframe containing the data.
158
+ df_compressed (DataFrame): The input dataframe containing the compressed data.
159
 
160
  Returns:
161
  tuple: A tuple containing two elements:
 
189
  )
190
 
191
  # Add a scatter trace for each type
192
+ for column in cumulative_df_compressed.columns:
193
  fig.add_trace(
194
  go.Scatter(
195
+ x=cumulative_df_compressed.index,
196
+ y=cumulative_df_compressed[column] / 1e15, # Convert to petabytes
197
  mode="lines",
198
  name=column.capitalize() + " (File-Level Deduplication)",
199
  line=dict(color=color_map.get(column, "black"), dash="dash"),
 
204
  fig.update_layout(
205
  title="Cumulative Growth of Models, Spaces, and Datasets Over Time<br><sup>Dotted lines represent growth with file-level deduplication</sup>",
206
  xaxis_title="Date",
207
+ yaxis_title="Cumulative Size (PBs)",
208
  legend_title="Type",
209
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
210
  )
 
251
  fig.update_layout(
252
  title="Cumulative Growth of Models, Spaces, and Datasets",
253
  xaxis_title="Date",
254
+ yaxis_title="Size (PBs)",
255
  legend_title="Type",
256
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
257
  )
 
277
 
278
  # Update layout
279
  fig.update_layout(
280
+ title="Top 20 File Extensions by Total Size (in PBs)",
281
  xaxis_title="File Extension",
282
+ yaxis_title="Total Size (PBs)",
283
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
284
  colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
285
  )
 
330
 
331
  # Update layout
332
  fig.update_layout(
333
+ title="Monthly Additions of LFS Files by Extension (in TBs)",
334
  xaxis_title="Date",
335
+ yaxis_title="Size (TBs)",
336
  legend_title="Type",
337
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
338
  )
 
347
  fig = px.area(_df, x="date", y="total_size", color="extension")
348
  # Update layout
349
  fig.update_layout(
350
+ title="File Extension Monthly Additions (in PBs) Over Time",
351
  xaxis_title="Date",
352
+ yaxis_title="Size (PBs)",
353
  legend_title="Type",
354
+ # format y-axis to be PBs (currently bytes) with two decimal places
355
  yaxis=dict(tickformat=".2f"),
356
  )
357
 
 
390
 
391
  # Convert year and month into a datetime column
392
  df = month_year_to_date(df)
393
+ df_compressed = month_year_to_date(file_df)
394
 
395
  # Calculate the cumulative growth of models, spaces, and datasets over time
396
  cumulative_df = cumulative_growth_df(df)
397
+ cumulative_df_compressed = cumulative_growth_df(df_compressed)
398
 
399
+ last_10_months = compare_last_10_months(cumulative_df, cumulative_df_compressed)
400
 
401
  by_repo_type_analysis = tabular_analysis(
402
+ by_repo_type, cumulative_df, cumulative_df_compressed
403
  )
404
 
405
  # Add top level heading and introduction text
 
428
  with gr.Column(scale=2):
429
  gr.Markdown("### Current Storage Usage")
430
  gr.Markdown(
431
+ "As of September 20, 2024, total files stored in Git LFS summed to almost 29 PB. To put this into perspective, the last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31) - the Hub stores the equivalent of more than **64 Common Crawls** 🀯."
432
  )
433
  with gr.Column(scale=3):
434
  # Convert the total size to petabytes and format to two decimal places
435
  current_storage = format_dataframe_size_column(
436
  by_repo_type_analysis,
437
+ ["Total Size (PBs)", "Deduped Size (PBs)", "Dedupe Savings (PBs)"],
438
  )
439
+ gr.Dataframe(current_storage[["Repository Type", "Total Size (PBs)"]])
440
 
441
  gr.HTML(div_px(25))
442
  # File Extension analysis
 
445
  "What types of files are stored on the Hub? The Xet team's backend architecture allows for storage optimizations by file type, so seeing the breakdown of the most popular stored file types helps to prioritize our roadmap. The following sections filter the analysis to the top 20 file extensions stored (by bytes) using Git LFS. Taken together, these 20 file extensions account for 82% of the total bytes stored in LFS."
446
  )
447
  gr.Markdown(
448
+ "[Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PBs (25%) of LFS storage. [GGUF (GPT-Generated Unified Format)](https://huggingface.co/docs/hub/gguf), a format for storing tensor files with a different set of optimizations, is also on the rise, accounting for 3.2 PBs (11%) of LFS storage."
449
  )
450
  # Get the top 10 file extensions by size
451
  by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
452
+
453
  # make a bar chart of the by_extension_size dataframe
454
  gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
455
  # drop the unnamed: 0 column
456
  by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
457
  # average size
458
+ by_extension_size["Average File Size (MBs)"] = (
459
  by_extension_size["size"].astype(float) / by_extension_size["count"]
460
  )
461
+ by_extension_size["Average File Size (MBs)"] = (
462
+ by_extension_size["Average File Size (MBs)"] / 1e6
463
  )
464
+ by_extension_size["Average File Size (MBs)"] = by_extension_size[
465
+ "Average File Size (MBs)"
466
  ].map("{:.2f}".format)
467
  # format the size column
468
  by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
 
471
  columns={
472
  "extension": "File Extension",
473
  "count": "Number of Files",
474
+ "size": "Total Size (PBs)",
475
  }
476
  )
477
 
 
483
  by_extension_size[
484
  [
485
  "File Extension",
486
+ "Total Size (PBs)",
487
  "Number of Files",
488
+ "Average File Size (MBs)",
489
  ]
490
  ]
491
  )
492
 
493
  gr.HTML(div_px(5))
494
+ gr.Markdown("### Storage Growth by File Extension (Monthly PBs Added)")
495
  gr.Markdown(
496
  "The following area chart shows the number of bytes added to LFS storage each month, faceted by file extension."
497
  )
 
499
 
500
  gr.HTML(div_px(5))
501
  gr.Markdown(
502
+ "To dig deeper, use the dropdown to filter by file extension and see the bytes added (in TBs) each month for specific file types."
503
  )
504
 
505
  # get the unique values in the extension column and remove any empty strings
 
523
  gr.Markdown(
524
  "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
525
  )
526
+ dedupe_fig = cumulative_growth_plot_analysis(
527
+ cumulative_df, cumulative_df_compressed
528
+ )
529
  gr.Plot(dedupe_fig)
530
 
531
  gr.HTML(div_px(5))
 
534
  with gr.Column(scale=1):
535
  gr.Markdown("### Current Storage Usage + File-level Deduplication")
536
  gr.Markdown(
537
+ "This simple change to the storage backend will save 3.24 PBs (the equivalent of 7.2 Common Crawls)."
538
  )
539
  with gr.Column(scale=3):
540
  # Convert the total size to petabytes and format to two decimal places
 
545
  with gr.Column(scale=1):
546
  gr.Markdown("### Month-to-Month Growth + File-level Deduplication")
547
  gr.Markdown(
548
+ "This table shows month-to-month growth in model, dataset, and space storage. In 2024, the Hub has averaged nearly **2.3 PBs uploaded to Git LFS per month**. Deduplicating at the file level saves nearly 225 TB (half a Common Crawl) monthly."
549
  )
550
  with gr.Column(scale=3):
551
  gr.Dataframe(last_10_months)