jsulz HF staff commited on
Commit
0922060
·
1 Parent(s): 6c40bac

totally removing compressed from everything

Browse files
Files changed (1) hide show
  1. app.py +21 -23
app.py CHANGED
@@ -88,19 +88,19 @@ def cumulative_growth_df(_df):
88
  return cumulative_df
89
 
90
 
91
- def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
92
  last_10_months = _cumulative_df.tail(10).copy()
93
  last_10_months["total"] = last_10_months.sum(axis=1)
94
  last_10_months["total_change"] = last_10_months["total"].diff()
95
- last_10_months["compressed_change"] = (
96
- _cumulative_df_compressed.tail(10).sum(axis=1).diff()
97
  )
98
  last_10_months["savings"] = (
99
- last_10_months["total_change"] - last_10_months["compressed_change"]
100
  )
101
 
102
  last_10_months = format_dataframe_size_column(
103
- last_10_months, ["total_change", "compressed_change", "savings"]
104
  )
105
 
106
  last_10_months["date"] = _cumulative_df.tail(10).index
@@ -112,28 +112,28 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
112
  last_10_months = last_10_months.drop(last_10_months.index[0])
113
  # order the columns date, total, total_change
114
  last_10_months = last_10_months[
115
- ["date", "total_change", "compressed_change", "savings"]
116
  ]
117
  # rename the columns
118
  last_10_months = last_10_months.rename(
119
  columns={
120
  "date": "Date",
121
  "total_change": "Month-to-Month Growth (PBs)",
122
- "compressed_change": "Growth with File-Level Deduplication (PBs)",
123
  "savings": "Dedupe Savings (PBs)",
124
  }
125
  )
126
  return last_10_months
127
 
128
 
129
- def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
130
- # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
131
  repo_sizes["Deduped Size (PBs)"] = ""
132
  repo_sizes["Dedupe Savings (PBs)"] = ""
133
 
134
  for column in cumulative_df.columns:
135
  cum_repo_size = cumulative_df[column].iloc[-1]
136
- comp_repo_size = cumulative_df_compressed[column].iloc[-1]
137
  repo_size_diff = cum_repo_size - comp_repo_size
138
  repo_sizes.loc[
139
  repo_sizes["Repository Type"] == column.capitalize(),
@@ -143,19 +143,19 @@ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
143
  repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
144
  ] = repo_size_diff
145
 
146
- # add a row that sums the total size and compressed size
147
  repo_sizes.loc["Total"] = repo_sizes.sum()
148
  repo_sizes.loc["Total", "Repository Type"] = "Total"
149
  return repo_sizes
150
 
151
 
152
- def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
153
  """
154
  Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
155
 
156
  Args:
157
  df (DataFrame): The input dataframe containing the data.
158
- df_compressed (DataFrame): The input dataframe containing the compressed data.
159
 
160
  Returns:
161
  tuple: A tuple containing two elements:
@@ -189,11 +189,11 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
189
  )
190
 
191
  # Add a scatter trace for each type
192
- for column in cumulative_df_compressed.columns:
193
  fig.add_trace(
194
  go.Scatter(
195
- x=cumulative_df_compressed.index,
196
- y=cumulative_df_compressed[column] / 1e15, # Convert to petabytes
197
  mode="lines",
198
  name=column.capitalize() + " (File-Level Deduplication)",
199
  line=dict(color=color_map.get(column, "black"), dash="dash"),
@@ -390,16 +390,16 @@ with gr.Blocks(theme="citrus") as demo:
390
 
391
  # Convert year and month into a datetime column
392
  df = month_year_to_date(df)
393
- df_compressed = month_year_to_date(file_df)
394
 
395
  # Calculate the cumulative growth of models, spaces, and datasets over time
396
  cumulative_df = cumulative_growth_df(df)
397
- cumulative_df_compressed = cumulative_growth_df(df_compressed)
398
 
399
- last_10_months = compare_last_10_months(cumulative_df, cumulative_df_compressed)
400
 
401
  by_repo_type_analysis = tabular_analysis(
402
- by_repo_type, cumulative_df, cumulative_df_compressed
403
  )
404
 
405
  # Add top level heading and introduction text
@@ -523,9 +523,7 @@ with gr.Blocks(theme="citrus") as demo:
523
  gr.Markdown(
524
  "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
525
  )
526
- dedupe_fig = cumulative_growth_plot_analysis(
527
- cumulative_df, cumulative_df_compressed
528
- )
529
  gr.Plot(dedupe_fig)
530
 
531
  gr.HTML(div_px(5))
 
88
  return cumulative_df
89
 
90
 
91
+ def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
92
  last_10_months = _cumulative_df.tail(10).copy()
93
  last_10_months["total"] = last_10_months.sum(axis=1)
94
  last_10_months["total_change"] = last_10_months["total"].diff()
95
+ last_10_months["deduped_change"] = (
96
+ _cumulative_df_deduped.tail(10).sum(axis=1).diff()
97
  )
98
  last_10_months["savings"] = (
99
+ last_10_months["total_change"] - last_10_months["deduped_change"]
100
  )
101
 
102
  last_10_months = format_dataframe_size_column(
103
+ last_10_months, ["total_change", "deduped_change", "savings"]
104
  )
105
 
106
  last_10_months["date"] = _cumulative_df.tail(10).index
 
112
  last_10_months = last_10_months.drop(last_10_months.index[0])
113
  # order the columns date, total, total_change
114
  last_10_months = last_10_months[
115
+ ["date", "total_change", "deduped_change", "savings"]
116
  ]
117
  # rename the columns
118
  last_10_months = last_10_months.rename(
119
  columns={
120
  "date": "Date",
121
  "total_change": "Month-to-Month Growth (PBs)",
122
+ "deduped_change": "Growth with File-Level Deduplication (PBs)",
123
  "savings": "Dedupe Savings (PBs)",
124
  }
125
  )
126
  return last_10_months
127
 
128
 
129
+ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
130
+ # create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
131
  repo_sizes["Deduped Size (PBs)"] = ""
132
  repo_sizes["Dedupe Savings (PBs)"] = ""
133
 
134
  for column in cumulative_df.columns:
135
  cum_repo_size = cumulative_df[column].iloc[-1]
136
+ comp_repo_size = cumulative_df_deduped[column].iloc[-1]
137
  repo_size_diff = cum_repo_size - comp_repo_size
138
  repo_sizes.loc[
139
  repo_sizes["Repository Type"] == column.capitalize(),
 
143
  repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
144
  ] = repo_size_diff
145
 
146
+ # add a row that sums the total size and deduped size
147
  repo_sizes.loc["Total"] = repo_sizes.sum()
148
  repo_sizes.loc["Total", "Repository Type"] = "Total"
149
  return repo_sizes
150
 
151
 
152
+ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
153
  """
154
  Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
155
 
156
  Args:
157
  df (DataFrame): The input dataframe containing the data.
158
+ df_deduped (DataFrame): The input dataframe containing the deduped data.
159
 
160
  Returns:
161
  tuple: A tuple containing two elements:
 
189
  )
190
 
191
  # Add a scatter trace for each type
192
+ for column in cumulative_df_deduped.columns:
193
  fig.add_trace(
194
  go.Scatter(
195
+ x=cumulative_df_deduped.index,
196
+ y=cumulative_df_deduped[column] / 1e15, # Convert to petabytes
197
  mode="lines",
198
  name=column.capitalize() + " (File-Level Deduplication)",
199
  line=dict(color=color_map.get(column, "black"), dash="dash"),
 
390
 
391
  # Convert year and month into a datetime column
392
  df = month_year_to_date(df)
393
+ df_deduped = month_year_to_date(file_df)
394
 
395
  # Calculate the cumulative growth of models, spaces, and datasets over time
396
  cumulative_df = cumulative_growth_df(df)
397
+ cumulative_df_deduped = cumulative_growth_df(df_deduped)
398
 
399
+ last_10_months = compare_last_10_months(cumulative_df, cumulative_df_deduped)
400
 
401
  by_repo_type_analysis = tabular_analysis(
402
+ by_repo_type, cumulative_df, cumulative_df_deduped
403
  )
404
 
405
  # Add top level heading and introduction text
 
523
  gr.Markdown(
524
  "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
525
  )
526
+ dedupe_fig = cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped)
 
 
527
  gr.Plot(dedupe_fig)
528
 
529
  gr.HTML(div_px(5))