Spaces:
Running
Running
totally removing compressed from everything
Browse files
app.py
CHANGED
@@ -88,19 +88,19 @@ def cumulative_growth_df(_df):
|
|
88 |
return cumulative_df
|
89 |
|
90 |
|
91 |
-
def compare_last_10_months(_cumulative_df,
|
92 |
last_10_months = _cumulative_df.tail(10).copy()
|
93 |
last_10_months["total"] = last_10_months.sum(axis=1)
|
94 |
last_10_months["total_change"] = last_10_months["total"].diff()
|
95 |
-
last_10_months["
|
96 |
-
|
97 |
)
|
98 |
last_10_months["savings"] = (
|
99 |
-
last_10_months["total_change"] - last_10_months["
|
100 |
)
|
101 |
|
102 |
last_10_months = format_dataframe_size_column(
|
103 |
-
last_10_months, ["total_change", "
|
104 |
)
|
105 |
|
106 |
last_10_months["date"] = _cumulative_df.tail(10).index
|
@@ -112,28 +112,28 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
|
|
112 |
last_10_months = last_10_months.drop(last_10_months.index[0])
|
113 |
# order the columns date, total, total_change
|
114 |
last_10_months = last_10_months[
|
115 |
-
["date", "total_change", "
|
116 |
]
|
117 |
# rename the columns
|
118 |
last_10_months = last_10_months.rename(
|
119 |
columns={
|
120 |
"date": "Date",
|
121 |
"total_change": "Month-to-Month Growth (PBs)",
|
122 |
-
"
|
123 |
"savings": "Dedupe Savings (PBs)",
|
124 |
}
|
125 |
)
|
126 |
return last_10_months
|
127 |
|
128 |
|
129 |
-
def tabular_analysis(repo_sizes, cumulative_df,
|
130 |
-
# create a new column in the repository sizes dataframe for "
|
131 |
repo_sizes["Deduped Size (PBs)"] = ""
|
132 |
repo_sizes["Dedupe Savings (PBs)"] = ""
|
133 |
|
134 |
for column in cumulative_df.columns:
|
135 |
cum_repo_size = cumulative_df[column].iloc[-1]
|
136 |
-
comp_repo_size =
|
137 |
repo_size_diff = cum_repo_size - comp_repo_size
|
138 |
repo_sizes.loc[
|
139 |
repo_sizes["Repository Type"] == column.capitalize(),
|
@@ -143,19 +143,19 @@ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
|
|
143 |
repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
|
144 |
] = repo_size_diff
|
145 |
|
146 |
-
# add a row that sums the total size and
|
147 |
repo_sizes.loc["Total"] = repo_sizes.sum()
|
148 |
repo_sizes.loc["Total", "Repository Type"] = "Total"
|
149 |
return repo_sizes
|
150 |
|
151 |
|
152 |
-
def cumulative_growth_plot_analysis(cumulative_df,
|
153 |
"""
|
154 |
Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
|
155 |
|
156 |
Args:
|
157 |
df (DataFrame): The input dataframe containing the data.
|
158 |
-
|
159 |
|
160 |
Returns:
|
161 |
tuple: A tuple containing two elements:
|
@@ -189,11 +189,11 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
|
|
189 |
)
|
190 |
|
191 |
# Add a scatter trace for each type
|
192 |
-
for column in
|
193 |
fig.add_trace(
|
194 |
go.Scatter(
|
195 |
-
x=
|
196 |
-
y=
|
197 |
mode="lines",
|
198 |
name=column.capitalize() + " (File-Level Deduplication)",
|
199 |
line=dict(color=color_map.get(column, "black"), dash="dash"),
|
@@ -390,16 +390,16 @@ with gr.Blocks(theme="citrus") as demo:
|
|
390 |
|
391 |
# Convert year and month into a datetime column
|
392 |
df = month_year_to_date(df)
|
393 |
-
|
394 |
|
395 |
# Calculate the cumulative growth of models, spaces, and datasets over time
|
396 |
cumulative_df = cumulative_growth_df(df)
|
397 |
-
|
398 |
|
399 |
-
last_10_months = compare_last_10_months(cumulative_df,
|
400 |
|
401 |
by_repo_type_analysis = tabular_analysis(
|
402 |
-
by_repo_type, cumulative_df,
|
403 |
)
|
404 |
|
405 |
# Add top level heading and introduction text
|
@@ -523,9 +523,7 @@ with gr.Blocks(theme="citrus") as demo:
|
|
523 |
gr.Markdown(
|
524 |
"The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
|
525 |
)
|
526 |
-
dedupe_fig = cumulative_growth_plot_analysis(
|
527 |
-
cumulative_df, cumulative_df_compressed
|
528 |
-
)
|
529 |
gr.Plot(dedupe_fig)
|
530 |
|
531 |
gr.HTML(div_px(5))
|
|
|
88 |
return cumulative_df
|
89 |
|
90 |
|
91 |
+
def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
|
92 |
last_10_months = _cumulative_df.tail(10).copy()
|
93 |
last_10_months["total"] = last_10_months.sum(axis=1)
|
94 |
last_10_months["total_change"] = last_10_months["total"].diff()
|
95 |
+
last_10_months["deduped_change"] = (
|
96 |
+
_cumulative_df_deduped.tail(10).sum(axis=1).diff()
|
97 |
)
|
98 |
last_10_months["savings"] = (
|
99 |
+
last_10_months["total_change"] - last_10_months["deduped_change"]
|
100 |
)
|
101 |
|
102 |
last_10_months = format_dataframe_size_column(
|
103 |
+
last_10_months, ["total_change", "deduped_change", "savings"]
|
104 |
)
|
105 |
|
106 |
last_10_months["date"] = _cumulative_df.tail(10).index
|
|
|
112 |
last_10_months = last_10_months.drop(last_10_months.index[0])
|
113 |
# order the columns date, total, total_change
|
114 |
last_10_months = last_10_months[
|
115 |
+
["date", "total_change", "deduped_change", "savings"]
|
116 |
]
|
117 |
# rename the columns
|
118 |
last_10_months = last_10_months.rename(
|
119 |
columns={
|
120 |
"date": "Date",
|
121 |
"total_change": "Month-to-Month Growth (PBs)",
|
122 |
+
"deduped_change": "Growth with File-Level Deduplication (PBs)",
|
123 |
"savings": "Dedupe Savings (PBs)",
|
124 |
}
|
125 |
)
|
126 |
return last_10_months
|
127 |
|
128 |
|
129 |
+
def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
|
130 |
+
# create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
|
131 |
repo_sizes["Deduped Size (PBs)"] = ""
|
132 |
repo_sizes["Dedupe Savings (PBs)"] = ""
|
133 |
|
134 |
for column in cumulative_df.columns:
|
135 |
cum_repo_size = cumulative_df[column].iloc[-1]
|
136 |
+
comp_repo_size = cumulative_df_deduped[column].iloc[-1]
|
137 |
repo_size_diff = cum_repo_size - comp_repo_size
|
138 |
repo_sizes.loc[
|
139 |
repo_sizes["Repository Type"] == column.capitalize(),
|
|
|
143 |
repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
|
144 |
] = repo_size_diff
|
145 |
|
146 |
+
# add a row that sums the total size and deduped size
|
147 |
repo_sizes.loc["Total"] = repo_sizes.sum()
|
148 |
repo_sizes.loc["Total", "Repository Type"] = "Total"
|
149 |
return repo_sizes
|
150 |
|
151 |
|
152 |
+
def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
|
153 |
"""
|
154 |
Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
|
155 |
|
156 |
Args:
|
157 |
df (DataFrame): The input dataframe containing the data.
|
158 |
+
df_deduped (DataFrame): The input dataframe containing the deduped data.
|
159 |
|
160 |
Returns:
|
161 |
tuple: A tuple containing two elements:
|
|
|
189 |
)
|
190 |
|
191 |
# Add a scatter trace for each type
|
192 |
+
for column in cumulative_df_deduped.columns:
|
193 |
fig.add_trace(
|
194 |
go.Scatter(
|
195 |
+
x=cumulative_df_deduped.index,
|
196 |
+
y=cumulative_df_deduped[column] / 1e15, # Convert to petabytes
|
197 |
mode="lines",
|
198 |
name=column.capitalize() + " (File-Level Deduplication)",
|
199 |
line=dict(color=color_map.get(column, "black"), dash="dash"),
|
|
|
390 |
|
391 |
# Convert year and month into a datetime column
|
392 |
df = month_year_to_date(df)
|
393 |
+
df_deduped = month_year_to_date(file_df)
|
394 |
|
395 |
# Calculate the cumulative growth of models, spaces, and datasets over time
|
396 |
cumulative_df = cumulative_growth_df(df)
|
397 |
+
cumulative_df_deduped = cumulative_growth_df(df_deduped)
|
398 |
|
399 |
+
last_10_months = compare_last_10_months(cumulative_df, cumulative_df_deduped)
|
400 |
|
401 |
by_repo_type_analysis = tabular_analysis(
|
402 |
+
by_repo_type, cumulative_df, cumulative_df_deduped
|
403 |
)
|
404 |
|
405 |
# Add top level heading and introduction text
|
|
|
523 |
gr.Markdown(
|
524 |
"The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
|
525 |
)
|
526 |
+
dedupe_fig = cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped)
|
|
|
|
|
527 |
gr.Plot(dedupe_fig)
|
528 |
|
529 |
gr.HTML(div_px(5))
|