Spaces:

xet-team
/

lfs-analysis

Running

App Files Files Community

jsulz HF staff commited on Oct 10, 2024

Commit

0922060

1 Parent(s): 6c40bac

totally removing compressed from everything

Browse files

Files changed (1) hide show

app.py +21 -23

app.py CHANGED Viewed

@@ -88,19 +88,19 @@ def cumulative_growth_df(_df):
     return cumulative_df
-def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
     last_10_months = _cumulative_df.tail(10).copy()
     last_10_months["total"] = last_10_months.sum(axis=1)
     last_10_months["total_change"] = last_10_months["total"].diff()
-    last_10_months["compressed_change"] = (
-        _cumulative_df_compressed.tail(10).sum(axis=1).diff()
     )
     last_10_months["savings"] = (
-        last_10_months["total_change"] - last_10_months["compressed_change"]
     )
     last_10_months = format_dataframe_size_column(
-        last_10_months, ["total_change", "compressed_change", "savings"]
     )
     last_10_months["date"] = _cumulative_df.tail(10).index
@@ -112,28 +112,28 @@ def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
     last_10_months = last_10_months.drop(last_10_months.index[0])
     # order the columns date, total, total_change
     last_10_months = last_10_months[
-        ["date", "total_change", "compressed_change", "savings"]
     ]
     # rename the columns
     last_10_months = last_10_months.rename(
         columns={
             "date": "Date",
             "total_change": "Month-to-Month Growth (PBs)",
-            "compressed_change": "Growth with File-Level Deduplication (PBs)",
             "savings": "Dedupe Savings (PBs)",
         }
     )
     return last_10_months
-def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
-    # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
     repo_sizes["Deduped Size (PBs)"] = ""
     repo_sizes["Dedupe Savings (PBs)"] = ""
     for column in cumulative_df.columns:
         cum_repo_size = cumulative_df[column].iloc[-1]
-        comp_repo_size = cumulative_df_compressed[column].iloc[-1]
         repo_size_diff = cum_repo_size - comp_repo_size
         repo_sizes.loc[
             repo_sizes["Repository Type"] == column.capitalize(),
@@ -143,19 +143,19 @@ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
             repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
         ] = repo_size_diff
-    # add a row that sums the total size and compressed size
     repo_sizes.loc["Total"] = repo_sizes.sum()
     repo_sizes.loc["Total", "Repository Type"] = "Total"
     return repo_sizes
-def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
     """
     Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
     Args:
         df (DataFrame): The input dataframe containing the data.
-        df_compressed (DataFrame): The input dataframe containing the compressed data.
     Returns:
         tuple: A tuple containing two elements:
@@ -189,11 +189,11 @@ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
         )
     # Add a scatter trace for each type
-    for column in cumulative_df_compressed.columns:
         fig.add_trace(
             go.Scatter(
-                x=cumulative_df_compressed.index,
-                y=cumulative_df_compressed[column] / 1e15,  # Convert to petabytes
                 mode="lines",
                 name=column.capitalize() + " (File-Level Deduplication)",
                 line=dict(color=color_map.get(column, "black"), dash="dash"),
@@ -390,16 +390,16 @@ with gr.Blocks(theme="citrus") as demo:
     # Convert year and month into a datetime column
     df = month_year_to_date(df)
-    df_compressed = month_year_to_date(file_df)
     # Calculate the cumulative growth of models, spaces, and datasets over time
     cumulative_df = cumulative_growth_df(df)
-    cumulative_df_compressed = cumulative_growth_df(df_compressed)
-    last_10_months = compare_last_10_months(cumulative_df, cumulative_df_compressed)
     by_repo_type_analysis = tabular_analysis(
-        by_repo_type, cumulative_df, cumulative_df_compressed
     )
     # Add top level heading and introduction text
@@ -523,9 +523,7 @@ with gr.Blocks(theme="citrus") as demo:
     gr.Markdown(
         "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
     )
-    dedupe_fig = cumulative_growth_plot_analysis(
-        cumulative_df, cumulative_df_compressed
-    )
     gr.Plot(dedupe_fig)
     gr.HTML(div_px(5))

     return cumulative_df
+def compare_last_10_months(_cumulative_df, _cumulative_df_deduped):
     last_10_months = _cumulative_df.tail(10).copy()
     last_10_months["total"] = last_10_months.sum(axis=1)
     last_10_months["total_change"] = last_10_months["total"].diff()
+    last_10_months["deduped_change"] = (
+        _cumulative_df_deduped.tail(10).sum(axis=1).diff()
     )
     last_10_months["savings"] = (
+        last_10_months["total_change"] - last_10_months["deduped_change"]
     )
     last_10_months = format_dataframe_size_column(
+        last_10_months, ["total_change", "deduped_change", "savings"]
     )
     last_10_months["date"] = _cumulative_df.tail(10).index
     last_10_months = last_10_months.drop(last_10_months.index[0])
     # order the columns date, total, total_change
     last_10_months = last_10_months[
+        ["date", "total_change", "deduped_change", "savings"]
     ]
     # rename the columns
     last_10_months = last_10_months.rename(
         columns={
             "date": "Date",
             "total_change": "Month-to-Month Growth (PBs)",
+            "deduped_change": "Growth with File-Level Deduplication (PBs)",
             "savings": "Dedupe Savings (PBs)",
         }
     )
     return last_10_months
+def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_deduped):
+    # create a new column in the repository sizes dataframe for "deduped size" and set it to empty atif rist
     repo_sizes["Deduped Size (PBs)"] = ""
     repo_sizes["Dedupe Savings (PBs)"] = ""
     for column in cumulative_df.columns:
         cum_repo_size = cumulative_df[column].iloc[-1]
+        comp_repo_size = cumulative_df_deduped[column].iloc[-1]
         repo_size_diff = cum_repo_size - comp_repo_size
         repo_sizes.loc[
             repo_sizes["Repository Type"] == column.capitalize(),
             repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
         ] = repo_size_diff
+    # add a row that sums the total size and deduped size
     repo_sizes.loc["Total"] = repo_sizes.sum()
     repo_sizes.loc["Total", "Repository Type"] = "Total"
     return repo_sizes
+def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped):
     """
     Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
     Args:
         df (DataFrame): The input dataframe containing the data.
+        df_deduped (DataFrame): The input dataframe containing the deduped data.
     Returns:
         tuple: A tuple containing two elements:
         )
     # Add a scatter trace for each type
+    for column in cumulative_df_deduped.columns:
         fig.add_trace(
             go.Scatter(
+                x=cumulative_df_deduped.index,
+                y=cumulative_df_deduped[column] / 1e15,  # Convert to petabytes
                 mode="lines",
                 name=column.capitalize() + " (File-Level Deduplication)",
                 line=dict(color=color_map.get(column, "black"), dash="dash"),
     # Convert year and month into a datetime column
     df = month_year_to_date(df)
+    df_deduped = month_year_to_date(file_df)
     # Calculate the cumulative growth of models, spaces, and datasets over time
     cumulative_df = cumulative_growth_df(df)
+    cumulative_df_deduped = cumulative_growth_df(df_deduped)
+    last_10_months = compare_last_10_months(cumulative_df, cumulative_df_deduped)
     by_repo_type_analysis = tabular_analysis(
+        by_repo_type, cumulative_df, cumulative_df_deduped
     )
     # Add top level heading and introduction text
     gr.Markdown(
         "The first improvement we can make to Hub storage is to add file-level deduplication. Since forking any Hub repository makes copies of the files, a scan of existing files unsurprisingly shows that some files match exactly. The following chart shows the storage growth chart from above with additional dashed lines showing the potential savings from deduplicating at the file level."
     )
+    dedupe_fig = cumulative_growth_plot_analysis(cumulative_df, cumulative_df_deduped)
     gr.Plot(dedupe_fig)
     gr.HTML(div_px(5))