meg-huggingface commited on
Commit
335424f
·
1 Parent(s): 85cf91c

Some additional modularizing and caching of the text lengths widget

Browse files
data_measurements/dataset_statistics.py CHANGED
@@ -219,6 +219,7 @@ class DatasetStatisticsCacheClass:
219
  self.avg_length = None
220
  self.std_length = None
221
  self.general_stats_dict = None
 
222
  # clustering text by embeddings
223
  # the hierarchical clustering tree is represented as a list of nodes,
224
  # the first is the root
@@ -351,6 +352,7 @@ class DatasetStatisticsCacheClass:
351
  self.length_stats_dict = json.load(f)
352
  self.avg_length = self.length_stats_dict["avg length"]
353
  self.std_length = self.length_stats_dict["std length"]
 
354
  else:
355
  self.prepare_text_length_stats()
356
  if save:
@@ -367,14 +369,16 @@ class DatasetStatisticsCacheClass:
367
  )
368
 
369
  def prepare_text_length_stats(self):
370
- if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
371
  self.prepare_length_df()
372
  avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
373
  self.avg_length = round(avg_length, 1)
374
  std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
375
  self.std_length = round(std_length, 1)
 
376
  self.length_stats_dict = {"avg length": self.avg_length,
377
- "std length": self.std_length}
 
378
 
379
  def prepare_fig_text_lengths(self):
380
  if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
 
219
  self.avg_length = None
220
  self.std_length = None
221
  self.general_stats_dict = None
222
+ self.num_uniq_lengths = 0
223
  # clustering text by embeddings
224
  # the hierarchical clustering tree is represented as a list of nodes,
225
  # the first is the root
 
352
  self.length_stats_dict = json.load(f)
353
  self.avg_length = self.length_stats_dict["avg length"]
354
  self.std_length = self.length_stats_dict["std length"]
355
+ self.num_uniq_lengths = self.length_stats_dict["num lengths"]
356
  else:
357
  self.prepare_text_length_stats()
358
  if save:
 
369
  )
370
 
371
  def prepare_text_length_stats(self):
372
+ if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns or self.length_df is None:
373
  self.prepare_length_df()
374
  avg_length = sum(self.tokenized_df[LENGTH_FIELD])/len(self.tokenized_df[LENGTH_FIELD])
375
  self.avg_length = round(avg_length, 1)
376
  std_length = statistics.stdev(self.tokenized_df[LENGTH_FIELD])
377
  self.std_length = round(std_length, 1)
378
+ self.num_uniq_lengths = len(self.length_df["length"].unique())
379
  self.length_stats_dict = {"avg length": self.avg_length,
380
+ "std length": self.std_length,
381
+ "num lengths": self.num_uniq_lengths}
382
 
383
  def prepare_fig_text_lengths(self):
384
  if self.tokenized_df is None or LENGTH_FIELD not in self.tokenized_df.columns:
data_measurements/streamlit_utils.py CHANGED
@@ -147,9 +147,7 @@ def expander_label_distribution(fig_labels, column_id):
147
  st.markdown("No labels were found in the dataset")
148
 
149
 
150
- def expander_text_lengths(dstats,
151
- column_id,
152
- ):
153
  _TEXT_LENGTH_CAPTION = (
154
  "Use this widget to identify outliers, particularly suspiciously long outliers."
155
  )
@@ -176,7 +174,7 @@ def expander_text_lengths(dstats,
176
  start_id_show_lengths = st.slider(
177
  f"Show the shortest sentences{column_id} starting at:",
178
  0,
179
- len(dstats.length_df["length"].unique()),
180
  value=0,
181
  step=1,
182
  )
 
147
  st.markdown("No labels were found in the dataset")
148
 
149
 
150
+ def expander_text_lengths(dstats, column_id):
 
 
151
  _TEXT_LENGTH_CAPTION = (
152
  "Use this widget to identify outliers, particularly suspiciously long outliers."
153
  )
 
174
  start_id_show_lengths = st.slider(
175
  f"Show the shortest sentences{column_id} starting at:",
176
  0,
177
+ dstats.num_uniq_lengths,
178
  value=0,
179
  step=1,
180
  )