derek-thomas HF staff commited on
Commit
303cbb8
·
1 Parent(s): 86f872b

Comment out print statements

Browse files
Files changed (3) hide show
  1. memory_states.py +2 -2
  2. plot.py +3 -3
  3. utilities.py +21 -21
memory_states.py CHANGED
@@ -24,10 +24,10 @@ def get_my_memory_states(proj_dir, dataset, my_collection):
24
  prediction.reset_index(drop=True, inplace=True)
25
  prediction.sort_values(by=['r_history'], inplace=True)
26
  prediction.to_csv(proj_dir / "prediction.tsv", sep='\t', index=None)
27
- print("prediction.tsv saved.")
28
  prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
29
  difficulty_distribution = prediction.groupby(by=['difficulty'])['count'].sum() / prediction['count'].sum()
30
- print(difficulty_distribution)
31
  difficulty_distribution_padding = np.zeros(10)
32
  for i in range(10):
33
  if i + 1 in difficulty_distribution.index:
 
24
  prediction.reset_index(drop=True, inplace=True)
25
  prediction.sort_values(by=['r_history'], inplace=True)
26
  prediction.to_csv(proj_dir / "prediction.tsv", sep='\t', index=None)
27
+ # print("prediction.tsv saved.")
28
  prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
29
  difficulty_distribution = prediction.groupby(by=['difficulty'])['count'].sum() / prediction['count'].sum()
30
+ # print(difficulty_distribution)
31
  difficulty_distribution_padding = np.zeros(10)
32
  for i in range(10):
33
  if i + 1 in difficulty_distribution.index:
plot.py CHANGED
@@ -41,7 +41,7 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
41
  return w[9] * np.power(d, w[10]) * np.power(s, w[11]) * np.exp((1 - r) * w[12])
42
 
43
  stability_list = np.array([np.power(base, i - index_offset) for i in range(index_len)])
44
- print(f"terminal stability: {stability_list.max(): .2f}")
45
  df = pd.DataFrame(columns=["retention", "difficulty", "repetitions"])
46
 
47
  for percentage in trange(96, 70, -2, desc='Repetition vs Retention plot'):
@@ -73,7 +73,7 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
73
 
74
  df.sort_values(by=["difficulty", "retention"], inplace=True)
75
  df.to_csv(proj_dir/"expected_repetitions.csv", index=False)
76
- print("expected_repetitions.csv saved.")
77
 
78
  optimal_retention_list = np.zeros(10)
79
  df2 = pd.DataFrame()
@@ -87,6 +87,6 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
87
 
88
  fig = px.line(df2, x="retention", y="expected repetitions", color='d', log_y=True)
89
 
90
- print(f"\n-----suggested retention: {np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}-----")
91
  suggested_retention_markdown = f"""# Suggested Retention: `{np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}`"""
92
  return fig, suggested_retention_markdown
 
41
  return w[9] * np.power(d, w[10]) * np.power(s, w[11]) * np.exp((1 - r) * w[12])
42
 
43
  stability_list = np.array([np.power(base, i - index_offset) for i in range(index_len)])
44
+ # print(f"terminal stability: {stability_list.max(): .2f}")
45
  df = pd.DataFrame(columns=["retention", "difficulty", "repetitions"])
46
 
47
  for percentage in trange(96, 70, -2, desc='Repetition vs Retention plot'):
 
73
 
74
  df.sort_values(by=["difficulty", "retention"], inplace=True)
75
  df.to_csv(proj_dir/"expected_repetitions.csv", index=False)
76
+ # print("expected_repetitions.csv saved.")
77
 
78
  optimal_retention_list = np.zeros(10)
79
  df2 = pd.DataFrame()
 
87
 
88
  fig = px.line(df2, x="retention", y="expected repetitions", color='d', log_y=True)
89
 
90
+ # print(f"\n-----suggested retention: {np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}-----")
91
  suggested_retention_markdown = f"""# Suggested Retention: `{np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}`"""
92
  return fig, suggested_retention_markdown
utilities.py CHANGED
@@ -26,7 +26,7 @@ def extract(file, prefix):
26
  proj_dir = Path(f'projects/{prefix}_{file.orig_name.replace(".", "_").replace("@", "_")}')
27
  with ZipFile(file, 'r') as zip_ref:
28
  zip_ref.extractall(proj_dir)
29
- print(f"Extracted {file.orig_name} successfully!")
30
  return proj_dir
31
 
32
 
@@ -63,7 +63,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
63
  df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
64
  type_sequence = np.array(df['type'])
65
  df.to_csv(proj_dir / "revlog.csv", index=False)
66
- print("revlog.csv saved.")
67
  df = df[(df['type'] == 0) | (df['type'] == 1)].copy()
68
  df['real_days'] = df['review_date'] - timedelta(hours=next_day_starts_at)
69
  df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D')).to_julian_date()
@@ -94,7 +94,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
94
  df["t_history"] = df["t_history"].map(lambda x: x[1:] if len(x) > 1 else x)
95
  df["r_history"] = df["r_history"].map(lambda x: x[1:] if len(x) > 1 else x)
96
  df.to_csv(proj_dir / 'revlog_history.tsv', sep="\t", index=False)
97
- print("Trainset saved.")
98
 
99
  def cal_retention(group: pd.DataFrame) -> pd.DataFrame:
100
  group['retention'] = round(group['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x]).mean(), 4)
@@ -103,7 +103,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
103
 
104
  tqdm.pandas(desc='Calculating Retention')
105
  df = df.groupby(by=['r_history', 'delta_t']).progress_apply(cal_retention)
106
- print("Retention calculated.")
107
  df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_lvl', 'factor', 'time', 'type', 'create_date', 'review_date',
108
  'real_days', 'r', 't_history'])
109
  df.drop_duplicates(inplace=True)
@@ -128,7 +128,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
128
 
129
  tqdm.pandas(desc='Calculating Stability')
130
  df = df.groupby(by=['r_history']).progress_apply(cal_stability)
131
- print("Stability calculated.")
132
  df.reset_index(drop=True, inplace=True)
133
  df.drop_duplicates(inplace=True)
134
  df.sort_values(by=['r_history'], inplace=True, ignore_index=True)
@@ -144,11 +144,11 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
144
  df['last_recall'] = df['r_history'].map(lambda x: x[-1])
145
  df = df[df.groupby(['i', 'r_history'])['group_cnt'].transform(max) == df['group_cnt']]
146
  df.to_csv(proj_dir / 'stability_for_analysis.tsv', sep='\t', index=None)
147
- print("1:again, 2:hard, 3:good, 4:easy\n")
148
- print(df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
149
- ['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']].to_string(
150
- index=False))
151
- print("Analysis saved!")
152
 
153
  df_out = df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
154
  ['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']]
@@ -168,7 +168,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
168
  tqdm.pandas(desc='Tensorizing Line')
169
  dataset['tensor'] = dataset.progress_apply(lambda x: lineToTensor(list(zip([x['t_history']], [x['r_history']]))[0]),
170
  axis=1)
171
- print("Tensorized!")
172
 
173
  pre_train_set = dataset[dataset['i'] == 2]
174
  # pretrain
@@ -187,7 +187,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
187
  {1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
188
  if np.isnan(loss.data.item()):
189
  # Exception Case
190
- print(row, output_t)
191
  raise Exception('error case')
192
  loss.backward()
193
  optimizer.step()
@@ -195,7 +195,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
195
  pbar.update()
196
  pbar.close()
197
  for name, param in model.named_parameters():
198
- print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
199
 
200
  train_set = dataset[dataset['i'] > 2]
201
  epoch_len = len(train_set)
@@ -214,7 +214,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
214
  {1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
215
  if np.isnan(loss.data.item()):
216
  # Exception Case
217
- print(row, output_t)
218
  raise Exception('error case')
219
  loss.backward()
220
  for param in model.parameters():
@@ -223,15 +223,15 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
223
  model.apply(clipper)
224
  pbar.update()
225
 
226
- if (k * epoch_len + i) % print_len == 0:
227
- print(f"iteration: {k * epoch_len + i + 1}")
228
- for name, param in model.named_parameters():
229
- print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
230
  pbar.close()
231
 
232
  w = list(map(lambda x: round(float(x), 4), dict(model.named_parameters())['w'].data))
233
 
234
- print("\nTraining finished!")
235
  return w, dataset
236
 
237
 
@@ -271,12 +271,12 @@ def my_loss(dataset, w):
271
  my_collection = Collection(init_w)
272
  tqdm.pandas(desc='Calculating Loss before Training')
273
  dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
274
- print(f"Loss before training: {dataset['log_loss'].mean():.4f}")
275
  loss_before = f"{dataset['log_loss'].mean():.4f}"
276
  my_collection = Collection(w)
277
  tqdm.pandas(desc='Calculating Loss After Training')
278
  dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
279
- print(f"Loss after training: {dataset['log_loss'].mean():.4f}")
280
  loss_after = f"{dataset['log_loss'].mean():.4f}"
281
  return f"""
282
  *Loss before training*: {loss_before}
 
26
  proj_dir = Path(f'projects/{prefix}_{file.orig_name.replace(".", "_").replace("@", "_")}')
27
  with ZipFile(file, 'r') as zip_ref:
28
  zip_ref.extractall(proj_dir)
29
+ # print(f"Extracted {file.orig_name} successfully!")
30
  return proj_dir
31
 
32
 
 
63
  df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
64
  type_sequence = np.array(df['type'])
65
  df.to_csv(proj_dir / "revlog.csv", index=False)
66
+ # print("revlog.csv saved.")
67
  df = df[(df['type'] == 0) | (df['type'] == 1)].copy()
68
  df['real_days'] = df['review_date'] - timedelta(hours=next_day_starts_at)
69
  df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D')).to_julian_date()
 
94
  df["t_history"] = df["t_history"].map(lambda x: x[1:] if len(x) > 1 else x)
95
  df["r_history"] = df["r_history"].map(lambda x: x[1:] if len(x) > 1 else x)
96
  df.to_csv(proj_dir / 'revlog_history.tsv', sep="\t", index=False)
97
+ # print("Trainset saved.")
98
 
99
  def cal_retention(group: pd.DataFrame) -> pd.DataFrame:
100
  group['retention'] = round(group['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x]).mean(), 4)
 
103
 
104
  tqdm.pandas(desc='Calculating Retention')
105
  df = df.groupby(by=['r_history', 'delta_t']).progress_apply(cal_retention)
106
+ # print("Retention calculated.")
107
  df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_lvl', 'factor', 'time', 'type', 'create_date', 'review_date',
108
  'real_days', 'r', 't_history'])
109
  df.drop_duplicates(inplace=True)
 
128
 
129
  tqdm.pandas(desc='Calculating Stability')
130
  df = df.groupby(by=['r_history']).progress_apply(cal_stability)
131
+ # print("Stability calculated.")
132
  df.reset_index(drop=True, inplace=True)
133
  df.drop_duplicates(inplace=True)
134
  df.sort_values(by=['r_history'], inplace=True, ignore_index=True)
 
144
  df['last_recall'] = df['r_history'].map(lambda x: x[-1])
145
  df = df[df.groupby(['i', 'r_history'])['group_cnt'].transform(max) == df['group_cnt']]
146
  df.to_csv(proj_dir / 'stability_for_analysis.tsv', sep='\t', index=None)
147
+ # print("1:again, 2:hard, 3:good, 4:easy\n")
148
+ # print(df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
149
+ # ['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']].to_string(
150
+ # index=False))
151
+ # print("Analysis saved!")
152
 
153
  df_out = df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
154
  ['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']]
 
168
  tqdm.pandas(desc='Tensorizing Line')
169
  dataset['tensor'] = dataset.progress_apply(lambda x: lineToTensor(list(zip([x['t_history']], [x['r_history']]))[0]),
170
  axis=1)
171
+ # print("Tensorized!")
172
 
173
  pre_train_set = dataset[dataset['i'] == 2]
174
  # pretrain
 
187
  {1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
188
  if np.isnan(loss.data.item()):
189
  # Exception Case
190
+ # print(row, output_t)
191
  raise Exception('error case')
192
  loss.backward()
193
  optimizer.step()
 
195
  pbar.update()
196
  pbar.close()
197
  for name, param in model.named_parameters():
198
+ # print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
199
 
200
  train_set = dataset[dataset['i'] > 2]
201
  epoch_len = len(train_set)
 
214
  {1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
215
  if np.isnan(loss.data.item()):
216
  # Exception Case
217
+ # print(row, output_t)
218
  raise Exception('error case')
219
  loss.backward()
220
  for param in model.parameters():
 
223
  model.apply(clipper)
224
  pbar.update()
225
 
226
+ # if (k * epoch_len + i) % print_len == 0:
227
+ # print(f"iteration: {k * epoch_len + i + 1}")
228
+ # for name, param in model.named_parameters():
229
+ # print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
230
  pbar.close()
231
 
232
  w = list(map(lambda x: round(float(x), 4), dict(model.named_parameters())['w'].data))
233
 
234
+ # print("\nTraining finished!")
235
  return w, dataset
236
 
237
 
 
271
  my_collection = Collection(init_w)
272
  tqdm.pandas(desc='Calculating Loss before Training')
273
  dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
274
+ # print(f"Loss before training: {dataset['log_loss'].mean():.4f}")
275
  loss_before = f"{dataset['log_loss'].mean():.4f}"
276
  my_collection = Collection(w)
277
  tqdm.pandas(desc='Calculating Loss After Training')
278
  dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
279
+ # print(f"Loss after training: {dataset['log_loss'].mean():.4f}")
280
  loss_after = f"{dataset['log_loss'].mean():.4f}"
281
  return f"""
282
  *Loss before training*: {loss_before}