Commit
·
303cbb8
1
Parent(s):
86f872b
Comment out print statements
Browse files- memory_states.py +2 -2
- plot.py +3 -3
- utilities.py +21 -21
memory_states.py
CHANGED
@@ -24,10 +24,10 @@ def get_my_memory_states(proj_dir, dataset, my_collection):
|
|
24 |
prediction.reset_index(drop=True, inplace=True)
|
25 |
prediction.sort_values(by=['r_history'], inplace=True)
|
26 |
prediction.to_csv(proj_dir / "prediction.tsv", sep='\t', index=None)
|
27 |
-
print("prediction.tsv saved.")
|
28 |
prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
|
29 |
difficulty_distribution = prediction.groupby(by=['difficulty'])['count'].sum() / prediction['count'].sum()
|
30 |
-
print(difficulty_distribution)
|
31 |
difficulty_distribution_padding = np.zeros(10)
|
32 |
for i in range(10):
|
33 |
if i + 1 in difficulty_distribution.index:
|
|
|
24 |
prediction.reset_index(drop=True, inplace=True)
|
25 |
prediction.sort_values(by=['r_history'], inplace=True)
|
26 |
prediction.to_csv(proj_dir / "prediction.tsv", sep='\t', index=None)
|
27 |
+
# print("prediction.tsv saved.")
|
28 |
prediction['difficulty'] = prediction['difficulty'].map(lambda x: int(round(x)))
|
29 |
difficulty_distribution = prediction.groupby(by=['difficulty'])['count'].sum() / prediction['count'].sum()
|
30 |
+
# print(difficulty_distribution)
|
31 |
difficulty_distribution_padding = np.zeros(10)
|
32 |
for i in range(10):
|
33 |
if i + 1 in difficulty_distribution.index:
|
plot.py
CHANGED
@@ -41,7 +41,7 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
|
|
41 |
return w[9] * np.power(d, w[10]) * np.power(s, w[11]) * np.exp((1 - r) * w[12])
|
42 |
|
43 |
stability_list = np.array([np.power(base, i - index_offset) for i in range(index_len)])
|
44 |
-
print(f"terminal stability: {stability_list.max(): .2f}")
|
45 |
df = pd.DataFrame(columns=["retention", "difficulty", "repetitions"])
|
46 |
|
47 |
for percentage in trange(96, 70, -2, desc='Repetition vs Retention plot'):
|
@@ -73,7 +73,7 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
|
|
73 |
|
74 |
df.sort_values(by=["difficulty", "retention"], inplace=True)
|
75 |
df.to_csv(proj_dir/"expected_repetitions.csv", index=False)
|
76 |
-
print("expected_repetitions.csv saved.")
|
77 |
|
78 |
optimal_retention_list = np.zeros(10)
|
79 |
df2 = pd.DataFrame()
|
@@ -87,6 +87,6 @@ def make_plot(proj_dir, type_sequence, w, difficulty_distribution_padding, progr
|
|
87 |
|
88 |
fig = px.line(df2, x="retention", y="expected repetitions", color='d', log_y=True)
|
89 |
|
90 |
-
print(f"\n-----suggested retention: {np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}-----")
|
91 |
suggested_retention_markdown = f"""# Suggested Retention: `{np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}`"""
|
92 |
return fig, suggested_retention_markdown
|
|
|
41 |
return w[9] * np.power(d, w[10]) * np.power(s, w[11]) * np.exp((1 - r) * w[12])
|
42 |
|
43 |
stability_list = np.array([np.power(base, i - index_offset) for i in range(index_len)])
|
44 |
+
# print(f"terminal stability: {stability_list.max(): .2f}")
|
45 |
df = pd.DataFrame(columns=["retention", "difficulty", "repetitions"])
|
46 |
|
47 |
for percentage in trange(96, 70, -2, desc='Repetition vs Retention plot'):
|
|
|
73 |
|
74 |
df.sort_values(by=["difficulty", "retention"], inplace=True)
|
75 |
df.to_csv(proj_dir/"expected_repetitions.csv", index=False)
|
76 |
+
# print("expected_repetitions.csv saved.")
|
77 |
|
78 |
optimal_retention_list = np.zeros(10)
|
79 |
df2 = pd.DataFrame()
|
|
|
87 |
|
88 |
fig = px.line(df2, x="retention", y="expected repetitions", color='d', log_y=True)
|
89 |
|
90 |
+
# print(f"\n-----suggested retention: {np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}-----")
|
91 |
suggested_retention_markdown = f"""# Suggested Retention: `{np.inner(difficulty_distribution_padding, optimal_retention_list):.2f}`"""
|
92 |
return fig, suggested_retention_markdown
|
utilities.py
CHANGED
@@ -26,7 +26,7 @@ def extract(file, prefix):
|
|
26 |
proj_dir = Path(f'projects/{prefix}_{file.orig_name.replace(".", "_").replace("@", "_")}')
|
27 |
with ZipFile(file, 'r') as zip_ref:
|
28 |
zip_ref.extractall(proj_dir)
|
29 |
-
print(f"Extracted {file.orig_name} successfully!")
|
30 |
return proj_dir
|
31 |
|
32 |
|
@@ -63,7 +63,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
63 |
df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
|
64 |
type_sequence = np.array(df['type'])
|
65 |
df.to_csv(proj_dir / "revlog.csv", index=False)
|
66 |
-
print("revlog.csv saved.")
|
67 |
df = df[(df['type'] == 0) | (df['type'] == 1)].copy()
|
68 |
df['real_days'] = df['review_date'] - timedelta(hours=next_day_starts_at)
|
69 |
df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D')).to_julian_date()
|
@@ -94,7 +94,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
94 |
df["t_history"] = df["t_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
95 |
df["r_history"] = df["r_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
96 |
df.to_csv(proj_dir / 'revlog_history.tsv', sep="\t", index=False)
|
97 |
-
print("Trainset saved.")
|
98 |
|
99 |
def cal_retention(group: pd.DataFrame) -> pd.DataFrame:
|
100 |
group['retention'] = round(group['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x]).mean(), 4)
|
@@ -103,7 +103,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
103 |
|
104 |
tqdm.pandas(desc='Calculating Retention')
|
105 |
df = df.groupby(by=['r_history', 'delta_t']).progress_apply(cal_retention)
|
106 |
-
print("Retention calculated.")
|
107 |
df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_lvl', 'factor', 'time', 'type', 'create_date', 'review_date',
|
108 |
'real_days', 'r', 't_history'])
|
109 |
df.drop_duplicates(inplace=True)
|
@@ -128,7 +128,7 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
128 |
|
129 |
tqdm.pandas(desc='Calculating Stability')
|
130 |
df = df.groupby(by=['r_history']).progress_apply(cal_stability)
|
131 |
-
print("Stability calculated.")
|
132 |
df.reset_index(drop=True, inplace=True)
|
133 |
df.drop_duplicates(inplace=True)
|
134 |
df.sort_values(by=['r_history'], inplace=True, ignore_index=True)
|
@@ -144,11 +144,11 @@ def create_time_series_features(revlog_start_date, timezone, next_day_starts_at,
|
|
144 |
df['last_recall'] = df['r_history'].map(lambda x: x[-1])
|
145 |
df = df[df.groupby(['i', 'r_history'])['group_cnt'].transform(max) == df['group_cnt']]
|
146 |
df.to_csv(proj_dir / 'stability_for_analysis.tsv', sep='\t', index=None)
|
147 |
-
print("1:again, 2:hard, 3:good, 4:easy\n")
|
148 |
-
print(df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
149 |
-
|
150 |
-
|
151 |
-
print("Analysis saved!")
|
152 |
|
153 |
df_out = df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
154 |
['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']]
|
@@ -168,7 +168,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
168 |
tqdm.pandas(desc='Tensorizing Line')
|
169 |
dataset['tensor'] = dataset.progress_apply(lambda x: lineToTensor(list(zip([x['t_history']], [x['r_history']]))[0]),
|
170 |
axis=1)
|
171 |
-
print("Tensorized!")
|
172 |
|
173 |
pre_train_set = dataset[dataset['i'] == 2]
|
174 |
# pretrain
|
@@ -187,7 +187,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
187 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
188 |
if np.isnan(loss.data.item()):
|
189 |
# Exception Case
|
190 |
-
print(row, output_t)
|
191 |
raise Exception('error case')
|
192 |
loss.backward()
|
193 |
optimizer.step()
|
@@ -195,7 +195,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
195 |
pbar.update()
|
196 |
pbar.close()
|
197 |
for name, param in model.named_parameters():
|
198 |
-
print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
199 |
|
200 |
train_set = dataset[dataset['i'] > 2]
|
201 |
epoch_len = len(train_set)
|
@@ -214,7 +214,7 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
214 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
215 |
if np.isnan(loss.data.item()):
|
216 |
# Exception Case
|
217 |
-
print(row, output_t)
|
218 |
raise Exception('error case')
|
219 |
loss.backward()
|
220 |
for param in model.parameters():
|
@@ -223,15 +223,15 @@ def train_model(proj_dir, progress=gr.Progress(track_tqdm=True)):
|
|
223 |
model.apply(clipper)
|
224 |
pbar.update()
|
225 |
|
226 |
-
if (k * epoch_len + i) % print_len == 0:
|
227 |
-
print(f"iteration: {k * epoch_len + i + 1}")
|
228 |
-
for name, param in model.named_parameters():
|
229 |
-
print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
230 |
pbar.close()
|
231 |
|
232 |
w = list(map(lambda x: round(float(x), 4), dict(model.named_parameters())['w'].data))
|
233 |
|
234 |
-
print("\nTraining finished!")
|
235 |
return w, dataset
|
236 |
|
237 |
|
@@ -271,12 +271,12 @@ def my_loss(dataset, w):
|
|
271 |
my_collection = Collection(init_w)
|
272 |
tqdm.pandas(desc='Calculating Loss before Training')
|
273 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
274 |
-
print(f"Loss before training: {dataset['log_loss'].mean():.4f}")
|
275 |
loss_before = f"{dataset['log_loss'].mean():.4f}"
|
276 |
my_collection = Collection(w)
|
277 |
tqdm.pandas(desc='Calculating Loss After Training')
|
278 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
279 |
-
print(f"Loss after training: {dataset['log_loss'].mean():.4f}")
|
280 |
loss_after = f"{dataset['log_loss'].mean():.4f}"
|
281 |
return f"""
|
282 |
*Loss before training*: {loss_before}
|
|
|
26 |
proj_dir = Path(f'projects/{prefix}_{file.orig_name.replace(".", "_").replace("@", "_")}')
|
27 |
with ZipFile(file, 'r') as zip_ref:
|
28 |
zip_ref.extractall(proj_dir)
|
29 |
+
# print(f"Extracted {file.orig_name} successfully!")
|
30 |
return proj_dir
|
31 |
|
32 |
|
|
|
63 |
df.sort_values(by=['cid', 'id'], inplace=True, ignore_index=True)
|
64 |
type_sequence = np.array(df['type'])
|
65 |
df.to_csv(proj_dir / "revlog.csv", index=False)
|
66 |
+
# print("revlog.csv saved.")
|
67 |
df = df[(df['type'] == 0) | (df['type'] == 1)].copy()
|
68 |
df['real_days'] = df['review_date'] - timedelta(hours=next_day_starts_at)
|
69 |
df['real_days'] = pd.DatetimeIndex(df['real_days'].dt.floor('D')).to_julian_date()
|
|
|
94 |
df["t_history"] = df["t_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
95 |
df["r_history"] = df["r_history"].map(lambda x: x[1:] if len(x) > 1 else x)
|
96 |
df.to_csv(proj_dir / 'revlog_history.tsv', sep="\t", index=False)
|
97 |
+
# print("Trainset saved.")
|
98 |
|
99 |
def cal_retention(group: pd.DataFrame) -> pd.DataFrame:
|
100 |
group['retention'] = round(group['r'].map(lambda x: {1: 0, 2: 1, 3: 1, 4: 1}[x]).mean(), 4)
|
|
|
103 |
|
104 |
tqdm.pandas(desc='Calculating Retention')
|
105 |
df = df.groupby(by=['r_history', 'delta_t']).progress_apply(cal_retention)
|
106 |
+
# print("Retention calculated.")
|
107 |
df = df.drop(columns=['id', 'cid', 'usn', 'ivl', 'last_lvl', 'factor', 'time', 'type', 'create_date', 'review_date',
|
108 |
'real_days', 'r', 't_history'])
|
109 |
df.drop_duplicates(inplace=True)
|
|
|
128 |
|
129 |
tqdm.pandas(desc='Calculating Stability')
|
130 |
df = df.groupby(by=['r_history']).progress_apply(cal_stability)
|
131 |
+
# print("Stability calculated.")
|
132 |
df.reset_index(drop=True, inplace=True)
|
133 |
df.drop_duplicates(inplace=True)
|
134 |
df.sort_values(by=['r_history'], inplace=True, ignore_index=True)
|
|
|
144 |
df['last_recall'] = df['r_history'].map(lambda x: x[-1])
|
145 |
df = df[df.groupby(['i', 'r_history'])['group_cnt'].transform(max) == df['group_cnt']]
|
146 |
df.to_csv(proj_dir / 'stability_for_analysis.tsv', sep='\t', index=None)
|
147 |
+
# print("1:again, 2:hard, 3:good, 4:easy\n")
|
148 |
+
# print(df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
149 |
+
# ['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']].to_string(
|
150 |
+
# index=False))
|
151 |
+
# print("Analysis saved!")
|
152 |
|
153 |
df_out = df[df['r_history'].str.contains(r'^[1-4][^124]*$', regex=True)][
|
154 |
['r_history', 'avg_interval', 'avg_retention', 'stability', 'factor', 'group_cnt']]
|
|
|
168 |
tqdm.pandas(desc='Tensorizing Line')
|
169 |
dataset['tensor'] = dataset.progress_apply(lambda x: lineToTensor(list(zip([x['t_history']], [x['r_history']]))[0]),
|
170 |
axis=1)
|
171 |
+
# print("Tensorized!")
|
172 |
|
173 |
pre_train_set = dataset[dataset['i'] == 2]
|
174 |
# pretrain
|
|
|
187 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
188 |
if np.isnan(loss.data.item()):
|
189 |
# Exception Case
|
190 |
+
# print(row, output_t)
|
191 |
raise Exception('error case')
|
192 |
loss.backward()
|
193 |
optimizer.step()
|
|
|
195 |
pbar.update()
|
196 |
pbar.close()
|
197 |
for name, param in model.named_parameters():
|
198 |
+
# print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
199 |
|
200 |
train_set = dataset[dataset['i'] > 2]
|
201 |
epoch_len = len(train_set)
|
|
|
214 |
{1: 0, 2: 1, 3: 1, 4: 1}[row['r']])
|
215 |
if np.isnan(loss.data.item()):
|
216 |
# Exception Case
|
217 |
+
# print(row, output_t)
|
218 |
raise Exception('error case')
|
219 |
loss.backward()
|
220 |
for param in model.parameters():
|
|
|
223 |
model.apply(clipper)
|
224 |
pbar.update()
|
225 |
|
226 |
+
# if (k * epoch_len + i) % print_len == 0:
|
227 |
+
# print(f"iteration: {k * epoch_len + i + 1}")
|
228 |
+
# for name, param in model.named_parameters():
|
229 |
+
# print(f"{name}: {list(map(lambda x: round(float(x), 4), param))}")
|
230 |
pbar.close()
|
231 |
|
232 |
w = list(map(lambda x: round(float(x), 4), dict(model.named_parameters())['w'].data))
|
233 |
|
234 |
+
# print("\nTraining finished!")
|
235 |
return w, dataset
|
236 |
|
237 |
|
|
|
271 |
my_collection = Collection(init_w)
|
272 |
tqdm.pandas(desc='Calculating Loss before Training')
|
273 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
274 |
+
# print(f"Loss before training: {dataset['log_loss'].mean():.4f}")
|
275 |
loss_before = f"{dataset['log_loss'].mean():.4f}"
|
276 |
my_collection = Collection(w)
|
277 |
tqdm.pandas(desc='Calculating Loss After Training')
|
278 |
dataset = dataset.progress_apply(partial(log_loss, my_collection), axis=1)
|
279 |
+
# print(f"Loss after training: {dataset['log_loss'].mean():.4f}")
|
280 |
loss_after = f"{dataset['log_loss'].mean():.4f}"
|
281 |
return f"""
|
282 |
*Loss before training*: {loss_before}
|