Joshua Lansford commited on
Commit
2f7253d
·
1 Parent(s): a880e5e

Fix Target contains only one unique value error (Issue #1)

Browse files
Files changed (3) hide show
  1. .vscode/launch.json +237 -40
  2. README.md +7 -7
  3. transmorgrify.py +80 -55
.vscode/launch.json CHANGED
@@ -12,6 +12,221 @@
12
  "program": "${file}",
13
  "console": "integratedTerminal",
14
  "justMyCode": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  },{
16
  "name": "Train phonetic 4000 gpu",
17
  "type": "python",
@@ -21,29 +236,14 @@
21
  "justMyCode": true,
22
  "args": [
23
  "--train",
24
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
25
  "--a_header", "English",
26
  "--b_header", "Phonetic",
27
  "--device", "0:1",
28
  "--model", "phonetics_forward.tm"
29
  ]
30
- },{
31
- "name": "Train reverse phonetic 4000 gpu",
32
- "type": "python",
33
- "request": "launch",
34
- "program": "transmorgrify.py",
35
- "console": "integratedTerminal",
36
- "justMyCode": true,
37
- "args": [
38
- "--train",
39
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
40
- "--b_header", "English",
41
- "--a_header", "Phonetic",
42
- "--device", "0:1",
43
- "--model", "phonetics_backwards.tm"
44
- ]
45
  },{
46
- "name": "Train short phonetic 4000 gpu",
47
  "type": "python",
48
  "request": "launch",
49
  "program": "transmorgrify.py",
@@ -51,31 +251,30 @@
51
  "justMyCode": true,
52
  "args": [
53
  "--train",
54
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
55
- "--a_header", "English",
56
- "--b_header", "Phonetic",
57
  "--device", "0:1",
58
- "--model", "phonetics_small.tm"
59
  ]
60
  },{
61
- "name": "Execute phonetic gpu",
62
  "type": "python",
63
  "request": "launch",
64
  "program": "transmorgrify.py",
65
  "console": "integratedTerminal",
66
  "justMyCode": true,
67
  "args": [
68
- "--execute",
69
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
70
- "--out_csv", "./phonetic_out.csv",
71
  "--a_header", "English",
72
  "--b_header", "Phonetic",
73
  "--device", "0:1",
74
- "--model", "phonetics_forward.tm",
75
- "--verbose",
76
  ]
77
  },{
78
- "name": "short Execute phonetic",
79
  "type": "python",
80
  "request": "launch",
81
  "program": "transmorgrify.py",
@@ -83,16 +282,13 @@
83
  "justMyCode": true,
84
  "args": [
85
  "--execute",
86
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
87
- "--out_csv", "./phonetic_out.csv",
88
- "--a_header", "English",
89
- "--b_header", "Phonetic",
90
- "--model", "phonetics_forward.tm",
91
  "--verbose",
92
- "--include_stats",
93
  ]
94
  },{
95
- "name": "short Execute reverse phonetic",
96
  "type": "python",
97
  "request": "launch",
98
  "program": "transmorgrify.py",
@@ -100,11 +296,12 @@
100
  "justMyCode": true,
101
  "args": [
102
  "--execute",
103
- "--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic_short.csv",
104
- "--out_csv", "./reverse_phonetic_out.csv",
105
- "--b_header", "English",
106
- "--a_header", "Phonetic",
107
- "--model", "phonetics_backwards.tm",
 
108
  "--verbose",
109
  "--include_stats",
110
  ]
 
12
  "program": "${file}",
13
  "console": "integratedTerminal",
14
  "justMyCode": true
15
+ },{
16
+ "name": "Train \"latin\" 4000",
17
+ "type": "python",
18
+ "request": "launch",
19
+ "program": "transmorgrify.py",
20
+ "console": "integratedTerminal",
21
+ "justMyCode": true,
22
+ "args": [
23
+ "--train",
24
+ "--in_csv", "./examples/latin/training_latin.txt",
25
+ "--a_header", "original",
26
+ "--b_header", "split",
27
+ "--device", "0:1",
28
+ "--model", "latin_forward.tm",
29
+ "--train_percentage", "50",
30
+ ]
31
+ },{
32
+ "name": "Train \"latin\" 100",
33
+ "type": "python",
34
+ "request": "launch",
35
+ "program": "transmorgrify.py",
36
+ "console": "integratedTerminal",
37
+ "justMyCode": true,
38
+ "args": [
39
+ "--train",
40
+ "--in_csv", "./examples/latin/training_latin.txt",
41
+ "--a_header", "original",
42
+ "--b_header", "split",
43
+ "--device", "0:1",
44
+ "--model", "latin_forward_100.tm",
45
+ "-n", "100",
46
+ "--train_percentage", "50",
47
+ ]
48
+ },{
49
+ "name": "Train \"latin\" 10000",
50
+ "type": "python",
51
+ "request": "launch",
52
+ "program": "transmorgrify.py",
53
+ "console": "integratedTerminal",
54
+ "justMyCode": true,
55
+ "args": [
56
+ "--train",
57
+ "--in_csv", "./examples/latin/training_latin.txt",
58
+ "--a_header", "original",
59
+ "--b_header", "split",
60
+ "--device", "0:1",
61
+ "--model", "latin_forward_10000.tm",
62
+ "-n", "10000",
63
+ "--train_percentage", "50",
64
+ ]
65
+ },{
66
+ "name": "Execute \"latin\" 4000",
67
+ "type": "python",
68
+ "request": "launch",
69
+ "program": "transmorgrify.py",
70
+ "console": "integratedTerminal",
71
+ "justMyCode": true,
72
+ "args": [
73
+ "--execute",
74
+ "--in_csv", "./examples/latin/training_latin.txt",
75
+ "--a_header", "original",
76
+ "--b_header", "split",
77
+ "--device", "0:1",
78
+ "--model", "latin_forward.tm",
79
+ "--verbose",
80
+ "--include_stats",
81
+ "--out_csv", "latin_4000.csv",
82
+ "--train_percentage", "50",
83
+ ]
84
+ },{
85
+ "name": "Execute \"latin\" 100",
86
+ "type": "python",
87
+ "request": "launch",
88
+ "program": "transmorgrify.py",
89
+ "console": "integratedTerminal",
90
+ "justMyCode": true,
91
+ "args": [
92
+ "--execute",
93
+ "--in_csv", "./examples/latin/training_latin.txt",
94
+ "--a_header", "original",
95
+ "--b_header", "split",
96
+ "--device", "0:1",
97
+ "--model", "latin_forward_100.tm",
98
+ "--verbose",
99
+ "--include_stats",
100
+ "--out_csv", "latin_100.csv",
101
+ "--train_percentage", "50",
102
+ ]
103
+ },{
104
+ "name": "Execute \"latin\" 10000",
105
+ "type": "python",
106
+ "request": "launch",
107
+ "program": "transmorgrify.py",
108
+ "console": "integratedTerminal",
109
+ "justMyCode": true,
110
+ "args": [
111
+ "--execute",
112
+ "--in_csv", "./examples/latin/training_latin.txt",
113
+ "--a_header", "original",
114
+ "--b_header", "split",
115
+ "--device", "0:1",
116
+ "--model", "latin_forward_10000.tm",
117
+ "--verbose",
118
+ "--include_stats",
119
+ "--out_csv", "latin_10000.csv",
120
+ "--train_percentage", "50",
121
+ ]
122
+ },{
123
+ "name": "Demo \"latin\" 4000",
124
+ "type": "python",
125
+ "request": "launch",
126
+ "program": "transmorgrify.py",
127
+ "console": "integratedTerminal",
128
+ "justMyCode": true,
129
+ "args": [
130
+ "--gradio",
131
+ "--in_csv", "./examples/latin/training_latin.txt",
132
+ "--a_header", "original",
133
+ //"--b_header", "split",
134
+ //"--device", "0:1",
135
+ "--model", "latin_forward.tm"
136
+ ]
137
+ },{
138
+ "name": "Demo \"latin\" 100",
139
+ "type": "python",
140
+ "request": "launch",
141
+ "program": "transmorgrify.py",
142
+ "console": "integratedTerminal",
143
+ "justMyCode": true,
144
+ "args": [
145
+ "--gradio",
146
+ "--in_csv", "./examples/latin/training_latin.txt",
147
+ "--a_header", "original",
148
+ //"--b_header", "split",
149
+ //"--device", "0:1",
150
+ "--model", "latin_forward_100.tm"
151
+ ]
152
+ },{
153
+ "name": "Demo \"latin\" 10000",
154
+ "type": "python",
155
+ "request": "launch",
156
+ "program": "transmorgrify.py",
157
+ "console": "integratedTerminal",
158
+ "justMyCode": true,
159
+ "args": [
160
+ "--gradio",
161
+ "--in_csv", "./examples/latin/training_latin.txt",
162
+ "--a_header", "original",
163
+ //"--b_header", "split",
164
+ //"--device", "0:1",
165
+ "--model", "latin_forward_10000.tm"
166
+ ]
167
+ },{
168
+ "name": "Train \"latin\" mod",
169
+ "type": "python",
170
+ "request": "launch",
171
+ "program": "transmorgrify.py",
172
+ "console": "integratedTerminal",
173
+ "justMyCode": true,
174
+ "args": [
175
+ "--train",
176
+ "--in_csv", "./examples/latin/training_latin_mod.csv",
177
+ "--a_header", "original",
178
+ "--b_header", "split",
179
+ "--device", "0:1",
180
+ "--model", "latin_mod_forward.tm"
181
+ ]
182
+ },{
183
+ "name": "Execute \"latin\" mod",
184
+ "type": "python",
185
+ "request": "launch",
186
+ "program": "transmorgrify.py",
187
+ "console": "integratedTerminal",
188
+ "justMyCode": true,
189
+ "args": [
190
+ "--execute",
191
+ "--in_csv", "./examples/latin/training_latin_mod.csv",
192
+ "--a_header", "original",
193
+ //"--b_header", "split",
194
+ "--device", "0:1",
195
+ "--model", "latin_mod_forward.tm",
196
+ "--out_csv", "latin_mod_out.csv",
197
+ ]
198
+ },{
199
+ "name": "Execute mixup",
200
+ "type": "python",
201
+ "request": "launch",
202
+ "program": "transmorgrify.py",
203
+ "console": "integratedTerminal",
204
+ "justMyCode": true,
205
+ "args": [
206
+ "--execute",
207
+ "--in_csv", "./examples/latin/training_latin_mod.csv",
208
+ "--a_header", "original",
209
+ //"--b_header", "split",
210
+ "--device", "0:1",
211
+ //"--model", "latin_mod_forward.tm",
212
+ "--model", "phonetics_small.tm",
213
+ "--out_csv", "mixum_out.csv",
214
+ ]
215
+ },{
216
+ "name": "Demo \"latin\" mod",
217
+ "type": "python",
218
+ "request": "launch",
219
+ "program": "transmorgrify.py",
220
+ "console": "integratedTerminal",
221
+ "justMyCode": true,
222
+ "args": [
223
+ "--gradio",
224
+ "--in_csv", "./examples/latin/training_latin_mod.csv",
225
+ "--a_header", "original",
226
+ //"--b_header", "split",
227
+ //"--device", "0:1",
228
+ "--model", "latin_mod_forward.tm"
229
+ ]
230
  },{
231
  "name": "Train phonetic 4000 gpu",
232
  "type": "python",
 
236
  "justMyCode": true,
237
  "args": [
238
  "--train",
239
+ "--in_csv", "./examples/phonetic/phonetic.csv",
240
  "--a_header", "English",
241
  "--b_header", "Phonetic",
242
  "--device", "0:1",
243
  "--model", "phonetics_forward.tm"
244
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  },{
246
+ "name": "Train reverse phonetic 4000 gpu",
247
  "type": "python",
248
  "request": "launch",
249
  "program": "transmorgrify.py",
 
251
  "justMyCode": true,
252
  "args": [
253
  "--train",
254
+ "--in_csv", "./examples/phonetic/phonetic.csv",
255
+ "--b_header", "English",
256
+ "--a_header", "Phonetic",
257
  "--device", "0:1",
258
+ "--model", "phonetics_backwards.tm"
259
  ]
260
  },{
261
+ "name": "Train short phonetic 4000 gpu",
262
  "type": "python",
263
  "request": "launch",
264
  "program": "transmorgrify.py",
265
  "console": "integratedTerminal",
266
  "justMyCode": true,
267
  "args": [
268
+ "--train",
269
+ "--in_csv", "./examples/phonetic/phonetic_short.csv",
 
270
  "--a_header", "English",
271
  "--b_header", "Phonetic",
272
  "--device", "0:1",
273
+ "-n", "100",
274
+ "--model", "phonetics_small.tm"
275
  ]
276
  },{
277
+ "name": "Execute phonetic gpu",
278
  "type": "python",
279
  "request": "launch",
280
  "program": "transmorgrify.py",
 
282
  "justMyCode": true,
283
  "args": [
284
  "--execute",
285
+ "--in_csv", "./examples/phonetic/phonetic.csv",
286
+ "--a_header", "original",
287
+ //"--b_header", "split",
 
 
288
  "--verbose",
 
289
  ]
290
  },{
291
+ "name": "Execute short phonetic",
292
  "type": "python",
293
  "request": "launch",
294
  "program": "transmorgrify.py",
 
296
  "justMyCode": true,
297
  "args": [
298
  "--execute",
299
+ "--in_csv", "./examples/phonetic/phonetic_short.csv",
300
+ "--out_csv", "./phonetic_out.csv",
301
+ "--a_header", "English",
302
+ "--b_header", "Phonetic",
303
+ "--model", "phonetics_small.tm",
304
+ "--device", "0:1",
305
  "--verbose",
306
  "--include_stats",
307
  ]
README.md CHANGED
@@ -13,10 +13,10 @@ license: apache-2.0
13
  ## Sentence Transmorgrifier
14
 
15
  # What is the Sentence Transmorgrifier?
16
- - The Sentence Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradiant boost library, [catboost](https://catboost.ai/), as its back end.
17
  - This library does not use neural net or word embeddings but does the transformation on the character level.
18
  - For Sentence Transmorgrifier to work, there has to be some common characters between the from and two conversion.
19
- - The model uses a modified form of the [logest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentence conversion into a sequence of three types of operations:
20
  1. Match: Pass the character from input to output
21
  2. Drop: Remove the incoming character from the input.
22
  3. Insert: Generate a character and add it to the output.
@@ -31,7 +31,7 @@ license: apache-2.0
31
  - The project has been configured to be able to be used in two different ways.
32
 
33
  ## Shell access
34
- - The transmorgrify.py script can be called directly with arguments specifying an input csv file, what lables are from and to and what to save the resulting model as or to process the input csv to an output. Here is an example:
35
 
36
  ```sh
37
  python transmorgrify.py \
@@ -49,7 +49,7 @@ python transmorgrify.py \
49
  - `--b_header` This indicates the to column
50
  - `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
51
  - `--model` This indicates where to save the model
52
- - `--verbose` Self explanitory
53
  - `--iterations` This indicates how many catboost iterations should be executed on your input data.
54
  - `--train_percentage` If you are going to use the same file for testing as well as the training, giving a train percentage will only use the percentage specified for training.
55
 
@@ -71,7 +71,7 @@ python transmorgrify.py \
71
  - `--b_header` This indicates the to column. The to column must be specified if `--include_stats` is also specified.
72
  - `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
73
  - `--model` This indicates where to load the model
74
- - `--verbose` Self explanitory
75
  - `--include_stats` This adds editing distance to the output csv so that you can sort and graph how well the model did. It reports the Levenshtein Distance from input to output before and after transformation and the percent improvement.
76
  - `--out_csv` This indicates where the data should be saved after being processed by the model.
77
  - `--train_percentage` If you are going to use the same file for testing as well as the training, give the same train percentage as was given for training and the execution will only use the remaining data not used for training.
@@ -107,7 +107,7 @@ model -- The filename of the model to load. (default my_model.tm)
107
  ```
108
  - `execute`
109
  ```
110
- Runs the data from from_sentaces. The results are returned
111
  using yield so you need to wrap this in list() if you want
112
  to index it. from_sentences can be an array or a generator.
113
 
@@ -150,4 +150,4 @@ my_model.load( "my_model.tm" )
150
  results = list( my_model.execute( inference_data["from_header"] ) )
151
  ```
152
  # What is the license?
153
- - The licence has been set to apache-2.0 to match catboost so I don't have to think about compatibility issues.
 
13
  ## Sentence Transmorgrifier
14
 
15
  # What is the Sentence Transmorgrifier?
16
+ - The Sentence Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradient boost library, [catboost](https://catboost.ai/), as its back end.
17
  - This library does not use neural net or word embeddings but does the transformation on the character level.
18
  - For Sentence Transmorgrifier to work, there has to be some common characters between the from and two conversion.
19
+ - The model uses a modified form of the [longest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentence conversion into a sequence of three types of operations:
20
  1. Match: Pass the character from input to output
21
  2. Drop: Remove the incoming character from the input.
22
  3. Insert: Generate a character and add it to the output.
 
31
  - The project has been configured to be able to be used in two different ways.
32
 
33
  ## Shell access
34
+ - The transmorgrify.py script can be called directly with arguments specifying an input csv file, what labels are from and to and what to save the resulting model as or to process the input csv to an output. Here is an example:
35
 
36
  ```sh
37
  python transmorgrify.py \
 
49
  - `--b_header` This indicates the to column
50
  - `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
51
  - `--model` This indicates where to save the model
52
+ - `--verbose` Self explanatory
53
  - `--iterations` This indicates how many catboost iterations should be executed on your input data.
54
  - `--train_percentage` If you are going to use the same file for testing as well as the training, giving a train percentage will only use the percentage specified for training.
55
 
 
71
  - `--b_header` This indicates the to column. The to column must be specified if `--include_stats` is also specified.
72
  - `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
73
  - `--model` This indicates where to load the model
74
+ - `--verbose` Self explanatory
75
  - `--include_stats` This adds editing distance to the output csv so that you can sort and graph how well the model did. It reports the Levenshtein Distance from input to output before and after transformation and the percent improvement.
76
  - `--out_csv` This indicates where the data should be saved after being processed by the model.
77
  - `--train_percentage` If you are going to use the same file for testing as well as the training, give the same train percentage as was given for training and the execution will only use the remaining data not used for training.
 
107
  ```
108
  - `execute`
109
  ```
110
+ Runs the data from from_sentences. The results are returned
111
  using yield so you need to wrap this in list() if you want
112
  to index it. from_sentences can be an array or a generator.
113
 
 
150
  results = list( my_model.execute( inference_data["from_header"] ) )
151
  ```
152
  # What is the license?
153
+ - The license has been set to apache-2.0 to match catboost so I don't have to think about compatibility issues.
transmorgrify.py CHANGED
@@ -36,7 +36,21 @@ class Transmorgrifier:
36
  #and the char model
37
  #slice through where only the action is insert.
38
  insert_indexes = Y['action'] == INSERT_TO
39
- self.char_model = _train_catboost( X[insert_indexes], Y['char'][insert_indexes], iterations, verbose=verbose, device=device, model_piece='char' )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  self.trailing_context = trailing_context
42
  self.leading_context = leading_context
@@ -52,19 +66,24 @@ class Transmorgrifier:
52
  model -- The pathname to save the model such as "my_model.tm" (default my_model.tm)
53
  """
54
  self.name = model
55
- with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as myzip:
56
- with myzip.open( 'params.json', mode='w' ) as out:
57
- out.write( json.dumps({
58
  'version': FILE_VERSION,
59
  'leading_context': self.leading_context,
60
  'trailing_context': self.trailing_context,
61
  'iterations': self.iterations,
62
- }).encode())
 
 
 
 
63
  temp_filename = _mktemp()
64
  self.action_model.save_model( temp_filename )
65
- myzip.write( temp_filename, "action.cb" )
66
- self.char_model.save_model( temp_filename )
67
- myzip.write( temp_filename, "char.cb" )
 
68
  os.unlink( temp_filename )
69
 
70
  return self
@@ -78,21 +97,26 @@ class Transmorgrifier:
78
  """
79
  self.name = model
80
  with zipfile.ZipFile( model, mode='r' ) as zip:
81
- with zip.open( 'params.json' ) as fin:
82
- params = json.loads( fin.read().decode() )
83
  if params['version'] > FILE_VERSION: raise Exception( f"Version {params['version']} greater than {FILE_VERSION}" )
84
  self.leading_context = int(params['leading_context'])
85
  self.trailing_context = int(params['trailing_context'])
86
  self.iterations = int(params['iterations'])
87
  temp_filename = _mktemp()
88
- with zip.open( 'action.cb' ) as fin:
89
- with open( temp_filename, "wb" ) as fout:
90
- fout.write( fin.read() )
91
  self.action_model = CatBoostClassifier().load_model( temp_filename )
92
- with zip.open( 'char.cb' ) as fin:
93
- with open( temp_filename, "wb" ) as fout:
94
- fout.write( fin.read() )
95
- self.char_model = CatBoostClassifier().load_model( temp_filename )
 
 
 
 
 
96
 
97
  os.unlink( temp_filename)
98
 
@@ -101,7 +125,7 @@ class Transmorgrifier:
101
 
102
  def execute( self, from_sentences, verbose=False ):
103
  """
104
- Runs the data from from_sentaces. The results are returned
105
  using yield so you need to wrap this in list() if you want
106
  to index it. from_sentences can be an array or a generator.
107
 
@@ -113,6 +137,7 @@ class Transmorgrifier:
113
  yield _do_reconstruct(
114
  action_model=self.action_model,
115
  char_model=self.char_model,
 
116
  text=from_sentence,
117
  num_pre_context_chars=self.leading_context,
118
  num_post_context_chars=self.trailing_context )
@@ -133,15 +158,15 @@ class Transmorgrifier:
133
  demo.launch( share=share )
134
 
135
  def _list_trace( trace ):
136
- if trace.parrent is None:
137
  result = [trace]
138
  else:
139
- result = _list_trace( trace.parrent )
140
  result.append( trace )
141
  return result
142
 
143
  class _edit_trace_hop():
144
- parrent = None
145
  edit_distance = None
146
  char = None
147
  from_row_i = None
@@ -182,7 +207,7 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
182
  #root case.
183
  if from_row_i == 0 and to_column_i == 0:
184
  best_option = _edit_trace_hop()
185
- best_option.parrent = None
186
  best_option.edit_distance = 0
187
  best_option.char = ""
188
  best_option.from_row_i = from_row_i
@@ -193,8 +218,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
193
  if to_column_i > 0:
194
  if best_option is None or current_row[to_column_i-1].edit_distance + 1 < best_option.edit_distance:
195
  best_option = _edit_trace_hop()
196
- best_option.parrent = current_row[to_column_i-1]
197
- best_option.edit_distance = best_option.parrent.edit_distance + 1
198
  best_option.char = to_sentence[to_column_i-1]
199
  best_option.from_row_i = from_row_i
200
  best_option.to_column_i = to_column_i
@@ -204,8 +229,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
204
  if from_row_i > 0:
205
  if best_option is None or last_row[to_column_i].edit_distance + 1 < best_option.edit_distance:
206
  best_option = _edit_trace_hop()
207
- best_option.parrent = last_row[to_column_i]
208
- best_option.edit_distance = best_option.parrent.edit_distance + 1
209
  best_option.char = from_sentence[from_row_i-1]
210
  best_option.from_row_i = from_row_i
211
  best_option.to_column_i = to_column_i
@@ -216,8 +241,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
216
  if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
217
  if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
218
  best_option = _edit_trace_hop()
219
- best_option.parrent = last_row[to_column_i-1]
220
- best_option.edit_distance = best_option.parrent.edit_distance + 1
221
  best_option.char = from_sentence[from_row_i-1]
222
  best_option.from_row_i = from_row_i
223
  best_option.to_column_i = to_column_i
@@ -231,8 +256,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
231
 
232
  if print_debug:
233
  def print_diffs( current_node ):
234
- if current_node.parrent is not None:
235
- print_diffs( current_node.parrent )
236
 
237
  if current_node.action == START:
238
  print( "start" )
@@ -344,7 +369,7 @@ def _parse_single_for_training( from_sentence, to_sentence, num_pre_context_char
344
  result_split_into_dict['action'] = action_slice
345
  result_split_into_dict['char'] = char_slice
346
 
347
- #now return it as a dataframe.
348
  return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
349
 
350
 
@@ -391,8 +416,8 @@ def _train_catboost( X, y, iterations, device, verbose, model_piece, learning_ra
391
 
392
 
393
  def _mktemp():
394
- #I know mktemp exists in the library but it has been depricated suggesting using
395
- #mkstemp but catboost can't write to a filehandle yet, so I need an actual
396
  #filename.
397
  number = 0
398
  while os.path.exists( f".temp_{number}~" ):
@@ -400,24 +425,21 @@ def _mktemp():
400
  return f".temp_{number}~"
401
 
402
 
403
- def _do_reconstruct( action_model, char_model, text, num_pre_context_chars, num_post_context_chars ):
404
- # result = ""
405
- # for i in range(len(text)):
406
- # pre_context = ( (" " * num_pre_context_chars) + result[max(0,len(result)-num_pre_context_chars):])[-num_pre_context_chars:]
407
- # post_context = (text[i:min(len(text),i+num_post_context_chars)] + (" " * num_post_context_chars))[:num_post_context_chars]
408
- # full_context = pre_context + post_context
409
- # context_as_dictionary = { 'c'+str(c):[full_context[c]] for c in range(len(full_context)) }
410
- # context_as_pd = pd.DataFrame( context_as_dictionary )
411
-
412
- # model_result = model.predict( context_as_pd )[0]
413
-
414
- # if not quite and len( result ) % 500 == 0: print( "%" + str(i*100/len(text))[:4] + " " + result[-100:])
415
-
416
- # if model_result: result += " "
417
- # result += text[i]
418
 
419
- # pass
420
- # return result
421
 
422
  #test for nan.
423
  if text != text: text = ''
@@ -451,7 +473,7 @@ def _do_reconstruct( action_model, char_model, text, num_pre_context_chars, num_
451
  context_as_pd = pd.DataFrame( context_as_dictionary )
452
 
453
  #run the model
454
- action_model_result = action_model.predict( context_as_pd )[0][0]
455
 
456
  #stop run away. If we have added more chars then our context, nothing is going to change.
457
  if action_model_result == INSERT_TO and continuous_added >= num_post_context_chars:
@@ -461,8 +483,11 @@ def _do_reconstruct( action_model, char_model, text, num_pre_context_chars, num_
461
  if action_model_result == START:
462
  pass
463
  elif action_model_result == INSERT_TO:
464
- #for an insert ask the char model what to insert
465
- char_model_result = char_model.predict( context_as_pd )[0][0]
 
 
 
466
 
467
  working_to += char_model_result
468
  continuous_added += 1
@@ -507,7 +532,7 @@ def train( in_csv, a_header, b_header, model, iterations, device, leading_contex
507
  split_index = int( train_percentage/100*len(full_data) )
508
  train_data = full_data.iloc[:split_index,:].reset_index(drop=True)
509
 
510
- if verbose: print( "parcing data for training" )
511
 
512
 
513
  tm = Transmorgrifier()
@@ -645,7 +670,7 @@ def main():
645
  tm = Transmorgrifier()
646
  tm.load( args.model )
647
 
648
- tm.demo( args.share is not None )
649
 
650
 
651
  if __name__ == '__main__':
 
36
  #and the char model
37
  #slice through where only the action is insert.
38
  insert_indexes = Y['action'] == INSERT_TO
39
+
40
+ #if there is only one char to insert, we can't train the second model and need to handle that as a boundary case.
41
+ if Y['char'][insert_indexes].nunique() > 1:
42
+ self.char_model = _train_catboost( X[insert_indexes], Y['char'][insert_indexes], iterations, verbose=verbose, device=device, model_piece='char' )
43
+ self.constant_output = None
44
+ else:
45
+ self.char_model = None
46
+ if Y['char'][insert_indexes].nunique() == 1:
47
+ self.constant_output = Y['char'][insert_indexes].unique()[0]
48
+ else:
49
+ #If there is never an insertion handle it as always inserting a space,
50
+ #because it will never insert, but it handles the boundary case so the saving and loading code works.
51
+ self.constant_output = ' '
52
+
53
+
54
 
55
  self.trailing_context = trailing_context
56
  self.leading_context = leading_context
 
66
  model -- The pathname to save the model such as "my_model.tm" (default my_model.tm)
67
  """
68
  self.name = model
69
+ with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as my_zip:
70
+ with my_zip.open( 'params.json', mode='w' ) as out:
71
+ params = {
72
  'version': FILE_VERSION,
73
  'leading_context': self.leading_context,
74
  'trailing_context': self.trailing_context,
75
  'iterations': self.iterations,
76
+ }
77
+ if self.constant_output is not None:
78
+ params['constant_output'] = self.constant_output
79
+
80
+ out.write( json.dumps(params).encode())
81
  temp_filename = _mktemp()
82
  self.action_model.save_model( temp_filename )
83
+ my_zip.write( temp_filename, "action.cb" )
84
+ if not self.char_model is None:
85
+ self.char_model.save_model( temp_filename )
86
+ my_zip.write( temp_filename, "char.cb" )
87
  os.unlink( temp_filename )
88
 
89
  return self
 
97
  """
98
  self.name = model
99
  with zipfile.ZipFile( model, mode='r' ) as zip:
100
+ with zip.open( 'params.json' ) as f_in:
101
+ params = json.loads( f_in.read().decode() )
102
  if params['version'] > FILE_VERSION: raise Exception( f"Version {params['version']} greater than {FILE_VERSION}" )
103
  self.leading_context = int(params['leading_context'])
104
  self.trailing_context = int(params['trailing_context'])
105
  self.iterations = int(params['iterations'])
106
  temp_filename = _mktemp()
107
+ with zip.open( 'action.cb' ) as f_in:
108
+ with open( temp_filename, "wb" ) as f_out:
109
+ f_out.write( f_in.read() )
110
  self.action_model = CatBoostClassifier().load_model( temp_filename )
111
+ if 'constant_output' not in params:
112
+ with zip.open( 'char.cb' ) as f_in:
113
+ with open( temp_filename, "wb" ) as f_out:
114
+ f_out.write( f_in.read() )
115
+ self.char_model = CatBoostClassifier().load_model( temp_filename )
116
+ self.constant_output = None
117
+ else:
118
+ self.constant_output = params['constant_output']
119
+ self.char_model = None
120
 
121
  os.unlink( temp_filename)
122
 
 
125
 
126
  def execute( self, from_sentences, verbose=False ):
127
  """
128
+ Runs the data from from_sentences. The results are returned
129
  using yield so you need to wrap this in list() if you want
130
  to index it. from_sentences can be an array or a generator.
131
 
 
137
  yield _do_reconstruct(
138
  action_model=self.action_model,
139
  char_model=self.char_model,
140
+ constant_output=self.constant_output,
141
  text=from_sentence,
142
  num_pre_context_chars=self.leading_context,
143
  num_post_context_chars=self.trailing_context )
 
158
  demo.launch( share=share )
159
 
160
  def _list_trace( trace ):
161
+ if trace.parent is None:
162
  result = [trace]
163
  else:
164
+ result = _list_trace( trace.parent )
165
  result.append( trace )
166
  return result
167
 
168
  class _edit_trace_hop():
169
+ parent = None
170
  edit_distance = None
171
  char = None
172
  from_row_i = None
 
207
  #root case.
208
  if from_row_i == 0 and to_column_i == 0:
209
  best_option = _edit_trace_hop()
210
+ best_option.parent = None
211
  best_option.edit_distance = 0
212
  best_option.char = ""
213
  best_option.from_row_i = from_row_i
 
218
  if to_column_i > 0:
219
  if best_option is None or current_row[to_column_i-1].edit_distance + 1 < best_option.edit_distance:
220
  best_option = _edit_trace_hop()
221
+ best_option.parent = current_row[to_column_i-1]
222
+ best_option.edit_distance = best_option.parent.edit_distance + 1
223
  best_option.char = to_sentence[to_column_i-1]
224
  best_option.from_row_i = from_row_i
225
  best_option.to_column_i = to_column_i
 
229
  if from_row_i > 0:
230
  if best_option is None or last_row[to_column_i].edit_distance + 1 < best_option.edit_distance:
231
  best_option = _edit_trace_hop()
232
+ best_option.parent = last_row[to_column_i]
233
+ best_option.edit_distance = best_option.parent.edit_distance + 1
234
  best_option.char = from_sentence[from_row_i-1]
235
  best_option.from_row_i = from_row_i
236
  best_option.to_column_i = to_column_i
 
241
  if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
242
  if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
243
  best_option = _edit_trace_hop()
244
+ best_option.parent = last_row[to_column_i-1]
245
+ best_option.edit_distance = best_option.parent.edit_distance + 1
246
  best_option.char = from_sentence[from_row_i-1]
247
  best_option.from_row_i = from_row_i
248
  best_option.to_column_i = to_column_i
 
256
 
257
  if print_debug:
258
  def print_diffs( current_node ):
259
+ if current_node.parent is not None:
260
+ print_diffs( current_node.parent )
261
 
262
  if current_node.action == START:
263
  print( "start" )
 
369
  result_split_into_dict['action'] = action_slice
370
  result_split_into_dict['char'] = char_slice
371
 
372
+ #now return it as a data_frame.
373
  return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
374
 
375
 
 
416
 
417
 
418
  def _mktemp():
419
+ #I know mktemp exists in the library but it has been deprecated suggesting using
420
+ #mkstemp but catboost can't write to a file handle yet, so I need an actual
421
  #filename.
422
  number = 0
423
  while os.path.exists( f".temp_{number}~" ):
 
425
  return f".temp_{number}~"
426
 
427
 
428
+ def predict_wrapper( model, model_input ):
429
+ #Big hack. Catboost has shown itself to be unstable on producing
430
+ #either a single value or an array with a single value in it.
431
+ #I traced it back to the saved model, and then the model to what
432
+ #data it is trained on. But I could figure out what it was
433
+ #in the data which would make the saved model be one way or the other
434
+ #so I am going to use the results this way so that it works either way.
435
+ result = model.predict( model_input )[0]
436
+ try:
437
+ result = result[0]
438
+ except:
439
+ pass
440
+ return result
 
 
441
 
442
+ def _do_reconstruct( action_model, char_model, constant_output, text, num_pre_context_chars, num_post_context_chars ):
 
443
 
444
  #test for nan.
445
  if text != text: text = ''
 
473
  context_as_pd = pd.DataFrame( context_as_dictionary )
474
 
475
  #run the model
476
+ action_model_result = predict_wrapper(action_model,context_as_pd )
477
 
478
  #stop run away. If we have added more chars then our context, nothing is going to change.
479
  if action_model_result == INSERT_TO and continuous_added >= num_post_context_chars:
 
483
  if action_model_result == START:
484
  pass
485
  elif action_model_result == INSERT_TO:
486
+ if constant_output is None:
487
+ #for an insert ask the char model what to insert
488
+ char_model_result = predict_wrapper(char_model, context_as_pd )
489
+ else:
490
+ char_model_result = constant_output
491
 
492
  working_to += char_model_result
493
  continuous_added += 1
 
532
  split_index = int( train_percentage/100*len(full_data) )
533
  train_data = full_data.iloc[:split_index,:].reset_index(drop=True)
534
 
535
+ if verbose: print( "parsing data for training" )
536
 
537
 
538
  tm = Transmorgrifier()
 
670
  tm = Transmorgrifier()
671
  tm.load( args.model )
672
 
673
+ tm.demo( share=args.share )
674
 
675
 
676
  if __name__ == '__main__':