Spaces:
Runtime error
Runtime error
Joshua Lansford
commited on
Commit
·
2f7253d
1
Parent(s):
a880e5e
Fix Target contains only one unique value error (Issue #1)
Browse files- .vscode/launch.json +237 -40
- README.md +7 -7
- transmorgrify.py +80 -55
.vscode/launch.json
CHANGED
@@ -12,6 +12,221 @@
|
|
12 |
"program": "${file}",
|
13 |
"console": "integratedTerminal",
|
14 |
"justMyCode": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
},{
|
16 |
"name": "Train phonetic 4000 gpu",
|
17 |
"type": "python",
|
@@ -21,29 +236,14 @@
|
|
21 |
"justMyCode": true,
|
22 |
"args": [
|
23 |
"--train",
|
24 |
-
"--in_csv", "
|
25 |
"--a_header", "English",
|
26 |
"--b_header", "Phonetic",
|
27 |
"--device", "0:1",
|
28 |
"--model", "phonetics_forward.tm"
|
29 |
]
|
30 |
-
},{
|
31 |
-
"name": "Train reverse phonetic 4000 gpu",
|
32 |
-
"type": "python",
|
33 |
-
"request": "launch",
|
34 |
-
"program": "transmorgrify.py",
|
35 |
-
"console": "integratedTerminal",
|
36 |
-
"justMyCode": true,
|
37 |
-
"args": [
|
38 |
-
"--train",
|
39 |
-
"--in_csv", "/home/lansford/Sync/projects/tf_over/sentence_transmogrifier/examples/phonetic/phonetic.csv",
|
40 |
-
"--b_header", "English",
|
41 |
-
"--a_header", "Phonetic",
|
42 |
-
"--device", "0:1",
|
43 |
-
"--model", "phonetics_backwards.tm"
|
44 |
-
]
|
45 |
},{
|
46 |
-
"name": "Train
|
47 |
"type": "python",
|
48 |
"request": "launch",
|
49 |
"program": "transmorgrify.py",
|
@@ -51,31 +251,30 @@
|
|
51 |
"justMyCode": true,
|
52 |
"args": [
|
53 |
"--train",
|
54 |
-
"--in_csv", "
|
55 |
-
"--
|
56 |
-
"--
|
57 |
"--device", "0:1",
|
58 |
-
"--model", "
|
59 |
]
|
60 |
},{
|
61 |
-
"name": "
|
62 |
"type": "python",
|
63 |
"request": "launch",
|
64 |
"program": "transmorgrify.py",
|
65 |
"console": "integratedTerminal",
|
66 |
"justMyCode": true,
|
67 |
"args": [
|
68 |
-
"--
|
69 |
-
"--in_csv", "
|
70 |
-
"--out_csv", "./phonetic_out.csv",
|
71 |
"--a_header", "English",
|
72 |
"--b_header", "Phonetic",
|
73 |
"--device", "0:1",
|
74 |
-
"
|
75 |
-
"--
|
76 |
]
|
77 |
},{
|
78 |
-
"name": "
|
79 |
"type": "python",
|
80 |
"request": "launch",
|
81 |
"program": "transmorgrify.py",
|
@@ -83,16 +282,13 @@
|
|
83 |
"justMyCode": true,
|
84 |
"args": [
|
85 |
"--execute",
|
86 |
-
"--in_csv", "
|
87 |
-
"--
|
88 |
-
"--
|
89 |
-
"--b_header", "Phonetic",
|
90 |
-
"--model", "phonetics_forward.tm",
|
91 |
"--verbose",
|
92 |
-
"--include_stats",
|
93 |
]
|
94 |
},{
|
95 |
-
"name": "
|
96 |
"type": "python",
|
97 |
"request": "launch",
|
98 |
"program": "transmorgrify.py",
|
@@ -100,11 +296,12 @@
|
|
100 |
"justMyCode": true,
|
101 |
"args": [
|
102 |
"--execute",
|
103 |
-
"--in_csv", "
|
104 |
-
"--out_csv", "./
|
105 |
-
"--
|
106 |
-
"--
|
107 |
-
"--model", "
|
|
|
108 |
"--verbose",
|
109 |
"--include_stats",
|
110 |
]
|
|
|
12 |
"program": "${file}",
|
13 |
"console": "integratedTerminal",
|
14 |
"justMyCode": true
|
15 |
+
},{
|
16 |
+
"name": "Train \"latin\" 4000",
|
17 |
+
"type": "python",
|
18 |
+
"request": "launch",
|
19 |
+
"program": "transmorgrify.py",
|
20 |
+
"console": "integratedTerminal",
|
21 |
+
"justMyCode": true,
|
22 |
+
"args": [
|
23 |
+
"--train",
|
24 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
25 |
+
"--a_header", "original",
|
26 |
+
"--b_header", "split",
|
27 |
+
"--device", "0:1",
|
28 |
+
"--model", "latin_forward.tm",
|
29 |
+
"--train_percentage", "50",
|
30 |
+
]
|
31 |
+
},{
|
32 |
+
"name": "Train \"latin\" 100",
|
33 |
+
"type": "python",
|
34 |
+
"request": "launch",
|
35 |
+
"program": "transmorgrify.py",
|
36 |
+
"console": "integratedTerminal",
|
37 |
+
"justMyCode": true,
|
38 |
+
"args": [
|
39 |
+
"--train",
|
40 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
41 |
+
"--a_header", "original",
|
42 |
+
"--b_header", "split",
|
43 |
+
"--device", "0:1",
|
44 |
+
"--model", "latin_forward_100.tm",
|
45 |
+
"-n", "100",
|
46 |
+
"--train_percentage", "50",
|
47 |
+
]
|
48 |
+
},{
|
49 |
+
"name": "Train \"latin\" 10000",
|
50 |
+
"type": "python",
|
51 |
+
"request": "launch",
|
52 |
+
"program": "transmorgrify.py",
|
53 |
+
"console": "integratedTerminal",
|
54 |
+
"justMyCode": true,
|
55 |
+
"args": [
|
56 |
+
"--train",
|
57 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
58 |
+
"--a_header", "original",
|
59 |
+
"--b_header", "split",
|
60 |
+
"--device", "0:1",
|
61 |
+
"--model", "latin_forward_10000.tm",
|
62 |
+
"-n", "10000",
|
63 |
+
"--train_percentage", "50",
|
64 |
+
]
|
65 |
+
},{
|
66 |
+
"name": "Execute \"latin\" 4000",
|
67 |
+
"type": "python",
|
68 |
+
"request": "launch",
|
69 |
+
"program": "transmorgrify.py",
|
70 |
+
"console": "integratedTerminal",
|
71 |
+
"justMyCode": true,
|
72 |
+
"args": [
|
73 |
+
"--execute",
|
74 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
75 |
+
"--a_header", "original",
|
76 |
+
"--b_header", "split",
|
77 |
+
"--device", "0:1",
|
78 |
+
"--model", "latin_forward.tm",
|
79 |
+
"--verbose",
|
80 |
+
"--include_stats",
|
81 |
+
"--out_csv", "latin_4000.csv",
|
82 |
+
"--train_percentage", "50",
|
83 |
+
]
|
84 |
+
},{
|
85 |
+
"name": "Execute \"latin\" 100",
|
86 |
+
"type": "python",
|
87 |
+
"request": "launch",
|
88 |
+
"program": "transmorgrify.py",
|
89 |
+
"console": "integratedTerminal",
|
90 |
+
"justMyCode": true,
|
91 |
+
"args": [
|
92 |
+
"--execute",
|
93 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
94 |
+
"--a_header", "original",
|
95 |
+
"--b_header", "split",
|
96 |
+
"--device", "0:1",
|
97 |
+
"--model", "latin_forward_100.tm",
|
98 |
+
"--verbose",
|
99 |
+
"--include_stats",
|
100 |
+
"--out_csv", "latin_100.csv",
|
101 |
+
"--train_percentage", "50",
|
102 |
+
]
|
103 |
+
},{
|
104 |
+
"name": "Execute \"latin\" 10000",
|
105 |
+
"type": "python",
|
106 |
+
"request": "launch",
|
107 |
+
"program": "transmorgrify.py",
|
108 |
+
"console": "integratedTerminal",
|
109 |
+
"justMyCode": true,
|
110 |
+
"args": [
|
111 |
+
"--execute",
|
112 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
113 |
+
"--a_header", "original",
|
114 |
+
"--b_header", "split",
|
115 |
+
"--device", "0:1",
|
116 |
+
"--model", "latin_forward_10000.tm",
|
117 |
+
"--verbose",
|
118 |
+
"--include_stats",
|
119 |
+
"--out_csv", "latin_10000.csv",
|
120 |
+
"--train_percentage", "50",
|
121 |
+
]
|
122 |
+
},{
|
123 |
+
"name": "Demo \"latin\" 4000",
|
124 |
+
"type": "python",
|
125 |
+
"request": "launch",
|
126 |
+
"program": "transmorgrify.py",
|
127 |
+
"console": "integratedTerminal",
|
128 |
+
"justMyCode": true,
|
129 |
+
"args": [
|
130 |
+
"--gradio",
|
131 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
132 |
+
"--a_header", "original",
|
133 |
+
//"--b_header", "split",
|
134 |
+
//"--device", "0:1",
|
135 |
+
"--model", "latin_forward.tm"
|
136 |
+
]
|
137 |
+
},{
|
138 |
+
"name": "Demo \"latin\" 100",
|
139 |
+
"type": "python",
|
140 |
+
"request": "launch",
|
141 |
+
"program": "transmorgrify.py",
|
142 |
+
"console": "integratedTerminal",
|
143 |
+
"justMyCode": true,
|
144 |
+
"args": [
|
145 |
+
"--gradio",
|
146 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
147 |
+
"--a_header", "original",
|
148 |
+
//"--b_header", "split",
|
149 |
+
//"--device", "0:1",
|
150 |
+
"--model", "latin_forward_100.tm"
|
151 |
+
]
|
152 |
+
},{
|
153 |
+
"name": "Demo \"latin\" 10000",
|
154 |
+
"type": "python",
|
155 |
+
"request": "launch",
|
156 |
+
"program": "transmorgrify.py",
|
157 |
+
"console": "integratedTerminal",
|
158 |
+
"justMyCode": true,
|
159 |
+
"args": [
|
160 |
+
"--gradio",
|
161 |
+
"--in_csv", "./examples/latin/training_latin.txt",
|
162 |
+
"--a_header", "original",
|
163 |
+
//"--b_header", "split",
|
164 |
+
//"--device", "0:1",
|
165 |
+
"--model", "latin_forward_10000.tm"
|
166 |
+
]
|
167 |
+
},{
|
168 |
+
"name": "Train \"latin\" mod",
|
169 |
+
"type": "python",
|
170 |
+
"request": "launch",
|
171 |
+
"program": "transmorgrify.py",
|
172 |
+
"console": "integratedTerminal",
|
173 |
+
"justMyCode": true,
|
174 |
+
"args": [
|
175 |
+
"--train",
|
176 |
+
"--in_csv", "./examples/latin/training_latin_mod.csv",
|
177 |
+
"--a_header", "original",
|
178 |
+
"--b_header", "split",
|
179 |
+
"--device", "0:1",
|
180 |
+
"--model", "latin_mod_forward.tm"
|
181 |
+
]
|
182 |
+
},{
|
183 |
+
"name": "Execute \"latin\" mod",
|
184 |
+
"type": "python",
|
185 |
+
"request": "launch",
|
186 |
+
"program": "transmorgrify.py",
|
187 |
+
"console": "integratedTerminal",
|
188 |
+
"justMyCode": true,
|
189 |
+
"args": [
|
190 |
+
"--execute",
|
191 |
+
"--in_csv", "./examples/latin/training_latin_mod.csv",
|
192 |
+
"--a_header", "original",
|
193 |
+
//"--b_header", "split",
|
194 |
+
"--device", "0:1",
|
195 |
+
"--model", "latin_mod_forward.tm",
|
196 |
+
"--out_csv", "latin_mod_out.csv",
|
197 |
+
]
|
198 |
+
},{
|
199 |
+
"name": "Execute mixup",
|
200 |
+
"type": "python",
|
201 |
+
"request": "launch",
|
202 |
+
"program": "transmorgrify.py",
|
203 |
+
"console": "integratedTerminal",
|
204 |
+
"justMyCode": true,
|
205 |
+
"args": [
|
206 |
+
"--execute",
|
207 |
+
"--in_csv", "./examples/latin/training_latin_mod.csv",
|
208 |
+
"--a_header", "original",
|
209 |
+
//"--b_header", "split",
|
210 |
+
"--device", "0:1",
|
211 |
+
//"--model", "latin_mod_forward.tm",
|
212 |
+
"--model", "phonetics_small.tm",
|
213 |
+
"--out_csv", "mixum_out.csv",
|
214 |
+
]
|
215 |
+
},{
|
216 |
+
"name": "Demo \"latin\" mod",
|
217 |
+
"type": "python",
|
218 |
+
"request": "launch",
|
219 |
+
"program": "transmorgrify.py",
|
220 |
+
"console": "integratedTerminal",
|
221 |
+
"justMyCode": true,
|
222 |
+
"args": [
|
223 |
+
"--gradio",
|
224 |
+
"--in_csv", "./examples/latin/training_latin_mod.csv",
|
225 |
+
"--a_header", "original",
|
226 |
+
//"--b_header", "split",
|
227 |
+
//"--device", "0:1",
|
228 |
+
"--model", "latin_mod_forward.tm"
|
229 |
+
]
|
230 |
},{
|
231 |
"name": "Train phonetic 4000 gpu",
|
232 |
"type": "python",
|
|
|
236 |
"justMyCode": true,
|
237 |
"args": [
|
238 |
"--train",
|
239 |
+
"--in_csv", "./examples/phonetic/phonetic.csv",
|
240 |
"--a_header", "English",
|
241 |
"--b_header", "Phonetic",
|
242 |
"--device", "0:1",
|
243 |
"--model", "phonetics_forward.tm"
|
244 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
},{
|
246 |
+
"name": "Train reverse phonetic 4000 gpu",
|
247 |
"type": "python",
|
248 |
"request": "launch",
|
249 |
"program": "transmorgrify.py",
|
|
|
251 |
"justMyCode": true,
|
252 |
"args": [
|
253 |
"--train",
|
254 |
+
"--in_csv", "./examples/phonetic/phonetic.csv",
|
255 |
+
"--b_header", "English",
|
256 |
+
"--a_header", "Phonetic",
|
257 |
"--device", "0:1",
|
258 |
+
"--model", "phonetics_backwards.tm"
|
259 |
]
|
260 |
},{
|
261 |
+
"name": "Train short phonetic 4000 gpu",
|
262 |
"type": "python",
|
263 |
"request": "launch",
|
264 |
"program": "transmorgrify.py",
|
265 |
"console": "integratedTerminal",
|
266 |
"justMyCode": true,
|
267 |
"args": [
|
268 |
+
"--train",
|
269 |
+
"--in_csv", "./examples/phonetic/phonetic_short.csv",
|
|
|
270 |
"--a_header", "English",
|
271 |
"--b_header", "Phonetic",
|
272 |
"--device", "0:1",
|
273 |
+
"-n", "100",
|
274 |
+
"--model", "phonetics_small.tm"
|
275 |
]
|
276 |
},{
|
277 |
+
"name": "Execute phonetic gpu",
|
278 |
"type": "python",
|
279 |
"request": "launch",
|
280 |
"program": "transmorgrify.py",
|
|
|
282 |
"justMyCode": true,
|
283 |
"args": [
|
284 |
"--execute",
|
285 |
+
"--in_csv", "./examples/phonetic/phonetic.csv",
|
286 |
+
"--a_header", "original",
|
287 |
+
//"--b_header", "split",
|
|
|
|
|
288 |
"--verbose",
|
|
|
289 |
]
|
290 |
},{
|
291 |
+
"name": "Execute short phonetic",
|
292 |
"type": "python",
|
293 |
"request": "launch",
|
294 |
"program": "transmorgrify.py",
|
|
|
296 |
"justMyCode": true,
|
297 |
"args": [
|
298 |
"--execute",
|
299 |
+
"--in_csv", "./examples/phonetic/phonetic_short.csv",
|
300 |
+
"--out_csv", "./phonetic_out.csv",
|
301 |
+
"--a_header", "English",
|
302 |
+
"--b_header", "Phonetic",
|
303 |
+
"--model", "phonetics_small.tm",
|
304 |
+
"--device", "0:1",
|
305 |
"--verbose",
|
306 |
"--include_stats",
|
307 |
]
|
README.md
CHANGED
@@ -13,10 +13,10 @@ license: apache-2.0
|
|
13 |
## Sentence Transmorgrifier
|
14 |
|
15 |
# What is the Sentence Transmorgrifier?
|
16 |
-
- The Sentence Transmorgrifier is a framework to make text to text conversion models which uses a categorical
|
17 |
- This library does not use neural net or word embeddings but does the transformation on the character level.
|
18 |
- For Sentence Transmorgrifier to work, there has to be some common characters between the from and two conversion.
|
19 |
-
- The model uses a modified form of the [
|
20 |
1. Match: Pass the character from input to output
|
21 |
2. Drop: Remove the incoming character from the input.
|
22 |
3. Insert: Generate a character and add it to the output.
|
@@ -31,7 +31,7 @@ license: apache-2.0
|
|
31 |
- The project has been configured to be able to be used in two different ways.
|
32 |
|
33 |
## Shell access
|
34 |
-
- The transmorgrify.py script can be called directly with arguments specifying an input csv file, what
|
35 |
|
36 |
```sh
|
37 |
python transmorgrify.py \
|
@@ -49,7 +49,7 @@ python transmorgrify.py \
|
|
49 |
- `--b_header` This indicates the to column
|
50 |
- `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
|
51 |
- `--model` This indicates where to save the model
|
52 |
-
- `--verbose` Self
|
53 |
- `--iterations` This indicates how many catboost iterations should be executed on your input data.
|
54 |
- `--train_percentage` If you are going to use the same file for testing as well as the training, giving a train percentage will only use the percentage specified for training.
|
55 |
|
@@ -71,7 +71,7 @@ python transmorgrify.py \
|
|
71 |
- `--b_header` This indicates the to column. The to column must be specified if `--include_stats` is also specified.
|
72 |
- `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
|
73 |
- `--model` This indicates where to load the model
|
74 |
-
- `--verbose` Self
|
75 |
- `--include_stats` This adds editing distance to the output csv so that you can sort and graph how well the model did. It reports the Levenshtein Distance from input to output before and after transformation and the percent improvement.
|
76 |
- `--out_csv` This indicates where the data should be saved after being processed by the model.
|
77 |
- `--train_percentage` If you are going to use the same file for testing as well as the training, give the same train percentage as was given for training and the execution will only use the remaining data not used for training.
|
@@ -107,7 +107,7 @@ model -- The filename of the model to load. (default my_model.tm)
|
|
107 |
```
|
108 |
- `execute`
|
109 |
```
|
110 |
-
Runs the data from
|
111 |
using yield so you need to wrap this in list() if you want
|
112 |
to index it. from_sentences can be an array or a generator.
|
113 |
|
@@ -150,4 +150,4 @@ my_model.load( "my_model.tm" )
|
|
150 |
results = list( my_model.execute( inference_data["from_header"] ) )
|
151 |
```
|
152 |
# What is the license?
|
153 |
-
- The
|
|
|
13 |
## Sentence Transmorgrifier
|
14 |
|
15 |
# What is the Sentence Transmorgrifier?
|
16 |
+
- The Sentence Transmorgrifier is a framework to make text to text conversion models which uses a categorical gradient boost library, [catboost](https://catboost.ai/), as its back end.
|
17 |
- This library does not use neural net or word embeddings but does the transformation on the character level.
|
18 |
- For Sentence Transmorgrifier to work, there has to be some common characters between the from and two conversion.
|
19 |
+
- The model uses a modified form of the [longest common subsequence algorithm](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) to transform the sentence conversion into a sequence of three types of operations:
|
20 |
1. Match: Pass the character from input to output
|
21 |
2. Drop: Remove the incoming character from the input.
|
22 |
3. Insert: Generate a character and add it to the output.
|
|
|
31 |
- The project has been configured to be able to be used in two different ways.
|
32 |
|
33 |
## Shell access
|
34 |
+
- The transmorgrify.py script can be called directly with arguments specifying an input csv file, what labels are from and to and what to save the resulting model as or to process the input csv to an output. Here is an example:
|
35 |
|
36 |
```sh
|
37 |
python transmorgrify.py \
|
|
|
49 |
- `--b_header` This indicates the to column
|
50 |
- `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
|
51 |
- `--model` This indicates where to save the model
|
52 |
+
- `--verbose` Self explanatory
|
53 |
- `--iterations` This indicates how many catboost iterations should be executed on your input data.
|
54 |
- `--train_percentage` If you are going to use the same file for testing as well as the training, giving a train percentage will only use the percentage specified for training.
|
55 |
|
|
|
71 |
- `--b_header` This indicates the to column. The to column must be specified if `--include_stats` is also specified.
|
72 |
- `--device` This specifies the gpu if you have one or type `cpu` if you do not have a gpu.
|
73 |
- `--model` This indicates where to load the model
|
74 |
+
- `--verbose` Self explanatory
|
75 |
- `--include_stats` This adds editing distance to the output csv so that you can sort and graph how well the model did. It reports the Levenshtein Distance from input to output before and after transformation and the percent improvement.
|
76 |
- `--out_csv` This indicates where the data should be saved after being processed by the model.
|
77 |
- `--train_percentage` If you are going to use the same file for testing as well as the training, give the same train percentage as was given for training and the execution will only use the remaining data not used for training.
|
|
|
107 |
```
|
108 |
- `execute`
|
109 |
```
|
110 |
+
Runs the data from from_sentences. The results are returned
|
111 |
using yield so you need to wrap this in list() if you want
|
112 |
to index it. from_sentences can be an array or a generator.
|
113 |
|
|
|
150 |
results = list( my_model.execute( inference_data["from_header"] ) )
|
151 |
```
|
152 |
# What is the license?
|
153 |
+
- The license has been set to apache-2.0 to match catboost so I don't have to think about compatibility issues.
|
transmorgrify.py
CHANGED
@@ -36,7 +36,21 @@ class Transmorgrifier:
|
|
36 |
#and the char model
|
37 |
#slice through where only the action is insert.
|
38 |
insert_indexes = Y['action'] == INSERT_TO
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
self.trailing_context = trailing_context
|
42 |
self.leading_context = leading_context
|
@@ -52,19 +66,24 @@ class Transmorgrifier:
|
|
52 |
model -- The pathname to save the model such as "my_model.tm" (default my_model.tm)
|
53 |
"""
|
54 |
self.name = model
|
55 |
-
with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as
|
56 |
-
with
|
57 |
-
|
58 |
'version': FILE_VERSION,
|
59 |
'leading_context': self.leading_context,
|
60 |
'trailing_context': self.trailing_context,
|
61 |
'iterations': self.iterations,
|
62 |
-
}
|
|
|
|
|
|
|
|
|
63 |
temp_filename = _mktemp()
|
64 |
self.action_model.save_model( temp_filename )
|
65 |
-
|
66 |
-
self.char_model
|
67 |
-
|
|
|
68 |
os.unlink( temp_filename )
|
69 |
|
70 |
return self
|
@@ -78,21 +97,26 @@ class Transmorgrifier:
|
|
78 |
"""
|
79 |
self.name = model
|
80 |
with zipfile.ZipFile( model, mode='r' ) as zip:
|
81 |
-
with zip.open( 'params.json' ) as
|
82 |
-
params = json.loads(
|
83 |
if params['version'] > FILE_VERSION: raise Exception( f"Version {params['version']} greater than {FILE_VERSION}" )
|
84 |
self.leading_context = int(params['leading_context'])
|
85 |
self.trailing_context = int(params['trailing_context'])
|
86 |
self.iterations = int(params['iterations'])
|
87 |
temp_filename = _mktemp()
|
88 |
-
with zip.open( 'action.cb' ) as
|
89 |
-
with open( temp_filename, "wb" ) as
|
90 |
-
|
91 |
self.action_model = CatBoostClassifier().load_model( temp_filename )
|
92 |
-
|
93 |
-
with open(
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
os.unlink( temp_filename)
|
98 |
|
@@ -101,7 +125,7 @@ class Transmorgrifier:
|
|
101 |
|
102 |
def execute( self, from_sentences, verbose=False ):
|
103 |
"""
|
104 |
-
Runs the data from
|
105 |
using yield so you need to wrap this in list() if you want
|
106 |
to index it. from_sentences can be an array or a generator.
|
107 |
|
@@ -113,6 +137,7 @@ class Transmorgrifier:
|
|
113 |
yield _do_reconstruct(
|
114 |
action_model=self.action_model,
|
115 |
char_model=self.char_model,
|
|
|
116 |
text=from_sentence,
|
117 |
num_pre_context_chars=self.leading_context,
|
118 |
num_post_context_chars=self.trailing_context )
|
@@ -133,15 +158,15 @@ class Transmorgrifier:
|
|
133 |
demo.launch( share=share )
|
134 |
|
135 |
def _list_trace( trace ):
|
136 |
-
if trace.
|
137 |
result = [trace]
|
138 |
else:
|
139 |
-
result = _list_trace( trace.
|
140 |
result.append( trace )
|
141 |
return result
|
142 |
|
143 |
class _edit_trace_hop():
|
144 |
-
|
145 |
edit_distance = None
|
146 |
char = None
|
147 |
from_row_i = None
|
@@ -182,7 +207,7 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
|
|
182 |
#root case.
|
183 |
if from_row_i == 0 and to_column_i == 0:
|
184 |
best_option = _edit_trace_hop()
|
185 |
-
best_option.
|
186 |
best_option.edit_distance = 0
|
187 |
best_option.char = ""
|
188 |
best_option.from_row_i = from_row_i
|
@@ -193,8 +218,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
|
|
193 |
if to_column_i > 0:
|
194 |
if best_option is None or current_row[to_column_i-1].edit_distance + 1 < best_option.edit_distance:
|
195 |
best_option = _edit_trace_hop()
|
196 |
-
best_option.
|
197 |
-
best_option.edit_distance = best_option.
|
198 |
best_option.char = to_sentence[to_column_i-1]
|
199 |
best_option.from_row_i = from_row_i
|
200 |
best_option.to_column_i = to_column_i
|
@@ -204,8 +229,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
|
|
204 |
if from_row_i > 0:
|
205 |
if best_option is None or last_row[to_column_i].edit_distance + 1 < best_option.edit_distance:
|
206 |
best_option = _edit_trace_hop()
|
207 |
-
best_option.
|
208 |
-
best_option.edit_distance = best_option.
|
209 |
best_option.char = from_sentence[from_row_i-1]
|
210 |
best_option.from_row_i = from_row_i
|
211 |
best_option.to_column_i = to_column_i
|
@@ -216,8 +241,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
|
|
216 |
if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
|
217 |
if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
|
218 |
best_option = _edit_trace_hop()
|
219 |
-
best_option.
|
220 |
-
best_option.edit_distance = best_option.
|
221 |
best_option.char = from_sentence[from_row_i-1]
|
222 |
best_option.from_row_i = from_row_i
|
223 |
best_option.to_column_i = to_column_i
|
@@ -231,8 +256,8 @@ def _trace_edits( from_sentence, to_sentence, print_debug=False ):
|
|
231 |
|
232 |
if print_debug:
|
233 |
def print_diffs( current_node ):
|
234 |
-
if current_node.
|
235 |
-
print_diffs( current_node.
|
236 |
|
237 |
if current_node.action == START:
|
238 |
print( "start" )
|
@@ -344,7 +369,7 @@ def _parse_single_for_training( from_sentence, to_sentence, num_pre_context_char
|
|
344 |
result_split_into_dict['action'] = action_slice
|
345 |
result_split_into_dict['char'] = char_slice
|
346 |
|
347 |
-
#now return it as a
|
348 |
return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
|
349 |
|
350 |
|
@@ -391,8 +416,8 @@ def _train_catboost( X, y, iterations, device, verbose, model_piece, learning_ra
|
|
391 |
|
392 |
|
393 |
def _mktemp():
|
394 |
-
#I know mktemp exists in the library but it has been
|
395 |
-
#mkstemp but catboost can't write to a
|
396 |
#filename.
|
397 |
number = 0
|
398 |
while os.path.exists( f".temp_{number}~" ):
|
@@ -400,24 +425,21 @@ def _mktemp():
|
|
400 |
return f".temp_{number}~"
|
401 |
|
402 |
|
403 |
-
def
|
404 |
-
#
|
405 |
-
#
|
406 |
-
#
|
407 |
-
#
|
408 |
-
#
|
409 |
-
#
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
# if model_result: result += " "
|
417 |
-
# result += text[i]
|
418 |
|
419 |
-
|
420 |
-
# return result
|
421 |
|
422 |
#test for nan.
|
423 |
if text != text: text = ''
|
@@ -451,7 +473,7 @@ def _do_reconstruct( action_model, char_model, text, num_pre_context_chars, num_
|
|
451 |
context_as_pd = pd.DataFrame( context_as_dictionary )
|
452 |
|
453 |
#run the model
|
454 |
-
action_model_result = action_model
|
455 |
|
456 |
#stop run away. If we have added more chars then our context, nothing is going to change.
|
457 |
if action_model_result == INSERT_TO and continuous_added >= num_post_context_chars:
|
@@ -461,8 +483,11 @@ def _do_reconstruct( action_model, char_model, text, num_pre_context_chars, num_
|
|
461 |
if action_model_result == START:
|
462 |
pass
|
463 |
elif action_model_result == INSERT_TO:
|
464 |
-
|
465 |
-
|
|
|
|
|
|
|
466 |
|
467 |
working_to += char_model_result
|
468 |
continuous_added += 1
|
@@ -507,7 +532,7 @@ def train( in_csv, a_header, b_header, model, iterations, device, leading_contex
|
|
507 |
split_index = int( train_percentage/100*len(full_data) )
|
508 |
train_data = full_data.iloc[:split_index,:].reset_index(drop=True)
|
509 |
|
510 |
-
if verbose: print( "
|
511 |
|
512 |
|
513 |
tm = Transmorgrifier()
|
@@ -645,7 +670,7 @@ def main():
|
|
645 |
tm = Transmorgrifier()
|
646 |
tm.load( args.model )
|
647 |
|
648 |
-
tm.demo( args.share
|
649 |
|
650 |
|
651 |
if __name__ == '__main__':
|
|
|
36 |
#and the char model
|
37 |
#slice through where only the action is insert.
|
38 |
insert_indexes = Y['action'] == INSERT_TO
|
39 |
+
|
40 |
+
#if there is only one char to insert, we can't train the second model and need to handle that as a boundary case.
|
41 |
+
if Y['char'][insert_indexes].nunique() > 1:
|
42 |
+
self.char_model = _train_catboost( X[insert_indexes], Y['char'][insert_indexes], iterations, verbose=verbose, device=device, model_piece='char' )
|
43 |
+
self.constant_output = None
|
44 |
+
else:
|
45 |
+
self.char_model = None
|
46 |
+
if Y['char'][insert_indexes].nunique() == 1:
|
47 |
+
self.constant_output = Y['char'][insert_indexes].unique()[0]
|
48 |
+
else:
|
49 |
+
#If there is never an insertion handle it as always inserting a space,
|
50 |
+
#because it will never insert, but it handles the boundary case so the saving and loading code works.
|
51 |
+
self.constant_output = ' '
|
52 |
+
|
53 |
+
|
54 |
|
55 |
self.trailing_context = trailing_context
|
56 |
self.leading_context = leading_context
|
|
|
66 |
model -- The pathname to save the model such as "my_model.tm" (default my_model.tm)
|
67 |
"""
|
68 |
self.name = model
|
69 |
+
with zipfile.ZipFile( model, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9 ) as my_zip:
|
70 |
+
with my_zip.open( 'params.json', mode='w' ) as out:
|
71 |
+
params = {
|
72 |
'version': FILE_VERSION,
|
73 |
'leading_context': self.leading_context,
|
74 |
'trailing_context': self.trailing_context,
|
75 |
'iterations': self.iterations,
|
76 |
+
}
|
77 |
+
if self.constant_output is not None:
|
78 |
+
params['constant_output'] = self.constant_output
|
79 |
+
|
80 |
+
out.write( json.dumps(params).encode())
|
81 |
temp_filename = _mktemp()
|
82 |
self.action_model.save_model( temp_filename )
|
83 |
+
my_zip.write( temp_filename, "action.cb" )
|
84 |
+
if not self.char_model is None:
|
85 |
+
self.char_model.save_model( temp_filename )
|
86 |
+
my_zip.write( temp_filename, "char.cb" )
|
87 |
os.unlink( temp_filename )
|
88 |
|
89 |
return self
|
|
|
97 |
"""
|
98 |
self.name = model
|
99 |
with zipfile.ZipFile( model, mode='r' ) as zip:
|
100 |
+
with zip.open( 'params.json' ) as f_in:
|
101 |
+
params = json.loads( f_in.read().decode() )
|
102 |
if params['version'] > FILE_VERSION: raise Exception( f"Version {params['version']} greater than {FILE_VERSION}" )
|
103 |
self.leading_context = int(params['leading_context'])
|
104 |
self.trailing_context = int(params['trailing_context'])
|
105 |
self.iterations = int(params['iterations'])
|
106 |
temp_filename = _mktemp()
|
107 |
+
with zip.open( 'action.cb' ) as f_in:
|
108 |
+
with open( temp_filename, "wb" ) as f_out:
|
109 |
+
f_out.write( f_in.read() )
|
110 |
self.action_model = CatBoostClassifier().load_model( temp_filename )
|
111 |
+
if 'constant_output' not in params:
|
112 |
+
with zip.open( 'char.cb' ) as f_in:
|
113 |
+
with open( temp_filename, "wb" ) as f_out:
|
114 |
+
f_out.write( f_in.read() )
|
115 |
+
self.char_model = CatBoostClassifier().load_model( temp_filename )
|
116 |
+
self.constant_output = None
|
117 |
+
else:
|
118 |
+
self.constant_output = params['constant_output']
|
119 |
+
self.char_model = None
|
120 |
|
121 |
os.unlink( temp_filename)
|
122 |
|
|
|
125 |
|
126 |
def execute( self, from_sentences, verbose=False ):
|
127 |
"""
|
128 |
+
Runs the data from from_sentences. The results are returned
|
129 |
using yield so you need to wrap this in list() if you want
|
130 |
to index it. from_sentences can be an array or a generator.
|
131 |
|
|
|
137 |
yield _do_reconstruct(
|
138 |
action_model=self.action_model,
|
139 |
char_model=self.char_model,
|
140 |
+
constant_output=self.constant_output,
|
141 |
text=from_sentence,
|
142 |
num_pre_context_chars=self.leading_context,
|
143 |
num_post_context_chars=self.trailing_context )
|
|
|
158 |
demo.launch( share=share )
|
159 |
|
160 |
def _list_trace( trace ):
|
161 |
+
if trace.parent is None:
|
162 |
result = [trace]
|
163 |
else:
|
164 |
+
result = _list_trace( trace.parent )
|
165 |
result.append( trace )
|
166 |
return result
|
167 |
|
168 |
class _edit_trace_hop():
|
169 |
+
parent = None
|
170 |
edit_distance = None
|
171 |
char = None
|
172 |
from_row_i = None
|
|
|
207 |
#root case.
|
208 |
if from_row_i == 0 and to_column_i == 0:
|
209 |
best_option = _edit_trace_hop()
|
210 |
+
best_option.parent = None
|
211 |
best_option.edit_distance = 0
|
212 |
best_option.char = ""
|
213 |
best_option.from_row_i = from_row_i
|
|
|
218 |
if to_column_i > 0:
|
219 |
if best_option is None or current_row[to_column_i-1].edit_distance + 1 < best_option.edit_distance:
|
220 |
best_option = _edit_trace_hop()
|
221 |
+
best_option.parent = current_row[to_column_i-1]
|
222 |
+
best_option.edit_distance = best_option.parent.edit_distance + 1
|
223 |
best_option.char = to_sentence[to_column_i-1]
|
224 |
best_option.from_row_i = from_row_i
|
225 |
best_option.to_column_i = to_column_i
|
|
|
229 |
if from_row_i > 0:
|
230 |
if best_option is None or last_row[to_column_i].edit_distance + 1 < best_option.edit_distance:
|
231 |
best_option = _edit_trace_hop()
|
232 |
+
best_option.parent = last_row[to_column_i]
|
233 |
+
best_option.edit_distance = best_option.parent.edit_distance + 1
|
234 |
best_option.char = from_sentence[from_row_i-1]
|
235 |
best_option.from_row_i = from_row_i
|
236 |
best_option.to_column_i = to_column_i
|
|
|
241 |
if to_sentence[to_column_i-1] == from_sentence[from_row_i-1]:
|
242 |
if best_option is None or last_row[to_column_i-1].edit_distance <= best_option.edit_distance: #prefer match so use <= than <
|
243 |
best_option = _edit_trace_hop()
|
244 |
+
best_option.parent = last_row[to_column_i-1]
|
245 |
+
best_option.edit_distance = best_option.parent.edit_distance + 1
|
246 |
best_option.char = from_sentence[from_row_i-1]
|
247 |
best_option.from_row_i = from_row_i
|
248 |
best_option.to_column_i = to_column_i
|
|
|
256 |
|
257 |
if print_debug:
|
258 |
def print_diffs( current_node ):
|
259 |
+
if current_node.parent is not None:
|
260 |
+
print_diffs( current_node.parent )
|
261 |
|
262 |
if current_node.action == START:
|
263 |
print( "start" )
|
|
|
369 |
result_split_into_dict['action'] = action_slice
|
370 |
result_split_into_dict['char'] = char_slice
|
371 |
|
372 |
+
#now return it as a data_frame.
|
373 |
return pd.DataFrame( context_split_into_dict ), pd.DataFrame( result_split_into_dict )
|
374 |
|
375 |
|
|
|
416 |
|
417 |
|
418 |
def _mktemp():
|
419 |
+
#I know mktemp exists in the library but it has been deprecated suggesting using
|
420 |
+
#mkstemp but catboost can't write to a file handle yet, so I need an actual
|
421 |
#filename.
|
422 |
number = 0
|
423 |
while os.path.exists( f".temp_{number}~" ):
|
|
|
425 |
return f".temp_{number}~"
|
426 |
|
427 |
|
428 |
+
def predict_wrapper( model, model_input ):
|
429 |
+
#Big hack. Catboost has shown itself to be unstable on producing
|
430 |
+
#either a single value or an array with a single value in it.
|
431 |
+
#I traced it back to the saved model, and then the model to what
|
432 |
+
#data it is trained on. But I could figure out what it was
|
433 |
+
#in the data which would make the saved model be one way or the other
|
434 |
+
#so I am going to use the results this way so that it works either way.
|
435 |
+
result = model.predict( model_input )[0]
|
436 |
+
try:
|
437 |
+
result = result[0]
|
438 |
+
except:
|
439 |
+
pass
|
440 |
+
return result
|
|
|
|
|
441 |
|
442 |
+
def _do_reconstruct( action_model, char_model, constant_output, text, num_pre_context_chars, num_post_context_chars ):
|
|
|
443 |
|
444 |
#test for nan.
|
445 |
if text != text: text = ''
|
|
|
473 |
context_as_pd = pd.DataFrame( context_as_dictionary )
|
474 |
|
475 |
#run the model
|
476 |
+
action_model_result = predict_wrapper(action_model,context_as_pd )
|
477 |
|
478 |
#stop run away. If we have added more chars then our context, nothing is going to change.
|
479 |
if action_model_result == INSERT_TO and continuous_added >= num_post_context_chars:
|
|
|
483 |
if action_model_result == START:
|
484 |
pass
|
485 |
elif action_model_result == INSERT_TO:
|
486 |
+
if constant_output is None:
|
487 |
+
#for an insert ask the char model what to insert
|
488 |
+
char_model_result = predict_wrapper(char_model, context_as_pd )
|
489 |
+
else:
|
490 |
+
char_model_result = constant_output
|
491 |
|
492 |
working_to += char_model_result
|
493 |
continuous_added += 1
|
|
|
532 |
split_index = int( train_percentage/100*len(full_data) )
|
533 |
train_data = full_data.iloc[:split_index,:].reset_index(drop=True)
|
534 |
|
535 |
+
if verbose: print( "parsing data for training" )
|
536 |
|
537 |
|
538 |
tm = Transmorgrifier()
|
|
|
670 |
tm = Transmorgrifier()
|
671 |
tm.load( args.model )
|
672 |
|
673 |
+
tm.demo( share=args.share )
|
674 |
|
675 |
|
676 |
if __name__ == '__main__':
|