c-dunlap commited on
Commit
b4dd09c
·
1 Parent(s): 4416953

Added model vocab

Browse files
Files changed (2) hide show
  1. smiles.py +132 -0
  2. vocab.txt +608 -0
smiles.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ © Battelle Memorial Institute 2023
3
+ Made available under the GNU General Public License v 2.0
4
+
5
+ BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
6
+ FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
7
+ OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
8
+ PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
9
+ OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
10
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
11
+ TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
12
+ PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
13
+ REPAIR OR CORRECTION.
14
+ """
15
+
16
+ import re
17
+ import torch
18
+ import pandas as pd
19
+ from rdkit import Chem
20
+ from rdkit.Chem.SaltRemover import SaltRemover
21
+
22
+
23
+ class InvalidSmile(Exception):
24
+ pass
25
+
26
+
27
+ def load_vocab(vocab_file_name):
28
+ """
29
+ Load an existing vocabulary from a file. Assumes a single
30
+ token definition per line of the file.
31
+
32
+ Parameters
33
+ ----------
34
+ vocab_file_name : str
35
+ The file name of the vocabulary to load.
36
+
37
+ Returns
38
+ -------
39
+ vocab_dict : dict
40
+ A dict of tokens as the keys and the corresponding
41
+ token index as the items.
42
+
43
+ """
44
+ # Get vocabulary
45
+ vocab = pd.read_csv(vocab_file_name, header=None)[0].to_list()
46
+ vocab_dict = {v: ind for ind, v in enumerate(vocab)}
47
+
48
+ return vocab_dict
49
+
50
+
51
+ def smiles_tokenizer(smiles):
52
+ """
53
+ Tokenize a SMILES string.
54
+
55
+ Parameters
56
+ ----------
57
+ smiles : str
58
+ A SMILES string to turn into tokens.
59
+
60
+ Returns
61
+ -------
62
+ tokens : list
63
+ A list of tokens after tokenizing the input string.
64
+
65
+ """
66
+ pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
67
+ regex = re.compile(pattern)
68
+ tokens = [token for token in regex.findall(smiles)]
69
+ # check if the smiles string had extra characters not recognized by regex
70
+ # solution based on https://stackoverflow.com/a/3879574
71
+ if len("".join(tokens)) < len(smiles):
72
+ raise Exception(
73
+ "Input smiles string contained invalid characters."
74
+ )
75
+
76
+ return tokens
77
+
78
+
79
+ def smiles_to_tensor(
80
+ smiles, vocab_dict, max_seq_len, desalt=True, canonical=True, isomeric=True
81
+ ):
82
+ """
83
+ Converts a SMILES string to a tensor using the provided vocabulary.
84
+
85
+ Parameters
86
+ ----------
87
+ smiles : str
88
+ A SMILES string to convert to a tensor.
89
+ vocab_dict : dict
90
+ A dictionary of SMILES tokens and integer value as the dictionary key
91
+ and item, respectively.
92
+ max_seq_len : int
93
+ The maximum sequence length allowed for SMILES strings. Smaller
94
+ strings are padded to the maximum length using the [PAD] token
95
+ from the vocabulary provided.
96
+ desalt : bool, optional
97
+ Flag for removing salts and solvents from SMILES string, by default True.
98
+ canonical : bool, optional
99
+ Flag enabling the conversion of the SMILES to canonical form, by default True.
100
+ isomeric : bool, optional
101
+ Flag enabling the conversion of the SMILES to isomeric form, by default True.
102
+
103
+ Returns
104
+ -------
105
+ smiles_ten_long : tensor
106
+ A tensor representing the converted SMILES string based on the provided
107
+ vocabulary with shape (1, max_seq_len).
108
+
109
+ """
110
+ # Initialize the salt/solvent remover
111
+ remover = SaltRemover()
112
+ # Convert the SMILES to molecule
113
+ mol = Chem.MolFromSmiles(smiles)
114
+ if mol is None:
115
+ raise InvalidSmile('Molecule could not be constructed from smile string')
116
+ # Remove the salts/solvents
117
+ if desalt:
118
+ mol = remover.StripMol(mol, dontRemoveEverything=True)
119
+ # Convert back to SMILES
120
+ smiles = Chem.MolToSmiles(mol, canonical=canonical, isomericSmiles=isomeric)
121
+ # Tokenize the SMILES
122
+ smiles_tok = smiles_tokenizer(smiles)
123
+ tok = [vocab_dict["[CLS]"], vocab_dict["[EDGE]"]]
124
+ tok += [vocab_dict[x] for x in smiles_tok]
125
+ tok += [vocab_dict["[EDGE]"]]
126
+ smiles_ten = torch.tensor(tok, dtype=torch.long)
127
+ smiles_ten_long = (
128
+ torch.ones((1, max_seq_len), dtype=torch.long) * vocab_dict["[PAD]"]
129
+ )
130
+ smiles_ten_long[0, : smiles_ten.shape[0]] = smiles_ten
131
+
132
+ return smiles_ten_long
vocab.txt ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD]
2
+ [CLS]
3
+ [EDGE]
4
+ [MASK]
5
+ [UNK]
6
+ [SEP]
7
+ [unused1]
8
+ [unused2]
9
+ [unused3]
10
+ [unused4]
11
+ [unused5]
12
+ [unused6]
13
+ [unused7]
14
+ [unused8]
15
+ [unused9]
16
+ [unused10]
17
+ c
18
+ C
19
+ (
20
+ )
21
+ O
22
+ 1
23
+ 2
24
+ =
25
+ N
26
+ .
27
+ n
28
+ 3
29
+ F
30
+ Cl
31
+ >>
32
+ ~
33
+ -
34
+ 4
35
+ [C@H]
36
+ S
37
+ [C@@H]
38
+ [O-]
39
+ Br
40
+ #
41
+ /
42
+ [nH]
43
+ [N+]
44
+ s
45
+ 5
46
+ o
47
+ P
48
+ [Na+]
49
+ [Si]
50
+ I
51
+ [Na]
52
+ [Pd]
53
+ [K+]
54
+ [K]
55
+ [P]
56
+ B
57
+ [C@]
58
+ [C@@]
59
+ [Cl-]
60
+ 6
61
+ [OH-]
62
+ \
63
+ [N-]
64
+ [Li]
65
+ [H]
66
+ [2H]
67
+ [NH4+]
68
+ [c-]
69
+ [P-]
70
+ [Cs+]
71
+ [Li+]
72
+ [Cs]
73
+ [NaH]
74
+ [H-]
75
+ [O+]
76
+ [BH4-]
77
+ [Cu]
78
+ 7
79
+ [Mg]
80
+ [Fe+2]
81
+ [n+]
82
+ [Sn]
83
+ [BH-]
84
+ [Pd+2]
85
+ [CH]
86
+ [I-]
87
+ [Br-]
88
+ [C-]
89
+ [Zn]
90
+ [B-]
91
+ [F-]
92
+ [Al]
93
+ [P+]
94
+ [BH3-]
95
+ [Fe]
96
+ [C]
97
+ [AlH4]
98
+ [Ni]
99
+ [SiH]
100
+ 8
101
+ [Cu+2]
102
+ [Mn]
103
+ [AlH]
104
+ [nH+]
105
+ [AlH4-]
106
+ [O-2]
107
+ [Cr]
108
+ [Mg+2]
109
+ [NH3+]
110
+ [S@]
111
+ [Pt]
112
+ [Al+3]
113
+ [S@@]
114
+ [S-]
115
+ [Ti]
116
+ [Zn+2]
117
+ [PH]
118
+ [NH2+]
119
+ [Ru]
120
+ [Ag+]
121
+ [S+]
122
+ [I+3]
123
+ [NH+]
124
+ [Ca+2]
125
+ [Ag]
126
+ 9
127
+ [Os]
128
+ [Se]
129
+ [SiH2]
130
+ [Ca]
131
+ [Ti+4]
132
+ [Ac]
133
+ [Cu+]
134
+ [S]
135
+ [Rh]
136
+ [Cl+3]
137
+ [cH-]
138
+ [Zn+]
139
+ [O]
140
+ [Cl+]
141
+ [SH]
142
+ [H+]
143
+ [Pd+]
144
+ [se]
145
+ [PH+]
146
+ [I]
147
+ [Pt+2]
148
+ [C+]
149
+ [Mg+]
150
+ [Hg]
151
+ [W]
152
+ [SnH]
153
+ [SiH3]
154
+ [Fe+3]
155
+ [NH]
156
+ [Mo]
157
+ [CH2+]
158
+ %10
159
+ [CH2-]
160
+ [CH2]
161
+ [n-]
162
+ [Ce+4]
163
+ [NH-]
164
+ [Co]
165
+ [I+]
166
+ [PH2]
167
+ [Pt+4]
168
+ [Ce]
169
+ [B]
170
+ [Sn+2]
171
+ [Ba+2]
172
+ %11
173
+ [Fe-3]
174
+ [18F]
175
+ [SH-]
176
+ [Pb+2]
177
+ [Os-2]
178
+ [Zr+4]
179
+ [N]
180
+ [Ir]
181
+ [Bi]
182
+ [Ni+2]
183
+ [P@]
184
+ [Co+2]
185
+ [s+]
186
+ [As]
187
+ [P+3]
188
+ [Hg+2]
189
+ [Yb+3]
190
+ [CH-]
191
+ [Zr+2]
192
+ [Mn+2]
193
+ [CH+]
194
+ [In]
195
+ [KH]
196
+ [Ce+3]
197
+ [Zr]
198
+ [AlH2-]
199
+ [OH2+]
200
+ [Ti+3]
201
+ [Rh+2]
202
+ [Sb]
203
+ [S-2]
204
+ %12
205
+ [P@@]
206
+ [Si@H]
207
+ [Mn+4]
208
+ p
209
+ [Ba]
210
+ [NH2-]
211
+ [Ge]
212
+ [Pb+4]
213
+ [Cr+3]
214
+ [Au]
215
+ [LiH]
216
+ [Sc+3]
217
+ [o+]
218
+ [Rh-3]
219
+ %13
220
+ [Br]
221
+ [Sb-]
222
+ [S@+]
223
+ [I+2]
224
+ [Ar]
225
+ [V]
226
+ [Cu-]
227
+ [Al-]
228
+ [Te]
229
+ [13c]
230
+ [13C]
231
+ [Cl]
232
+ [PH4+]
233
+ [SiH4]
234
+ [te]
235
+ [CH3-]
236
+ [S@@+]
237
+ [Rh+3]
238
+ [SH+]
239
+ [Bi+3]
240
+ [Br+2]
241
+ [La]
242
+ [La+3]
243
+ [Pt-2]
244
+ [N@@]
245
+ [PH3+]
246
+ [N@]
247
+ [Si+4]
248
+ [Sr+2]
249
+ [Al+]
250
+ [Pb]
251
+ [SeH]
252
+ [Si-]
253
+ [V+5]
254
+ [Y+3]
255
+ [Re]
256
+ [Ru+]
257
+ [Sm]
258
+ *
259
+ [3H]
260
+ [NH2]
261
+ [Ag-]
262
+ [13CH3]
263
+ [OH+]
264
+ [Ru+3]
265
+ [OH]
266
+ [Gd+3]
267
+ [13CH2]
268
+ [In+3]
269
+ [Si@@]
270
+ [Si@]
271
+ [Ti+2]
272
+ [Sn+]
273
+ [Cl+2]
274
+ [AlH-]
275
+ [Pd-2]
276
+ [SnH3]
277
+ [B+3]
278
+ [Cu-2]
279
+ [Nd+3]
280
+ [Pb+3]
281
+ [13cH]
282
+ [Fe-4]
283
+ [Ga]
284
+ [Sn+4]
285
+ [Hg+]
286
+ [11CH3]
287
+ [Hf]
288
+ [Pr]
289
+ [Y]
290
+ [S+2]
291
+ [Cd]
292
+ [Cr+6]
293
+ [Zr+3]
294
+ [Rh+]
295
+ [CH3]
296
+ [N-3]
297
+ [Hf+2]
298
+ [Th]
299
+ [Sb+3]
300
+ %14
301
+ [Cr+2]
302
+ [Ru+2]
303
+ [Hf+4]
304
+ [14C]
305
+ [Ta]
306
+ [Tl+]
307
+ [B+]
308
+ [Os+4]
309
+ [PdH2]
310
+ [Pd-]
311
+ [Cd+2]
312
+ [Co+3]
313
+ [S+4]
314
+ [Nb+5]
315
+ [123I]
316
+ [c+]
317
+ [Rb+]
318
+ [V+2]
319
+ [CH3+]
320
+ [Ag+2]
321
+ [cH+]
322
+ [Mn+3]
323
+ [Se-]
324
+ [As-]
325
+ [Eu+3]
326
+ [SH2]
327
+ [Sm+3]
328
+ [IH+]
329
+ %15
330
+ [OH3+]
331
+ [PH3]
332
+ [IH2+]
333
+ [SH2+]
334
+ [Ir+3]
335
+ [AlH3]
336
+ [Sc]
337
+ [Yb]
338
+ [15NH2]
339
+ [Lu]
340
+ [sH+]
341
+ [Gd]
342
+ [18F-]
343
+ [SH3+]
344
+ [SnH4]
345
+ [TeH]
346
+ [Si@@H]
347
+ [Ga+3]
348
+ [CaH2]
349
+ [Tl]
350
+ [Ta+5]
351
+ [GeH]
352
+ [Br+]
353
+ [Sr]
354
+ [Tl+3]
355
+ [Sm+2]
356
+ [PH5]
357
+ %16
358
+ [N@@+]
359
+ [Au+3]
360
+ [C-4]
361
+ [Nd]
362
+ [Ti+]
363
+ [IH]
364
+ [N@+]
365
+ [125I]
366
+ [Eu]
367
+ [Sn+3]
368
+ [Nb]
369
+ [Er+3]
370
+ [123I-]
371
+ [14c]
372
+ %17
373
+ [SnH2]
374
+ [YH]
375
+ [Sb+5]
376
+ [Pr+3]
377
+ [Ir+]
378
+ [N+3]
379
+ [AlH2]
380
+ [19F]
381
+ %18
382
+ [Tb]
383
+ [14CH]
384
+ [Mo+4]
385
+ [Si+]
386
+ [BH]
387
+ [Be]
388
+ [Rb]
389
+ [pH]
390
+ %19
391
+ %20
392
+ [Xe]
393
+ [Ir-]
394
+ [Be+2]
395
+ [C+4]
396
+ [RuH2]
397
+ [15NH]
398
+ [U+2]
399
+ [Au-]
400
+ %21
401
+ %22
402
+ [Au+]
403
+ [15n]
404
+ [Al+2]
405
+ [Tb+3]
406
+ [15N]
407
+ [V+3]
408
+ [W+6]
409
+ [14CH3]
410
+ [Cr+4]
411
+ [ClH+]
412
+ b
413
+ [Ti+6]
414
+ [Nd+]
415
+ [Zr+]
416
+ [PH2+]
417
+ [Fm]
418
+ [N@H+]
419
+ [RuH]
420
+ [Dy+3]
421
+ %23
422
+ [Hf+3]
423
+ [W+4]
424
+ [11C]
425
+ [13CH]
426
+ [Er]
427
+ [124I]
428
+ [LaH]
429
+ [F]
430
+ [siH]
431
+ [Ga+]
432
+ [Cm]
433
+ [GeH3]
434
+ [IH-]
435
+ [U+6]
436
+ [SeH+]
437
+ [32P]
438
+ [SeH-]
439
+ [Pt-]
440
+ [Ir+2]
441
+ [se+]
442
+ [U]
443
+ [F+]
444
+ [BH2]
445
+ [As+]
446
+ [Cf]
447
+ [ClH2+]
448
+ [Ni+]
449
+ [TeH3]
450
+ [SbH2]
451
+ [Ag+3]
452
+ %24
453
+ [18O]
454
+ [PH4]
455
+ [Os+2]
456
+ [Na-]
457
+ [Sb+2]
458
+ [V+4]
459
+ [Ho+3]
460
+ [68Ga]
461
+ [PH-]
462
+ [Bi+2]
463
+ [Ce+2]
464
+ [Pd+3]
465
+ [99Tc]
466
+ [13C@@H]
467
+ [Fe+6]
468
+ [c]
469
+ [GeH2]
470
+ [10B]
471
+ [Cu+3]
472
+ [Mo+2]
473
+ [Cr+]
474
+ [Pd+4]
475
+ [Dy]
476
+ [AsH]
477
+ [Ba+]
478
+ [SeH2]
479
+ [In+]
480
+ [TeH2]
481
+ [BrH+]
482
+ [14cH]
483
+ [W+]
484
+ [13C@H]
485
+ [AsH2]
486
+ [In+2]
487
+ [N+2]
488
+ [N@@H+]
489
+ [SbH]
490
+ [60Co]
491
+ [AsH4+]
492
+ [AsH3]
493
+ [18OH]
494
+ [Ru-2]
495
+ [Na-2]
496
+ [CuH2]
497
+ [31P]
498
+ [Ti+5]
499
+ [35S]
500
+ [P@@H]
501
+ [ArH]
502
+ [Co+]
503
+ [Zr-2]
504
+ [BH2-]
505
+ [131I]
506
+ [SH5]
507
+ [VH]
508
+ [B+2]
509
+ [Yb+2]
510
+ [14C@H]
511
+ [211At]
512
+ [NH3+2]
513
+ [IrH]
514
+ [IrH2]
515
+ [Rh-]
516
+ [Cr-]
517
+ [Sb+]
518
+ [Ni+3]
519
+ [TaH3]
520
+ [Tl+2]
521
+ [64Cu]
522
+ [Tc]
523
+ [Cd+]
524
+ [1H]
525
+ [15nH]
526
+ [AlH2+]
527
+ [FH+2]
528
+ [BiH3]
529
+ [Ru-]
530
+ [Mo+6]
531
+ [AsH+]
532
+ [BaH2]
533
+ [BaH]
534
+ [Fe+4]
535
+ [229Th]
536
+ [Th+4]
537
+ [As+3]
538
+ [NH+3]
539
+ [P@H]
540
+ [Li-]
541
+ [7NaH]
542
+ [Bi+]
543
+ [PtH+2]
544
+ [p-]
545
+ [Re+5]
546
+ [NiH]
547
+ [Ni-]
548
+ [Xe+]
549
+ [Ca+]
550
+ [11c]
551
+ [Rh+4]
552
+ [AcH]
553
+ [HeH]
554
+ [Sc+2]
555
+ [Mn+]
556
+ [UH]
557
+ [14CH2]
558
+ [SiH4+]
559
+ [18OH2]
560
+ [Ac-]
561
+ [Re+4]
562
+ [118Sn]
563
+ [153Sm]
564
+ [P+2]
565
+ [9CH]
566
+ [9CH3]
567
+ [Y-]
568
+ [NiH2]
569
+ [Si+2]
570
+ [Mn+6]
571
+ [ZrH2]
572
+ [C-2]
573
+ [Bi+5]
574
+ [24NaH]
575
+ [Fr]
576
+ [15CH]
577
+ [Se+]
578
+ [At]
579
+ [P-3]
580
+ [124I-]
581
+ [CuH2-]
582
+ [Nb+4]
583
+ [Nb+3]
584
+ [MgH]
585
+ [Ir+4]
586
+ [67Ga+3]
587
+ [67Ga]
588
+ [13N]
589
+ [15OH2]
590
+ [2NH]
591
+ [Ho]
592
+ [Cn]
593
+ [P@@+]
594
+ [P@+]
595
+ [IH2]
596
+ [B@-]
597
+ [S@@H]
598
+ [B@@-]
599
+ [SnH2+]
600
+ [25O]
601
+ [SnH4+2]
602
+ [SH3]
603
+ [17O]
604
+ [SnH6+3]
605
+ [Sn-]
606
+ [S@H]
607
+ [si]
608
+ [p+]