Spaces:
Running
Running
jannisborn
commited on
update
Browse files- README.md +1 -1
- app.py +63 -133
- model_cards/article.md +68 -0
- model_cards/description.md +7 -0
- model_cards/examples.smi +13 -0
- model_cards/regression_transformer.png +0 -0
- model_cards/regression_transformer_article.md +0 -113
- model_cards/regression_transformer_description.md +0 -13
- model_cards/regression_transformer_examples.csv +0 -9
- requirements.txt +1 -1
- utils.py +24 -139
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 💡
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
|
|
1 |
---
|
2 |
+
title: Molecular properties
|
3 |
emoji: 💡
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
app.py
CHANGED
@@ -2,168 +2,98 @@ import logging
|
|
2 |
import pathlib
|
3 |
|
4 |
import gradio as gr
|
|
|
5 |
import pandas as pd
|
6 |
-
from gt4sd.
|
7 |
-
|
8 |
-
|
9 |
-
from gt4sd.algorithms.registry import ApplicationsRegistry
|
10 |
-
from utils import (
|
11 |
-
draw_grid_generate,
|
12 |
-
draw_grid_predict,
|
13 |
-
get_application,
|
14 |
-
get_inference_dict,
|
15 |
-
get_rt_name,
|
16 |
-
)
|
17 |
|
18 |
logger = logging.getLogger(__name__)
|
19 |
logger.addHandler(logging.NullHandler())
|
20 |
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
substructures_to_mask: str,
|
35 |
-
substructures_to_keep: str,
|
36 |
-
):
|
37 |
-
|
38 |
-
if task == "Predict" and wrapper:
|
39 |
-
logger.warning(
|
40 |
-
f"For prediction, no sampling_wrapper will be used, ignoring: fraction_to_mask: {fraction_to_mask}, "
|
41 |
-
f"tokens_to_mask: {tokens_to_mask}, substructures_to_mask={substructures_to_mask}, "
|
42 |
-
f"substructures_to_keep: {substructures_to_keep}."
|
43 |
-
)
|
44 |
-
sampling_wrapper = {}
|
45 |
-
elif not wrapper:
|
46 |
-
sampling_wrapper = {}
|
47 |
-
else:
|
48 |
-
substructures_to_mask = (
|
49 |
-
[]
|
50 |
-
if substructures_to_mask == ""
|
51 |
-
else substructures_to_mask.replace(" ", "").split(",")
|
52 |
-
)
|
53 |
-
substructures_to_keep = (
|
54 |
-
[]
|
55 |
-
if substructures_to_keep == ""
|
56 |
-
else substructures_to_keep.replace(" ", "").split(",")
|
57 |
-
)
|
58 |
-
tokens_to_mask = [] if tokens_to_mask == "" else tokens_to_mask.split(",")
|
59 |
-
|
60 |
-
property_goals = {}
|
61 |
-
if property_goal == "":
|
62 |
-
raise ValueError(
|
63 |
-
"For conditional generation you have to specify `property_goal`."
|
64 |
-
)
|
65 |
-
for line in property_goal.split(","):
|
66 |
-
property_goals[line.split(":")[0].strip()] = float(line.split(":")[1])
|
67 |
-
|
68 |
-
sampling_wrapper = {
|
69 |
-
"substructures_to_keep": substructures_to_keep,
|
70 |
-
"substructures_to_mask": substructures_to_mask,
|
71 |
-
"text_filtering": False,
|
72 |
-
"fraction_to_mask": fraction_to_mask,
|
73 |
-
"property_goal": property_goals,
|
74 |
-
}
|
75 |
-
algorithm_application = get_application(algorithm.split(":")[0])
|
76 |
-
algorithm_version = algorithm.split(" ")[-1].lower()
|
77 |
-
config = algorithm_application(
|
78 |
-
algorithm_version=algorithm_version,
|
79 |
-
search=search.lower(),
|
80 |
-
temperature=temperature,
|
81 |
-
tolerance=tolerance,
|
82 |
-
sampling_wrapper=sampling_wrapper,
|
83 |
)
|
84 |
-
model =
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
return draw_grid_predict(samples[0], target, domain=algorithm.split(":")[0])
|
100 |
else:
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
if __name__ == "__main__":
|
105 |
|
106 |
# Preparation (retrieve all available algorithms)
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
properties = {}
|
114 |
-
for algo in rt_algos:
|
115 |
-
application = get_application(
|
116 |
-
algo["algorithm_application"].split("Transformer")[-1]
|
117 |
-
)
|
118 |
-
data = get_inference_dict(
|
119 |
-
application=application, algorithm_version=algo["algorithm_version"]
|
120 |
-
)
|
121 |
-
properties[get_rt_name(algo)] = data
|
122 |
-
properties
|
123 |
|
124 |
# Load metadata
|
125 |
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
|
126 |
|
127 |
-
examples =
|
128 |
-
metadata_root.joinpath("
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
-
with open(metadata_root.joinpath("
|
132 |
article = f.read()
|
133 |
-
with open(
|
134 |
-
metadata_root.joinpath("regression_transformer_description.md"), "r"
|
135 |
-
) as f:
|
136 |
description = f.read()
|
137 |
|
138 |
demo = gr.Interface(
|
139 |
-
fn=
|
140 |
-
title="
|
141 |
inputs=[
|
142 |
-
gr.Dropdown(
|
143 |
-
gr.Radio(choices=["Predict", "Generate"], label="Task", value="Generate"),
|
144 |
gr.Textbox(
|
145 |
-
label="
|
146 |
-
|
147 |
-
|
148 |
-
minimum=1, maximum=50, value=10, label="Number of samples", step=1
|
149 |
),
|
150 |
-
gr.
|
151 |
-
|
152 |
-
|
153 |
-
gr.Radio(choices=[True, False], label="Sampling Wrapper", value=True),
|
154 |
-
gr.Slider(minimum=0, maximum=1, value=0.5, label="Fraction to mask"),
|
155 |
-
gr.Textbox(label="Property goal", placeholder="<qed>:0.75", lines=1),
|
156 |
-
gr.Textbox(label="Tokens to mask", placeholder="N, C", lines=1),
|
157 |
-
gr.Textbox(
|
158 |
-
label="Substructures to mask", placeholder="C(=O), C#C", lines=1
|
159 |
-
),
|
160 |
-
gr.Textbox(
|
161 |
-
label="Substructures to keep", placeholder="C1=CC=C(Cl)C=C1", lines=1
|
162 |
),
|
163 |
],
|
164 |
outputs=gr.HTML(label="Output"),
|
165 |
article=article,
|
166 |
description=description,
|
167 |
-
examples=examples
|
168 |
)
|
169 |
demo.launch(debug=True, show_error=True)
|
|
|
2 |
import pathlib
|
3 |
|
4 |
import gradio as gr
|
5 |
+
import numpy as np
|
6 |
import pandas as pd
|
7 |
+
from gt4sd.properties.molecules import MOLECULE_PROPERTY_PREDICTOR_FACTORY
|
8 |
+
|
9 |
+
from utils import draw_grid_predict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
logger.addHandler(logging.NullHandler())
|
13 |
|
14 |
+
REMOVE = ["docking", "docking_tdc", "molecule_one", "askcos", "plogp"]
|
15 |
+
REMOVE.extend(["similarity_seed", "activity_against_target", "organtox"])
|
16 |
|
17 |
+
MODEL_PROP_DESCRIPTION = {
|
18 |
+
"Tox21": "NR-AR, NR-AR-LBD, NR-AhR, NR-Aromatase, NR-ER, NR-ER-LBD, NR-PPAR-gamma, SR-ARE, SR-ATAD5, SR-HSE, SR-MMP, SR-p53",
|
19 |
+
"Sider": "Hepatobiliary disorders,Metabolism and nutrition disorders,Product issues,Eye disorders,Investigations,Musculoskeletal disorders,Gastrointestinal disorders,Social circumstances,Immune system disorders,Reproductive system and breast disorders,Bening & malignant,General disorders,Endocrine disorders,Surgical & medical procedures,Vascular disorders,Blood & lymphatic disorders,Skin & subcutaneous disorders,Congenital & genetic disorders,Infections,Respiratory & thoracic disorders,Psychiatric disorders,Renal & urinary disorders,Pregnancy conditions,Ear disorders,Cardiac disorders,Nervous system disorders,Injury & procedural complications",
|
20 |
+
"Clintox": "FDA approval, Clinical trial failure",
|
21 |
+
}
|
22 |
+
|
23 |
+
|
24 |
+
def main(property: str, smiles: str, smiles_file: str):
|
25 |
+
|
26 |
+
algo, config = MOLECULE_PROPERTY_PREDICTOR_FACTORY[property.lower()]
|
27 |
+
kwargs = (
|
28 |
+
{"algorithm_version": "v0"} if property in MODEL_PROP_DESCRIPTION.keys() else {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
)
|
30 |
+
model = algo(config(**kwargs))
|
31 |
+
if smiles is not None and smiles_file is not None:
|
32 |
+
raise ValueError("Pass either smiles or smiles_file, not both.")
|
33 |
+
elif smiles is not None:
|
34 |
+
smiles = [smiles]
|
35 |
+
elif smiles_file is not None:
|
36 |
+
smiles = pd.read_csv(smiles_file.name, header=None, sep="\t")[0].tolist()
|
37 |
+
props = np.array(list(map(model, smiles))).round(2)
|
38 |
+
|
39 |
+
# Expand to 2D array if needed
|
40 |
+
if len(props.shape) == 1:
|
41 |
+
props = np.expand_dims(np.array(props), -1)
|
42 |
+
|
43 |
+
if property in MODEL_PROP_DESCRIPTION.keys():
|
44 |
+
property_names = MODEL_PROP_DESCRIPTION[property].split(",")
|
|
|
45 |
else:
|
46 |
+
property_names = [property]
|
47 |
+
|
48 |
+
return draw_grid_predict(
|
49 |
+
smiles, props, property_names=property_names, domain="Molecules"
|
50 |
+
)
|
51 |
|
52 |
|
53 |
if __name__ == "__main__":
|
54 |
|
55 |
# Preparation (retrieve all available algorithms)
|
56 |
+
properties = list(MOLECULE_PROPERTY_PREDICTOR_FACTORY.keys())[::-1]
|
57 |
+
for prop in REMOVE:
|
58 |
+
prop_to_idx = dict(zip(properties, range(len(properties))))
|
59 |
+
properties.pop(prop_to_idx[prop])
|
60 |
+
properties = list(map(lambda x: x.capitalize(), properties))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
# Load metadata
|
63 |
metadata_root = pathlib.Path(__file__).parent.joinpath("model_cards")
|
64 |
|
65 |
+
examples = [
|
66 |
+
["Qed", None, metadata_root.joinpath("examples.smi")],
|
67 |
+
[
|
68 |
+
"Esol",
|
69 |
+
"CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1",
|
70 |
+
None,
|
71 |
+
],
|
72 |
+
]
|
73 |
|
74 |
+
with open(metadata_root.joinpath("article.md"), "r") as f:
|
75 |
article = f.read()
|
76 |
+
with open(metadata_root.joinpath("description.md"), "r") as f:
|
|
|
|
|
77 |
description = f.read()
|
78 |
|
79 |
demo = gr.Interface(
|
80 |
+
fn=main,
|
81 |
+
title="Molecular properties",
|
82 |
inputs=[
|
83 |
+
gr.Dropdown(properties, label="Property", value="qed"),
|
|
|
84 |
gr.Textbox(
|
85 |
+
label="Single SMILES",
|
86 |
+
placeholder="CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1",
|
87 |
+
lines=1,
|
|
|
88 |
),
|
89 |
+
gr.File(
|
90 |
+
file_types=[".smi"],
|
91 |
+
label="Multiple SMILES (tab-separated, `.smi` file)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
),
|
93 |
],
|
94 |
outputs=gr.HTML(label="Output"),
|
95 |
article=article,
|
96 |
description=description,
|
97 |
+
examples=examples,
|
98 |
)
|
99 |
demo.launch(debug=True, show_error=True)
|
model_cards/article.md
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Supported molecular properties
|
2 |
+
|
3 |
+
|
4 |
+
### ClinTox
|
5 |
+
A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on [ClinTox](https://moleculenet.org/datasets-1) dataset which has two endpoints: Probability of FDA approval and Probability of failure in clinical trials. When using this model, please cite [*Born et al. (2023)](#toxsmi-citation).
|
6 |
+
|
7 |
+
### SIDER
|
8 |
+
A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [SIDER](https://moleculenet.org/datasets-1) dataset for 27 different types of side effects of drugs. When using this model, please cite [*Born et al. (2023)](#toxsmi-citation).
|
9 |
+
|
10 |
+
### Tox21
|
11 |
+
A [ToxSmi model](https://github.com/PaccMann/toxsmi) trained on the [Tox21](https://tripod.nih.gov/tox/) dataset with 12 different types of environmental toxicities. When using this model, please cite [*Born et al. (2023)](#toxsmi-citation).
|
12 |
+
|
13 |
+
### SCScore
|
14 |
+
Predict the synthetic complexity score (SCScore) as presented in [Coley et al. (*J. Chem. Inf. Model.*; 2018)](https://pubs.acs.org/doi/full/10.1021/acs.jcim.7b00622).
|
15 |
+
|
16 |
+
### SAS
|
17 |
+
Estimate the synthetic accessibility score (SAS) as presented in [Ertl et al. (*Journal of Chemoinformatics*; 2009)](https://jcheminf.biomedcentral.com/articles/10.1186/1758-2946-1-8).
|
18 |
+
|
19 |
+
### Lipinski
|
20 |
+
Measure whether a molecule confirms to the Lipinski-rule-of-five as presented in [Lipinski et al. (*Advanced Drug Delivery Reviews*; 2001)](https://www.sciencedirect.com/science/article/abs/pii/S0169409X00001290?via%3Dihub).
|
21 |
+
|
22 |
+
### Penalized logP
|
23 |
+
Measure the penalized logP (partition coefficient) score as presented in [Gomez-Bombarelli et al. (*ACS Central Science*; 2018)](https://arxiv.org/abs/1610.02415v1). This is the logP minus the number of rings with > 6 atoms minus the SAS.
|
24 |
+
|
25 |
+
### QED
|
26 |
+
Measure the drug-likeness as presented in [Bickerton et al. (*Nature Chemistry*; 2012)](https://www.nature.com/articles/nchem.1243).
|
27 |
+
|
28 |
+
### LogP
|
29 |
+
Measure the logP (partition coefficient) of a molecule as presented in [Wildman et al. (*J. Chem. Inf. Comput. Sci.*; 1999)](https://pubs.acs.org/doi/full/10.1021/ci990307l).
|
30 |
+
|
31 |
+
### Bertz
|
32 |
+
Calculate the total polar surface area of a molecule as presented in [Ertl et al. (*Journal of Medicinal Chemistry*; 2000)](https://pubs.acs.org/doi/full/10.1021/jm000942e).
|
33 |
+
|
34 |
+
### TPSA
|
35 |
+
Calculate the first general index of molecular complexity [Bertz (*Journal of the American Chemical Society*; 1981)](https://pubs.acs.org/doi/pdf/10.1021/ja00402a071).
|
36 |
+
|
37 |
+
### Is-Scaffold
|
38 |
+
Whether the molecule is identical to its [Murcko scaffold](https://rdkit.org/docs/source/rdkit.Chem.Scaffolds.MurckoScaffold.html).
|
39 |
+
|
40 |
+
### Number-Of-X
|
41 |
+
Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
|
42 |
+
|
43 |
+
### Molecular Weight
|
44 |
+
Calculated with [RDKit](https://www.rdkit.org/docs/source/rdkit.Chem.rdchem.html).
|
45 |
+
|
46 |
+
|
47 |
+
### ToxSmi citation
|
48 |
+
```bib
|
49 |
+
@article{born2023chemical,
|
50 |
+
title={Chemical representation learning for toxicity prediction},
|
51 |
+
author={Born, Jannis and Markert, Greta and Janakarajan, Nikita and Kimber, Talia B. and Volkamer, Andrea and Rodriguez Martinez, Maria and Manica, Matteo},
|
52 |
+
journal={Under review at Digital Discovery},
|
53 |
+
year={2023}
|
54 |
+
}
|
55 |
+
```
|
56 |
+
|
57 |
+
|
58 |
+
### Unsupported properties
|
59 |
+
The following molecular properties are available via the GT4SD API but not in this UI:
|
60 |
+
- [MoleculeOne](https://tdcommons.ai/functions/oracles/#moleculeone) endpoint for retrosynthesis
|
61 |
+
- [ASKCOS](https://tdcommons.ai/functions/oracles/#askcos) endpoint for retrosynthesis
|
62 |
+
- [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against a user-provided target
|
63 |
+
- [TDC-Docking](https://tdcommons.ai/functions/oracles/#docking-scores) endpoint for docking against *3pbl*.
|
64 |
+
- [Protein-ligand binding](https://tdcommons.ai/functions/oracles/#dopamine-receptor-d2-drd2) against one of the targets *drd2*, *gsk3b*, *jnk3*, *fpscores*, *cyp3a4_veith*, *drd2_current*, *gsk3b_current* or *jnk3_current*.
|
65 |
+
- [Tanimoto similarity](https://tdcommons.ai/functions/oracles/#similaritydissimilarity) to a seed molecule.
|
66 |
+
|
67 |
+
|
68 |
+
Moreover, GT4SD also includes properties on other entities such as [proteins](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.proteins.html) and [crystals](https://gt4sd.github.io/gt4sd-core/api/gt4sd.properties.crystals.html).
|
model_cards/description.md
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
4 |
+
|
5 |
+
### Molecular property prediction
|
6 |
+
|
7 |
+
This is the GT4SD web-app for prediction of various molecular properties. For **examples** and **documentation** of the supported properties, please see below. Please note that this API does not expose **all** properties that are supported in GT4SD (a list of the non-supported ones can be found at the bottom).
|
model_cards/examples.smi
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Cc1cc2c(c3oc(CCCC#N)cc13)C(=O)c1c(O)cccc1C2=O
|
2 |
+
C=CCN1C(=O)C(=NNC(=S)NC2OC(COC(C)=O)C(OC(C)=O)C(OC(C)=O)C2OC(C)=O)c2ccccc21
|
3 |
+
O=C1C(=Cc2ccc(F)cc2)CCOc2c1ccc1ccccc21
|
4 |
+
CC(C)CNc1cc(NCC(C)C)nc(NCC(C)C)n1
|
5 |
+
CN1CCN(CCCOc2ccc(N3C(=O)C(=Cc4ccc(Oc5ccc([N+](=O)[O-])cc5)cc4)SC3=S)cc2)CC1
|
6 |
+
COc1ccc2ccccc2c1C1CC1NC(C)=O
|
7 |
+
Cc1ccc(-n2c(=O)[nH]cc(C(=O)Nc3ccc4c(c3)OCCO4)c2=O)cc1
|
8 |
+
Cc1ccc(NCc2nnc(SCC(=O)NCCc3ccccc3)n2C)cc1
|
9 |
+
CCCNC(=O)c1ccc2c(c1)N=C(C)c1c(C)ccc(C)c1S2
|
10 |
+
COc1ccc(Cn2ccn(CC(=O)Nc3cc(C)ccc3C)c(=O)c2=O)cc1
|
11 |
+
Cn1nccc1C(=O)NN=Cc1c(O)ccc2ccccc12
|
12 |
+
CCOC(=O)Nc1cc(N)c2c(n1)NC(C)C(c1ccccc1)=N2
|
13 |
+
Cn1nc(N)c2ncc(C(Cl)(Cl)Cl)nc21
|
model_cards/regression_transformer.png
DELETED
Binary file (225 kB)
|
|
model_cards/regression_transformer_article.md
DELETED
@@ -1,113 +0,0 @@
|
|
1 |
-
# Model documentation & parameters
|
2 |
-
|
3 |
-
## Parameters
|
4 |
-
|
5 |
-
### Algorithm Version
|
6 |
-
Which model checkpoint to use (trained on different datasets).
|
7 |
-
|
8 |
-
### Task
|
9 |
-
Whether the multitask model should be used for property prediction or conditional generation (default).
|
10 |
-
|
11 |
-
### Input
|
12 |
-
The input sequence. In the default setting (where `Task` is *Generate* and `Sampling Wrapper` is *True*) this can be a seed SMILES (for the molecule models) or amino-acid sequence (for the protein models). The model will locally adapt the seed sequence by masking `Fraction to mask` of the tokens.
|
13 |
-
If the `Task` is *Predict*, the sequences are given as SELFIES for the molecule models. Moreover, the tokens that should be predicted (`[MASK]` in the input) have to be given explicitly. Populate the examples to understand better.
|
14 |
-
NOTE: When setting `Task` to *Generate*, and `Sampling Wrapper` to *False*, the user has maximal control about the generative process and can explicitly decide which tokens should be masked.
|
15 |
-
|
16 |
-
### Number of samples
|
17 |
-
How many samples should be generated (between 1 and 50). If `Task` is *Predict*, this has to be set to 1.
|
18 |
-
|
19 |
-
### Search
|
20 |
-
Decoding search method. Use *Sample* if `Task` is *Generate*. If `Task` is *Predict*, use *Greedy*.
|
21 |
-
|
22 |
-
### Tolerance
|
23 |
-
Precision tolerance; only used if `Task` is *Generate*. This is a single float between 0 and 100 for the the tolerated deviation between desired/primed property and predicted property of the generated molecule. Given in percentage with respect to the property range encountered during training.
|
24 |
-
NOTE: The tolerance is *only* used for post-hoc filtering of the generated samples.
|
25 |
-
|
26 |
-
### Sampling Wrapper
|
27 |
-
Only used if `Task` is *Generate*. If set to *False*, the user has to provide a full RT-sequence as `Input` and has to **explicitly** decide which tokens are masked (see example below). This gives full control but is tedious. Instead, if `Sampling Wrapper` is set to *True*, the RT stochastically determines which parts of the sequence are masked.
|
28 |
-
**NOTE**: All below arguments only apply if `Sampling Wrapper` is *True*.
|
29 |
-
|
30 |
-
#### Fraction to mask
|
31 |
-
Specifies the ratio of tokens that can be changed by the model. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
32 |
-
|
33 |
-
#### Property goal
|
34 |
-
Specifies the desired target properties for the generation. Need to be given in the format `<prop>:value`. If the model supports multiple properties, give them separated by a comma `,`. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
35 |
-
|
36 |
-
#### Tokens to mask
|
37 |
-
Optionally specifies which tokens (atoms, bonds etc) can be masked. Please separate multiple tokens by comma (`,`). If not specified, all tokens can be masked. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
38 |
-
|
39 |
-
#### Substructures to mask
|
40 |
-
Optionally specifies a list of substructures that should *definitely* be masked (excluded from stochastic masking). Given in SMILES format. If multiple are provided, separate by comma (`,`). Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
41 |
-
*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
|
42 |
-
|
43 |
-
#### Substructures to keep
|
44 |
-
Optionally specifies a list of substructures that should definitely be present in the target sample (i.e., excluded from stochastic masking). Given in SMILES format. Argument only applies if `Task` is *Generate* and `Sampling Wrapper` is *True*.
|
45 |
-
*NOTE*: This keeps tokens even if they are included in `tokens_to_mask`.
|
46 |
-
*NOTE*: Most models operate on SELFIES and the matching of the substructures occurs in SELFIES simply on a string level.
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
# Model card -- Regression Transformer
|
51 |
-
|
52 |
-
**Model Details**: The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task. This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation.
|
53 |
-
|
54 |
-
**Developers**: Jannis Born and Matteo Manica from IBM Research.
|
55 |
-
|
56 |
-
**Distributors**: Original authors' code wrapped and distributed by GT4SD Team (2023) from IBM Research.
|
57 |
-
|
58 |
-
**Model date**: Preprint released in 2022, currently under review at *Nature Machine Intelligence*.
|
59 |
-
|
60 |
-
**Algorithm version**: Models trained and distributed by the original authors.
|
61 |
-
- **Molecules: QED**: Model trained on 1.6M molecules (SELFIES) from ChEMBL and their QED scores.
|
62 |
-
- **Molecules: Solubility**: QED model finetuned on the ESOL dataset from [Delaney et al (2004), *J. Chem. Inf. Comput. Sci.*](https://pubs.acs.org/doi/10.1021/ci034243x) to predict water solubility. Model trained on augmented SELFIES.
|
63 |
-
- **Molecules: USPTO**: Model trained on 2.8M [chemical reactions](https://figshare.com/articles/dataset/Chemical_reactions_from_US_patents_1976-Sep2016_/5104873) from the US patent office. The model used SELFIES and a synthetic property (total molecular weight of all precursors).
|
64 |
-
- **Molecules: Polymer**: Model finetuned on 600 ROPs (ring-opening polymerizations) with monomer-catalyst pairs. Model used three properties: conversion (`<conv>`), PDI (`<pdi>`) and Molecular Weight (`<molwt>`). Model trained with augmented SELFIES, optimized only to generate catalysts, given a monomer and the property constraints. See the example for details.
|
65 |
-
- **Molecules: Cosmo_acdl**: Model finetuned on 56k molecules with two properties (*pKa_ACDL* and *pKa_COSMO*). Model used augmented SELFIES.
|
66 |
-
- **Molecules: Pfas**: Model finetuned on ~1k PFAS (Perfluoroalkyl and Polyfluoroalkyl Substances) molecules with 9 properties including some experimentally measured ones (biodegradability, LD50 etc) and some synthetic ones (SCScore, molecular weight). Model trained on augmented SELFIES.
|
67 |
-
- **Molecules: Logp_and_synthesizability**: Model trained on 2.9M molecules (SELFIES) from PubChem with **two** synthetic properties, the logP (partition coefficient) and the [SCScore by Coley et al. (2018); *J. Chem. Inf. Model.*](https://pubs.acs.org/doi/full/10.1021/acs.jcim.7b00622?casa_token=JZzOrdWlQ_QAAAAA%3A3_ynCfBJRJN7wmP2gyAR0EWXY-pNW_l-SGwSSU2SGfl5v5SxcvqhoaPNDhxq4THberPoyyYqTZELD4Ck)
|
68 |
-
- **Molecules: Crippen_logp**: Model trained on 2.9M molecules (SMILES) from PubChem, but *only* on logP (partition coefficient).
|
69 |
-
- **Proteins: Stability**: Model pretrained on 2.6M peptides from UniProt with the Boman index as property. Finetuned on the [**Stability**](https://www.science.org/doi/full/10.1126/science.aan0693) dataset from the [TAPE benchmark](https://proceedings.neurips.cc/paper/2019/hash/37f65c068b7723cd7809ee2d31d7861c-Abstract.html) which has ~65k samples.
|
70 |
-
|
71 |
-
**Model type**: A Transformer-based language model that is trained on alphanumeric sequence to simultaneously perform sequence regression or conditional sequence generation.
|
72 |
-
|
73 |
-
**Information about training algorithms, parameters, fairness constraints or other applied approaches, and features**:
|
74 |
-
All models are trained with an alternated training scheme that alternated between optimizing the cross-entropy loss on the property tokens ("regression") or the self-consistency objective on the molecular tokens. See the [Regression Transformer](https://arxiv.org/abs/2202.01338) paper for details.
|
75 |
-
|
76 |
-
**Paper or other resource for more information**:
|
77 |
-
The [Regression Transformer](https://arxiv.org/abs/2202.01338) paper. See the [source code](https://github.com/IBM/regression-transformer) for details.
|
78 |
-
|
79 |
-
**License**: MIT
|
80 |
-
|
81 |
-
**Where to send questions or comments about the model**: Open an issue on [GT4SD repository](https://github.com/GT4SD/gt4sd-core).
|
82 |
-
|
83 |
-
**Intended Use. Use cases that were envisioned during development**: Chemical research, in particular drug discovery.
|
84 |
-
|
85 |
-
**Primary intended uses/users**: Researchers and computational chemists using the model for model comparison or research exploration purposes.
|
86 |
-
|
87 |
-
**Out-of-scope use cases**: Production-level inference, producing molecules with harmful properties.
|
88 |
-
|
89 |
-
**Factors**: Not applicable.
|
90 |
-
|
91 |
-
**Metrics**: High predictive power for the properties of that specific algorithm version.
|
92 |
-
|
93 |
-
**Datasets**: Different ones, as described under **Algorithm version**.
|
94 |
-
|
95 |
-
**Ethical Considerations**: No specific considerations as no private/personal data is involved. Please consult with the authors in case of questions.
|
96 |
-
|
97 |
-
**Caveats and Recommendations**: Please consult with original authors in case of questions.
|
98 |
-
|
99 |
-
Model card prototype inspired by [Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)
|
100 |
-
|
101 |
-
|
102 |
-
## Citation
|
103 |
-
|
104 |
-
```bib
|
105 |
-
@article{born2022regression,
|
106 |
-
title={Regression Transformer: Concurrent Conditional Generation and Regression by Blending Numerical and Textual Tokens},
|
107 |
-
author={Born, Jannis and Manica, Matteo},
|
108 |
-
journal={arXiv preprint arXiv:2202.01338},
|
109 |
-
note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
|
110 |
-
year={2022}
|
111 |
-
}
|
112 |
-
```
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_cards/regression_transformer_description.md
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
<img align="right" src="https://raw.githubusercontent.com/GT4SD/gt4sd-core/main/docs/_static/gt4sd_logo.png" alt="logo" width="120" >
|
4 |
-
|
5 |
-
### Concurrent sequence regression and generation for molecular language modeling
|
6 |
-
|
7 |
-
The [Regression Transformer](https://arxiv.org/abs/2202.01338) is a multitask Transformer that reformulates regression as a conditional sequence modeling task.
|
8 |
-
This yields a dichotomous language model that seamlessly integrates property prediction with property-driven conditional generation. For details see the [arXiv preprint](https://arxiv.org/abs/2202.01338), the [development code](https://github.com/IBM/regression-transformer) and the [GT4SD endpoint](https://github.com/GT4SD/gt4sd-core) for inference.
|
9 |
-
|
10 |
-
Each `algorithm_version` refers to one trained model. Each model can be used for **two tasks**, either to *predict* one (or multiple) properties of a molecule or to *generate* a molecule (given a seed molecule and a property constraint).
|
11 |
-
|
12 |
-
For **examples** and **documentation** of the model parameters, please see below.
|
13 |
-
Moreover, we provide a **model card** ([Mitchell et al. (2019)](https://dl.acm.org/doi/abs/10.1145/3287560.3287596?casa_token=XD4eHiE2cRUAAAAA:NL11gMa1hGPOUKTAbtXnbVQBDBbjxwcjGECF_i-WC_3g1aBgU1Hbz_f2b4kI_m1in-w__1ztGeHnwHs)) at the bottom of this page.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_cards/regression_transformer_examples.csv
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
Molecules: Logp_and_synthesizability,Generate,CCOC1=NC=NC(=C1C)NCCOC(C)C,3,Sample,1.2,20,True,0.3,"<logp>:0.390, <scs>:2.628",N,(C)C,CCO
|
2 |
-
Molecules: Qed,Generate,CC(C#C)N(C)C(=O)NC1=CC=C(Cl)C=C1,10,Sample,1.0,30,True,0.5,<qed>:0.75,"N, C","C(=O), CC",C1=CC=C(Cl)C=C1
|
3 |
-
Molecules: Logp_and_synthesizability,Predict,<logp>[MASK][MASK][MASK][MASK][MASK]|<scs>[MASK][MASK][MASK][MASK][MASK]|[C][C][O][C][=N][C][=N][C][Branch1_2][Branch1_1][=C][Ring1][Branch1_2][C][N][C][C][O][C][Branch1_1][C][C][C],1,Greedy,1.0,30,False,0.0,,,,
|
4 |
-
Proteins: Stability,Predict,<stab>[MASK][MASK][MASK][MASK][MASK]|GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,1,Greedy,1.0,1,False,0.0,,,,
|
5 |
-
Proteins: Stability,Generate,GSQEVNSGTQTYKNASPEEAERIARKAGATTWTEKGNKWEIRI,10,Sample,1.2,30,True,0.3,<stab>:0.393,,SQEVNSGTQTYKN,WTEK
|
6 |
-
Molecules: Qed,Generate,<qed>0.717|[MASK][MASK][MASK][MASK][MASK][C][Branch2_1][Ring1][Ring1][MASK][MASK][=C][C][Branch1_1][C][C][=N][C][MASK][MASK][=C][C][=C][Ring1][O][Ring1][Branch1_2][=C][Ring2][MASK][MASK],10,Sample,1.2,30,False,0.0,,,,
|
7 |
-
Molecules: Solubility,Generate,ClC(Cl)C(Cl)Cl,5,Sample,1.3,40,True,0.4,<esol>:0.754,,,
|
8 |
-
Molecules: Polymer,Predict,<conv>[MASK][MASK][MASK][MASK]|<pdi>[MASK][MASK][MASK][MASK][MASK]|<molwt>[MASK][MASK][MASK][MASK][MASK]|[C][Branch1_2][C][=O][O][C@@Hexpl][Branch1_1][C][C][C][Branch1_2][C][=O][O][C@Hexpl][Ring1][Branch2_2][C].[C][C][C][Branch2_1][Ring1][Ring1][N][C][Branch1_1][=C][N][C][=C][C][=C][Branch1_1][Ring1][O][C][C][=C][Ring1][Branch2_1][=S][C][C][C][Ring2][Ring1][C],1,Greedy,1,0,False,,,,,
|
9 |
-
Molecules: Polymer,Generate,C1(=O)O[C@@H](C)C(=O)O[C@H]1C.C2CC(NC(NC1=CC=C(OC)C=C1)=S)CCC2,10,Sample,1.3,50,True,0.5,"<pdi>:3.490, <conv>:0.567, <molwt>:3.567",,,C1(=O)O[C@@H](C)C(=O)O[C@H]1C
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -8,7 +8,7 @@ torch-sparse
|
|
8 |
torch-geometric
|
9 |
torchvision==0.13.1
|
10 |
torchaudio==0.12.1
|
11 |
-
gt4sd>=1.
|
12 |
molgx>=0.22.0a1
|
13 |
molecule_generation
|
14 |
nglview
|
|
|
8 |
torch-geometric
|
9 |
torchvision==0.13.1
|
10 |
torchaudio==0.12.1
|
11 |
+
gt4sd>=1.1.1
|
12 |
molgx>=0.22.0a1
|
13 |
molecule_generation
|
14 |
nglview
|
utils.py
CHANGED
@@ -1,136 +1,25 @@
|
|
1 |
-
import json
|
2 |
import logging
|
3 |
-
import
|
4 |
-
|
5 |
-
from typing import Dict, List, Tuple
|
6 |
-
|
7 |
import mols2grid
|
8 |
import pandas as pd
|
9 |
-
from gt4sd.algorithms import (
|
10 |
-
RegressionTransformerMolecules,
|
11 |
-
RegressionTransformerProteins,
|
12 |
-
)
|
13 |
-
from gt4sd.algorithms.core import AlgorithmConfiguration
|
14 |
from rdkit import Chem
|
15 |
-
from terminator.selfies import decoder
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
18 |
logger.addHandler(logging.NullHandler())
|
19 |
|
20 |
|
21 |
-
def
|
22 |
-
|
23 |
-
Convert application name to AlgorithmConfiguration.
|
24 |
-
|
25 |
-
Args:
|
26 |
-
application: Molecules or Proteins
|
27 |
-
|
28 |
-
Returns:
|
29 |
-
The corresponding AlgorithmConfiguration
|
30 |
-
"""
|
31 |
-
if application == "Molecules":
|
32 |
-
application = RegressionTransformerMolecules
|
33 |
-
elif application == "Proteins":
|
34 |
-
application = RegressionTransformerProteins
|
35 |
-
else:
|
36 |
-
raise ValueError(
|
37 |
-
"Currently only models for molecules and proteins are supported"
|
38 |
-
)
|
39 |
-
return application
|
40 |
-
|
41 |
-
|
42 |
-
def get_inference_dict(
|
43 |
-
application: AlgorithmConfiguration, algorithm_version: str
|
44 |
-
) -> Dict:
|
45 |
-
"""
|
46 |
-
Get inference dictionary for a given application and algorithm version.
|
47 |
-
|
48 |
-
Args:
|
49 |
-
application: algorithm application (Molecules or Proteins)
|
50 |
-
algorithm_version: algorithm version (e.g. qed)
|
51 |
-
|
52 |
-
Returns:
|
53 |
-
A dictionary with the inference parameters.
|
54 |
-
"""
|
55 |
-
config = application(algorithm_version=algorithm_version)
|
56 |
-
with open(os.path.join(config.ensure_artifacts(), "inference.json"), "r") as f:
|
57 |
-
data = json.load(f)
|
58 |
-
return data
|
59 |
-
|
60 |
-
|
61 |
-
def get_rt_name(x: Dict) -> str:
|
62 |
-
"""
|
63 |
-
Get the UI display name of the regression transformer.
|
64 |
-
|
65 |
-
Args:
|
66 |
-
x: dictionary with the inference parameters
|
67 |
-
|
68 |
-
Returns:
|
69 |
-
The display name
|
70 |
-
"""
|
71 |
-
return (
|
72 |
-
x["algorithm_application"].split("Transformer")[-1]
|
73 |
-
+ ": "
|
74 |
-
+ x["algorithm_version"].capitalize()
|
75 |
-
)
|
76 |
-
|
77 |
-
|
78 |
-
def draw_grid_predict(prediction: str, target: str, domain: str) -> str:
|
79 |
-
"""
|
80 |
-
Uses mols2grid to draw a HTML grid for the prediction
|
81 |
-
|
82 |
-
Args:
|
83 |
-
prediction: Predicted sequence.
|
84 |
-
target: Target molecule
|
85 |
-
domain: Domain of the prediction (molecules or proteins)
|
86 |
-
|
87 |
-
Returns:
|
88 |
-
HTML to display
|
89 |
-
"""
|
90 |
-
|
91 |
-
if domain not in ["Molecules", "Proteins"]:
|
92 |
-
raise ValueError(f"Unsupported domain {domain}")
|
93 |
-
|
94 |
-
seq = target.split("|")[-1]
|
95 |
-
converter = (
|
96 |
-
decoder
|
97 |
-
if domain == "Molecules"
|
98 |
-
else lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
|
99 |
-
)
|
100 |
-
try:
|
101 |
-
seq = converter(seq)
|
102 |
-
except Exception:
|
103 |
-
logger.warning(f"Could not draw sequence {seq}")
|
104 |
-
|
105 |
-
result = {"SMILES": [seq], "Name": ["Target"]}
|
106 |
-
# Add properties
|
107 |
-
for prop in prediction.split("<")[1:]:
|
108 |
-
result[
|
109 |
-
prop.split(">")[0]
|
110 |
-
] = f"{prop.split('>')[0].capitalize()} = {prop.split('>')[1]}"
|
111 |
-
result_df = pd.DataFrame(result)
|
112 |
-
obj = mols2grid.display(
|
113 |
-
result_df,
|
114 |
-
tooltip=list(result.keys()),
|
115 |
-
height=900,
|
116 |
-
n_cols=1,
|
117 |
-
name="Results",
|
118 |
-
size=(600, 700),
|
119 |
-
)
|
120 |
-
return obj.data
|
121 |
-
|
122 |
-
|
123 |
-
def draw_grid_generate(
|
124 |
-
samples: List[Tuple[str]], domain: str, n_cols: int = 5, size=(140, 200)
|
125 |
) -> str:
|
126 |
"""
|
127 |
-
Uses mols2grid to draw a HTML grid for the
|
128 |
|
129 |
Args:
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
|
135 |
Returns:
|
136 |
HTML to display
|
@@ -140,29 +29,25 @@ def draw_grid_generate(
|
|
140 |
raise ValueError(f"Unsupported domain {domain}")
|
141 |
|
142 |
if domain == "Proteins":
|
143 |
-
|
144 |
-
smis = list(
|
145 |
-
map(lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x[0])), samples)
|
146 |
-
)
|
147 |
-
except Exception:
|
148 |
-
logger.warning(f"Could not convert some sequences {samples}")
|
149 |
else:
|
150 |
-
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
for prop in properties:
|
160 |
-
value = float(sample[1].split(prop)[-1][1:].split("<")[0])
|
161 |
-
result[prop].append(f"{prop} = {value}")
|
162 |
|
163 |
-
|
|
|
|
|
|
|
|
|
164 |
obj = mols2grid.display(
|
165 |
-
|
166 |
tooltip=list(result.keys()),
|
167 |
height=1100,
|
168 |
n_cols=n_cols,
|
|
|
|
|
1 |
import logging
|
2 |
+
from typing import List
|
3 |
+
import numpy as np
|
|
|
|
|
4 |
import mols2grid
|
5 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
6 |
from rdkit import Chem
|
|
|
7 |
|
8 |
logger = logging.getLogger(__name__)
|
9 |
logger.addHandler(logging.NullHandler())
|
10 |
|
11 |
|
12 |
+
def draw_grid_predict(
|
13 |
+
sequences: List[str], properties: np.array, property_names: List[str], domain: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
) -> str:
|
15 |
"""
|
16 |
+
Uses mols2grid to draw a HTML grid for the prediction
|
17 |
|
18 |
Args:
|
19 |
+
sequences: Sequences for which properties are predicted.
|
20 |
+
properties: Predicted properties. Array of shape (n_samples, n_properties).
|
21 |
+
names: List of property names
|
22 |
+
domain: Domain of the prediction (molecules or proteins).
|
23 |
|
24 |
Returns:
|
25 |
HTML to display
|
|
|
29 |
raise ValueError(f"Unsupported domain {domain}")
|
30 |
|
31 |
if domain == "Proteins":
|
32 |
+
converter = lambda x: Chem.MolToSmiles(Chem.MolFromFASTA(x))
|
|
|
|
|
|
|
|
|
|
|
33 |
else:
|
34 |
+
converter = lambda x: x
|
35 |
|
36 |
+
smiles = []
|
37 |
+
for sequence in sequences:
|
38 |
+
try:
|
39 |
+
seq = converter(sequence)
|
40 |
+
smiles.append(seq)
|
41 |
+
except Exception:
|
42 |
+
logger.warning(f"Could not draw sequence {seq}")
|
|
|
|
|
|
|
43 |
|
44 |
+
result = pd.DataFrame({"SMILES": smiles})
|
45 |
+
for i, name in enumerate(property_names):
|
46 |
+
result[name] = properties[:, i]
|
47 |
+
n_cols = min(3, len(result))
|
48 |
+
size = (140, 200) if len(result) > 3 else (600, 700)
|
49 |
obj = mols2grid.display(
|
50 |
+
result,
|
51 |
tooltip=list(result.keys()),
|
52 |
height=1100,
|
53 |
n_cols=n_cols,
|