Spaces:
Runtime error
Runtime error
initial commit
Browse files- .gitignore +2 -0
- app.py +78 -0
- gradio.py +40 -0
- requirements.txt +4 -0
- src/__init__.py +0 -0
- src/tokenizers.py +239 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.mypy_cache
|
2 |
+
__pycache__
|
app.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main App
|
3 |
+
"""
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
from transformers import AutoModelForSeq2SeqLM
|
7 |
+
|
8 |
+
from src.tokenizers import IndoNLGTokenizer
|
9 |
+
|
10 |
+
|
11 |
+
@st.cache(allow_output_mutation=True)
|
12 |
+
def fetch_tokenizer_model():
|
13 |
+
"""
|
14 |
+
Fetch tokenizer and model
|
15 |
+
"""
|
16 |
+
tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
|
17 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("haryoaw/id-recigen-bart")
|
18 |
+
return tokenizer, model
|
19 |
+
|
20 |
+
|
21 |
+
tokenizer, model = fetch_tokenizer_model()
|
22 |
+
|
23 |
+
|
24 |
+
def predict_recipe(food: str) -> str:
|
25 |
+
"""
|
26 |
+
Predict Ingredients Here!
|
27 |
+
|
28 |
+
Parameters
|
29 |
+
----------
|
30 |
+
food: str
|
31 |
+
The food that will be used
|
32 |
+
|
33 |
+
Returns
|
34 |
+
-------
|
35 |
+
str
|
36 |
+
Return the model here
|
37 |
+
"""
|
38 |
+
inp = tokenizer(food.lower(), return_tensors="pt")["input_ids"]
|
39 |
+
generated = model.generate(
|
40 |
+
inp, max_length=500, do_sample=False, num_beams=10, num_beam_groups=2
|
41 |
+
)
|
42 |
+
returned_input: str = tokenizer.decode(generated[0], skip_special_tokens=True)
|
43 |
+
returned_input = "\n".join([x.strip() for x in returned_input.split("||")])
|
44 |
+
return returned_input
|
45 |
+
|
46 |
+
|
47 |
+
def create_frontend() -> None:
|
48 |
+
"""
|
49 |
+
Create front end streamlit here
|
50 |
+
"""
|
51 |
+
st.markdown("# Food Ingredients Generator Indonesia Showcase!")
|
52 |
+
st.write("🥑Generate your ingredients here!")
|
53 |
+
|
54 |
+
with st.form("my_form"):
|
55 |
+
food_name = st.text_input(
|
56 |
+
"Food", value="Nasi Goreng Ayam", help="Input your food here!"
|
57 |
+
)
|
58 |
+
submitted = st.form_submit_button("Submit")
|
59 |
+
if submitted:
|
60 |
+
predicted = predict_recipe(food_name)
|
61 |
+
st.markdown(f"## Bahan ( Ingredients ) `{food_name}`:")
|
62 |
+
st.text(predicted)
|
63 |
+
st.markdown("## Additional Note")
|
64 |
+
st.write(
|
65 |
+
"❗Please note that the model is trained with the food that use:"
|
66 |
+
)
|
67 |
+
for i, ingr in enumerate(("ayam", "tempe", "ikan", "kambing", "telur", "tahu", "sapi")):
|
68 |
+
st.write(f"{i+1}. {ingr}")
|
69 |
+
|
70 |
+
st.markdown("## Models")
|
71 |
+
st.markdown(
|
72 |
+
"🤗 Huggingface Model: [Link](https://huggingface.co/haryoaw/id-recigen-bart)"
|
73 |
+
)
|
74 |
+
st.write("Thank you 😊")
|
75 |
+
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
create_frontend()
|
gradio.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Main App
|
3 |
+
"""
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
from transformers import AutoModelForSeq2SeqLM
|
7 |
+
|
8 |
+
from src.tokenizers import IndoNLGTokenizer
|
9 |
+
|
10 |
+
|
11 |
+
tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
|
12 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("haryoaw/id-recigen-bart")
|
13 |
+
|
14 |
+
|
15 |
+
def predict_recipe(food: str) -> str:
|
16 |
+
"""
|
17 |
+
Predict Ingredients Here!
|
18 |
+
|
19 |
+
Parameters
|
20 |
+
----------
|
21 |
+
food: str
|
22 |
+
The food that will be used
|
23 |
+
"""
|
24 |
+
inp = tokenizer(food, return_tensors="pt")["input_ids"]
|
25 |
+
generated = model.generate(
|
26 |
+
inp, max_length=500, do_sample=False, num_beams=10, num_beam_groups=2
|
27 |
+
)
|
28 |
+
returned_input: str = tokenizer.decode(generated[0], skip_special_tokens=True)
|
29 |
+
returned_input = "\n".join([x.strip() for x in returned_input.split("||")])
|
30 |
+
return returned_input
|
31 |
+
|
32 |
+
|
33 |
+
iface = gr.Interface(
|
34 |
+
fn=predict_recipe,
|
35 |
+
inputs=[gr.inputs.Textbox(placeholder="Food Name")],
|
36 |
+
outputs="textbox",
|
37 |
+
)
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
app, local_url, share_url = iface.launch(share=False)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentencepiece>=0.1.95
|
2 |
+
transformers
|
3 |
+
torch
|
4 |
+
streamlit==1.8.1
|
src/__init__.py
ADDED
File without changes
|
src/tokenizers.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License
|
15 |
+
""" Tokenization classes for IndoNLG model."""
|
16 |
+
|
17 |
+
import os
|
18 |
+
from shutil import copyfile
|
19 |
+
from typing import List, Optional, Tuple
|
20 |
+
from transformers import PreTrainedTokenizer
|
21 |
+
|
22 |
+
import sentencepiece as spm
|
23 |
+
|
24 |
+
from transformers.utils import logging
|
25 |
+
|
26 |
+
|
27 |
+
logger = logging.get_logger(__name__)
|
28 |
+
|
29 |
+
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
|
30 |
+
|
31 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
32 |
+
"vocab_file": {
|
33 |
+
"indobart": "https://huggingface.co/indobart/resolve/main/sentencepiece.bpe.model",
|
34 |
+
"indogpt": "https://huggingface.co/indogptresolve/main/sentencepiece.bpe.model",
|
35 |
+
"indobart-v2": "https://huggingface.co/indobart-v2/resolve/main/sentencepiece.bpe.model"
|
36 |
+
}
|
37 |
+
}
|
38 |
+
|
39 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
|
40 |
+
"indobenchmark/indobart": 768,
|
41 |
+
"ndobenchmark/indogpt": 768,
|
42 |
+
"indobenchmark/indobart-v2": 768
|
43 |
+
}
|
44 |
+
|
45 |
+
SHARED_MODEL_IDENTIFIERS = [
|
46 |
+
# Load with
|
47 |
+
"indobenchmark/indobart",
|
48 |
+
"indobenchmark/indogpt",
|
49 |
+
"indobenchmark/indobart-v2"
|
50 |
+
]
|
51 |
+
|
52 |
+
SPIECE_UNDERLINE = "▁"
|
53 |
+
|
54 |
+
class IndoNLGTokenizer(PreTrainedTokenizer):
|
55 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
56 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
57 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
58 |
+
model_input_names = ["input_ids","attention_mask"]
|
59 |
+
|
60 |
+
def __init__(
|
61 |
+
self,
|
62 |
+
vocab_file,
|
63 |
+
decode_special_token=True,
|
64 |
+
bos_token="<s>",
|
65 |
+
eos_token="</s>",
|
66 |
+
sep_token="</s>",
|
67 |
+
cls_token="<s>",
|
68 |
+
unk_token="<unk>",
|
69 |
+
pad_token="<pad>",
|
70 |
+
mask_token="<mask>",
|
71 |
+
additional_special_tokens=["[java]","[sunda]","[indonesia]","<mask>"],
|
72 |
+
**kwargs
|
73 |
+
):
|
74 |
+
super().__init__(
|
75 |
+
vocab_file=vocab_file,
|
76 |
+
bos_token=bos_token,
|
77 |
+
eos_token=eos_token,
|
78 |
+
unk_token=unk_token,
|
79 |
+
sep_token=sep_token,
|
80 |
+
cls_token=cls_token,
|
81 |
+
pad_token=pad_token,
|
82 |
+
mask_token=mask_token,
|
83 |
+
additional_special_tokens=additional_special_tokens,
|
84 |
+
**kwargs,
|
85 |
+
)
|
86 |
+
self.sp_model = spm.SentencePieceProcessor()
|
87 |
+
self.sp_model.Load(str(vocab_file))
|
88 |
+
self.vocab_file = vocab_file
|
89 |
+
self.decode_special_token = decode_special_token
|
90 |
+
self.model_max_length = 1024
|
91 |
+
|
92 |
+
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
|
93 |
+
# sentencepiece vocabulary (this is the case for <s> and </s>
|
94 |
+
self.special_tokens_to_ids = {
|
95 |
+
"[java]": 40000,
|
96 |
+
"[sunda]": 40001,
|
97 |
+
"[indonesia]": 40002,
|
98 |
+
"<mask>": 40003
|
99 |
+
}
|
100 |
+
self.special_ids_to_tokens = {v: k for k, v in self.special_tokens_to_ids.items()}
|
101 |
+
|
102 |
+
# Store Language token ID
|
103 |
+
self.javanese_token = '[javanese]'
|
104 |
+
self.javanese_token_id = 40000
|
105 |
+
self.sundanese_token = '[sundanese]'
|
106 |
+
self.sundanese_token_id = 40001
|
107 |
+
self.indonesian_token = '[indonesia]'
|
108 |
+
self.indonesian_token_id = 40002
|
109 |
+
|
110 |
+
self.special_token_ids = [
|
111 |
+
self.bos_token_id, self.eos_token_id, self.sep_token_id, self.cls_token_id,
|
112 |
+
self.unk_token_id, self.pad_token_id, self.mask_token_id,
|
113 |
+
self.javanese_token_id, self.sundanese_token_id, self.indonesian_token_id
|
114 |
+
]
|
115 |
+
|
116 |
+
def build_inputs_with_special_tokens(
|
117 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
118 |
+
) -> List[int]:
|
119 |
+
"""
|
120 |
+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
121 |
+
adding special tokens. An CamemBERT sequence has the following format:
|
122 |
+
- single sequence: ``<s> X </s>``
|
123 |
+
- pair of sequences: ``<s> A </s></s> B </s>``
|
124 |
+
Args:
|
125 |
+
token_ids_0 (:obj:`List[int]`):
|
126 |
+
List of IDs to which the special tokens will be added.
|
127 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
128 |
+
Optional second list of IDs for sequence pairs.
|
129 |
+
Returns:
|
130 |
+
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
131 |
+
"""
|
132 |
+
|
133 |
+
if token_ids_1 is None:
|
134 |
+
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
135 |
+
cls = [self.cls_token_id]
|
136 |
+
sep = [self.sep_token_id]
|
137 |
+
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
|
138 |
+
|
139 |
+
def get_special_tokens_mask(
|
140 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
141 |
+
) -> List[int]:
|
142 |
+
"""
|
143 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
144 |
+
special tokens using the tokenizer ``prepare_for_model`` method.
|
145 |
+
Args:
|
146 |
+
token_ids_0 (:obj:`List[int]`):
|
147 |
+
List of IDs.
|
148 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
149 |
+
Optional second list of IDs for sequence pairs.
|
150 |
+
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
151 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
152 |
+
Returns:
|
153 |
+
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
154 |
+
"""
|
155 |
+
if already_has_special_tokens:
|
156 |
+
return super().get_special_tokens_mask(
|
157 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
158 |
+
)
|
159 |
+
|
160 |
+
if token_ids_1 is None:
|
161 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
162 |
+
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
|
163 |
+
|
164 |
+
def create_token_type_ids_from_sequences(
|
165 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
166 |
+
) -> List[int]:
|
167 |
+
"""
|
168 |
+
Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
|
169 |
+
RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
|
170 |
+
Args:
|
171 |
+
token_ids_0 (:obj:`List[int]`):
|
172 |
+
List of IDs.
|
173 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
174 |
+
Optional second list of IDs for sequence pairs.
|
175 |
+
Returns:
|
176 |
+
:obj:`List[int]`: List of zeros.
|
177 |
+
"""
|
178 |
+
sep = [self.sep_token_id]
|
179 |
+
cls = [self.cls_token_id]
|
180 |
+
|
181 |
+
if token_ids_1 is None:
|
182 |
+
return len(cls + token_ids_0 + sep) * [0]
|
183 |
+
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
|
184 |
+
|
185 |
+
@property
|
186 |
+
def vocab_size(self):
|
187 |
+
return 4 + len(self.sp_model)
|
188 |
+
|
189 |
+
def get_vocab(self):
|
190 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
191 |
+
vocab.update(self.added_tokens_encoder)
|
192 |
+
return vocab
|
193 |
+
|
194 |
+
def _tokenize(self, text: str) -> List[str]:
|
195 |
+
return self.sp_model.encode(text, out_type=str)
|
196 |
+
|
197 |
+
def _convert_token_to_id(self, token):
|
198 |
+
""" Converts a token (str) in an id using the vocab. """
|
199 |
+
if token in self.special_tokens_to_ids:
|
200 |
+
return self.special_tokens_to_ids[token]
|
201 |
+
return self.sp_model.PieceToId(token)
|
202 |
+
|
203 |
+
def _convert_id_to_token(self, index):
|
204 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
205 |
+
if not self.decode_special_token and index in self.special_token_ids:
|
206 |
+
return ''
|
207 |
+
|
208 |
+
if index in self.special_ids_to_tokens:
|
209 |
+
return self.special_ids_to_tokens[index]
|
210 |
+
|
211 |
+
return self.sp_model.IdToPiece(index)
|
212 |
+
|
213 |
+
def __getstate__(self):
|
214 |
+
state = self.__dict__.copy()
|
215 |
+
state["sp_model"] = None
|
216 |
+
return state
|
217 |
+
|
218 |
+
def __setstate__(self, d):
|
219 |
+
self.__dict__ = d
|
220 |
+
|
221 |
+
# for backward compatibility
|
222 |
+
if not hasattr(self, "sp_model_kwargs"):
|
223 |
+
self.sp_model_kwargs = {}
|
224 |
+
|
225 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
226 |
+
self.sp_model.Load(self.vocab_file)
|
227 |
+
|
228 |
+
def convert_tokens_to_string(self, tokens):
|
229 |
+
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
|
230 |
+
return self.sp_model.decode(tokens)
|
231 |
+
|
232 |
+
def decode(self, inputs, skip_special_tokens=False):
|
233 |
+
prev_val = self.decode_special_token
|
234 |
+
self.decode_special_token = not skip_special_tokens
|
235 |
+
|
236 |
+
outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens)
|
237 |
+
self.decode_special_token = prev_val
|
238 |
+
|
239 |
+
return outputs
|