haryoaw commited on
Commit
acd7000
·
1 Parent(s): 570233d

initial commit

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. app.py +78 -0
  3. gradio.py +40 -0
  4. requirements.txt +4 -0
  5. src/__init__.py +0 -0
  6. src/tokenizers.py +239 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .mypy_cache
2
+ __pycache__
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main App
3
+ """
4
+
5
+ import streamlit as st
6
+ from transformers import AutoModelForSeq2SeqLM
7
+
8
+ from src.tokenizers import IndoNLGTokenizer
9
+
10
+
11
+ @st.cache(allow_output_mutation=True)
12
+ def fetch_tokenizer_model():
13
+ """
14
+ Fetch tokenizer and model
15
+ """
16
+ tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
17
+ model = AutoModelForSeq2SeqLM.from_pretrained("haryoaw/id-recigen-bart")
18
+ return tokenizer, model
19
+
20
+
21
+ tokenizer, model = fetch_tokenizer_model()
22
+
23
+
24
+ def predict_recipe(food: str) -> str:
25
+ """
26
+ Predict Ingredients Here!
27
+
28
+ Parameters
29
+ ----------
30
+ food: str
31
+ The food that will be used
32
+
33
+ Returns
34
+ -------
35
+ str
36
+ Return the model here
37
+ """
38
+ inp = tokenizer(food.lower(), return_tensors="pt")["input_ids"]
39
+ generated = model.generate(
40
+ inp, max_length=500, do_sample=False, num_beams=10, num_beam_groups=2
41
+ )
42
+ returned_input: str = tokenizer.decode(generated[0], skip_special_tokens=True)
43
+ returned_input = "\n".join([x.strip() for x in returned_input.split("||")])
44
+ return returned_input
45
+
46
+
47
+ def create_frontend() -> None:
48
+ """
49
+ Create front end streamlit here
50
+ """
51
+ st.markdown("# Food Ingredients Generator Indonesia Showcase!")
52
+ st.write("🥑Generate your ingredients here!")
53
+
54
+ with st.form("my_form"):
55
+ food_name = st.text_input(
56
+ "Food", value="Nasi Goreng Ayam", help="Input your food here!"
57
+ )
58
+ submitted = st.form_submit_button("Submit")
59
+ if submitted:
60
+ predicted = predict_recipe(food_name)
61
+ st.markdown(f"## Bahan ( Ingredients ) `{food_name}`:")
62
+ st.text(predicted)
63
+ st.markdown("## Additional Note")
64
+ st.write(
65
+ "❗Please note that the model is trained with the food that use:"
66
+ )
67
+ for i, ingr in enumerate(("ayam", "tempe", "ikan", "kambing", "telur", "tahu", "sapi")):
68
+ st.write(f"{i+1}. {ingr}")
69
+
70
+ st.markdown("## Models")
71
+ st.markdown(
72
+ "🤗 Huggingface Model: [Link](https://huggingface.co/haryoaw/id-recigen-bart)"
73
+ )
74
+ st.write("Thank you 😊")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ create_frontend()
gradio.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main App
3
+ """
4
+
5
+ import gradio as gr
6
+ from transformers import AutoModelForSeq2SeqLM
7
+
8
+ from src.tokenizers import IndoNLGTokenizer
9
+
10
+
11
+ tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
12
+ model = AutoModelForSeq2SeqLM.from_pretrained("haryoaw/id-recigen-bart")
13
+
14
+
15
+ def predict_recipe(food: str) -> str:
16
+ """
17
+ Predict Ingredients Here!
18
+
19
+ Parameters
20
+ ----------
21
+ food: str
22
+ The food that will be used
23
+ """
24
+ inp = tokenizer(food, return_tensors="pt")["input_ids"]
25
+ generated = model.generate(
26
+ inp, max_length=500, do_sample=False, num_beams=10, num_beam_groups=2
27
+ )
28
+ returned_input: str = tokenizer.decode(generated[0], skip_special_tokens=True)
29
+ returned_input = "\n".join([x.strip() for x in returned_input.split("||")])
30
+ return returned_input
31
+
32
+
33
+ iface = gr.Interface(
34
+ fn=predict_recipe,
35
+ inputs=[gr.inputs.Textbox(placeholder="Food Name")],
36
+ outputs="textbox",
37
+ )
38
+
39
+ if __name__ == "__main__":
40
+ app, local_url, share_url = iface.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ sentencepiece>=0.1.95
2
+ transformers
3
+ torch
4
+ streamlit==1.8.1
src/__init__.py ADDED
File without changes
src/tokenizers.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License
15
+ """ Tokenization classes for IndoNLG model."""
16
+
17
+ import os
18
+ from shutil import copyfile
19
+ from typing import List, Optional, Tuple
20
+ from transformers import PreTrainedTokenizer
21
+
22
+ import sentencepiece as spm
23
+
24
+ from transformers.utils import logging
25
+
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
30
+
31
+ PRETRAINED_VOCAB_FILES_MAP = {
32
+ "vocab_file": {
33
+ "indobart": "https://huggingface.co/indobart/resolve/main/sentencepiece.bpe.model",
34
+ "indogpt": "https://huggingface.co/indogptresolve/main/sentencepiece.bpe.model",
35
+ "indobart-v2": "https://huggingface.co/indobart-v2/resolve/main/sentencepiece.bpe.model"
36
+ }
37
+ }
38
+
39
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40
+ "indobenchmark/indobart": 768,
41
+ "ndobenchmark/indogpt": 768,
42
+ "indobenchmark/indobart-v2": 768
43
+ }
44
+
45
+ SHARED_MODEL_IDENTIFIERS = [
46
+ # Load with
47
+ "indobenchmark/indobart",
48
+ "indobenchmark/indogpt",
49
+ "indobenchmark/indobart-v2"
50
+ ]
51
+
52
+ SPIECE_UNDERLINE = "▁"
53
+
54
+ class IndoNLGTokenizer(PreTrainedTokenizer):
55
+ vocab_files_names = VOCAB_FILES_NAMES
56
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
57
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
58
+ model_input_names = ["input_ids","attention_mask"]
59
+
60
+ def __init__(
61
+ self,
62
+ vocab_file,
63
+ decode_special_token=True,
64
+ bos_token="<s>",
65
+ eos_token="</s>",
66
+ sep_token="</s>",
67
+ cls_token="<s>",
68
+ unk_token="<unk>",
69
+ pad_token="<pad>",
70
+ mask_token="<mask>",
71
+ additional_special_tokens=["[java]","[sunda]","[indonesia]","<mask>"],
72
+ **kwargs
73
+ ):
74
+ super().__init__(
75
+ vocab_file=vocab_file,
76
+ bos_token=bos_token,
77
+ eos_token=eos_token,
78
+ unk_token=unk_token,
79
+ sep_token=sep_token,
80
+ cls_token=cls_token,
81
+ pad_token=pad_token,
82
+ mask_token=mask_token,
83
+ additional_special_tokens=additional_special_tokens,
84
+ **kwargs,
85
+ )
86
+ self.sp_model = spm.SentencePieceProcessor()
87
+ self.sp_model.Load(str(vocab_file))
88
+ self.vocab_file = vocab_file
89
+ self.decode_special_token = decode_special_token
90
+ self.model_max_length = 1024
91
+
92
+ # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
93
+ # sentencepiece vocabulary (this is the case for <s> and </s>
94
+ self.special_tokens_to_ids = {
95
+ "[java]": 40000,
96
+ "[sunda]": 40001,
97
+ "[indonesia]": 40002,
98
+ "<mask>": 40003
99
+ }
100
+ self.special_ids_to_tokens = {v: k for k, v in self.special_tokens_to_ids.items()}
101
+
102
+ # Store Language token ID
103
+ self.javanese_token = '[javanese]'
104
+ self.javanese_token_id = 40000
105
+ self.sundanese_token = '[sundanese]'
106
+ self.sundanese_token_id = 40001
107
+ self.indonesian_token = '[indonesia]'
108
+ self.indonesian_token_id = 40002
109
+
110
+ self.special_token_ids = [
111
+ self.bos_token_id, self.eos_token_id, self.sep_token_id, self.cls_token_id,
112
+ self.unk_token_id, self.pad_token_id, self.mask_token_id,
113
+ self.javanese_token_id, self.sundanese_token_id, self.indonesian_token_id
114
+ ]
115
+
116
+ def build_inputs_with_special_tokens(
117
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
118
+ ) -> List[int]:
119
+ """
120
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
121
+ adding special tokens. An CamemBERT sequence has the following format:
122
+ - single sequence: ``<s> X </s>``
123
+ - pair of sequences: ``<s> A </s></s> B </s>``
124
+ Args:
125
+ token_ids_0 (:obj:`List[int]`):
126
+ List of IDs to which the special tokens will be added.
127
+ token_ids_1 (:obj:`List[int]`, `optional`):
128
+ Optional second list of IDs for sequence pairs.
129
+ Returns:
130
+ :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
131
+ """
132
+
133
+ if token_ids_1 is None:
134
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
135
+ cls = [self.cls_token_id]
136
+ sep = [self.sep_token_id]
137
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
138
+
139
+ def get_special_tokens_mask(
140
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
141
+ ) -> List[int]:
142
+ """
143
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
144
+ special tokens using the tokenizer ``prepare_for_model`` method.
145
+ Args:
146
+ token_ids_0 (:obj:`List[int]`):
147
+ List of IDs.
148
+ token_ids_1 (:obj:`List[int]`, `optional`):
149
+ Optional second list of IDs for sequence pairs.
150
+ already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
151
+ Whether or not the token list is already formatted with special tokens for the model.
152
+ Returns:
153
+ :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
154
+ """
155
+ if already_has_special_tokens:
156
+ return super().get_special_tokens_mask(
157
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
158
+ )
159
+
160
+ if token_ids_1 is None:
161
+ return [1] + ([0] * len(token_ids_0)) + [1]
162
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
163
+
164
+ def create_token_type_ids_from_sequences(
165
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
166
+ ) -> List[int]:
167
+ """
168
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
169
+ RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
170
+ Args:
171
+ token_ids_0 (:obj:`List[int]`):
172
+ List of IDs.
173
+ token_ids_1 (:obj:`List[int]`, `optional`):
174
+ Optional second list of IDs for sequence pairs.
175
+ Returns:
176
+ :obj:`List[int]`: List of zeros.
177
+ """
178
+ sep = [self.sep_token_id]
179
+ cls = [self.cls_token_id]
180
+
181
+ if token_ids_1 is None:
182
+ return len(cls + token_ids_0 + sep) * [0]
183
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
184
+
185
+ @property
186
+ def vocab_size(self):
187
+ return 4 + len(self.sp_model)
188
+
189
+ def get_vocab(self):
190
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
191
+ vocab.update(self.added_tokens_encoder)
192
+ return vocab
193
+
194
+ def _tokenize(self, text: str) -> List[str]:
195
+ return self.sp_model.encode(text, out_type=str)
196
+
197
+ def _convert_token_to_id(self, token):
198
+ """ Converts a token (str) in an id using the vocab. """
199
+ if token in self.special_tokens_to_ids:
200
+ return self.special_tokens_to_ids[token]
201
+ return self.sp_model.PieceToId(token)
202
+
203
+ def _convert_id_to_token(self, index):
204
+ """Converts an index (integer) in a token (str) using the vocab."""
205
+ if not self.decode_special_token and index in self.special_token_ids:
206
+ return ''
207
+
208
+ if index in self.special_ids_to_tokens:
209
+ return self.special_ids_to_tokens[index]
210
+
211
+ return self.sp_model.IdToPiece(index)
212
+
213
+ def __getstate__(self):
214
+ state = self.__dict__.copy()
215
+ state["sp_model"] = None
216
+ return state
217
+
218
+ def __setstate__(self, d):
219
+ self.__dict__ = d
220
+
221
+ # for backward compatibility
222
+ if not hasattr(self, "sp_model_kwargs"):
223
+ self.sp_model_kwargs = {}
224
+
225
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
226
+ self.sp_model.Load(self.vocab_file)
227
+
228
+ def convert_tokens_to_string(self, tokens):
229
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
230
+ return self.sp_model.decode(tokens)
231
+
232
+ def decode(self, inputs, skip_special_tokens=False):
233
+ prev_val = self.decode_special_token
234
+ self.decode_special_token = not skip_special_tokens
235
+
236
+ outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens)
237
+ self.decode_special_token = prev_val
238
+
239
+ return outputs