Files changed (2) hide show
  1. load_model.py +253 -0
  2. ui.py +97 -0
load_model.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ import torch
5
+ from transformers import AutoTokenizer, EncoderDecoderModel, T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
6
+ import pickle
7
+ import numpy as np
8
+ from tensorflow.keras.models import load_model, Model
9
+ from tensorflow.keras.layers import Input
10
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
11
+ import os
12
+ from GRU_with_attention_ver4.load_GRU_model import translate_GRU
13
+
14
+ import tensorflow as tf
15
+
16
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
17
+
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+
20
+ def load_model_BERT_BARTPho():
21
+ encoder_model_name = "bert-base-uncased"
22
+ decoder_model_name = "vinai/bartpho-word"
23
+ model_path = "./EncoderDecoder_6"
24
+
25
+ encoder = AutoTokenizer.from_pretrained(encoder_model_name)
26
+ decoder = AutoTokenizer.from_pretrained(decoder_model_name)
27
+ model = EncoderDecoderModel.from_pretrained(model_path).to(device)
28
+ return encoder, decoder, model
29
+
30
+ def translate_BERT_BARTPho(input_text, encoder=None, decoder=None, model=None):
31
+ if encoder is None or decoder is None or model is None:
32
+ encoder, decoder, model = load_model_BERT_BARTPho()
33
+
34
+ inputs = encoder(
35
+ input_text,
36
+ return_tensors="pt",
37
+ padding=True,
38
+ truncation=True
39
+ ).to(device)
40
+
41
+ inputs = {key: value.to(model.device) for key, value in inputs.items()}
42
+ outputs = model.generate(inputs["input_ids"], max_length=64, num_beams=4)
43
+
44
+ return decoder.decode(outputs[0], skip_special_tokens=True)
45
+
46
+ def load_model_T5():
47
+ model_folder = "./T5_ver3"
48
+ decoder_path = model_folder + "/vi_tokenizer_32128.model"
49
+
50
+ encoder = T5Tokenizer.from_pretrained("t5-small", skip_special_tokens=True)
51
+ decoder = T5Tokenizer.from_pretrained(pretrained_model_name_or_path = decoder_path, skip_special_tokens=True)
52
+ model = T5ForConditionalGeneration.from_pretrained(model_folder, max_length = 64).to(device)
53
+ return encoder, decoder, model
54
+
55
+ def translate_T5(input_text, encoder=None, decoder=None, model=None):
56
+ if encoder is None or decoder is None or model is None:
57
+ encoder, decoder, model = load_model_T5()
58
+
59
+ # Tiến hành dịch
60
+ inputs = encoder(input_text, return_tensors="pt").to(device)
61
+ outputs = model.generate(inputs['input_ids'])
62
+ output_text = decoder.decode(outputs[0].tolist(), skip_special_tokens=True)
63
+
64
+ return output_text
65
+
66
+ def load_model_BiLSTM():
67
+ model_folder = f"./BiLSTM_2"
68
+ encoder_path = model_folder + "/english_tokenizer.pkl"
69
+ decoder_path = model_folder + "/vietnamese_tokenizer.pkl"
70
+ model_path = model_folder + "/my_model_1.keras"
71
+
72
+ with open(encoder_path, "rb") as f:
73
+ encoder = pickle.load(f)
74
+ with open(decoder_path, "rb") as f:
75
+ decoder = pickle.load(f)
76
+
77
+ model = load_model(model_path)
78
+ return encoder, decoder, model
79
+
80
+ def translate_BiLSTM(input_text, encoder=None, decoder=None, model=None):
81
+ if encoder is None or decoder is None or model is None:
82
+ encoder, decoder, model = load_model_BiLSTM()
83
+
84
+ # Extract components from the model
85
+ encoder_input = model.input[0] # Input tensor for the encoder
86
+ encoder_output = model.get_layer("bidirectional").output[0]
87
+ encoder_state_h = model.get_layer("state_h_concat").output
88
+ encoder_state_c = model.get_layer("state_c_concat").output
89
+
90
+ # Build encoder model
91
+ encoder_model = Model(encoder_input, [encoder_output, encoder_state_h, encoder_state_c])
92
+
93
+ # Extract decoder components
94
+ decoder_embedding = model.get_layer("decoder_embedding")
95
+ decoder_lstm = model.get_layer("decoder_lstm")
96
+ decoder_dense = model.get_layer("decoder_dense")
97
+
98
+ # Define decoder inference inputs
99
+ units = 128 # LSTM units
100
+ decoder_state_input_h = Input(shape=(units * 2,), name="decoder_state_input_h")
101
+ decoder_state_input_c = Input(shape=(units * 2,), name="decoder_state_input_c")
102
+ decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
103
+
104
+ # Reuse the embedding and LSTM layers
105
+ decoder_input = Input(shape=(1,), name="decoder_input") # Decoder input for one time step
106
+ decoder_embedding_inf = decoder_embedding(decoder_input)
107
+ decoder_lstm_inf = decoder_lstm(decoder_embedding_inf, initial_state=decoder_states_inputs)
108
+ decoder_output_inf, state_h_inf, state_c_inf = decoder_lstm_inf
109
+
110
+ decoder_states_inf = [state_h_inf, state_c_inf]
111
+
112
+ # Dense layer for probabilities
113
+ decoder_output_inf = decoder_dense(decoder_output_inf)
114
+
115
+ # Build decoder inference model
116
+ decoder_model = Model(
117
+ [decoder_input] + decoder_states_inputs, # Inputs
118
+ [decoder_output_inf] + decoder_states_inf # Outputs
119
+ )
120
+
121
+ # Helper functions
122
+ def preprocess_sentence(sentence, tokenizer, max_length):
123
+ """Preprocess and tokenize an input sentence."""
124
+ sequence = tokenizer.texts_to_sequences([sentence])
125
+ return pad_sequences(sequence, maxlen=max_length, padding='post')
126
+
127
+ def decode_sequence(input_seq):
128
+ """Generate a Vietnamese sentence from an English input sequence."""
129
+ # Encode the input sequence to get initial states
130
+ encoder_output, state_h, state_c = encoder_model.predict(input_seq)
131
+
132
+ # Initialize the decoder input with the <start> token
133
+ target_seq = np.zeros((1, 1)) # Shape: (batch_size, 1)
134
+ target_seq[0, 0] = decoder.texts_to_sequences(["<SOS>"])[0][0]
135
+
136
+ # Initialize states
137
+ states = [state_h, state_c]
138
+
139
+ # Generate the output sequence token by token
140
+ decoded_sentence = []
141
+ for _ in range(232):
142
+ output_tokens, h, c = decoder_model.predict([target_seq] + states)
143
+
144
+ # Sample the next token
145
+ sampled_token_index = np.argmax(output_tokens[0, -1, :])
146
+ sampled_token = decoder.index_word.get(sampled_token_index, '<unk>')
147
+ if sampled_token == '<eos>':
148
+ break
149
+
150
+ decoded_sentence.append(sampled_token)
151
+
152
+ # Update the target sequence (input to the decoder)
153
+ target_seq[0, 0] = sampled_token_index
154
+
155
+ # Update states
156
+ states = [h, c]
157
+
158
+ return ' '.join(decoded_sentence)
159
+ max_input_length = 193 # Adjust based on your tokenizer setup
160
+ input_sequence = preprocess_sentence(input_text, encoder, max_input_length)
161
+
162
+ # Generate translation
163
+ translation = decode_sequence(input_sequence)
164
+
165
+ return translation
166
+
167
+ # def load_model_GRU():
168
+ # model_folder = f"./GRU_with_attention_ver3"
169
+ # return encoder, decoder, model
170
+
171
+ # def translate_GRU(input_text, encoder=None, decoder =None, model=None):
172
+ # translation = translate_GRU(input_text)
173
+ # return translation
174
+
175
+ def load_model_LSTM():
176
+ encoder_model_name = "bert-base-uncased"
177
+ decoder_model_name = "vinai/phobert-base"
178
+ model_path = r'LSTM_Attention_2\best_model.keras'
179
+
180
+ encoder = AutoTokenizer.from_pretrained(encoder_model_name)
181
+ decoder = AutoTokenizer.from_pretrained(decoder_model_name)
182
+ model = load_model(model_path)
183
+ return encoder, decoder, model
184
+
185
+ def translate_LSTM(input_text, encoder=None, decoder=None, model=None):
186
+ max_length = 50
187
+
188
+ if encoder is None or decoder is None or model is None:
189
+ encoder, decoder, model = load_model_LSTM()
190
+
191
+ def greedy_decode(input_sequence, model, decoder, max_length=50):
192
+
193
+ input_sequence = tf.constant([input_sequence], dtype=tf.int64)
194
+
195
+ # Start with the target sequence containing only the start token
196
+ start_token = decoder.cls_token_id
197
+ end_token = decoder.sep_token_id
198
+
199
+ target_sequence = [start_token]
200
+
201
+ for _ in range(max_length):
202
+ # Prepare input for the decoder
203
+ decoder_input = tf.constant([target_sequence], dtype=tf.int64)
204
+
205
+ # Predict next token probabilities
206
+ predictions = model.predict([input_sequence, decoder_input], verbose=0)
207
+
208
+ # Take the last time-step and find the highest probability token
209
+ next_token = tf.argmax(predictions[:, -1, :], axis=-1).numpy()[0]
210
+
211
+ # Append the predicted token to the target sequence
212
+ target_sequence.append(next_token)
213
+
214
+ # Stop if the end token is predicted
215
+ if next_token == end_token:
216
+ break
217
+
218
+ # Decode the target sequence to text
219
+ translated_sentence = decoder.decode(target_sequence[1:], skip_special_tokens=True)
220
+ return translated_sentence
221
+
222
+ input_tokens = encoder.encode(input_text, add_special_tokens=True)
223
+ translated_text = greedy_decode(input_tokens, model, decoder)
224
+ return translated_text
225
+
226
+ def load_model_MarianMT():
227
+ tokenizer_model_name = "Helsinki-NLP/opus-mt-en-vi"
228
+ model_path = "./MarianMT_ver2"
229
+
230
+ tokenizer = MarianTokenizer.from_pretrained(tokenizer_model_name)
231
+ model = MarianMTModel.from_pretrained(model_path).to(device)
232
+ return tokenizer, model
233
+
234
+ def translate_MarianMT(input_text, model=None, tokenizer=None):
235
+ if model is None or tokenizer is None:
236
+ tokenizer, model = load_model_MarianMT()
237
+
238
+ inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
239
+ inputs = {key: value.to(device) for key, value in inputs.items()}
240
+
241
+ outputs = model.generate(**inputs, max_length=64, num_beams=4)
242
+ translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
243
+
244
+ return translated_text
245
+
246
+ if __name__ == "__main__":
247
+ input = """
248
+ I go to school
249
+ """
250
+
251
+ translated_text = translate_LSTM(input)
252
+
253
+ print(translated_text)
ui.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer
5
+ import re
6
+ from load_model import translate_T5, translate_BERT_BARTPho, translate_LSTM, translate_BiLSTM, translate_GRU, translate_MarianMT
7
+ # from load_model import translate_GRU
8
+ import sentencepiece as spm
9
+
10
+
11
+ # python -m streamlit run ui.py
12
+
13
+ MAX_LENGTH = 64
14
+
15
+ st.title("Machine Translation")
16
+ st.markdown('<p style="font-size:24px; font-weight:bold;">English - Vietnamese</p>',
17
+ unsafe_allow_html=True)
18
+
19
+ if 'summarize_model' not in st.session_state:
20
+ summarize_model_dir = "./Summarization"
21
+ st.session_state.summarize_tokenizer = AutoTokenizer.from_pretrained(summarize_model_dir)
22
+ st.session_state.summarize_model = AutoModelForSeq2SeqLM.from_pretrained(summarize_model_dir)
23
+ print("Summarize model loaded")
24
+
25
+ model_name = st.selectbox("Select Model", ["BERT_BARTPho" ,"T5", "BiLSTM", "GRU", "LSTM", "MarianMT"], index=None, placeholder="Select a Model")
26
+
27
+ input_text = st.text_area(
28
+ "Input Text:",
29
+ placeholder="Enter your text here...",
30
+ height=150,
31
+ key="input_text",
32
+ help = f"If your input text is more than {MAX_LENGTH} words. It will be summarized and then translated",
33
+ value= "Today, I go to school"
34
+ )
35
+
36
+ def summarize(input_text):
37
+ if (len(input_text.split()) > MAX_LENGTH):
38
+ st.write("Your input paragraph is more than 64 words!")
39
+
40
+ summarize_tokenizer = st.session_state.summarize_tokenizer
41
+ summarize_model = st.session_state.summarize_model
42
+
43
+ inputs = summarize_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
44
+ outputs = summarize_model.generate(**inputs, max_length=100, num_beams=5, length_penalty=2.0, early_stopping=True)
45
+
46
+ summerized_input_text = summarize_tokenizer.decode(outputs[0], skip_special_tokens=True)
47
+
48
+ return summerized_input_text
49
+
50
+ def cut_sentence(input_text):
51
+ sentences = re.split(r'(?<=[.!?]) +', input_text.strip())
52
+ return sentences
53
+
54
+ st.write(summarize(input_text))
55
+
56
+ st.markdown(
57
+ f"""
58
+ <style>
59
+ input[type=text] {{
60
+ width: 500%;
61
+ }}
62
+ </style>
63
+ """,
64
+ unsafe_allow_html=True
65
+ )
66
+
67
+ if st.button("Translate"):
68
+ if model_name == "BERT_BARTPho":
69
+ translated_text = translate_BERT_BARTPho(input_text)
70
+ st.write(f"Translation for {model_name}:")
71
+ st.write(translated_text)
72
+
73
+ if model_name == "T5":
74
+ translated_text = translate_T5(input_text)
75
+ st.write(f"Translation for {model_name}:")
76
+ st.write(translated_text)
77
+
78
+ if model_name == "BiLSTM":
79
+ translated_text = translate_BiLSTM(input_text)
80
+ st.write(f"Translation for {model_name}:")
81
+ st.write(translated_text)
82
+
83
+ elif model_name == "GRU":
84
+ translated_text = translate_GRU(input_text)
85
+ st.write(f"Translation for {model_name}:")
86
+ st.write(translated_text)
87
+
88
+ elif model_name == "LSTM":
89
+ translated_text = translate_LSTM(input_text)
90
+ st.write(f"Translation for {model_name}:")
91
+ st.write(translated_text)
92
+
93
+ elif model_name == "MarianMT":
94
+ translated_text = translate_MarianMT(input_text)
95
+ st.write(f"Translation for {model_name}:")
96
+ st.write(translated_text)
97
+