UI update
#45
by
khaihoang004
- opened
- load_model.py +253 -0
- ui.py +97 -0
load_model.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from transformers import AutoTokenizer, EncoderDecoderModel, T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
|
6 |
+
import pickle
|
7 |
+
import numpy as np
|
8 |
+
from tensorflow.keras.models import load_model, Model
|
9 |
+
from tensorflow.keras.layers import Input
|
10 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
11 |
+
import os
|
12 |
+
from GRU_with_attention_ver4.load_GRU_model import translate_GRU
|
13 |
+
|
14 |
+
import tensorflow as tf
|
15 |
+
|
16 |
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
17 |
+
|
18 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
19 |
+
|
20 |
+
def load_model_BERT_BARTPho():
|
21 |
+
encoder_model_name = "bert-base-uncased"
|
22 |
+
decoder_model_name = "vinai/bartpho-word"
|
23 |
+
model_path = "./EncoderDecoder_6"
|
24 |
+
|
25 |
+
encoder = AutoTokenizer.from_pretrained(encoder_model_name)
|
26 |
+
decoder = AutoTokenizer.from_pretrained(decoder_model_name)
|
27 |
+
model = EncoderDecoderModel.from_pretrained(model_path).to(device)
|
28 |
+
return encoder, decoder, model
|
29 |
+
|
30 |
+
def translate_BERT_BARTPho(input_text, encoder=None, decoder=None, model=None):
|
31 |
+
if encoder is None or decoder is None or model is None:
|
32 |
+
encoder, decoder, model = load_model_BERT_BARTPho()
|
33 |
+
|
34 |
+
inputs = encoder(
|
35 |
+
input_text,
|
36 |
+
return_tensors="pt",
|
37 |
+
padding=True,
|
38 |
+
truncation=True
|
39 |
+
).to(device)
|
40 |
+
|
41 |
+
inputs = {key: value.to(model.device) for key, value in inputs.items()}
|
42 |
+
outputs = model.generate(inputs["input_ids"], max_length=64, num_beams=4)
|
43 |
+
|
44 |
+
return decoder.decode(outputs[0], skip_special_tokens=True)
|
45 |
+
|
46 |
+
def load_model_T5():
|
47 |
+
model_folder = "./T5_ver3"
|
48 |
+
decoder_path = model_folder + "/vi_tokenizer_32128.model"
|
49 |
+
|
50 |
+
encoder = T5Tokenizer.from_pretrained("t5-small", skip_special_tokens=True)
|
51 |
+
decoder = T5Tokenizer.from_pretrained(pretrained_model_name_or_path = decoder_path, skip_special_tokens=True)
|
52 |
+
model = T5ForConditionalGeneration.from_pretrained(model_folder, max_length = 64).to(device)
|
53 |
+
return encoder, decoder, model
|
54 |
+
|
55 |
+
def translate_T5(input_text, encoder=None, decoder=None, model=None):
|
56 |
+
if encoder is None or decoder is None or model is None:
|
57 |
+
encoder, decoder, model = load_model_T5()
|
58 |
+
|
59 |
+
# Tiến hành dịch
|
60 |
+
inputs = encoder(input_text, return_tensors="pt").to(device)
|
61 |
+
outputs = model.generate(inputs['input_ids'])
|
62 |
+
output_text = decoder.decode(outputs[0].tolist(), skip_special_tokens=True)
|
63 |
+
|
64 |
+
return output_text
|
65 |
+
|
66 |
+
def load_model_BiLSTM():
|
67 |
+
model_folder = f"./BiLSTM_2"
|
68 |
+
encoder_path = model_folder + "/english_tokenizer.pkl"
|
69 |
+
decoder_path = model_folder + "/vietnamese_tokenizer.pkl"
|
70 |
+
model_path = model_folder + "/my_model_1.keras"
|
71 |
+
|
72 |
+
with open(encoder_path, "rb") as f:
|
73 |
+
encoder = pickle.load(f)
|
74 |
+
with open(decoder_path, "rb") as f:
|
75 |
+
decoder = pickle.load(f)
|
76 |
+
|
77 |
+
model = load_model(model_path)
|
78 |
+
return encoder, decoder, model
|
79 |
+
|
80 |
+
def translate_BiLSTM(input_text, encoder=None, decoder=None, model=None):
|
81 |
+
if encoder is None or decoder is None or model is None:
|
82 |
+
encoder, decoder, model = load_model_BiLSTM()
|
83 |
+
|
84 |
+
# Extract components from the model
|
85 |
+
encoder_input = model.input[0] # Input tensor for the encoder
|
86 |
+
encoder_output = model.get_layer("bidirectional").output[0]
|
87 |
+
encoder_state_h = model.get_layer("state_h_concat").output
|
88 |
+
encoder_state_c = model.get_layer("state_c_concat").output
|
89 |
+
|
90 |
+
# Build encoder model
|
91 |
+
encoder_model = Model(encoder_input, [encoder_output, encoder_state_h, encoder_state_c])
|
92 |
+
|
93 |
+
# Extract decoder components
|
94 |
+
decoder_embedding = model.get_layer("decoder_embedding")
|
95 |
+
decoder_lstm = model.get_layer("decoder_lstm")
|
96 |
+
decoder_dense = model.get_layer("decoder_dense")
|
97 |
+
|
98 |
+
# Define decoder inference inputs
|
99 |
+
units = 128 # LSTM units
|
100 |
+
decoder_state_input_h = Input(shape=(units * 2,), name="decoder_state_input_h")
|
101 |
+
decoder_state_input_c = Input(shape=(units * 2,), name="decoder_state_input_c")
|
102 |
+
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
|
103 |
+
|
104 |
+
# Reuse the embedding and LSTM layers
|
105 |
+
decoder_input = Input(shape=(1,), name="decoder_input") # Decoder input for one time step
|
106 |
+
decoder_embedding_inf = decoder_embedding(decoder_input)
|
107 |
+
decoder_lstm_inf = decoder_lstm(decoder_embedding_inf, initial_state=decoder_states_inputs)
|
108 |
+
decoder_output_inf, state_h_inf, state_c_inf = decoder_lstm_inf
|
109 |
+
|
110 |
+
decoder_states_inf = [state_h_inf, state_c_inf]
|
111 |
+
|
112 |
+
# Dense layer for probabilities
|
113 |
+
decoder_output_inf = decoder_dense(decoder_output_inf)
|
114 |
+
|
115 |
+
# Build decoder inference model
|
116 |
+
decoder_model = Model(
|
117 |
+
[decoder_input] + decoder_states_inputs, # Inputs
|
118 |
+
[decoder_output_inf] + decoder_states_inf # Outputs
|
119 |
+
)
|
120 |
+
|
121 |
+
# Helper functions
|
122 |
+
def preprocess_sentence(sentence, tokenizer, max_length):
|
123 |
+
"""Preprocess and tokenize an input sentence."""
|
124 |
+
sequence = tokenizer.texts_to_sequences([sentence])
|
125 |
+
return pad_sequences(sequence, maxlen=max_length, padding='post')
|
126 |
+
|
127 |
+
def decode_sequence(input_seq):
|
128 |
+
"""Generate a Vietnamese sentence from an English input sequence."""
|
129 |
+
# Encode the input sequence to get initial states
|
130 |
+
encoder_output, state_h, state_c = encoder_model.predict(input_seq)
|
131 |
+
|
132 |
+
# Initialize the decoder input with the <start> token
|
133 |
+
target_seq = np.zeros((1, 1)) # Shape: (batch_size, 1)
|
134 |
+
target_seq[0, 0] = decoder.texts_to_sequences(["<SOS>"])[0][0]
|
135 |
+
|
136 |
+
# Initialize states
|
137 |
+
states = [state_h, state_c]
|
138 |
+
|
139 |
+
# Generate the output sequence token by token
|
140 |
+
decoded_sentence = []
|
141 |
+
for _ in range(232):
|
142 |
+
output_tokens, h, c = decoder_model.predict([target_seq] + states)
|
143 |
+
|
144 |
+
# Sample the next token
|
145 |
+
sampled_token_index = np.argmax(output_tokens[0, -1, :])
|
146 |
+
sampled_token = decoder.index_word.get(sampled_token_index, '<unk>')
|
147 |
+
if sampled_token == '<eos>':
|
148 |
+
break
|
149 |
+
|
150 |
+
decoded_sentence.append(sampled_token)
|
151 |
+
|
152 |
+
# Update the target sequence (input to the decoder)
|
153 |
+
target_seq[0, 0] = sampled_token_index
|
154 |
+
|
155 |
+
# Update states
|
156 |
+
states = [h, c]
|
157 |
+
|
158 |
+
return ' '.join(decoded_sentence)
|
159 |
+
max_input_length = 193 # Adjust based on your tokenizer setup
|
160 |
+
input_sequence = preprocess_sentence(input_text, encoder, max_input_length)
|
161 |
+
|
162 |
+
# Generate translation
|
163 |
+
translation = decode_sequence(input_sequence)
|
164 |
+
|
165 |
+
return translation
|
166 |
+
|
167 |
+
# def load_model_GRU():
|
168 |
+
# model_folder = f"./GRU_with_attention_ver3"
|
169 |
+
# return encoder, decoder, model
|
170 |
+
|
171 |
+
# def translate_GRU(input_text, encoder=None, decoder =None, model=None):
|
172 |
+
# translation = translate_GRU(input_text)
|
173 |
+
# return translation
|
174 |
+
|
175 |
+
def load_model_LSTM():
|
176 |
+
encoder_model_name = "bert-base-uncased"
|
177 |
+
decoder_model_name = "vinai/phobert-base"
|
178 |
+
model_path = r'LSTM_Attention_2\best_model.keras'
|
179 |
+
|
180 |
+
encoder = AutoTokenizer.from_pretrained(encoder_model_name)
|
181 |
+
decoder = AutoTokenizer.from_pretrained(decoder_model_name)
|
182 |
+
model = load_model(model_path)
|
183 |
+
return encoder, decoder, model
|
184 |
+
|
185 |
+
def translate_LSTM(input_text, encoder=None, decoder=None, model=None):
|
186 |
+
max_length = 50
|
187 |
+
|
188 |
+
if encoder is None or decoder is None or model is None:
|
189 |
+
encoder, decoder, model = load_model_LSTM()
|
190 |
+
|
191 |
+
def greedy_decode(input_sequence, model, decoder, max_length=50):
|
192 |
+
|
193 |
+
input_sequence = tf.constant([input_sequence], dtype=tf.int64)
|
194 |
+
|
195 |
+
# Start with the target sequence containing only the start token
|
196 |
+
start_token = decoder.cls_token_id
|
197 |
+
end_token = decoder.sep_token_id
|
198 |
+
|
199 |
+
target_sequence = [start_token]
|
200 |
+
|
201 |
+
for _ in range(max_length):
|
202 |
+
# Prepare input for the decoder
|
203 |
+
decoder_input = tf.constant([target_sequence], dtype=tf.int64)
|
204 |
+
|
205 |
+
# Predict next token probabilities
|
206 |
+
predictions = model.predict([input_sequence, decoder_input], verbose=0)
|
207 |
+
|
208 |
+
# Take the last time-step and find the highest probability token
|
209 |
+
next_token = tf.argmax(predictions[:, -1, :], axis=-1).numpy()[0]
|
210 |
+
|
211 |
+
# Append the predicted token to the target sequence
|
212 |
+
target_sequence.append(next_token)
|
213 |
+
|
214 |
+
# Stop if the end token is predicted
|
215 |
+
if next_token == end_token:
|
216 |
+
break
|
217 |
+
|
218 |
+
# Decode the target sequence to text
|
219 |
+
translated_sentence = decoder.decode(target_sequence[1:], skip_special_tokens=True)
|
220 |
+
return translated_sentence
|
221 |
+
|
222 |
+
input_tokens = encoder.encode(input_text, add_special_tokens=True)
|
223 |
+
translated_text = greedy_decode(input_tokens, model, decoder)
|
224 |
+
return translated_text
|
225 |
+
|
226 |
+
def load_model_MarianMT():
|
227 |
+
tokenizer_model_name = "Helsinki-NLP/opus-mt-en-vi"
|
228 |
+
model_path = "./MarianMT_ver2"
|
229 |
+
|
230 |
+
tokenizer = MarianTokenizer.from_pretrained(tokenizer_model_name)
|
231 |
+
model = MarianMTModel.from_pretrained(model_path).to(device)
|
232 |
+
return tokenizer, model
|
233 |
+
|
234 |
+
def translate_MarianMT(input_text, model=None, tokenizer=None):
|
235 |
+
if model is None or tokenizer is None:
|
236 |
+
tokenizer, model = load_model_MarianMT()
|
237 |
+
|
238 |
+
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
|
239 |
+
inputs = {key: value.to(device) for key, value in inputs.items()}
|
240 |
+
|
241 |
+
outputs = model.generate(**inputs, max_length=64, num_beams=4)
|
242 |
+
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
243 |
+
|
244 |
+
return translated_text
|
245 |
+
|
246 |
+
if __name__ == "__main__":
|
247 |
+
input = """
|
248 |
+
I go to school
|
249 |
+
"""
|
250 |
+
|
251 |
+
translated_text = translate_LSTM(input)
|
252 |
+
|
253 |
+
print(translated_text)
|
ui.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import json
|
4 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, T5Tokenizer
|
5 |
+
import re
|
6 |
+
from load_model import translate_T5, translate_BERT_BARTPho, translate_LSTM, translate_BiLSTM, translate_GRU, translate_MarianMT
|
7 |
+
# from load_model import translate_GRU
|
8 |
+
import sentencepiece as spm
|
9 |
+
|
10 |
+
|
11 |
+
# python -m streamlit run ui.py
|
12 |
+
|
13 |
+
MAX_LENGTH = 64
|
14 |
+
|
15 |
+
st.title("Machine Translation")
|
16 |
+
st.markdown('<p style="font-size:24px; font-weight:bold;">English - Vietnamese</p>',
|
17 |
+
unsafe_allow_html=True)
|
18 |
+
|
19 |
+
if 'summarize_model' not in st.session_state:
|
20 |
+
summarize_model_dir = "./Summarization"
|
21 |
+
st.session_state.summarize_tokenizer = AutoTokenizer.from_pretrained(summarize_model_dir)
|
22 |
+
st.session_state.summarize_model = AutoModelForSeq2SeqLM.from_pretrained(summarize_model_dir)
|
23 |
+
print("Summarize model loaded")
|
24 |
+
|
25 |
+
model_name = st.selectbox("Select Model", ["BERT_BARTPho" ,"T5", "BiLSTM", "GRU", "LSTM", "MarianMT"], index=None, placeholder="Select a Model")
|
26 |
+
|
27 |
+
input_text = st.text_area(
|
28 |
+
"Input Text:",
|
29 |
+
placeholder="Enter your text here...",
|
30 |
+
height=150,
|
31 |
+
key="input_text",
|
32 |
+
help = f"If your input text is more than {MAX_LENGTH} words. It will be summarized and then translated",
|
33 |
+
value= "Today, I go to school"
|
34 |
+
)
|
35 |
+
|
36 |
+
def summarize(input_text):
|
37 |
+
if (len(input_text.split()) > MAX_LENGTH):
|
38 |
+
st.write("Your input paragraph is more than 64 words!")
|
39 |
+
|
40 |
+
summarize_tokenizer = st.session_state.summarize_tokenizer
|
41 |
+
summarize_model = st.session_state.summarize_model
|
42 |
+
|
43 |
+
inputs = summarize_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
|
44 |
+
outputs = summarize_model.generate(**inputs, max_length=100, num_beams=5, length_penalty=2.0, early_stopping=True)
|
45 |
+
|
46 |
+
summerized_input_text = summarize_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
47 |
+
|
48 |
+
return summerized_input_text
|
49 |
+
|
50 |
+
def cut_sentence(input_text):
|
51 |
+
sentences = re.split(r'(?<=[.!?]) +', input_text.strip())
|
52 |
+
return sentences
|
53 |
+
|
54 |
+
st.write(summarize(input_text))
|
55 |
+
|
56 |
+
st.markdown(
|
57 |
+
f"""
|
58 |
+
<style>
|
59 |
+
input[type=text] {{
|
60 |
+
width: 500%;
|
61 |
+
}}
|
62 |
+
</style>
|
63 |
+
""",
|
64 |
+
unsafe_allow_html=True
|
65 |
+
)
|
66 |
+
|
67 |
+
if st.button("Translate"):
|
68 |
+
if model_name == "BERT_BARTPho":
|
69 |
+
translated_text = translate_BERT_BARTPho(input_text)
|
70 |
+
st.write(f"Translation for {model_name}:")
|
71 |
+
st.write(translated_text)
|
72 |
+
|
73 |
+
if model_name == "T5":
|
74 |
+
translated_text = translate_T5(input_text)
|
75 |
+
st.write(f"Translation for {model_name}:")
|
76 |
+
st.write(translated_text)
|
77 |
+
|
78 |
+
if model_name == "BiLSTM":
|
79 |
+
translated_text = translate_BiLSTM(input_text)
|
80 |
+
st.write(f"Translation for {model_name}:")
|
81 |
+
st.write(translated_text)
|
82 |
+
|
83 |
+
elif model_name == "GRU":
|
84 |
+
translated_text = translate_GRU(input_text)
|
85 |
+
st.write(f"Translation for {model_name}:")
|
86 |
+
st.write(translated_text)
|
87 |
+
|
88 |
+
elif model_name == "LSTM":
|
89 |
+
translated_text = translate_LSTM(input_text)
|
90 |
+
st.write(f"Translation for {model_name}:")
|
91 |
+
st.write(translated_text)
|
92 |
+
|
93 |
+
elif model_name == "MarianMT":
|
94 |
+
translated_text = translate_MarianMT(input_text)
|
95 |
+
st.write(f"Translation for {model_name}:")
|
96 |
+
st.write(translated_text)
|
97 |
+
|