|
|
|
|
|
|
|
import gradio as gr |
|
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer |
|
|
|
model = M2M100ForConditionalGeneration.from_pretrained( |
|
"facebook/m2m100_1.2B") |
|
|
|
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B") |
|
|
|
|
|
this_description = ''' |
|
Using facebook/m2m100_1.2B pre-trained model. |
|
|
|
Chinese(zh) |
|
English(en) |
|
Hindi(hi) |
|
Japanese(ja) |
|
Sinhalese(si) |
|
Thai(th) |
|
Vietnamese(vi) |
|
... |
|
|
|
''' |
|
|
|
|
|
lang_codes = { |
|
"Afrikaans": "af", |
|
"Amharic": "am", |
|
"Arabic": "ar", |
|
"Asturian": "ast", |
|
"Azerbaijani": "az", |
|
"Bashkir": "ba", |
|
"Belarusian": "be", |
|
"Bulgarian": "bg", |
|
"Bengali": "bn", |
|
"Breton": "br", |
|
"Bosnian": "bs", |
|
"Catalan; Valencian": "ca", |
|
"Cebuano": "ceb", |
|
"Czech": "cs", |
|
"Welsh": "cy", |
|
"Danish": "da", |
|
"German": "de", |
|
"Greeek": "el", |
|
"English": "en", |
|
"Spanish": "es", |
|
"Estonian": "et", |
|
"Persian": "fa", |
|
"Fulah": "ff", |
|
"Finnish": "fi", |
|
"French": "fr", |
|
"Western Frisian": "fy", |
|
"Irish": "ga", |
|
"Gaelic; Scottish Gaelic": "gd", |
|
"Galician": "gl", |
|
"Gujarati": "gu", |
|
"Hausa": "ha", |
|
"Hebrew": "he", |
|
"Hindi": "hi", |
|
"Croatian": "hr", |
|
"Haitian; Haitian Creole": "ht", |
|
"Hungarian": "hu", |
|
"Armenian": "hy", |
|
"Indonesian": "id", |
|
"Igbo": "ig", |
|
"Iloko": "ilo", |
|
"Icelandic": "is", |
|
"Italian": "it", |
|
"Japanese": "ja", |
|
"Javanese": "jv", |
|
"Georgian": "ka", |
|
"Kazakh": "kk", |
|
"Central Khmer": "km", |
|
"Kannada": "kn", |
|
"Korean": "ko", |
|
"Luxembourgish; Letzeburgesch": "lb", |
|
"Ganda": "lg", |
|
"Lingala": "ln", |
|
"Lao": "lo", |
|
"Lithuanian": "lt", |
|
"Latvian": "lv", |
|
"Malagasy": "mg", |
|
"Macedonian": "mk", |
|
"Malayalam": "ml", |
|
"Mongolian": "mn", |
|
"Marathi": "mr", |
|
"Malay": "ms", |
|
"Burmese": "my", |
|
"Nepali": "ne", |
|
"Dutch; Flemish": "nl", |
|
"Norwegian": "no", |
|
"Northern Sotho": "ns", |
|
"Occitan": "oc", |
|
"Oriya": "or", |
|
"Panjabi; Punjabi": "pa", |
|
"Polish": "pl", |
|
"Pushto": "ps", |
|
"Portuguese": "pt", |
|
"Romanian; Moldavian; Moldovan": "ro", |
|
"Russian": "ru", |
|
"Sindhi": "sd", |
|
"Sinhala; Sinhalese": "si", |
|
"Slovak": "sk", |
|
"Slovenian": "sl", |
|
"Somali": "so", |
|
"Albanian": "sq", |
|
"Serbian": "sr", |
|
"Swati": "ss", |
|
"Sundanese": "su", |
|
"Swedish": "sv", |
|
"Swahili": "sw", |
|
"Tamil": "ta", |
|
"Thai": "th", |
|
"Tagalog": "tl", |
|
"Tswana": "tn", |
|
"Turkish": "tr", |
|
"Ukrainian": "uk", |
|
"Urdu": "ur", |
|
"Uzbek": "uz", |
|
"Vietnamese": "vi", |
|
"Wolof": "wo", |
|
"Xhosa": "xh", |
|
"Yiddish": "yi", |
|
"Yoruba": "yo", |
|
"Chinese": "zh", |
|
"Zulu": "zu" |
|
} |
|
|
|
|
|
def m2m_translate(Input_Text, from_lang, to_lang): |
|
tokenizer.src_lang = lang_codes[from_lang] |
|
encoded_from_lang = tokenizer(Input_Text, return_tensors="pt") |
|
|
|
generated_tokens = model.generate( |
|
**encoded_from_lang, |
|
max_new_tokens=200, |
|
forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang]) |
|
) |
|
|
|
res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) |
|
|
|
return res[0] |
|
|
|
|
|
iface = gr.Interface( |
|
fn=m2m_translate, |
|
|
|
title="M2M100 Text Translation", |
|
description=this_description, |
|
|
|
inputs=[ |
|
gr.Textbox(lines=5, placeholder="Enter text", label="Text input"), |
|
|
|
gr.Radio( |
|
choices=[ |
|
'Burmese', |
|
'Chinese', |
|
'English', |
|
'Hindi', |
|
'Japanese', |
|
'Sinhala', |
|
'Thai', |
|
'Vietnamese' |
|
], |
|
value='Vietnamese', |
|
label='From language' |
|
), |
|
|
|
gr.Radio( |
|
choices=[ |
|
'Burmese', |
|
'Chinese', |
|
'English', |
|
'Hindi', |
|
'Japanese', |
|
'Sinhala', |
|
'Thai', |
|
'Vietnamese' |
|
], |
|
value='English', |
|
label='To language' |
|
), |
|
], |
|
outputs="text") |
|
|
|
iface.launch() |
|
|