Daryl Lim commited on
Commit
43b39f0
·
1 Parent(s): 35d4340

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -29
app.py CHANGED
@@ -1,82 +1,124 @@
1
- import spaces
 
 
 
 
 
2
  import gradio as gr
 
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
5
 
6
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
 
8
- tokenizer_3b_mt = AutoTokenizer.from_pretrained("google/madlad400-3b-mt", use_fast=True)
9
- language_codes = [token for token in tokenizer_3b_mt.get_vocab().keys() if token.startswith("<2")]
10
- remove_codes = ['<2>', '<2en_xx_simple>', '<2translate>', '<2back_translated>', '<2zxx_xx_dtynoise>', '<2transliterate>']
11
- language_codes = [token for token in language_codes if token not in remove_codes]
12
 
13
- model_choices = [
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "google/madlad400-3b-mt",
15
  "google/madlad400-7b-mt",
16
  "google/madlad400-10b-mt",
17
  "google/madlad400-7b-mt-bt"
18
  ]
19
 
20
- model_resources = {}
21
 
22
- def load_tokenizer_model(model_name):
23
  """
24
  Load tokenizer and model for a chosen model name.
 
 
 
 
 
 
25
  """
26
- if model_name not in model_resources:
27
- # Load tokenizer and model for first time
28
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
29
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
30
- model.to_bettertransformer()
31
- model.to(device)
32
- model_resources[model_name] = (tokenizer, model)
33
- return model_resources[model_name]
34
 
35
  @spaces.GPU
36
- def translate(text, target_language, model_name):
37
  """
38
  Translate the input text from English to another language.
 
 
 
 
 
 
 
 
39
  """
 
 
 
 
 
 
40
  # Load tokenizer and model if not already loaded
41
  tokenizer, model = load_tokenizer_model(model_name)
42
 
43
- text = target_language + text
44
- input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
45
 
46
  outputs = model.generate(input_ids=input_ids, max_new_tokens=128000)
47
  text_translated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
48
 
49
  return text_translated[0]
50
 
51
- title = "MADLAD-400 Translation"
52
- description = """
53
- Translation from English to over 400 languages based on [research](https://arxiv.org/pdf/2309.04662) by Google DeepMind and Google Research. Initial inference will be slow as models load.
 
54
  """
55
 
 
56
  input_text = gr.Textbox(
57
  label="Text",
58
  placeholder="Enter text here"
59
  )
 
60
  target_language = gr.Dropdown(
61
- choices=language_codes,
62
- value="<2haw>",
63
  label="Target language"
64
  )
 
65
  model_choice = gr.Dropdown(
66
- choices=model_choices,
67
  value="google/madlad400-3b-mt",
68
  label="Model"
69
  )
 
70
  output_text = gr.Textbox(label="Translation")
71
 
 
72
  demo = gr.Interface(
73
  fn=translate,
74
  inputs=[input_text, target_language, model_choice],
75
  outputs=output_text,
76
- title=title,
77
- description=description
78
  )
79
 
80
- demo.queue()
81
-
82
- demo.launch()
 
1
+ """
2
+ This module provides an interface for translation using the MADLAD-400 models.
3
+ The interface allows users to enter English text, select the target language, and choose a model.
4
+ The user will receive the translated text.
5
+ """
6
+
7
  import gradio as gr
8
+ import spaces
9
  import torch
10
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
11
+ from LangMap.langid_mapping import langid_to_language
12
 
13
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
 
15
+ # Initialize the tokenizer
16
+ TOKENIZER_3B_MT = AutoTokenizer.from_pretrained("google/madlad400-3b-mt", use_fast=True)
 
 
17
 
18
+ # Retrieve the language codes
19
+ LANGUAGE_CODES = [token for token in TOKENIZER_3B_MT.get_vocab().keys() if token in langid_to_language.keys()]
20
+
21
+ # Mapping language codes to human readable language names
22
+ LANGUAGE_MAP = {k: v for k, v in langid_to_language.items() if k in LANGUAGE_CODES}
23
+
24
+ # Invert the language mapping for reverse lookup (from language name to language code)
25
+ NAME_TO_CODE_MAP = {name: code for code, name in LANGUAGE_MAP.items()}
26
+
27
+ # Extract the language names for the dropdown in the Gradio interface
28
+ LANGUAGE_NAMES = list(LANGUAGE_MAP.values())
29
+
30
+ # Model choices
31
+ MODEL_CHOICES = [
32
  "google/madlad400-3b-mt",
33
  "google/madlad400-7b-mt",
34
  "google/madlad400-10b-mt",
35
  "google/madlad400-7b-mt-bt"
36
  ]
37
 
38
+ MODEL_RESOURCES = {}
39
 
40
+ def load_tokenizer_model(model_name: str):
41
  """
42
  Load tokenizer and model for a chosen model name.
43
+
44
+ Args:
45
+ model_name (str): The name of the model to load.
46
+
47
+ Returns:
48
+ tuple: The tokenizer and model for the specified model.
49
  """
50
+ if model_name not in MODEL_RESOURCES:
51
+ # Load tokenizer and model for the first time
52
  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
53
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
54
+ model.to(DEVICE)
55
+ MODEL_RESOURCES[model_name] = (tokenizer, model)
56
+ return MODEL_RESOURCES[model_name]
 
57
 
58
  @spaces.GPU
59
+ def translate(text: str, target_language_name: str, model_name: str) -> str:
60
  """
61
  Translate the input text from English to another language.
62
+
63
+ Args:
64
+ text (str): The input text to be translated.
65
+ target_language_name (str): The human readable target language name.
66
+ model_name (str): The model name for translation.
67
+
68
+ Returns:
69
+ str: The translated text.
70
  """
71
+ # Convert the selected language name back to its corresponding language code
72
+ target_language_code = NAME_TO_CODE_MAP.get(target_language_name)
73
+
74
+ if target_language_code is None:
75
+ raise ValueError(f"Unsupported language: {target_language_name}")
76
+
77
  # Load tokenizer and model if not already loaded
78
  tokenizer, model = load_tokenizer_model(model_name)
79
 
80
+ text = target_language_code + text
81
+ input_ids = tokenizer(text, return_tensors="pt").input_ids.to(DEVICE)
82
 
83
  outputs = model.generate(input_ids=input_ids, max_new_tokens=128000)
84
  text_translated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
85
 
86
  return text_translated[0]
87
 
88
+ TITLE = "MADLAD-400 Translation"
89
+ DESCRIPTION = """
90
+ Translation from English to (almost) 400 languages based on [research](https://arxiv.org/pdf/2309.04662)
91
+ by Google DeepMind and Google Research.
92
  """
93
 
94
+ # Gradio components
95
  input_text = gr.Textbox(
96
  label="Text",
97
  placeholder="Enter text here"
98
  )
99
+
100
  target_language = gr.Dropdown(
101
+ choices=LANGUAGE_NAMES, # Use language names instead of codes
102
+ value="Hawaiian", # Default human readable language name
103
  label="Target language"
104
  )
105
+
106
  model_choice = gr.Dropdown(
107
+ choices=MODEL_CHOICES,
108
  value="google/madlad400-3b-mt",
109
  label="Model"
110
  )
111
+
112
  output_text = gr.Textbox(label="Translation")
113
 
114
+ # Define the Gradio interface
115
  demo = gr.Interface(
116
  fn=translate,
117
  inputs=[input_text, target_language, model_choice],
118
  outputs=output_text,
119
+ title=TITLE,
120
+ description=DESCRIPTION
121
  )
122
 
123
+ # Launch the Gradio interface
124
+ demo.launch()