Spaces:
Sleeping
Sleeping
Update app.py
Browse files+other target languages
app.py
CHANGED
@@ -550,7 +550,7 @@ logger = logging.getLogger(__name__)
|
|
550 |
|
551 |
# Main function to handle the translation workflow
|
552 |
# Main function to handle the translation workflow
|
553 |
-
def main(dataset_url, model_type, output_dataset_name, range_specification, token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
|
554 |
try:
|
555 |
# Login to Hugging Face
|
556 |
if token is None or profile is None or token.token is None or profile.username is None:
|
@@ -574,6 +574,7 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
|
|
574 |
# Load the tokenizer
|
575 |
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
|
576 |
tokenizer.src_lang = "en"
|
|
|
577 |
logger.info("Tokenizer loaded successfully.")
|
578 |
|
579 |
# Define the task based on user input
|
@@ -581,10 +582,11 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
|
|
581 |
"url": dataset_url,
|
582 |
"local_path": "train.parquet",
|
583 |
"input_file": f"{model_type}_en.jsonl",
|
584 |
-
"output_file": f"{model_type}
|
585 |
-
"raw_file": f"{model_type}
|
586 |
"range_spec": range_specification,
|
587 |
-
"model_type": model_type
|
|
|
588 |
}
|
589 |
|
590 |
# Call the translate_dataset function with the provided parameters
|
@@ -601,6 +603,7 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
|
|
601 |
model_type=task["model_type"],
|
602 |
translator=translator,
|
603 |
tokenizer=tokenizer,
|
|
|
604 |
)
|
605 |
logger.info("Dataset translation completed!")
|
606 |
return "Dataset translation completed!\n\n### Logs:\n" + log_stream.getvalue()
|
@@ -608,15 +611,17 @@ def main(dataset_url, model_type, output_dataset_name, range_specification, toke
|
|
608 |
return "Login failed. Please try again."
|
609 |
except Exception as e:
|
610 |
logger.error(f"An error occurred in the main function: {e}")
|
611 |
-
# Ensure logs are flushed and captured
|
612 |
return f"An error occurred: {e}\n\n### Logs:\n{log_stream.getvalue()}"
|
613 |
|
|
|
614 |
# Gradio interface setup
|
615 |
gradio_title = "π§ WMT21 Dataset Translation"
|
616 |
-
gradio_desc = """This tool translates datasets using the WMT21 translation model.
|
617 |
## π What Does This Tool Do:
|
618 |
-
- Translates datasets based on the selected model type.
|
619 |
-
-
|
|
|
|
|
620 |
datasets_desc = """## π Dataset Types:
|
621 |
- **mix**:
|
622 |
- `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
|
@@ -650,12 +655,14 @@ with gr.Blocks(theme=theme) as demo:
|
|
650 |
model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
|
651 |
output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
|
652 |
range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
|
653 |
-
|
|
|
654 |
with gr.Column():
|
655 |
output = gr.Markdown(label="Output")
|
656 |
|
657 |
submit_btn = gr.Button("Translate Dataset", variant="primary")
|
658 |
-
submit_btn.click(main, inputs=[dataset_url, model_type, output_dataset_name, range_specification], outputs=output)
|
|
|
659 |
|
660 |
gr.Markdown(datasets_desc)
|
661 |
|
|
|
550 |
|
551 |
# Main function to handle the translation workflow
|
552 |
# Main function to handle the translation workflow
|
553 |
+
def main(dataset_url, model_type, output_dataset_name, range_specification, target_language, token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
|
554 |
try:
|
555 |
# Login to Hugging Face
|
556 |
if token is None or profile is None or token.token is None or profile.username is None:
|
|
|
574 |
# Load the tokenizer
|
575 |
tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name)
|
576 |
tokenizer.src_lang = "en"
|
577 |
+
tokenizer.tgt_lang = target_language # Set target language
|
578 |
logger.info("Tokenizer loaded successfully.")
|
579 |
|
580 |
# Define the task based on user input
|
|
|
582 |
"url": dataset_url,
|
583 |
"local_path": "train.parquet",
|
584 |
"input_file": f"{model_type}_en.jsonl",
|
585 |
+
"output_file": f"{model_type}_{target_language}.jsonl", # Include target language in the filename
|
586 |
+
"raw_file": f"{model_type}_{target_language}_raw.jsonl",
|
587 |
"range_spec": range_specification,
|
588 |
+
"model_type": model_type,
|
589 |
+
"target_language": target_language # Include target language in the task
|
590 |
}
|
591 |
|
592 |
# Call the translate_dataset function with the provided parameters
|
|
|
603 |
model_type=task["model_type"],
|
604 |
translator=translator,
|
605 |
tokenizer=tokenizer,
|
606 |
+
target_language=task["target_language"] # Pass the target language
|
607 |
)
|
608 |
logger.info("Dataset translation completed!")
|
609 |
return "Dataset translation completed!\n\n### Logs:\n" + log_stream.getvalue()
|
|
|
611 |
return "Login failed. Please try again."
|
612 |
except Exception as e:
|
613 |
logger.error(f"An error occurred in the main function: {e}")
|
|
|
614 |
return f"An error occurred: {e}\n\n### Logs:\n{log_stream.getvalue()}"
|
615 |
|
616 |
+
|
617 |
# Gradio interface setup
|
618 |
gradio_title = "π§ WMT21 Dataset Translation"
|
619 |
+
gradio_desc = """This tool translates english datasets using the WMT21 translation model.
|
620 |
## π What Does This Tool Do:
|
621 |
+
- Translates datasets with structures based on the selected model type.
|
622 |
+
- The translation model (facebook/wmt21-dense-24-wide-en-x) supports as target languages: Hausa (ha), Icelandic (is), Japanese (ja), Czech (cs), Russian (ru), Chinese (zh), German (de)
|
623 |
+
- Uploads the translated dataset to Hugging Face.
|
624 |
+
- At the moment, this works only on CPU, and therefore is very very slow (>1 minute per item depending on string lengths)."""
|
625 |
datasets_desc = """## π Dataset Types:
|
626 |
- **mix**:
|
627 |
- `prompt`: List of dictionaries with 'content' and 'role' fields (multi-turn conversation).
|
|
|
655 |
model_type = gr.Dropdown(choices=["mix", "ufb_cached", "ufb"], label="Dataset Type")
|
656 |
output_dataset_name = gr.Textbox(label="Output Dataset Name", lines=1, placeholder = "cstr/translated_datasets")
|
657 |
range_specification = gr.Textbox(label="Range Specification", lines=1, placeholder="e.g., 1-100")
|
658 |
+
target_language = gr.Dropdown(choices=["ha", "is", "ja", "cs", "ru", "zh", "de"], label="Target Language") # New dropdown for target language
|
659 |
+
|
660 |
with gr.Column():
|
661 |
output = gr.Markdown(label="Output")
|
662 |
|
663 |
submit_btn = gr.Button("Translate Dataset", variant="primary")
|
664 |
+
submit_btn.click(main, inputs=[dataset_url, model_type, output_dataset_name, range_specification, target_language], outputs=output)
|
665 |
+
|
666 |
|
667 |
gr.Markdown(datasets_desc)
|
668 |
|