Translator / app.py
ProfessorLeVesseur's picture
Update app.py
4447788 verified
import streamlit as st
import requests
import os
import PyPDF2
import docx
import time
#------------------------------------------------------------------------
# Configurations
#------------------------------------------------------------------------
# Streamlit page setup
st.set_page_config(
page_title="Text Translator",
page_icon=":speech_balloon:",
layout="centered",
initial_sidebar_state="auto",
menu_items={
'Get Help': 'mailto:[email protected]',
'About': "This app is built to support translation tasks"
}
)
#------------------------------------------------------------------------
# Title
#------------------------------------------------------------------------
# Set the title of the app
st.title("Text Translator")
# Description
st.write("""
Choose a target language, enter your text or upload a document, and click **Translate** to get the translated text.
""")
#------------------------------------------------------------------------
# Sidebar
#------------------------------------------------------------------------
with st.sidebar:
# Password input field
# password = st.text_input("Enter Password:", type="password")
# Set the desired width in pixels
image_width = 300
# Define the path to the image
# image_path = "MTSSai_logo.png"
# Display the image
# st.image(image_path, width=image_width)
# Set the title
st.title("MTSS.ai")
# Toggle for Help and Report a Bug
with st.expander("Need help and report a bug"):
st.write("""
**Contact**: Cheyne LeVesseur, PhD
**Email**: [email protected]
""")
st.divider()
st.subheader('User Instructions')
# Principles text with Markdown formatting
User_Instructions = """
- **Step 1**: Provide either text input or upload a document for translation.
- **Step 2**: Click Translate.
- **Step 3**: Sit back, relax, and let the magic happen!
"""
st.markdown(User_Instructions)
#------------------------------------------------------------------------
# Functions
#------------------------------------------------------------------------
# Language to model mapping
language_model_mapping = {
"Spanish": "Helsinki-NLP/opus-mt-en-es",
"Arabic": "Helsinki-NLP/opus-mt-en-ar",
"Chinese": "Helsinki-NLP/opus-mt-en-zh",
"Albanian": "Helsinki-NLP/opus-mt-en-sq",
"French": "Helsinki-NLP/opus-mt-en-fr",
"German": "Helsinki-NLP/opus-mt-en-de",
"Japanese": "Helsinki-NLP/opus-mt-en-jap",
"Italian": "Helsinki-NLP/opus-mt-en-it",
"Dutch": "Helsinki-NLP/opus-mt-en-nl",
"Hindi": "Helsinki-NLP/opus-mt-en-hi",
"Russian": "Helsinki-NLP/opus-mt-en-ru",
"Indonesian": "Helsinki-NLP/opus-mt-en-id",
"Greek": "Helsinki-NLP/opus-mt-en-el",
"Danish": "Helsinki-NLP/opus-mt-en-da",
"Swedish": "Helsinki-NLP/opus-mt-en-sv",
"Czech": "Helsinki-NLP/opus-mt-en-cs",
"Catalan": "Helsinki-NLP/opus-mt-en-ca",
"Bulgarian": "Helsinki-NLP/opus-mt-en-bg",
"Estonian": "Helsinki-NLP/opus-mt-en-et",
"Basque": "Helsinki-NLP/opus-mt-en-eu",
"Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
"Finnish": "Helsinki-NLP/opus-mt-en-fi",
"Hebrew": "Helsinki-NLP/opus-mt-en-he",
"Azerbaijani": "Helsinki-NLP/opus-mt-en-az",
"Afrikaans": "Helsinki-NLP/opus-mt-en-af",
"Armenian": "Helsinki-NLP/opus-mt-en-hy",
"Hungarian": "Helsinki-NLP/opus-mt-en-hu"
}
# Dropdown for language selection
language = st.selectbox(
"Select target language",
list(language_model_mapping.keys())
)
# Input method selection
input_option = st.radio("Select input method:", ("Text Input", "Upload Document"))
input_text = ""
# Functions to extract text from files
def extract_text_from_pdf(pdf_file):
try:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
extracted_text = page.extract_text()
if extracted_text:
text += extracted_text + "\n"
return text
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return ""
def extract_text_from_docx(docx_file):
try:
doc = docx.Document(docx_file)
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
return text
except Exception as e:
st.error(f"Error extracting text from Word document: {e}")
return ""
# Text area or file uploader based on input method
if input_option == "Text Input":
input_text = st.text_area("Enter text to translate", height=200)
elif input_option == "Upload Document":
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "docx"])
if uploaded_file is not None:
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension == ".pdf":
with st.spinner("Extracting text from PDF..."):
input_text = extract_text_from_pdf(uploaded_file)
elif file_extension == ".docx":
with st.spinner("Extracting text from Word document..."):
input_text = extract_text_from_docx(uploaded_file)
else:
st.error("Unsupported file type.")
input_text = ""
# Function to split text into chunks
def split_text_into_chunks(text, max_chunk_size):
return [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
# Function to perform the translation with retry mechanism
def translate_text(text, target_lang, max_retries=5, backoff_factor=2):
model = language_model_mapping.get(target_lang)
if not model:
st.error("Unsupported language selected.")
return None
# Retrieve Hugging Face API key from environment variables
hf_api_key = os.getenv('HF_API_KEY')
if not hf_api_key:
st.error("Hugging Face API key not set in environment variables.")
return None
API_URL = f"/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2F%3Cspan class="hljs-subst">{model}"
headers = {
"Authorization": f"Bearer {hf_api_key}" # Use the API key from environment variables
}
# Split the text into manageable chunks
max_chunk_size = 500 # Adjust based on API limitations
text_chunks = split_text_into_chunks(text, max_chunk_size)
translated_chunks = []
for chunk_index, chunk in enumerate(text_chunks):
attempt = 0
while attempt < max_retries:
payload = {
"inputs": chunk,
}
try:
response = requests.post(API_URL, headers=headers, json=payload)
if response.status_code == 503:
# Service Unavailable, retry after delay
attempt += 1
wait_time = backoff_factor ** attempt
time.sleep(wait_time)
continue
response.raise_for_status() # Raise an error for bad status codes
result = response.json()
# Handle possible errors from the API
if isinstance(result, dict) and result.get("error"):
st.error(f"Error from translation API: {result['error']}")
return None
# The API might return a list of translations
if isinstance(result, list) and len(result) > 0:
translated_text = result[0].get("translation_text", "No translation found.")
elif isinstance(result, dict) and "translation_text" in result:
translated_text = result["translation_text"]
else:
translated_text = "Unexpected response format from the API."
translated_chunks.append(translated_text)
break # Exit the retry loop if successful
except requests.exceptions.RequestException as e:
attempt += 1
wait_time = backoff_factor ** attempt
time.sleep(wait_time)
else:
# All retry attempts failed for this chunk
st.error(f"Failed to translate chunk {chunk_index + 1} after {max_retries} attempts.")
return None
return " ".join(translated_chunks)
# Translate button
if st.button("Translate"):
if not input_text.strip():
st.warning("Please enter some text to translate.")
else:
with st.spinner("Translation service loading..."):
translated = translate_text(input_text, language)
if translated:
st.subheader("Translated Text:")
st.write(translated)
else:
st.error("Translation failed. Please try again later.")