spanish-gpt2 / app.py
mariagrandury's picture
Update application
08d4272
import json
import random
import requests
from mtranslate import translate
import streamlit as st
LOGO = "https://raw.githubusercontent.com/nlp-en-es/assets/main/logo.png"
MODELS = {
"Model trained on OSCAR": {
"url": "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fflax-community%2Fgpt-2-spanish%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
},
"Model trained on the Large Spanish Corpus": {
"url": "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fmrm8488%2Fspanish-gpt2%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
},
}
PROMPT_LIST = {
"Érase una vez...": ["Érase una vez "],
"¡Hola!": ["¡Hola! Me llamo "],
"¿Ser o no ser?": ["En mi opinión, 'ser' es "],
}
def query(payload, model_name):
data = json.dumps(payload)
print("model url:", MODELS[model_name]["url"])
response = requests.request(
"POST", MODELS[model_name]["url"], headers={}, data=data
)
return json.loads(response.content.decode("utf-8"))
def process(
text: str, model_name: str, max_len: int, temp: float, top_k: int, top_p: float
):
payload = {
"inputs": text,
"parameters": {
"max_new_tokens": max_len,
"top_k": top_k,
"top_p": top_p,
"temperature": temp,
"repetition_penalty": 2.0,
},
"options": {
"use_cache": True,
},
}
return query(payload, model_name)
# Page
st.set_page_config(page_title="Spanish GPT-2 Demo", page_icon=LOGO)
st.title("Spanish GPT-2")
# Sidebar
st.sidebar.image(LOGO)
st.sidebar.subheader("Configurable parameters")
max_len = st.sidebar.number_input(
"Maximum length",
value=100,
help="The maximum length of the sequence to be generated.",
)
temp = st.sidebar.slider(
"Temperature",
value=1.0,
min_value=0.1,
max_value=100.0,
help="The value used to module the next token probabilities.",
)
top_k = st.sidebar.number_input(
"Top k",
value=10,
help="The number of highest probability vocabulary tokens to keep for top-k-filtering.",
)
top_p = st.sidebar.number_input(
"Top p",
value=0.95,
help=" If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.",
)
do_sample = st.sidebar.selectbox(
"Sampling?",
(True, False),
help="Whether or not to use sampling; use greedy decoding otherwise.",
)
# Body
st.markdown(
"""
Spanish GPT-2 models trained from scratch on two different datasets. One
model is trained on the Spanish portion of
[OSCAR](https://huggingface.co/datasets/viewer/?dataset=oscar)
and the other on the
[large_spanish_corpus](https://huggingface.co/datasets/viewer/?dataset=large_spanish_corpus)
aka BETO's corpus.
The models are trained with Flax and using TPUs sponsored by Google since this is part of the
[Flax/Jax Community Week](/static-proxy?url=https%3A%2F%2Fdiscuss.huggingface.co%2Ft%2Fopen-to-the-community-community-week-using-jax-flax-for-nlp-cv%2F7104)%3C%2Fspan%3E%3C!-- HTML_TAG_END -->
organised by HuggingFace.
"""
)
model_name = st.selectbox("Model", (list(MODELS.keys())))
ALL_PROMPTS = list(PROMPT_LIST.keys()) + ["Custom"]
prompt = st.selectbox("Prompt", ALL_PROMPTS, index=len(ALL_PROMPTS) - 1)
if prompt == "Custom":
prompt_box = "Enter your text here"
else:
prompt_box = random.choice(PROMPT_LIST[prompt])
text = st.text_area("Enter text", prompt_box)
if st.button("Run"):
with st.spinner(text="Getting results..."):
st.subheader("Result")
print(f"maxlen:{max_len}, temp:{temp}, top_k:{top_k}, top_p:{top_p}")
result = process(
text=text,
model_name=model_name,
max_len=int(max_len),
temp=temp,
top_k=int(top_k),
top_p=float(top_p),
)
print("result:", result)
if "error" in result:
if type(result["error"]) is str:
st.write(f'{result["error"]}.', end=" ")
if "estimated_time" in result:
st.write(
f'Please try again in about {result["estimated_time"]:.0f} seconds.'
)
else:
if type(result["error"]) is list:
for error in result["error"]:
st.write(f"{error}")
else:
result = result[0]["generated_text"]
st.write(result.replace("\n", " \n"))
st.text("English translation")
st.write(translate(result, "en", "es").replace("\n", " \n"))
st.markdown(
"""
### Team members
- Manuel Romero ([mrm8488](https://huggingface.co/mrm8488))
- María Grandury ([mariagrandury](https://huggingface.co/mariagrandury))
- Pablo González de Prado ([Pablogps](https://huggingface.co/Pablogps))
- Daniel Vera ([daveni](https://huggingface.co/daveni))
- Sri Lakshmi ([srisweet](https://huggingface.co/srisweet))
- José Posada ([jdposa](https://huggingface.co/jdposa))
- Santiago Hincapie ([shpotes](https://huggingface.co/shpotes))
- Jorge ([jorgealro](https://huggingface.co/jorgealro))
### More information
You can find more information about these models in their cards:
- [Model trained on OSCAR](https://huggingface.co/models/flax-community/gpt-2-spanish)
- [Model trained on the Large Spanish Corpus](https://huggingface.co/mrm8488/spanish-gpt2)
"""
)