Spaces:
Runtime error
Runtime error
from huggingface_hub import from_pretrained_keras | |
import numpy as np | |
import gradio as gr | |
import transformers | |
import tensorflow as tf | |
class BertSemanticDataGenerator(tf.keras.utils.Sequence): | |
"""Generates batches of data.""" | |
def __init__( | |
self, | |
sentence_pairs, | |
labels, | |
batch_size=32, | |
shuffle=True, | |
include_targets=True, | |
): | |
self.sentence_pairs = sentence_pairs | |
self.labels = labels | |
self.shuffle = shuffle | |
self.batch_size = batch_size | |
self.include_targets = include_targets | |
# Load our BERT Tokenizer to encode the text. | |
# We will use base-base-uncased pretrained model. | |
self.tokenizer = transformers.BertTokenizer.from_pretrained( | |
"bert-base-uncased", do_lower_case=True | |
) | |
self.indexes = np.arange(len(self.sentence_pairs)) | |
self.on_epoch_end() | |
def __len__(self): | |
# Denotes the number of batches per epoch. | |
return len(self.sentence_pairs) // self.batch_size | |
def __getitem__(self, idx): | |
# Retrieves the batch of index. | |
indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size] | |
sentence_pairs = self.sentence_pairs[indexes] | |
# With BERT tokenizer's batch_encode_plus batch of both the sentences are | |
# encoded together and separated by [SEP] token. | |
encoded = self.tokenizer.batch_encode_plus( | |
sentence_pairs.tolist(), | |
add_special_tokens=True, | |
max_length=128, | |
return_attention_mask=True, | |
return_token_type_ids=True, | |
pad_to_max_length=True, | |
return_tensors="tf", | |
) | |
# Convert batch of encoded features to numpy array. | |
input_ids = np.array(encoded["input_ids"], dtype="int32") | |
attention_masks = np.array(encoded["attention_mask"], dtype="int32") | |
token_type_ids = np.array(encoded["token_type_ids"], dtype="int32") | |
# Set to true if data generator is used for training/validation. | |
if self.include_targets: | |
labels = np.array(self.labels[indexes], dtype="int32") | |
return [input_ids, attention_masks, token_type_ids], labels | |
else: | |
return [input_ids, attention_masks, token_type_ids] | |
model = from_pretrained_keras("keras-io/bert-semantic-similarity") | |
labels = ["contradiction", "entailment", "neutral"] | |
def predict(sentence1, sentence2): | |
sentence_pairs = np.array([[str(sentence1), str(sentence2)]]) | |
test_data = BertSemanticDataGenerator( | |
sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False, | |
) | |
probs = model.predict(test_data[0])[0] | |
labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)} | |
return labels_probs | |
#idx = np.argmax(proba) | |
#proba = f"{proba[idx]*100:.2f}%" | |
#pred = labels[idx] | |
#return f'The semantic similarity of two input sentences is {pred} with {proba} of probability' | |
inputs = [ | |
gr.Audio(source = "upload", label='Upload audio file', type="filepath"), | |
] | |
examples = [["Two women are observing something together.", "Two women are standing with their eyes closed."], | |
["A smiling costumed woman is holding an umbrella", "A happy woman in a fairy costume holds an umbrella"], | |
["A soccer game with multiple males playing", "Some men are playing a sport"], | |
] | |
gr.Interface( | |
fn=predict, | |
title="Semantic Similarity with BERT", | |
description = "Natural Language Inference by fine-tuning BERT model on SNLI Corpus π°", | |
inputs=["text", "text"], | |
examples=examples, | |
#outputs=gr.Textbox(label='Prediction'), | |
outputs=gr.outputs.Label(num_top_classes=3, label='Semantic similarity score'), | |
cache_examples=False, | |
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/nlp/semantic_similarity_with_bert/\">Mohamad Merchant</a>", | |
).launch(debug=True, enable_queue=True) |