File size: 1,996 Bytes
0c07570 0e41473 0c07570 226cae8 0c07570 56518e1 0e41473 885a800 0e41473 0c07570 d118cf4 885a800 0e41473 885a800 0c07570 0e41473 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch
# Load a small CPU model for text to vector processing
model_name = "sentence-transformers/all-mpnet-base-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def text_to_vector(texts):
results = []
# Process each sentence individually to catch errors
for sentence in texts:
try:
# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
# Check if tokenization results in valid tokens
if inputs['input_ids'].shape[1] == 0:
raise ValueError(f"Tokenization failed for sentence: '{sentence}'")
# Pass through the model
with torch.no_grad():
outputs = model(**inputs)
# Get the vector from pooler_output or handle errors
if outputs.pooler_output is None:
raise ValueError(f"No vector generated for sentence: '{sentence}'")
# Convert the vector to a list of floats
vector = outputs.pooler_output.squeeze().numpy().tolist()
# Append result as sentence and vector pair
results.append({
"sentence": sentence,
"vector": vector
})
except Exception as e:
# Handle any errors for individual sentences
results.append({
"sentence": sentence,
"vector": f"Error: {str(e)}"
})
return results
demo = gr.Interface(
fn=text_to_vector,
inputs=gr.Textbox(label="Enter JSON array", placeholder="Enter an array of sentences as a JSON string"),
outputs=gr.JSON(label="Sentence and Vector Pairs"),
title="Batch Text to Vector",
description="This demo converts an array of sentences to vectors and returns objects with sentence and vector."
)
demo.launch()
|