File size: 1,996 Bytes
0c07570
 
0e41473
0c07570
 
226cae8
0c07570
 
 
56518e1
0e41473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
885a800
0e41473
0c07570
 
 
d118cf4
885a800
0e41473
885a800
0c07570
 
 
0e41473
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
from transformers import AutoModel, AutoTokenizer
import torch

# Load a small CPU model for text to vector processing
model_name = "sentence-transformers/all-mpnet-base-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def text_to_vector(texts):
    results = []

    # Process each sentence individually to catch errors
    for sentence in texts:
        try:
            # Tokenize the sentence
            inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)

            # Check if tokenization results in valid tokens
            if inputs['input_ids'].shape[1] == 0:
                raise ValueError(f"Tokenization failed for sentence: '{sentence}'")

            # Pass through the model
            with torch.no_grad():
                outputs = model(**inputs)

            # Get the vector from pooler_output or handle errors
            if outputs.pooler_output is None:
                raise ValueError(f"No vector generated for sentence: '{sentence}'")
            
            # Convert the vector to a list of floats
            vector = outputs.pooler_output.squeeze().numpy().tolist()

            # Append result as sentence and vector pair
            results.append({
                "sentence": sentence,
                "vector": vector
            })
        except Exception as e:
            # Handle any errors for individual sentences
            results.append({
                "sentence": sentence,
                "vector": f"Error: {str(e)}"
            })
    
    return results

demo = gr.Interface(
    fn=text_to_vector,
    inputs=gr.Textbox(label="Enter JSON array", placeholder="Enter an array of sentences as a JSON string"),
    outputs=gr.JSON(label="Sentence and Vector Pairs"),
    title="Batch Text to Vector",
    description="This demo converts an array of sentences to vectors and returns objects with sentence and vector."
)

demo.launch()