0xalfroz commited on
Commit
0e41473
·
verified ·
1 Parent(s): 885a800

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -13
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from transformers import AutoModel, AutoTokenizer
3
- import numpy as np
4
 
5
  # Load a small CPU model for text to vector processing
6
  model_name = "sentence-transformers/all-mpnet-base-v2"
@@ -8,25 +8,50 @@ model = AutoModel.from_pretrained(model_name)
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
 
10
  def text_to_vector(texts):
11
- # Tokenize the input array of sentences
12
- inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
13
- outputs = model(**inputs)
14
- vectors = outputs.pooler_output.detach().numpy()
15
-
16
- # Convert each vector to a string representation and create an object
17
- result = [
18
- {"sentence": sentence, "vector": ", ".join(map(str, vector))}
19
- for sentence, vector in zip(texts, vectors)
20
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- return result
23
 
24
  demo = gr.Interface(
25
  fn=text_to_vector,
26
  inputs=gr.Textbox(label="Enter JSON array", placeholder="Enter an array of sentences as a JSON string"),
27
  outputs=gr.JSON(label="Sentence and Vector Pairs"),
28
- title="Batch Text to Vector 769 dim",
29
  description="This demo converts an array of sentences to vectors and returns objects with sentence and vector."
30
  )
31
 
32
  demo.launch()
 
 
1
  import gradio as gr
2
  from transformers import AutoModel, AutoTokenizer
3
+ import torch
4
 
5
  # Load a small CPU model for text to vector processing
6
  model_name = "sentence-transformers/all-mpnet-base-v2"
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_name)
9
 
10
  def text_to_vector(texts):
11
+ results = []
12
+
13
+ # Process each sentence individually to catch errors
14
+ for sentence in texts:
15
+ try:
16
+ # Tokenize the sentence
17
+ inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
18
+
19
+ # Check if tokenization results in valid tokens
20
+ if inputs['input_ids'].shape[1] == 0:
21
+ raise ValueError(f"Tokenization failed for sentence: '{sentence}'")
22
+
23
+ # Pass through the model
24
+ with torch.no_grad():
25
+ outputs = model(**inputs)
26
+
27
+ # Get the vector from pooler_output or handle errors
28
+ if outputs.pooler_output is None:
29
+ raise ValueError(f"No vector generated for sentence: '{sentence}'")
30
+
31
+ # Convert the vector to a list of floats
32
+ vector = outputs.pooler_output.squeeze().numpy().tolist()
33
+
34
+ # Append result as sentence and vector pair
35
+ results.append({
36
+ "sentence": sentence,
37
+ "vector": vector
38
+ })
39
+ except Exception as e:
40
+ # Handle any errors for individual sentences
41
+ results.append({
42
+ "sentence": sentence,
43
+ "vector": f"Error: {str(e)}"
44
+ })
45
 
46
+ return results
47
 
48
  demo = gr.Interface(
49
  fn=text_to_vector,
50
  inputs=gr.Textbox(label="Enter JSON array", placeholder="Enter an array of sentences as a JSON string"),
51
  outputs=gr.JSON(label="Sentence and Vector Pairs"),
52
+ title="Batch Text to Vector",
53
  description="This demo converts an array of sentences to vectors and returns objects with sentence and vector."
54
  )
55
 
56
  demo.launch()
57
+