Hammedalmodel commited on
Commit
9f8dfc0
·
verified ·
1 Parent(s): 3387487

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -49
app.py CHANGED
@@ -1,8 +1,12 @@
 
 
1
  from transformers import MllamaForConditionalGeneration, AutoProcessor
2
  from PIL import Image
3
  import torch
4
- import gradio as gr
5
- import spaces
 
 
6
 
7
  # Initialize model and processor
8
  ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
@@ -12,52 +16,50 @@ model = MllamaForConditionalGeneration.from_pretrained(
12
  ).to("cuda")
13
  processor = AutoProcessor.from_pretrained(ckpt)
14
 
15
- @spaces.GPU
16
- def extract_text(image):
17
- # Convert image to RGB
18
- image = Image.open(image).convert("RGB")
19
-
20
- # Create message structure
21
- messages = [
22
- {
23
- "role": "user",
24
- "content": [
25
- {"type": "text", "text": "Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output"},
26
- {"type": "image"}
27
- ]
28
- }
29
- ]
30
-
31
- # Process input
32
- texts = processor.apply_chat_template(messages, add_generation_prompt=True)
33
- inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")
34
-
35
-
36
- # Generate output
37
- outputs = model.generate(**inputs, max_new_tokens=250)
38
- result = processor.decode(outputs[0], skip_special_tokens=True)
39
-
40
- print(result)
41
-
42
- # Clean up the output to remove the prompt and assistant text
43
- if "assistant" in result.lower():
44
- result = result[result.lower().find("assistant") + len("assistant"):].strip()
45
-
46
- # Remove any remaining conversation markers
47
- result = result.replace("user", "").replace("Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output", "").strip()
48
-
49
- print(result)
50
-
51
- return result
52
 
53
- # Create Gradio interface
54
- demo = gr.Interface(
55
- fn=extract_text,
56
- inputs=gr.Image(type="filepath", label="Upload Image"),
57
- outputs=gr.Textbox(label="Extracted Text"),
58
- title="Handwritten Text Extractor",
59
- description="Upload an image containing handwritten text to extract its content.",
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- # Launch the app
63
- demo.launch(debug=True)
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
  from transformers import MllamaForConditionalGeneration, AutoProcessor
4
  from PIL import Image
5
  import torch
6
+ import requests
7
+ from io import BytesIO
8
+
9
+ app = FastAPI()
10
 
11
  # Initialize model and processor
12
  ckpt = "unsloth/Llama-3.2-11B-Vision-Instruct"
 
16
  ).to("cuda")
17
  processor = AutoProcessor.from_pretrained(ckpt)
18
 
19
+ class ImageRequest(BaseModel):
20
+ image_path: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ @app.post("/extract_text")
23
+ async def extract_text(request: ImageRequest):
24
+ try:
25
+ # Download image from URL
26
+ response = requests.get(request.image_path)
27
+ if response.status_code != 200:
28
+ raise HTTPException(status_code=400, detail="Failed to fetch image from URL")
29
+
30
+ # Open image from bytes
31
+ image = Image.open(BytesIO(response.content)).convert("RGB")
32
+
33
+ # Create message structure
34
+ messages = [
35
+ {
36
+ "role": "user",
37
+ "content": [
38
+ {"type": "text", "text": "Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output"},
39
+ {"type": "image"}
40
+ ]
41
+ }
42
+ ]
43
+
44
+ # Process input
45
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
46
+ inputs = processor(text=texts, images=[image], return_tensors="pt").to("cuda")
47
+
48
+ # Generate output
49
+ outputs = model.generate(**inputs, max_new_tokens=250)
50
+ result = processor.decode(outputs[0], skip_special_tokens=True)
51
+
52
+ # Clean up the output
53
+ if "assistant" in result.lower():
54
+ result = result[result.lower().find("assistant") + len("assistant"):].strip()
55
+
56
+ result = result.replace("user", "").replace("Extract handwritten text from the image and output only the extracted text without any additional description or commentary in output", "").strip()
57
+
58
+ return {"text": f"\n{result}\n"}
59
+
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=str(e))
62
 
63
+ if __name__ == "__main__":
64
+ import uvicorn
65
+ uvicorn.run(app, host="0.0.0.0", port=7860)