Spaces:

Mageia
/

GOT-OCR-Optimize

Running

App Files Files Community

Mageia commited on Oct 15, 2024

Commit

4d31938

unverified ·

1 Parent(s): 0e016b0

fix: process pdf once

Browse files

Files changed (1) hide show

app.py +33 -188

app.py CHANGED Viewed

@@ -1,209 +1,54 @@
 import base64
-import multiprocessing
 import os
-import shutil
 import uuid
-from functools import partial
-import fitz  # PyMuPDF
-import gradio as gr
-import spaces
-from PIL import Image, ImageEnhance
-from transformers import AutoModel, AutoTokenizer
-# 全局变量
-model = None
-tokenizer = None
-def initialize_model():
-    model_name = "ucaslcl/GOT-OCR2_0"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    model = AutoModel.from_pretrained(model_name, trust_remote_code=True, low_cpu_mem_usage=True, device_map="auto")
-    model = model.eval()
-    return model, tokenizer
 UPLOAD_FOLDER = "./uploads"
-RESULTS_FOLDER = "./results"
-# 确保必要的文件夹存在
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-os.makedirs(RESULTS_FOLDER, exist_ok=True)
-def pdf_to_images(pdf_path):
-    images = []
-    pdf_document = fitz.open(pdf_path)
-    for page_num in range(len(pdf_document)):
-        page = pdf_document.load_page(page_num)
-        # 进一步增加分辨率和缩放比例
-        zoom = 10  # 增加缩放比例到4
-        mat = fitz.Matrix(zoom, zoom)
-        pix = page.get_pixmap(matrix=mat, alpha=False)
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        # 增对比度
-        enhancer = ImageEnhance.Contrast(img)
-        img = enhancer.enhance(1.5)  # 增加50%的对比度
-        images.append(img)
-    pdf_document.close()
-    return images
-def process_pdf(pdf_file):
-    if pdf_file is None:
-        return None
-    temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")
-    # 使用 shutil 复制上传的件到临时位置
-    shutil.copy(pdf_file.name, temp_pdf_path)
-    images = pdf_to_images(temp_pdf_path)
-    os.remove(temp_pdf_path)
-    # 将图像保存为临时文件并返回文件路径列表
-    image_paths = []
-    for i, img in enumerate(images):
-        img_path = os.path.join(RESULTS_FOLDER, f"page_{i+1}.png")
-        img.save(img_path, "PNG")
-        image_paths.append(img_path)
-    return image_paths
-@spaces.GPU()
-def got_ocr(model, tokenizer, image_path, got_mode="format texts OCR", fine_grained_mode="", ocr_color="", ocr_box=""):
-    # 在这里将模型移动到 GPU
-    model = model.cuda()
-    # 执行OCR
-    try:
-        if got_mode == "plain texts OCR":
-            res = model.chat(tokenizer, image_path, ocr_type="ocr")
-            return res, None
-        elif got_mode == "format texts OCR":
-            result_path = f"{os.path.splitext(image_path)[0]}_result.html"
-            res = model.chat(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
-        elif got_mode == "plain multi-crop OCR":
-            res = model.chat_crop(tokenizer, image_path, ocr_type="ocr")
-            return res, None
-        elif got_mode == "format multi-crop OCR":
-            result_path = f"{os.path.splitext(image_path)[0]}_result.html"
-            res = model.chat_crop(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
-        elif got_mode == "plain fine-grained OCR":
-            res = model.chat(tokenizer, image_path, ocr_type="ocr", ocr_box=ocr_box, ocr_color=ocr_color)
-            return res, None
-        elif got_mode == "format fine-grained OCR":
-            result_path = f"{os.path.splitext(image_path)[0]}_result.html"
-            res = model.chat(tokenizer, image_path, ocr_type="format", ocr_box=ocr_box, ocr_color=ocr_color, render=True, save_render_file=result_path)
-        # 处理格式化结果
-        if "format" in got_mode and os.path.exists(result_path):
-            with open(result_path, "r") as f:
-                html_content = f.read()
-            encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
-            return res, encoded_html
-        else:
-            return res, None
-    except Exception as e:
-        return f"错误: {str(e)}", None
-    finally:
-        # 在使用完后将模型移回 CPU
-        model = model.cpu()
-def worker_process(task_queue, result_queue):
-    model, tokenizer = initialize_model()
-    while True:
-        task = task_queue.get()
-        if task is None:
-            break
-        image_path, got_mode, fine_grained_mode, ocr_color, ocr_box = task
-        result, _ = got_ocr(model, tokenizer, image_path, got_mode, fine_grained_mode, ocr_color, ocr_box)
-        result_queue.put(result)
-def perform_ocr(image_gallery, got_mode, fine_grained_type, color, box):
-    task_queue = multiprocessing.Queue()
-    result_queue = multiprocessing.Queue()
-    process = multiprocessing.Process(target=worker_process, args=(task_queue, result_queue))
-    process.start()
-    results = []
-    progress = gr.Progress()
-    for i, image_info in enumerate(progress.tqdm(image_gallery)):
-        selected_image = image_info[0]
-        ocr_color = color if fine_grained_type == "color" else ""
-        ocr_box = box if fine_grained_type == "box" else ""
-        task_queue.put((selected_image, got_mode, fine_grained_type, ocr_color, ocr_box))
-        result = result_queue.get()
-        results.append(f"第 {i+1} 页结果:\n{result}\n\n")
-    task_queue.put(None)  # 发送终止信号
-    process.join()
-    combined_result = "".join(results)
-    encoded_result = base64.b64encode(combined_result.encode("utf-8")).decode("utf-8")
-    download_link = f'<a href="data:text/plain;base64,{encoded_result}" download="ocr_result.txt">下载完整OCR结果</a>'
-    return gr.Markdown(f"{download_link}\n\n{combined_result[:1000]}..."), combined_result
-def task_update(task):
-    if "fine-grained" in task:
-        return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)]
-    else:
-        return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)]
-def fine_grained_update(fine_grained_type):
-    if fine_grained_type == "color":
-        return [gr.update(visible=True), gr.update(visible=False)]
-    elif fine_grained_type == "box":
-        return [gr.update(visible=False), gr.update(visible=True)]
-    else:
-        return [gr.update(visible=False), gr.update(visible=False)]
-with gr.Blocks() as demo:
-    pdf_input = gr.File(label="上传PDF文件")
-    image_gallery = gr.Gallery(
-        label="PDF页面预览",
-        columns=3,
-        height=600,
-        object_fit="contain",
-        preview=True,
-    )
-    pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
-    task_dropdown = gr.Dropdown(
-        choices=["plain texts OCR", "format texts OCR", "plain multi-crop OCR", "format multi-crop OCR", "plain fine-grained OCR", "format fine-grained OCR"],
-        label="选择GOT模式",
-        value="format texts OCR",
-    )
-    fine_grained_dropdown = gr.Dropdown(choices=["box", "color"], label="fine-grained类型", visible=False)
-    color_dropdown = gr.Dropdown(choices=["red", "green", "blue"], label="颜色列表", visible=False)
-    box_input = gr.Textbox(label="输入框: [x1,y1,x2,y2]", placeholder="例如: [0,0,100,100]", visible=False)
-    ocr_button = gr.Button("开始OCR识别")
-    ocr_result = gr.Markdown(label="OCR结果预览")
-    full_result = gr.State()
-    task_dropdown.change(task_update, inputs=[task_dropdown], outputs=[fine_grained_dropdown, color_dropdown, box_input])
-    fine_grained_dropdown.change(fine_grained_update, inputs=[fine_grained_dropdown], outputs=[color_dropdown, box_input])
-    ocr_button.click(
-        fn=perform_ocr,
-        inputs=[image_gallery, task_dropdown, fine_grained_dropdown, color_dropdown, box_input],
-        outputs=[ocr_result, full_result],
-    )
 if __name__ == "__main__":
-    multiprocessing.set_start_method("spawn")
-    demo.launch()

 import base64
 import os
 import uuid
+import torch
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+from got_ocr import got_ocr
+app = FastAPI()
+# 初始化模型和分词器
+model_name = "ucaslcl/GOT-OCR2_0"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
+model = model.eval().to(device)
+model.config.pad_token_id = tokenizer.eos_token_id
 UPLOAD_FOLDER = "./uploads"
+# 确保上传文件夹存在
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+@app.post("/ocr")
+async def perform_ocr(image: UploadFile = File(...)):
+    # 保存上传的图片
+    image_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.png")
+    with open(image_path, "wb") as buffer:
+        buffer.write(await image.read())
+    # 执行OCR
+    result, html_content = got_ocr(model, tokenizer, image_path, got_mode="format texts OCR")
+    # 删除临时文件
+    os.remove(image_path)
+    # 准备响应
+    response = {"result": result}
+    if html_content:
+        response["html_content"] = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
+    return JSONResponse(content=response)
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)