File size: 5,601 Bytes
4dcdb35
18def71
4dcdb35
 
8ec1357
4dcdb35
18def71
 
 
 
 
 
 
 
 
4dcdb35
 
 
 
18def71
 
 
 
4dcdb35
8ec1357
 
 
 
 
18def71
 
 
 
8ec1357
18def71
1612798
18def71
 
 
8ec1357
 
 
4dcdb35
 
8ec1357
18def71
 
 
8ec1357
18def71
 
 
 
8ec1357
 
18def71
 
 
 
 
 
 
 
 
4dcdb35
 
18def71
 
 
 
4dcdb35
 
7ee1423
6d6872c
7ee1423
 
 
70b6f98
7ee1423
 
 
 
 
61f1538
6d6872c
 
 
 
 
 
 
 
61f1538
 
3db4b7a
 
 
 
61f1538
3db4b7a
4dcdb35
 
70b6f98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dcdb35
8ec1357
86b715d
 
 
aff59fd
 
 
 
 
 
86b715d
18def71
6d6872c
 
 
 
7ee1423
70b6f98
7ee1423
fffa248
7ee1423
 
 
 
 
70b6f98
61f1538
7ee1423
 
 
 
 
 
6d6872c
 
 
7ee1423
 
e9361c0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import os
import shutil
import uuid

import fitz  # PyMuPDF
import gradio as gr
from modelscope import AutoModel, AutoTokenizer
from PIL import Image, ImageEnhance

from got_ocr import got_ocr

# 初始化模型和分词器
tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
model = model.eval().cuda()

UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"

# 确保必要的文件夹存在
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)


def pdf_to_images(pdf_path):
    images = []
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        # 进一步增加分辨率和缩放比例
        zoom = 4  # 增加缩放比例到4
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # 增对比度
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.5)  # 增加50%的对比度

        images.append(img)
    pdf_document.close()
    return images


def process_pdf(pdf_file):
    if pdf_file is None:
        return None

    temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")

    # 使用 shutil 复制上传的件到临时位置
    shutil.copy(pdf_file.name, temp_pdf_path)

    images = pdf_to_images(temp_pdf_path)
    os.remove(temp_pdf_path)

    # 将图像保存为临时文件并返回文件路径列表
    image_paths = []
    for i, img in enumerate(images):
        img_path = os.path.join(RESULTS_FOLDER, f"page_{i+1}.png")
        img.save(img_path, "PNG")
        image_paths.append(img_path)

    return image_paths


def on_image_select(evt: gr.SelectData):
    if evt.index is not None:
        return evt.index
    return None


# 更新perform_ocr函数的输入参数
def perform_ocr(selected_index, image_gallery, got_mode, fine_grained_type, color, box):
    if selected_index is None or len(image_gallery) == 0:
        return "请先选择一张图片"

    selected_image = image_gallery[selected_index][0]

    # 根据选择的任务和参数调用GOT OCR
    ocr_color = color if fine_grained_type == "color" else ""
    ocr_box = box if fine_grained_type == "box" else ""

    result, html_content = got_ocr(
        model,
        tokenizer,
        selected_image,
        got_mode=got_mode,
        fine_grained_mode=fine_grained_type,
        ocr_color=ocr_color,
        ocr_box=ocr_box,
    )

    if html_content:
        iframe_src = f"data:text/html;base64,{html_content}"
        iframe = f'<iframe src="{iframe_src}" width="100%" height="600px"></iframe>'
        download_link = f'<a href="data:text/html;base64,{html_content}" download="result.html">下载完整结果</a>'
        return gr.Markdown(f"{download_link}\n\n{iframe}")
    else:
        return gr.Markdown(result)


def task_update(task):
    if "fine-grained" in task:
        return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)]
    else:
        return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)]


def fine_grained_update(fine_grained_type):
    if fine_grained_type == "color":
        return [gr.update(visible=True), gr.update(visible=False)]
    elif fine_grained_type == "box":
        return [gr.update(visible=False), gr.update(visible=True)]
    else:
        return [gr.update(visible=False), gr.update(visible=False)]


with gr.Blocks() as demo:
    pdf_input = gr.File(label="上传PDF文件")
    image_gallery = gr.Gallery(
        label="PDF页面预览",
        columns=3,
        height=600,
        object_fit="contain",
        preview=True,
        show_download_button=True,  # 显示下载按钮
        show_fullscreen_button=True,  # 显示全屏按钮
        allow_preview=True,  # 允许预览(包括全屏查看)
    )
    selected_index = gr.State(None)
    pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
    image_gallery.select(fn=on_image_select, inputs=[], outputs=selected_index)

    ####################
    task_dropdown = gr.Dropdown(
        choices=["plain texts OCR", "format texts OCR", "plain multi-crop OCR", "format multi-crop OCR", "plain fine-grained OCR", "format fine-grained OCR"],
        label="选择GOT模式",
        value="format texts OCR",
    )
    fine_grained_dropdown = gr.Dropdown(choices=["box", "color"], label="fine-grained类型", visible=False)
    color_dropdown = gr.Dropdown(choices=["red", "green", "blue"], label="颜色列表", visible=False)
    box_input = gr.Textbox(label="输入框: [x1,y1,x2,y2]", placeholder="例如: [0,0,100,100]", visible=False)

    ocr_button = gr.Button("开始OCR识别")
    ocr_result = gr.HTML(label="OCR结果")  # 将Textbox更改为HTML组件

    task_dropdown.change(task_update, inputs=[task_dropdown], outputs=[fine_grained_dropdown, color_dropdown, box_input])
    fine_grained_dropdown.change(fine_grained_update, inputs=[fine_grained_dropdown], outputs=[color_dropdown, box_input])

    # 更新ocr_button的click事件,传递所有必要的参数
    ocr_button.click(
        fn=perform_ocr,
        inputs=[selected_index, image_gallery, task_dropdown, fine_grained_dropdown, color_dropdown, box_input],
        outputs=ocr_result,
    )


if __name__ == "__main__":
    demo.launch()