File size: 3,441 Bytes
8c52817
 
 
 
 
e24f049
8c52817
 
 
 
 
 
 
 
 
 
 
 
 
 
7d7d7ee
 
 
 
 
220ef37
7d7d7ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b79f2a
 
 
 
 
220ef37
 
 
7d7d7ee
29d33c6
220ef37
29d33c6
220ef37
 
29d33c6
 
04d83d5
e24f049
220ef37
 
 
29d33c6
90d0fc5
 
220ef37
29d33c6
220ef37
 
e24f049
04d83d5
 
220ef37
29d33c6
220ef37
04d83d5
220ef37
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re

from transformers import DonutProcessor, VisionEncoderDecoderModel
from datasets import load_dataset
import torch
from PIL import Image
import numpy as np
import streamlit as st

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

#image = Image.open(r"C:\Invoices\Sample Invoices\sample invoice 1.tif")
#image = image.convert("RGB")
#print(np.array(image).shape)


st.title("Classify Document Image")

file_name = st.file_uploader("Upload a candidate image")

if file_name is not None:
    col1, col2, col3 = st.columns(3)

    image = Image.open(file_name)
    image = image.convert("RGB")

    # load document image
    #dataset = load_dataset("hf-internal-testing/example-documents", split="test")
    #image = dataset[2]["image"]


    task_prompt = "<s_rvlcdip>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    pixel_values = processor(image, return_tensors="pt").pixel_values

    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    print(processor.token2json(sequence))
    
    col1.image(image, use_column_width=True)

    col2.header("Results")
    col2.subheader(processor.token2json(sequence))
    
    processor_ext = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
    model_ext = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_ext.to(device)

    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor_ext.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

    pixel_values = processor_ext(image, return_tensors="pt").pixel_values

    outputs = model_ext.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model_ext.decoder.config.max_position_embeddings,
        pad_token_id=processor_ext.tokenizer.pad_token_id,
        eos_token_id=processor_ext.tokenizer.eos_token_id,
        use_cache=True,
        bad_words_ids=[[processor_ext.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )

    sequence = processor_ext.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor_ext.tokenizer.eos_token, "").replace(processor_ext.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    print(processor_ext.token2json(sequence))
    col3.header("Features")
    col3.subheader(processor_ext.token2json(sequence))