File size: 3,441 Bytes
8c52817 e24f049 8c52817 7d7d7ee 220ef37 7d7d7ee 1b79f2a 220ef37 7d7d7ee 29d33c6 220ef37 29d33c6 220ef37 29d33c6 04d83d5 e24f049 220ef37 29d33c6 90d0fc5 220ef37 29d33c6 220ef37 e24f049 04d83d5 220ef37 29d33c6 220ef37 04d83d5 220ef37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import re
from transformers import DonutProcessor, VisionEncoderDecoderModel
from datasets import load_dataset
import torch
from PIL import Image
import numpy as np
import streamlit as st
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
#image = Image.open(r"C:\Invoices\Sample Invoices\sample invoice 1.tif")
#image = image.convert("RGB")
#print(np.array(image).shape)
st.title("Classify Document Image")
file_name = st.file_uploader("Upload a candidate image")
if file_name is not None:
col1, col2, col3 = st.columns(3)
image = Image.open(file_name)
image = image.convert("RGB")
# load document image
#dataset = load_dataset("hf-internal-testing/example-documents", split="test")
#image = dataset[2]["image"]
task_prompt = "<s_rvlcdip>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
pixel_values = processor(image, return_tensors="pt").pixel_values
outputs = model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model.decoder.config.max_position_embeddings,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
print(processor.token2json(sequence))
col1.image(image, use_column_width=True)
col2.header("Results")
col2.subheader(processor.token2json(sequence))
processor_ext = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model_ext = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
device = "cuda" if torch.cuda.is_available() else "cpu"
model_ext.to(device)
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor_ext.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
pixel_values = processor_ext(image, return_tensors="pt").pixel_values
outputs = model_ext.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model_ext.decoder.config.max_position_embeddings,
pad_token_id=processor_ext.tokenizer.pad_token_id,
eos_token_id=processor_ext.tokenizer.eos_token_id,
use_cache=True,
bad_words_ids=[[processor_ext.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
sequence = processor_ext.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor_ext.tokenizer.eos_token, "").replace(processor_ext.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
print(processor_ext.token2json(sequence))
col3.header("Features")
col3.subheader(processor_ext.token2json(sequence))
|