File size: 2,032 Bytes
1d2c57b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
from typing import Any
from subprocess import run
from docquery import document, pipeline
import tempfile
import os
# from transformers import AutoConfig, AutoTokenizer, LayoutLMForQuestionAnswering
# install tesseract-ocr and pytesseract
run("apt install -y tesseract-ocr", shell=True, check=True)
class EndpointHandler:
def __init__(self, path=""):
# config = AutoConfig.from_pretrained(model_checkpoint, revision=rev)
# self.tokenizer = AutoTokenizer.from_pretrained(path)
# self.model = LayoutLMForQuestionAnswering.from_pretrained(path)
# self.pipeline = pipeline('document-question-answering', model=self.model, tokenizer=self.tokenizer)
# self.pipeline = pipeline('document-question-answering', model='impira/layoutlm-invoices')
self.pipeline = pipeline('document-question-answering', model=path)
def __call__(self, data: dict[str, bytes]) -> dict[str, list[Any]]:
"""
Args:
data (:obj:):
includes:
- pdf bytes
"""
# process input
f_bytes = data.pop("inputs", data)
try:
temp_file_name = next(tempfile._get_candidate_names())
temp_file_path = os.path.join('/tmp', f'{temp_file_name}.pdf')
with open(temp_file_path, 'wb') as temp_file:
temp_file.write(f_bytes)
if not os.path.exists(temp_file_path):
raise ValueError(f'File not found at path: {temp_file_path}')
results = []
doc = document.load_document(temp_file_path)
for q in ["What is the invoice number?", "What is the invoice total?"]:
result = self.pipeline(question=q, **doc.context)
results.append(result)
except Exception as e:
raise
else:
return {"predictions": results}
finally:
try:
os.remove(temp_file_path)
except FileNotFoundError as e:
print(e)
|