File size: 1,086 Bytes
e651999 03b6d75 e651999 03b6d75 e651999 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
import pytesseract
from IdentifyModel.cardModel import parse_id_card
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline
# 加載預訓練模型和分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
# 初始化 Taiwanese BERT 模型
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
def extract_entities(text):
ner_results = ner_pipeline(text)
entities = {}
for result in ner_results:
entity = result['entity']
word = result['word']
if entity not in entities:
entities[entity] = word
else:
entities[entity] += word
return entities
def llm_recognition(image, validation_type, language):
text = pytesseract.image_to_string(image, lang=language)
ner_results = ner_pipeline(text)
entities = {result['entity']: text[result['start']:result['end']] for result in ner_results}
return parse_id_card(text, validation_type, entities)
|