File size: 1,086 Bytes
e651999
 
03b6d75
 
 
 
 
 
e651999
 
 
 
 
03b6d75
 
 
 
 
 
 
 
 
 
 
 
 
e651999
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pytesseract
from IdentifyModel.cardModel import parse_id_card
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

# 加載預訓練模型和分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")

# 初始化 Taiwanese BERT 模型
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)


def extract_entities(text):
    ner_results = ner_pipeline(text)
    entities = {}
    for result in ner_results:
        entity = result['entity']
        word = result['word']
        if entity not in entities:
            entities[entity] = word
        else:
            entities[entity] += word
    return entities


def llm_recognition(image, validation_type, language):
    text = pytesseract.image_to_string(image, lang=language)
    ner_results = ner_pipeline(text)
    entities = {result['entity']: text[result['start']:result['end']] for result in ner_results}
    return parse_id_card(text, validation_type, entities)