|
import pytesseract |
|
from IdentifyModel.cardModel import parse_id_card |
|
from transformers import BertTokenizer, BertForTokenClassification |
|
from transformers import pipeline |
|
|
|
|
|
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") |
|
model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner") |
|
|
|
|
|
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) |
|
|
|
|
|
def extract_entities(text): |
|
ner_results = ner_pipeline(text) |
|
entities = {} |
|
for result in ner_results: |
|
entity = result['entity'] |
|
word = result['word'] |
|
if entity not in entities: |
|
entities[entity] = word |
|
else: |
|
entities[entity] += word |
|
return entities |
|
|
|
|
|
def llm_recognition(image, validation_type, language): |
|
text = pytesseract.image_to_string(image, lang=language) |
|
ner_results = ner_pipeline(text) |
|
entities = {result['entity']: text[result['start']:result['end']] for result in ner_results} |
|
return parse_id_card(text, validation_type, entities) |
|
|