import pytesseract from IdentifyModel.cardModel import parse_id_card from transformers import BertTokenizer, BertForTokenClassification from transformers import pipeline # 加載預訓練模型和分詞器 tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner") # 初始化 Taiwanese BERT 模型 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer) def extract_entities(text): ner_results = ner_pipeline(text) entities = {} for result in ner_results: entity = result['entity'] word = result['word'] if entity not in entities: entities[entity] = word else: entities[entity] += word return entities def llm_recognition(image, validation_type, language): text = pytesseract.image_to_string(image, lang=language) ner_results = ner_pipeline(text) entities = {result['entity']: text[result['start']:result['end']] for result in ner_results} return parse_id_card(text, validation_type, entities)