try: import torch import pandas as pd import streamlit as st import re from transformers import BertTokenizer from model import IndoBERTBiLSTM from stqdm import stqdm except Exception as e: print(e) # Config MAX_SEQ_LEN = 128 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review" LABELS = {'Not Useful': 0, 'Useful': 1} def get_device(): if torch.cuda.is_available(): return torch.device('cuda') else: return torch.device('cpu') USE_CUDA = False device = get_device() if device.type == 'cuda': USE_CUDA = True # Get the Keys def get_key(val, my_dict): for key, value in my_dict.items(): if val == value: return key def load_tokenizer(model_path): tokenizer = BertTokenizer.from_pretrained(model_path) return tokenizer def remove_special_characters(text): # case folding text = text.lower() # menghapus karakter khusus text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) text = re.sub(r'[0-9]', ' ', text) # replace multiple whitespace characters with a single space text = re.sub(r"\s+", " ", text) return text def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN): return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt' ) def load_model(): model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH) return model def classify_single(text, model, tokenizer, device): if device.type == 'cuda': model.cuda() # We need Token IDs and Attention Mask for inference on the new sentence test_ids = [] test_attention_mask = [] # Apply preprocessing to the new sentence new_sentence = remove_special_characters(text) encoding = preprocess(new_sentence, tokenizer) # Extract IDs and Attention Mask test_ids.append(encoding['input_ids']) test_attention_mask.append(encoding['attention_mask']) test_ids = torch.cat(test_ids, dim=0) test_attention_mask = torch.cat(test_attention_mask, dim=0) # Forward pass, calculate logit with torch.no_grad(): outputs = model(test_ids.to(device), test_attention_mask.to(device)) print("output ", outputs) result = torch.argmax(outputs, dim=-1) print("output ", result) return result.item() def classify_multiple(data, model, tokenizer, device): if device.type == 'cuda': model.cuda() input_ids = [] attention_masks = [] for row in data.tolist(): text = remove_special_characters(row) text = preprocess(text, tokenizer) input_ids.append(text['input_ids']) attention_masks.append(text['attention_mask']) result_list = [] with torch.no_grad(): for i in stqdm(range(len(input_ids))): test_ids = input_ids[i] test_attention_mask = attention_masks[i] outputs = model(test_ids.to(device), test_attention_mask.to(device)) result = torch.argmax(outputs, dim= -1) result_label = get_key(result.item(), LABELS) result_list.append(result_label) return result_list tab_labels = ["Single Input", "Multiple Input"] class App: def __init__(self): self.fileTypes = ["csv"] self.default_tab_selected = tab_labels[0] self.input_text = None self.csv_input = None self.csv_process = None def run(self): self.init_session_state() # Initialize session state tokenizer = load_tokenizer(MODELS_PATH) model = load_model() """App Review Classifier""" html_temp = """