try: import torch import pandas as pd import streamlit as st import re from transformers import BertTokenizer from model import IndoBERTBiLSTM from stqdm import stqdm except Exception as e: print(e) # Config MAX_SEQ_LEN = 128 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review" LABELS = {'Not Useful': 0, 'Useful': 1} def get_device(): if torch.cuda.is_available(): return torch.device('cuda') else: return torch.device('cpu') USE_CUDA = False device = get_device() if device.type == 'cuda': USE_CUDA = True # Get the Keys def get_key(val, my_dict): for key, value in my_dict.items(): if val == value: return key def load_tokenizer(model_path): tokenizer = BertTokenizer.from_pretrained(model_path) return tokenizer def remove_special_characters(text): # case folding text = text.lower() # menghapus karakter khusus text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text) text = re.sub(r'[0-9]', ' ', text) # replace multiple whitespace characters with a single space text = re.sub(r"\s+", " ", text) return text def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN): return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt' ) def load_model(): model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH) return model def classify_single(text, model, tokenizer, device): if device.type == 'cuda': model.cuda() # We need Token IDs and Attention Mask for inference on the new sentence test_ids = [] test_attention_mask = [] # Apply preprocessing to the new sentence new_sentence = remove_special_characters(text) encoding = preprocess(new_sentence, tokenizer) # Extract IDs and Attention Mask test_ids.append(encoding['input_ids']) test_attention_mask.append(encoding['attention_mask']) test_ids = torch.cat(test_ids, dim=0) test_attention_mask = torch.cat(test_attention_mask, dim=0) # Forward pass, calculate logit with torch.no_grad(): outputs = model(test_ids.to(device), test_attention_mask.to(device)) print("output ", outputs) result = torch.argmax(outputs, dim=-1) print("output ", result) return result.item() def classify_multiple(data, model, tokenizer, device): if device.type == 'cuda': model.cuda() input_ids = [] attention_masks = [] for row in data.tolist(): text = remove_special_characters(row) text = preprocess(text, tokenizer) input_ids.append(text['input_ids']) attention_masks.append(text['attention_mask']) result_list = [] with torch.no_grad(): for i in stqdm(range(len(input_ids))): test_ids = input_ids[i] test_attention_mask = attention_masks[i] outputs = model(test_ids.to(device), test_attention_mask.to(device)) result = torch.argmax(outputs, dim= -1) result_label = get_key(result.item(), LABELS) result_list.append(result_label) return result_list tab_labels = ["Single Input", "Multiple Input"] class App: def __init__(self): self.fileTypes = ["csv"] self.default_tab_selected = tab_labels[0] self.input_text = None self.csv_input = None self.csv_process = None def run(self): self.init_session_state() # Initialize session state tokenizer = load_tokenizer(MODELS_PATH) model = load_model() """App Review Classifier""" html_temp = """

Klasifikasi Ulasan Aplikasi yang Berguna

""" st.markdown(html_temp, unsafe_allow_html=True) st.markdown("") self.render_tabs() st.divider() self.render_process_button(model, tokenizer, device) def init_session_state(self): if "tab_selected" not in st.session_state: st.session_state.tab_selected = tab_labels[0] def render_tabs(self): tab_selected = st.session_state.get('tab_selected', self.default_tab_selected) tab_selected = st.sidebar.radio("Select Input Type", tab_labels) if USE_CUDA: st.sidebar.markdown(footer,unsafe_allow_html=True) if tab_selected == tab_labels[0]: self.render_single_input() elif tab_selected == tab_labels[1]: self.render_multiple_input() st.session_state.tab_selected = tab_selected def render_single_input(self): self.input_text = st.text_area("Enter Text Here", placeholder="Type Here") def render_multiple_input(self): """ Upload File """ st.markdown("Upload file") file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.", type=self.fileTypes) if not file: st.info("Please upload a file of type: " + ", ".join(self.fileTypes)) return data = pd.read_csv(file) placeholder = st.empty() placeholder.dataframe(data.head(10)) header_list = data.columns.tolist() header_list.insert(0, "---------- select column -------------") ques = st.radio("Select column to process", header_list, index=0) if header_list.index(ques) == 0: st.warning("Please select a column to process") return df_process = data[ques].astype(str) self.csv_input = data self.csv_process = df_process def render_process_button(self, model, tokenizer, device): if st.button("Process"): if st.session_state.tab_selected == tab_labels[0]: input_text = self.input_text if input_text: classification = classify_single(input_text, model, tokenizer, device) classification_label = get_key(classification, LABELS) st.write("Classification result:", classification_label) else: st.warning('Please enter text to process', icon="⚠️") elif st.session_state.tab_selected == tab_labels[1]: df_process = self.csv_process if df_process is not None: classification = classify_multiple(df_process, model, tokenizer, device) st.divider() st.write("Classification Result") input_file = self.csv_input input_file["classification_result"] = classification st.dataframe(input_file.head(10)) st.download_button( label="Download Result", data=input_file.to_csv().encode("utf-8"), file_name="classification_result.csv", mime="text/csv", ) else: st.warning('Please upload a file to process', icon="⚠️") footer=""" """ if __name__ == "__main__": app = App() app.run()