Spaces:
Sleeping
Sleeping
# Mengimpor pustaka yang diperlukan | |
import pandas as pd | |
import numpy as np | |
import sklearn.manifold as manifold | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pickle | |
import nltk | |
import tensorflow as tf | |
import re | |
# Mengunduh tokenizer dan stopwords | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# Mengimpor komponen dari NLTK | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize, sent_tokenize | |
# Mengimpor komponen dari scikit-learn dan transformers | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc | |
from sklearn.manifold import TSNE | |
from transformers import BertTokenizer, TFBertForSequenceClassification | |
df = pd.read_csv("dataset_tweet_sentiment_pilkada_DKI_2017.csv") | |
df.rename(columns={ | |
'Sentiment': 'sentiment', | |
'Pasangan Calon': 'calon', | |
'Text Tweet': 'text' | |
}, inplace=True) | |
df.dropna(inplace=True) | |
#preprocessing | |
def clean_text(text): | |
text = re.sub(r"https?://\S+|www\.\S+", "", text) #hapus url | |
text = re.sub(r"@\S+", "", text) #hapus mention | |
text = re.sub(r"#\S+", "", text) #hapus hastag | |
text = re.sub(r"\d+", "", text) #hapus nomor | |
text = re.sub(r"[^\w\s]", "", text) #hapus tanda baca | |
text = re.sub(r"(.)\1{2,}", r"\1", text) #hapus double karakter | |
text = text.strip() #hapus spasi di depan dan di belakang | |
text = text.lower() #ubah menjadi huruf kecil | |
return text | |
stopword_pilkada = pd.read_csv("stopword_tweet_pilkada_DKI_2017.csv", header=None) | |
stopword_pilkada.columns = ['stopword'] | |
stop_words = set(stopwords.words('indonesian')) | |
additional_sw = set(stopword_pilkada.stopword.values) | |
stop_words = stop_words.union(additional_sw) | |
def remove_stopwords(text): | |
word_tokens = word_tokenize(text) | |
filtered_sentence = [w for w in word_tokens if not w in stop_words] | |
return " ".join(filtered_sentence) | |
def preprocess_text(text): | |
text = clean_text(text) | |
text = remove_stopwords(text) | |
return(text) | |
text_to_process = "sangat gak bagus pak ahok" | |
processed_text = preprocess_text(text_to_process) | |
print(processed_text) | |
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42) | |
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=42) | |
print("Train Data Size: ", len(df_train)) #70% | |
print("Validation Data Size: ", len(df_val)) #15% | |
print("Test Data Size: ", len(df_test)) #15% | |
PRETRAINED_MODEL = "indobenchmark/indobert-base-p2" | |
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL) | |
vocab = tokenizer.get_vocab() | |
#mengecek distrubusi data untuk mengetahui panjang maksimal untuk token | |
token_lens = [] | |
for txt in df["text"]: | |
tokens = tokenizer.encode(txt) | |
token_lens.append(len(tokens)) | |
MAX_LEN = 60 | |
df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0}) | |
df_val['sentiment'] = df_val['sentiment'].map({'positive': 1, 'negative': 0}) | |
def encode_sentence(sent): | |
return tokenizer.encode_plus( | |
sent, | |
add_special_tokens =True, | |
padding = 'max_length', | |
truncation = 'longest_first', | |
max_length = MAX_LEN, | |
return_attention_mask =True, | |
return_token_type_ids=True | |
) | |
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label): | |
return{ | |
"input_ids": input_ids, | |
"attention_mask": attention_masks, | |
"token_type_ids": token_type_ids, | |
}, label | |
def encode_dataset(ds, limit=-1): | |
input_ids_list = [] | |
attention_mask_list = [] | |
token_type_ids_list = [] | |
label_list = [] | |
for index, row in ds.iterrows(): | |
if limit > 0 and index >= limit: | |
break | |
input_ids, attention_mask, token_type_ids =encode_sentence(row["text"])["input_ids"],\ | |
encode_sentence(row["text"])["attention_mask"],\ | |
encode_sentence(row["text"])["token_type_ids"] | |
label = row["sentiment"] | |
input_ids_list.append(input_ids) | |
attention_mask_list.append(attention_mask) | |
token_type_ids_list.append(token_type_ids) | |
label_list.append(label) | |
return tf.data.Dataset.from_tensor_slices(( | |
input_ids_list, | |
attention_mask_list, | |
token_type_ids_list, | |
label_list | |
)).map(map_example_to_dict) | |
EPOCH = 1 | |
BATCH_SIZE = 42 | |
LEARNING_RATE = 1e-5 | |
df_train_shuffled = df_train.sample(frac=1, random_state=42) | |
train_data = encode_dataset(df_train_shuffled).batch(BATCH_SIZE) | |
val_data = encode_dataset(df_val).batch(BATCH_SIZE) | |
test_data = encode_dataset(df_test).batch(BATCH_SIZE) | |
model = TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL, num_labels=2) | |
model.summary() | |
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE) | |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) | |
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') | |
model.compile(optimizer, loss=loss, metrics=[metric]) | |
history = model.fit( | |
train_data, | |
epochs=EPOCH, | |
batch_size=BATCH_SIZE, | |
validation_data=val_data | |
) | |
# Convert string labels to numeric format for the test dataset | |
df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0}) | |
# Create the test_data with the updated DataFrame | |
test_data = encode_dataset(df_test).batch(BATCH_SIZE) | |
# Evaluate the model | |
model.evaluate(test_data) | |
y_pred = model.predict(test_data) | |
y_actual = np.concatenate([y for x, y in test_data], axis=0) | |
labels = ["negative", "positive"] | |
def predict(text): | |
input_ids, attention_mask, token_type_ids = encode_sentence(text)["input_ids"],\ | |
encode_sentence(text)["attention_mask"],\ | |
encode_sentence(text)["token_type_ids"] | |
input_ids = tf.expand_dims(input_ids, 0) | |
attention_mask = tf.expand_dims(attention_mask, 0) | |
token_type_ids = tf.expand_dims(token_type_ids, 0) | |
outputs = model([input_ids, attention_mask, token_type_ids]) | |
return labels[np.argmax(tf.nn.softmax(outputs[0], axis=1).numpy()[0])] |