import pickle import nltk from sklearn.svm import SVC from sklearn.svm import LinearSVC from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import classification_report from nltk.tokenize import word_tokenize from datasets import load_dataset import numpy as np from tqdm import tqdm import gradio as gr import matplotlib.pyplot as plt from sklearn import metrics from sklearn.model_selection import KFold nltk.download('stopwords') nltk.download('punkt_tab') SW = set(nltk.corpus.stopwords.words("english")) PUNCT = set([".", ",", "!", "?", ":", ";", "-", "(", ")", "[", "]", "{", "}", "'", '"']) Features_count = 6 SEED = 42 class NEI: def __init__(self): self.model = None self.scaler = StandardScaler() self.vectorizer = DictVectorizer(sparse=True) self.tagset = ['Name[1]', 'No-Name[0]'] def load_dataset(self, file): sentences = [] sentence = [] with open(file, 'r', encoding='utf-8') as file: for line in file: if line.strip() == "": if sentence: sentences.append(sentence) sentence = [] continue word_info = line.strip().split() if len(word_info) != 4: continue word, pos, chunk, nei = word_info sentence.append((word, pos, nei)) if sentence: sentences.append(sentence) return sentences def sent2features(self, sentence): return [self.word2features(sentence, i) for i in range(len(sentence))] def sent2labels(self, sentence): return [label for _, _, label in sentence] def word2features(self, sentence, i): word = sentence[i][0] pos_tag = sentence[i][1] features = { 'word': word, 'pos_tag': pos_tag, 'word.isupper': int(word.isupper()), 'word.islower': int(word.islower()), 'word.istitle': int(word.istitle()), 'word.isdigit': int(word.isdigit()), 'word.prefix2': word[:2], 'word.prefix3': word[:3], 'word.suffix2': word[-2:], 'word.suffix3': word[-3:], } # Add context features if i > 0: prv_word = sentence[i - 1][0] prv_pos_tag = sentence[i - 1][1] features.update({ '-1:word': prv_word, '-1:pos_tag': prv_pos_tag, '-1:word.isupper': int(prv_word.isupper()), '-1:word.istitle': int(prv_word.istitle()), }) else: features['BOS'] = True if i < len(sentence) - 1: next_word = sentence[i + 1][0] next_pos_tag = sentence[i + 1][1] features.update({ '+1:word': next_word, '+1:pos_tag': next_pos_tag, '+1:word.isupper': int(next_word.isupper()), '+1:word.istitle': int(next_word.istitle()), }) else: features['EOS'] = True return features def performance(self, y_true, y_pred): print(classification_report(y_true, y_pred)) precision = metrics.precision_score(y_true,y_pred,average='weighted',zero_division=0) recall = metrics.recall_score(y_true,y_pred,average='weighted',zero_division=0) f05_Score = metrics.fbeta_score(y_true,y_pred,beta=0.5,average='weighted',zero_division=0) f1_Score = metrics.fbeta_score(y_true,y_pred,beta=1,average='weighted',zero_division=0) f2_Score = metrics.fbeta_score(y_true,y_pred,beta=2,average='weighted',zero_division=0) print(f"Average Precision = {precision:.2f}, Average Recall = {recall:.2f}, Average f05-Score = {f05_Score:.2f}, Average f1-Score = {f1_Score:.2f}, Average f2-Score = {f2_Score:.2f}") def confusion_matrix(self,y_true,y_pred): matrix = metrics.confusion_matrix(y_true,y_pred) normalized_matrix = matrix/np.sum(matrix, axis=1, keepdims=True) _, ax = plt.subplots() ax.tick_params(top=True) plt.xticks(np.arange(len(self.tagset)), self.tagset) plt.yticks(np.arange(len(self.tagset)), self.tagset) for i in range(normalized_matrix.shape[0]): for j in range(normalized_matrix.shape[1]): plt.text(j, i, format(normalized_matrix[i, j], '0.2f'), horizontalalignment="center") plt.imshow(normalized_matrix,interpolation='nearest',cmap=plt.cm.GnBu) plt.colorbar() plt.savefig('Confusion_Matrix.png') def vectorize(self, w, scaled_position): title = 1 if w[0].isupper() else 0 allcaps = 1 if w.isupper() else 0 sw = 1 if w.lower() in SW else 0 punct = 1 if w in PUNCT else 0 return [title, allcaps, len(w), sw, punct, scaled_position] def create_data(self, data): words, features, labels = [], [], [] for d in tqdm(data): tags = d["ner_tags"] tokens = d["tokens"] for i, token in enumerate(tokens): x = self.vectorize(token, scaled_position=(i / len(tokens))) y = 1 if tags[i] > 0 else 0 features.append(x) labels.append(y) words.extend(tokens) return np.array(words, dtype="object"), np.array(features, dtype=np.float32), np.array(labels, dtype=np.float32) def train(self, train_dataset): _, X_train, y_train = self.create_data(train_dataset) self.scaler.fit(X_train) X_train = self.scaler.transform(X_train) self.model = SVC(C=1.0, kernel="linear", class_weight="balanced", random_state=SEED, verbose=True) self.model.fit(X_train, y_train) def evaluate(self, val_data): _, X_val, y_val = self.create_data(val_data) X_val = self.scaler.transform(X_val) y_pred_val = self.model.predict(X_val) # print(classification_report(y_true=y_val, y_pred=y_pred_val)) self.confusion_matrix(y_val,y_pred_val) self.performance(y_val,y_pred_val) def infer(self, sentence): tokens = word_tokenize(sentence) features = [self.vectorize(token, i / len(tokens)) for i, token in enumerate(tokens)] features = np.array(features, dtype=np.float32) scaled_features = self.scaler.transform(features) y_pred = self.model.predict(scaled_features) return list(zip(tokens, y_pred)) data = load_dataset("conll2003", trust_remote_code=True) nei_model = NEI() # Training the model nei_model.train(data["train"]) # Evaluating the model nei_model.evaluate(data["validation"]) def annotate(text): predictions = nei_model.infer(text) annotated_output = " ".join([f"{word}_{int(label)} " for word, label in predictions]) return annotated_output interface = gr.Interface(fn = annotate, inputs = gr.Textbox( label="Input Sentence", placeholder="Enter your sentence here...", ), outputs = gr.Textbox( label="Tagged Output", placeholder="Tagged sentence appears here...", ), title = "Named Entity Recognition", description = "CS626 Assignment 2 (Autumn 2024)", theme=gr.themes.Soft()) interface.launch()