import pandas as pd import json import tensorflow as tf from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelBinarizer # Specify the file path file_path = 'news classifier/News_Category_Dataset_v3.json' # Lists to store data links = [] headlines = [] categories = [] short_descriptions = [] authors = [] dates = [] # Open and read the file with open(file_path, 'r') as file: for line in file: data = json.loads(line) links.append(data['link']) headlines.append(data['headline']) categories.append(data['category']) short_descriptions.append(data['short_description']) authors.append(data['authors']) dates.append(data['date']) # Create a DataFrame df = pd.DataFrame({ 'link': links, 'headline': headlines, 'category': categories, 'short_description': short_descriptions, 'authors': authors, 'date': dates }) # Combine headline and short_description df['text'] = df['headline'] + ' ' + df['short_description'] x = df['text'] y = df['category'] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # One-hot encoding of the labels label_binarizer = LabelBinarizer() y_train_one_hot = label_binarizer.fit_transform(y_train) y_test_one_hot = label_binarizer.transform(y_test) # Batch size batch_size = 32 # Define the model VOCAB_SIZE = 20000 # Increased vocabulary size encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE) encoder.adapt(X_train.to_list()) # Model with dropout to reduce overfitting class AttentionLayer(tf.keras.layers.Layer): def __init__(self, **kwargs): super(AttentionLayer, self).__init__(**kwargs) def build(self, input_shape): # Create a trainable weight variable for this layer. self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1), initializer='normal') self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1), initializer='zeros') super(AttentionLayer, self).build(input_shape) def call(self, x): et = tf.nn.tanh(tf.matmul(x, self.W) + self.b) at = tf.nn.softmax(et, axis=1) at = tf.transpose(at, perm=[0, 2, 1]) output = tf.matmul(at, x) return tf.squeeze(output, axis=1) def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[-1]) # Model with added attention layer model = tf.keras.Sequential([ encoder, tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=256, mask_zero=True), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.5)), AttentionLayer(), tf.keras.layers.Dense(256, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(256, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(len(label_binarizer.classes_), activation='softmax') ]) # Compile the model model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), # Increase learning rate metrics=['accuracy']) # Train the model train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_one_hot)).batch(batch_size) test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_one_hot)).batch(batch_size) history = model.fit(train_dataset, epochs=10, validation_data=test_dataset) # Save the model model.save('news_classifier_optimized') #63% accuracy