import pandas as pd
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

# Specify the file path
file_path = 'news classifier/News_Category_Dataset_v3.json'

# Lists to store data
links = []
headlines = []
categories = []
short_descriptions = []
authors = []
dates = []

# Open and read the file
with open(file_path, 'r') as file:
    for line in file:
        data = json.loads(line)
        links.append(data['link'])
        headlines.append(data['headline'])
        categories.append(data['category'])
        short_descriptions.append(data['short_description'])
        authors.append(data['authors'])
        dates.append(data['date'])
# Create a DataFrame
df = pd.DataFrame({
    'link': links,
    'headline': headlines,
    'category': categories,
    'short_description': short_descriptions,
    'authors': authors,
    'date': dates
})

# Combine headline and short_description
df['text'] = df['headline'] + ' ' + df['short_description']

x = df['text']
y = df['category']


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# One-hot encoding of the labels
label_binarizer = LabelBinarizer()
y_train_one_hot = label_binarizer.fit_transform(y_train)
y_test_one_hot = label_binarizer.transform(y_test)

# Batch size
batch_size = 32

# Define the model
VOCAB_SIZE = 20000  # Increased vocabulary size
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(X_train.to_list())

# Model with dropout to reduce overfitting

class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
                                 initializer='normal')
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
                                 initializer='zeros')
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        et = tf.nn.tanh(tf.matmul(x, self.W) + self.b)
        at = tf.nn.softmax(et, axis=1)
        at = tf.transpose(at, perm=[0, 2, 1])
        output = tf.matmul(at, x)
        return tf.squeeze(output, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
# Model with added attention layer


model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=256, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.5)),
    AttentionLayer(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(label_binarizer.classes_), activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), # Increase learning rate
              metrics=['accuracy'])

# Train the model
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_one_hot)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test_one_hot)).batch(batch_size)
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)

# Save the model
model.save('news_classifier_optimized')
#63% accuracy