import streamlit as st
import pandas as pd
from newspaper import Article, Config
from langdetect import detect
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from deep_translator import GoogleTranslator
import torch
import requests
import logging
from gnews import GNews
import nltk
nltk.download('punkt')

## ............................................... ##
# Set page configuration (Call this once and make changes as needed)
st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rocket:')

## ............................................... ##
# Set up logging
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Initialize the DataFrame
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment'])

## ............................................... ##
# Function for translation
def translate_text(text, source='auto', target='en'):
    try:
        if source != target:
            text = GoogleTranslator(source=source, target=target).translate(text)
        return text
    except Exception as e:
        logging.error(f"Translation error: {str(e)}")
        return text

# Function for sentiment analysis
def predict_sentiment(text, model, tokenizer):
    try:
        tokens_info = tokenizer(text, truncation=True, return_tensors="pt")
        with torch.no_grad():
            raw_predictions = model(**tokens_info).logits

        predicted_class_id = raw_predictions.argmax().item()
        predict = model.config.id2label[predicted_class_id]

        softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)

        if (softmaxed > 70):
            status = 'Not trust'
        elif (softmaxed > 40):
            status = 'Not sure'
        else:
            status = 'Trust'

        return status, predict
    except Exception as e:
        logging.error(f"Sentiment analysis error: {str(e)}")
        return 'N/A', 'N/A'

def get_models_and_tokenizers():
    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return model, tokenizer

# Function to process an article
def process_article(url, config):
    try:
        article = Article(url=url, config=config)
        article.download()
        article.parse()

        # Get the article data
        title = article.title
        authors = article.authors

        publish_date = article.publish_date

        # Check if publish_date is not None before accessing 'value'
        if publish_date is not None:
            publish_date = publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
        else:
            publish_date = 'N/A'

        text = article.text
        article.nlp()
        keywords = article.keywords
        summary = article.summary

        concated_text = title + '| ' + summary
        language = detect(concated_text)
        tl = translate_text(concated_text, source=language, target='en')
        status, predict = predict_sentiment(tl, model, tokenizer)

        return publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict
    except Exception as e:
        logging.error(f"Article processing error: {str(e)}")
        return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'

## ............................................... ##
# Initialize Streamlit app
st.title('News Article Scrapping')
st.write("Created by Bayhaqy")

## ............................................... ##
# Input search parameters
search_term = st.text_input('Enter a search term:', 'palestina')
max_results = st.number_input('Maximum number of results:', min_value=1, value=10)
country = st.text_input('Country:', 'Indonesia')
language = st.text_input('Language:', 'indonesian')
start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))

# Create a variable to track whether the data has been processed
data_processed = False

# Fetch news and process articles
if st.button('Fetch and Process News'):

    # Your news retrieval code
    google_news = GNews()

    google_news.max_results = max_results
    google_news.country = country
    google_news.language = language
    google_news.start_date = (start_date.year, start_date.month, start_date.day)

    news = google_news.get_news(search_term)

    # Initialize your model and tokenizer
    model, tokenizer = get_models_and_tokenizers()

    # Create a custom configuration to disable SSL certificate verification
    config = Config()
    config.ignore_ssl = True

    # Process articles
    for x in news:
        publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
        temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
                                'Summary': [summary], 'Text': [text], 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})

        df = pd.concat([df, temp_df], ignore_index=True)

    # Set data_processed to True when the data has been successfully processed
    data_processed = True


# Add a button to download the data as a CSV file
if data_processed:
    st.markdown("### Download Processed Data as CSV")
    st.write("Click the button below to download the processed data as a CSV file.")
    
    # Create a downloadable link
    csv_data = df.to_csv(index=False).encode()
    st.download_button(
        label="Download CSV",
        data=csv_data,
        file_name="processed_data.csv",
    )

# Display processed data
if data_processed:
    st.write(df.head())