from streamlit_pandas_profiling import st_profile_report
from ydata_profiling import ProfileReport
import streamlit as st
import pandas as pd
from newspaper import Article, Config
from langdetect import detect
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from deep_translator import GoogleTranslator
import torch
import requests
import logging
from gnews import GNews
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from urllib.parse import urlparse
import nltk
nltk.download('punkt')

## ............................................... ##
# Set page configuration (Call this once and make changes as needed)
st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rocket:')

with st.container():
  # Initialize Streamlit app
  st.title('News Article Scrapping')
  st.write("Created by Bayhaqy")

## ............................................... ##
# Set up logging
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## ............................................... ##
# Function for get model and tokenize
@st.cache_resource
def get_models_and_tokenizers():
    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    #model.eval()

    return model, tokenizer

# Function for sentiment analysis
@st.cache_resource
def analyze_sentiment_distilbert(text, _model, _tokenizer):
    try:
        tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
        with torch.no_grad():
            raw_predictions = _model(**tokens_info).logits

        predicted_class_id = raw_predictions.argmax().item()
        predict = _model.config.id2label[predicted_class_id]

        softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
        if (softmaxed > 70):
            status = 'Not trust'
        elif (softmaxed > 40):
            status = 'Not sure'
        else:
            status = 'Trust'
        return status, predict

    except Exception as e:
        logging.error(f"Sentiment analysis error: {str(e)}")
        return 'N/A', 'N/A'

# Function for sentiment analysis using VADER
@st.cache_data
def analyze_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    compound_score = sentiment['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Function for sentiment analysis using TextBlob
@st.cache_data
def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

## ............................................... ##
# Function to process an article
@st.cache_data
def process_article(url, _config):
    try:
        article = Article(url=url, config=_config)
        article.download()
        article.parse()

        # Check if publish_date is not None before further processing
        if article.publish_date is None:
            return None  # Skip processing and return None

        # Check if text is not None before further processing
        if len(article.text) <= 5:
            return None  # Skip processing and return None

        # Get the article data if publish_date is not not None
        text = article.text
        url = article.canonical_link
        source_url = urlparse(url).netloc

        title = article.title
        authors = article.authors
        #publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
        publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')

        article.nlp()
        keywords = article.meta_keywords
        summary = article.summary

        language = detect(title)

        return publish_date, language, url, source_url, title, authors, keywords, text, summary

    except Exception as e:
        logging.error(f"Article processing error: {str(e)}")
        return None  # Skip processing and return None

# Function for translation
@st.cache_data
def translate_text(text, source='auto', target='en'):
    try:
        if source != target:
            text = GoogleTranslator(source=source, target=target).translate(text)
        return text

    except Exception as e:
        logging.error(f"Translation error: {str(e)}")
        return text

## ............................................... ##
with st.container():
  # Input search parameters
  search_term = st.text_input('Enter a search term :', 'Indonesia')

  col1, col2, col3 = st.columns(3)

  with col1:
    period = st.text_input('Enter a news period :', '7d')
    max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
  with col2:
    country = st.text_input('Country :', 'Indonesia')
    language = st.text_input('Language :', 'indonesian')
  with col3:  
    start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
    end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))

## ............................................... ##
with st.container():
  col1, col2 = st.columns(2)

  with col1:
    # Checkbox options for different processing steps
    include_translation = st.checkbox("Include Translation", value=False)
    include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=False)
  with col2:
    include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=False)
    include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=False)

## ............................................... ##
# Create a variable to track whether the data has been processed
data_processed = False

## ............................................... ##
# Create a custom configuration
config = Config()
config.number_threads = 200
config.request_timeout = 3

## ............................................... ##
# Initialize the DataFrame
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary']) 

# Initialize your model and tokenizer
model, tokenizer = get_models_and_tokenizers()

## ............................................... ##
with st.container():
  # Fetch news and process articles
  if st.button('Fetch and Process News'):

      # Your news retrieval code
      google_news = GNews()

      google_news.period = period  # News from last 7 days
      google_news.max_results = max_results # number of responses across a keyword
      google_news.country = country  # News from a specific country
      google_news.language = language  # News in a specific language
      #google_news.exclude_websites = ['yahoo.com', 'cnn.com']  # Exclude news from specific website i.e Yahoo.com and CNN.com
      google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
      google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
      
      news = google_news.get_news(search_term)

      ## ............................................... ##,
      # Progress bar for fetching and processing news
      progress_bar = st.progress(0)
      total_news = len(news)

      # Your news retrieval code (assuming 'news' is a list of article URLs)
      #for x in news:
      for idx, x in enumerate(news):
          result = process_article(x['url'], _config=config)
          if result is not None:
              publish_date, language, url, source_url, title, authors, keywords, text, summary = result
              temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
                                      'Text': [text], 'Summary': [summary]})
              df = pd.concat([df, temp_df], ignore_index=True)
          
          # Update the progress bar
          progress = (idx + 1) / total_news
          progress_bar.progress(progress)

      # Conditionally apply translation function to the 'Translation' column
      if include_translation:
          df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
      
      # Conditionally apply sentiment analysis function to the 'Translation' column
      if include_sentiment_analysis:
          df[['Fake_Check', 'Sentiment_Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
          
      
      # Conditionally apply VADER sentiment analysis to the 'Translation' column
      if include_sentiment_vader:
          df['Sentiment_VADER'] = df['Translation'].apply(analyze_sentiment_vader)
      
      # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
      if include_sentiment_textblob:
          df['Sentiment_TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)

      # Set data_processed to True when the data has been successfully processed
      data_processed = True

  ## ............................................... ##
  # Add a button to download the data as a CSV file
  if data_processed:
      st.markdown("### Download Processed Data as CSV")
      st.write("Click the button below to download the processed data as a CSV file.")
      
      # Create a downloadable link
      csv_data = df.to_csv(index=False).encode()
      st.download_button(
          label="Download CSV",
          data=csv_data,
          file_name="processed_data.csv",
      )

  with st.expander("See Table"):
    ## ............................................... ##
    # Display processed data
    if data_processed:
        st.dataframe(df)
  
  with st.expander("See EDA"):
    ## ............................................... ##
    # Display processed data
    if data_processed:
      pr = ProfileReport(df)
      st_profile_report(pr)