Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Sleeping

App Files Files Community

Bayhaqy commited on Oct 12, 2023

Commit

6593698

1 Parent(s): ce083b1

Upload News_Scrapping.py

Browse files

Files changed (1) hide show

pages/News_Scrapping.py +165 -0

pages/News_Scrapping.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import streamlit as st
+import pandas as pd
+from newspaper import Article, Config
+from langdetect import detect
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from deep_translator import GoogleTranslator
+import torch
+import requests
+import logging
+from gnews import GNews
+import nltk
+nltk.download('punkt')
+## ............................................... ##
+# Set page configuration (Call this once and make changes as needed)
+st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':rocket:')
+## ............................................... ##
+# Set up logging
+logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Initialize the DataFrame
+df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment'])
+## ............................................... ##
+# Function for translation
+def translate_text(text, source='auto', target='en'):
+    try:
+        if source != target:
+            text = GoogleTranslator(source=source, target=target).translate(text)
+        return text
+    except Exception as e:
+        logging.error(f"Translation error: {str(e)}")
+        return text
+# Function for sentiment analysis
+def predict_sentiment(text, model, tokenizer):
+    try:
+        tokens_info = tokenizer(text, truncation=True, return_tensors="pt")
+        with torch.no_grad():
+            raw_predictions = model(**tokens_info).logits
+        predicted_class_id = raw_predictions.argmax().item()
+        predict = model.config.id2label[predicted_class_id]
+        softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
+        if (softmaxed > 70):
+            status = 'Not trust'
+        elif (softmaxed > 40):
+            status = 'Not sure'
+        else:
+            status = 'Trust'
+        return status, predict
+    except Exception as e:
+        logging.error(f"Sentiment analysis error: {str(e)}")
+        return 'N/A', 'N/A'
+def get_models_and_tokenizers():
+    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
+    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+    model.eval()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    return model, tokenizer
+# Function to process an article
+def process_article(url, config):
+    try:
+        article = Article(url=url, config=config)
+        article.download()
+        article.parse()
+        # Get the article data
+        title = article.title
+        authors = article.authors
+        publish_date = article.publish_date
+        # Check if publish_date is not None before accessing 'value'
+        if publish_date is not None:
+            publish_date = publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
+        else:
+            publish_date = 'N/A'
+        text = article.text
+        article.nlp()
+        keywords = article.keywords
+        summary = article.summary
+        concated_text = title + '| ' + summary
+        language = detect(concated_text)
+        tl = translate_text(concated_text, source=language, target='en')
+        status, predict = predict_sentiment(tl, model, tokenizer)
+        return publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict
+    except Exception as e:
+        logging.error(f"Article processing error: {str(e)}")
+        return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
+## ............................................... ##
+# Initialize Streamlit app
+st.title('News Article Scrapping')
+st.write("Created by Bayhaqy")
+## ............................................... ##
+# Input search parameters
+search_term = st.text_input('Enter a search term:', 'palestina')
+max_results = st.number_input('Maximum number of results:', min_value=1, value=10)
+country = st.text_input('Country:', 'Indonesia')
+language = st.text_input('Language:', 'indonesian')
+start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
+# Create a variable to track whether the data has been processed
+data_processed = False
+# Fetch news and process articles
+if st.button('Fetch and Process News'):
+    # Your news retrieval code
+    google_news = GNews()
+    google_news.max_results = max_results
+    google_news.country = country
+    google_news.language = language
+    google_news.start_date = (start_date.year, start_date.month, start_date.day)
+    news = google_news.get_news(search_term)
+    # Initialize your model and tokenizer
+    model, tokenizer = get_models_and_tokenizers()
+    # Create a custom configuration to disable SSL certificate verification
+    config = Config()
+    config.ignore_ssl = True
+    # Process articles
+    for x in news:
+        publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
+        temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
+                                'Summary': [summary], 'Text': [text], 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
+        df = pd.concat([df, temp_df], ignore_index=True)
+    # Set data_processed to True when the data has been successfully processed
+    data_processed = True
+# Add a button to download the data as a CSV file
+if data_processed:
+    st.markdown("### Download Processed Data as CSV")
+    st.write("Click the button below to download the processed data as a CSV file.")
+    # Create a downloadable link
+    csv_data = df.to_csv(index=False).encode()
+    st.download_button(
+        label="Download CSV",
+        data=csv_data,
+        file_name="processed_data.csv",
+    )
+# Display processed data
+if data_processed:
+    st.write(df.head())