import streamlit as st import pandas as pd from newspaper import Article, Config from langdetect import detect from transformers import AutoModelForSequenceClassification, AutoTokenizer from deep_translator import GoogleTranslator import torch import requests import logging from gnews import GNews import nltk nltk.download('punkt') ## ............................................... ## # Set page configuration (Call this once and make changes as needed) st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:') ## ............................................... ## # Set up logging logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') # Initialize the DataFrame df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment']) ## ............................................... ## # Function for translation def translate_text(text, source='auto', target='en'): try: if source != target: text = GoogleTranslator(source=source, target=target).translate(text) return text except Exception as e: logging.error(f"Translation error: {str(e)}") return text # Function for sentiment analysis def predict_sentiment(text, model, tokenizer): try: tokens_info = tokenizer(text, truncation=True, return_tensors="pt") with torch.no_grad(): raw_predictions = model(**tokens_info).logits predicted_class_id = raw_predictions.argmax().item() predict = model.config.id2label[predicted_class_id] softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100) if (softmaxed > 70): status = 'Not trust' elif (softmaxed > 40): status = 'Not sure' else: status = 'Trust' return status, predict except Exception as e: logging.error(f"Sentiment analysis error: {str(e)}") return 'N/A', 'N/A' def get_models_and_tokenizers(): model_name = 'distilbert-base-uncased-finetuned-sst-2-english' model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) model.eval() tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer # Function to process an article def process_article(url, config): try: article = Article(url=url, config=config) article.download() article.parse() # Get the article data title = article.title authors = article.authors publish_date = article.publish_date # Check if publish_date is not None before accessing 'value' if publish_date is not None: publish_date = publish_date.strftime('%Y-%m-%d %H:%M:%S%z') else: publish_date = 'N/A' text = article.text article.nlp() keywords = article.keywords summary = article.summary concated_text = title + '| ' + summary language = detect(concated_text) tl = translate_text(concated_text, source=language, target='en') status, predict = predict_sentiment(tl, model, tokenizer) return publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict except Exception as e: logging.error(f"Article processing error: {str(e)}") return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A' ## ............................................... ## # Initialize Streamlit app st.title('News Article Scrapping') st.write("Created by Bayhaqy") ## ............................................... ## # Input search parameters search_term = st.text_input('Enter a search term:', 'palestina') max_results = st.number_input('Maximum number of results:', min_value=1, value=10) country = st.text_input('Country:', 'Indonesia') language = st.text_input('Language:', 'indonesian') start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01')) # Create a variable to track whether the data has been processed data_processed = False # Fetch news and process articles if st.button('Fetch and Process News'): # Your news retrieval code google_news = GNews() google_news.max_results = max_results google_news.country = country google_news.language = language google_news.start_date = (start_date.year, start_date.month, start_date.day) news = google_news.get_news(search_term) # Initialize your model and tokenizer model, tokenizer = get_models_and_tokenizers() # Create a custom configuration to disable SSL certificate verification config = Config() config.ignore_ssl = True # Process articles for x in news: publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config) temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords], 'Summary': [summary], 'Text': [text], 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]}) df = pd.concat([df, temp_df], ignore_index=True) # Set data_processed to True when the data has been successfully processed data_processed = True # Add a button to download the data as a CSV file if data_processed: st.markdown("### Download Processed Data as CSV") st.write("Click the button below to download the processed data as a CSV file.") # Create a downloadable link csv_data = df.to_csv(index=False).encode() st.download_button( label="Download CSV", data=csv_data, file_name="processed_data.csv", ) # Display processed data if data_processed: st.write(df.head())