Bayhaqy commited on
Commit
6593698
·
1 Parent(s): ce083b1

Upload News_Scrapping.py

Browse files
Files changed (1) hide show
  1. pages/News_Scrapping.py +165 -0
pages/News_Scrapping.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from newspaper import Article, Config
4
+ from langdetect import detect
5
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
6
+ from deep_translator import GoogleTranslator
7
+ import torch
8
+ import requests
9
+ import logging
10
+ from gnews import GNews
11
+ import nltk
12
+ nltk.download('punkt')
13
+
14
+ ## ............................................... ##
15
+ # Set page configuration (Call this once and make changes as needed)
16
+ st.set_page_config(page_title='News Scrapping', layout='wide', page_icon=':rocket:')
17
+
18
+ ## ............................................... ##
19
+ # Set up logging
20
+ logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
+
22
+ # Initialize the DataFrame
23
+ df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Title', 'Authors', 'Keywords', 'Summary', 'Text', 'Translate', 'Status', 'Sentiment'])
24
+
25
+ ## ............................................... ##
26
+ # Function for translation
27
+ def translate_text(text, source='auto', target='en'):
28
+ try:
29
+ if source != target:
30
+ text = GoogleTranslator(source=source, target=target).translate(text)
31
+ return text
32
+ except Exception as e:
33
+ logging.error(f"Translation error: {str(e)}")
34
+ return text
35
+
36
+ # Function for sentiment analysis
37
+ def predict_sentiment(text, model, tokenizer):
38
+ try:
39
+ tokens_info = tokenizer(text, truncation=True, return_tensors="pt")
40
+ with torch.no_grad():
41
+ raw_predictions = model(**tokens_info).logits
42
+
43
+ predicted_class_id = raw_predictions.argmax().item()
44
+ predict = model.config.id2label[predicted_class_id]
45
+
46
+ softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
47
+
48
+ if (softmaxed > 70):
49
+ status = 'Not trust'
50
+ elif (softmaxed > 40):
51
+ status = 'Not sure'
52
+ else:
53
+ status = 'Trust'
54
+
55
+ return status, predict
56
+ except Exception as e:
57
+ logging.error(f"Sentiment analysis error: {str(e)}")
58
+ return 'N/A', 'N/A'
59
+
60
+ def get_models_and_tokenizers():
61
+ model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
62
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
63
+ model.eval()
64
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
65
+
66
+ return model, tokenizer
67
+
68
+ # Function to process an article
69
+ def process_article(url, config):
70
+ try:
71
+ article = Article(url=url, config=config)
72
+ article.download()
73
+ article.parse()
74
+
75
+ # Get the article data
76
+ title = article.title
77
+ authors = article.authors
78
+
79
+ publish_date = article.publish_date
80
+
81
+ # Check if publish_date is not None before accessing 'value'
82
+ if publish_date is not None:
83
+ publish_date = publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
84
+ else:
85
+ publish_date = 'N/A'
86
+
87
+ text = article.text
88
+ article.nlp()
89
+ keywords = article.keywords
90
+ summary = article.summary
91
+
92
+ concated_text = title + '| ' + summary
93
+ language = detect(concated_text)
94
+ tl = translate_text(concated_text, source=language, target='en')
95
+ status, predict = predict_sentiment(tl, model, tokenizer)
96
+
97
+ return publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict
98
+ except Exception as e:
99
+ logging.error(f"Article processing error: {str(e)}")
100
+ return 'N/A', 'N/A', url, 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'
101
+
102
+ ## ............................................... ##
103
+ # Initialize Streamlit app
104
+ st.title('News Article Scrapping')
105
+ st.write("Created by Bayhaqy")
106
+
107
+ ## ............................................... ##
108
+ # Input search parameters
109
+ search_term = st.text_input('Enter a search term:', 'palestina')
110
+ max_results = st.number_input('Maximum number of results:', min_value=1, value=10)
111
+ country = st.text_input('Country:', 'Indonesia')
112
+ language = st.text_input('Language:', 'indonesian')
113
+ start_date = st.date_input('Start Date', pd.to_datetime('2023-09-01'))
114
+
115
+ # Create a variable to track whether the data has been processed
116
+ data_processed = False
117
+
118
+ # Fetch news and process articles
119
+ if st.button('Fetch and Process News'):
120
+
121
+ # Your news retrieval code
122
+ google_news = GNews()
123
+
124
+ google_news.max_results = max_results
125
+ google_news.country = country
126
+ google_news.language = language
127
+ google_news.start_date = (start_date.year, start_date.month, start_date.day)
128
+
129
+ news = google_news.get_news(search_term)
130
+
131
+ # Initialize your model and tokenizer
132
+ model, tokenizer = get_models_and_tokenizers()
133
+
134
+ # Create a custom configuration to disable SSL certificate verification
135
+ config = Config()
136
+ config.ignore_ssl = True
137
+
138
+ # Process articles
139
+ for x in news:
140
+ publish_date, language, url, title, authors, keywords, summary, text, tl, status, predict = process_article(x['url'], config)
141
+ temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
142
+ 'Summary': [summary], 'Text': [text], 'Translate': [tl], 'Status': [status], 'Sentiment': [predict]})
143
+
144
+ df = pd.concat([df, temp_df], ignore_index=True)
145
+
146
+ # Set data_processed to True when the data has been successfully processed
147
+ data_processed = True
148
+
149
+
150
+ # Add a button to download the data as a CSV file
151
+ if data_processed:
152
+ st.markdown("### Download Processed Data as CSV")
153
+ st.write("Click the button below to download the processed data as a CSV file.")
154
+
155
+ # Create a downloadable link
156
+ csv_data = df.to_csv(index=False).encode()
157
+ st.download_button(
158
+ label="Download CSV",
159
+ data=csv_data,
160
+ file_name="processed_data.csv",
161
+ )
162
+
163
+ # Display processed data
164
+ if data_processed:
165
+ st.write(df.head())