hsuvaskakoty commited on
Commit
0d0a4e0
·
verified ·
1 Parent(s): 8dbd54d

Upload 9 files

Browse files
collect_data_es.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ from bs4 import BeautifulSoup
4
+ import re
5
+
6
+ #################### Spanish Wikipedia ####################
7
+
8
+ ###############
9
+ # Title based #
10
+ ###############
11
+
12
+ def extract_result_resultado(sentence):
13
+ match = re.search(r"(RESULTADO:|El resultado fue)\s*(\w+)", sentence, flags=re.IGNORECASE)
14
+ return match.group(2).strip() if match else None
15
+
16
+ def extract_result(sentence):
17
+ #print(f"Extracting result from sentence: {sentence}")
18
+ match = re.search(r"se\s+decidió\s+(\w+)", sentence, flags=re.IGNORECASE)
19
+ if match:
20
+ #print(f"Match found for 'se decidió': {match.groups()}")
21
+ return match.group(1).strip()
22
+ #print("No match found for 'se decidió'.")
23
+ return None
24
+
25
+ def clean_comments_with_no_text_after_timestamp(content_div):
26
+ for ol in content_div.find_all('ol'):
27
+ for li in ol.find_all('li'):
28
+ li_text = li.get_text(strip=True)
29
+ if "(CEST)" in li_text or "(CET)" in li_text:
30
+ match = re.search(r"\(C[ES]T\)\s*(.*)", li_text)
31
+ if match:
32
+ after_timestamp = match.group(1).strip()
33
+ if not after_timestamp:
34
+ li.decompose()
35
+ else:
36
+ li.decompose()
37
+ return content_div
38
+
39
+ def extract_cleaned_spanish_discussion_and_result(url):
40
+ response = requests.get(url)
41
+ if response.status_code != 200:
42
+ print(f"Error: Received status code {response.status_code} for URL: {url}")
43
+ return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url'])
44
+
45
+ soup = BeautifulSoup(response.content, 'html.parser')
46
+ title = url.split('/')[-1].replace('_', ' ').replace(':', '')
47
+ text_url = f"https://es.wikipedia.org/wiki/{url.split('/')[-1]}"
48
+ discussion_url = url
49
+
50
+ content_div = soup.find('div', class_='mw-content-ltr mw-parser-output')
51
+ if not content_div:
52
+ print("Error: Main discussion container not found")
53
+ return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url'])
54
+
55
+ discussion_uncleaned = content_div.prettify()
56
+ discussion = ''
57
+ result_sentence = ''
58
+ result = None
59
+
60
+ try:
61
+ result_p = next(
62
+ (p for p in content_div.find_all('p') if "El resultado fue" in p.get_text() or "RESULTADO:" in p.get_text()), None
63
+ )
64
+
65
+ if result_p:
66
+ result_sentence = result_p.get_text(strip=True)
67
+ bold_tag = result_p.find('b')
68
+ if bold_tag:
69
+ result = bold_tag.get_text(strip=True)
70
+ else:
71
+ match = re.search(r"(El resultado fue|RESULTADO:)\s*(.+?)\.", result_sentence, flags=re.IGNORECASE)
72
+ result = match.group(2).strip() if match else None
73
+ #print(f"Extracted result from sentence: {result}")
74
+
75
+ content_div = clean_comments_with_no_text_after_timestamp(content_div)
76
+ discussion_text_parts = content_div.find_all(recursive=False)
77
+ cleaned_text_parts = []
78
+ for part in discussion_text_parts:
79
+ cleaned_text_parts.append(part.get_text(strip=True))
80
+ discussion = "\n".join(cleaned_text_parts)
81
+
82
+ if not result:
83
+ result_div = content_div.find('div', class_='messagebox')
84
+ if result_div:
85
+ result_dl = result_div.find('dl')
86
+ if result_dl:
87
+ result_sentence = result_dl.get_text(strip=True)
88
+ #print(f"Extracted result sentence from messagebox: {result_sentence}")
89
+ result = extract_result(result_sentence)
90
+ if not result and not result_sentence:
91
+ result_p = next((p for p in content_div.find_all('p') if "RESULTADO:" in p.get_text() or "se decidió" in p.get_text()), None)
92
+ if result_p:
93
+ result_sentence = result_p.get_text(strip=True)
94
+ #print(f"Extracted result sentence from paragraph: {result_sentence}")
95
+ result = extract_result(result_sentence)
96
+
97
+ if not result and not result_sentence:
98
+ voting_sentence = next((p for p in content_div.find_all('p') if "se decidió" in p.get_text()), None)
99
+ if voting_sentence:
100
+ result_sentence = voting_sentence.get_text(strip=True)
101
+ #print(f"Extracted voting sentence: {result_sentence}")
102
+ result = extract_result(result_sentence)
103
+
104
+ # if result:
105
+ # print(f"Final extracted result: {result}")
106
+
107
+ if "Votación" in discussion:
108
+ discussion = discussion.split("Votación", 1)[1].strip()
109
+
110
+ except Exception as e:
111
+ print(f"Error processing discussion: {e}")
112
+ data = [[title, discussion_uncleaned, discussion, result_sentence, result, text_url, discussion_url]]
113
+ df = pd.DataFrame(data, columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url'])
114
+ df['result'] = df['result'].apply(lambda x: extract_result_resultado(x) if isinstance(x, str) and len(x.split()) > 1 else x)
115
+ return df
116
+
117
+ # url = 'https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/!Hispahack' #'https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/:Country_Club_La_Planicie'
118
+ # df = extract_cleaned_spanish_discussion_and_result(url)
119
+ # df
120
+
121
+ ###############
122
+ # Date based #
123
+ ###############
124
+
125
+
126
+ def extract_result(sentence):
127
+ match = re.search(r"(El resultado fue|RESULTADO:)\s*(\w+)", sentence, flags=re.IGNORECASE)
128
+ return match.group(2).strip() if match else None
129
+
130
+ def extract_multiple_discussions(url):
131
+ response = requests.get(url)
132
+ if response.status_code != 200:
133
+ print(f"Error: Received status code {response.status_code} for URL: {url}")
134
+ return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url'])
135
+
136
+ soup = BeautifulSoup(response.content, 'html.parser')
137
+ content_div = soup.find('div', class_='mw-content-ltr mw-parser-output')
138
+ if not content_div:
139
+ print("Error: Main discussion container not found")
140
+ return pd.DataFrame(columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url'])
141
+ data = []
142
+ headings = content_div.find_all('div', class_='mw-heading mw-heading3')
143
+ for idx, heading in enumerate(headings):
144
+ try:
145
+ title_tag = heading.find('a', class_='new') or heading.find('a')
146
+ if title_tag:
147
+ title = title_tag.text.strip()
148
+ text_url = f"https://es.wikipedia.org{title_tag['href']}"
149
+ else:
150
+ title = f"{url.split('/')[-1]}_{idx + 1}"
151
+ text_url = f"https://es.wikipedia.org/wiki/{title}"
152
+ previous_sibling = heading.find_previous_sibling()
153
+ result_sentence = None
154
+ result = None
155
+ while previous_sibling:
156
+ if previous_sibling.name == 'p' and "El resultado fue" in previous_sibling.get_text():
157
+ normalized_text = previous_sibling.get_text(separator=" ", strip=True)
158
+ result_sentence = normalized_text
159
+ result = extract_result(result_sentence)
160
+ break
161
+ previous_sibling = previous_sibling.find_previous_sibling()
162
+ if not result_sentence:
163
+ result_p = content_div.find('p', string=lambda text: text and "RESULTADO:" in text)
164
+ if result_p:
165
+ result_sentence = result_p.get_text(strip=True)
166
+ result = extract_result(result_sentence)
167
+ discussion_html = ""
168
+ current = heading.find_next_sibling()
169
+ while current and not (current.name == 'div' and 'mw-heading mw-heading3' in current.get('class', [])):
170
+ discussion_html += str(current)
171
+ current = current.find_next_sibling()
172
+
173
+ discussion_uncleaned = discussion_html
174
+ discussion = BeautifulSoup(discussion_html, 'html.parser').get_text(strip=True)
175
+ data.append([title, discussion_uncleaned, discussion, result_sentence, result, text_url, url])
176
+ except Exception as e:
177
+ print(f"Error processing heading: {e}")
178
+ df = pd.DataFrame(data, columns=['title', 'discussion_uncleaned', 'discussion', 'result_sentence', 'result', 'text_url', 'discussion_url'])
179
+ return df
180
+
181
+ # url = 'https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/Registro/10_de_septiembre_de_2009'
182
+ # df = extract_multiple_discussions(url)
183
+ # df
184
+
185
+ ###############
186
+ # Collect ES #
187
+ ###############
188
+
189
+ def collect_es(mode='title', title='', url = '',date=''):
190
+ if mode not in ['title', 'year', 'url']:
191
+ raise ValueError("mode must be either 'title' or 'year'")
192
+
193
+ if mode == 'title':
194
+ if not title or date:
195
+ raise ValueError("For 'title' mode, 'title' must be provided and 'date' must be empty.")
196
+ url = f"https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/{title}"
197
+ df = extract_cleaned_spanish_discussion_and_result(url)
198
+ if df.empty:
199
+ print(f"No data found for url: {url}")
200
+ return df
201
+ elif mode == 'url':
202
+ if title or date:
203
+ raise ValueError("For 'url' mode, 'url' must be provided and 'title' must be empty.")
204
+ df = extract_cleaned_spanish_discussion_and_result(url)
205
+ return df
206
+
207
+ elif mode == 'year':
208
+ if title or not date:
209
+ raise ValueError("For 'year' mode, 'date' must be provided and 'title' must be empty.")
210
+ month_map = {
211
+ '01': 'enero', '02': 'febrero', '03': 'marzo', '04': 'abril', '05': 'mayo', '06': 'junio',
212
+ '07': 'julio', '08': 'agosto', '09': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'
213
+ }
214
+
215
+ match = re.match(r'(\d{2})/(\d{2})/(\d{4})', date)
216
+ if not match:
217
+ raise ValueError("Date must be in the format dd/mm/yyyy")
218
+
219
+ day, month, year = match.groups()
220
+ if month not in month_map:
221
+ raise ValueError("Invalid month in date")
222
+
223
+ date_str = f"{int(day)}_de_{month_map[month]}_de_{year}"
224
+ url = f"https://es.wikipedia.org/wiki/Wikipedia:Consultas_de_borrado/Registro/{date_str}"
225
+ df = extract_multiple_discussions(url)
226
+ return df
collect_data_gr.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import re
5
+ import pysbd
6
+
7
+ ###############################################
8
+ # Functions from Code 1 (collapsible approach)#
9
+ ###############################################
10
+
11
+ def extract_result(sentence):
12
+ match = re.search(r"(Διαγραφή|Παραμονή|Άλλο αποτέλεσμα|διαγραφή|Συγχώνευση|Διατήρηση)", sentence, flags=re.IGNORECASE)
13
+ delete_cases = [
14
+ 'Μη εγκυκλοπαιδικό', 'Πράγματι δεν φαίνεται πως το λήμμα είναι εγκυκλοπαιδικό',
15
+ 'Δεν διαπιστώθηκε εγκυκλοπαιδικότητα', 'Μη εγκυκλοπαιδικό λήμμα',
16
+ 'Το λήμμα κρίθηκε ότι είναι καταλληλότερο για κάποιο άλλο αδελφό εγχείρημα, παρά για την Βικιπαίδεια + ατεκμηρίωτο.',
17
+ 'Δεν υπάρχουν επαρκείς αναφορές για την βιογραφούμενη'
18
+ ]
19
+ if match:
20
+ outcome = match.group(1).strip()
21
+ elif sentence in delete_cases:
22
+ outcome = 'Διαγραφή'
23
+ else:
24
+ outcome = 'Δεν υπάρχει συναίνεση'
25
+ return normalize_outcome(outcome)
26
+
27
+ def normalize_outcome(o):
28
+ lowered = o.lower()
29
+ if 'διαγρ' in lowered: # covers 'διαγραφή'
30
+ return 'Διαγραφή'
31
+ elif 'διατήρη' in lowered or 'παραμονή' in lowered:
32
+ return 'Διατήρηση'
33
+ elif 'συγχών' in lowered:
34
+ return 'συγχώνευση'
35
+ else:
36
+ # Covers 'Άλλο αποτέλεσμα' and unknown cases
37
+ return 'Δεν υπάρχει συναίνεση'
38
+
39
+ def extract_discussions_from_page_collapsible(url):
40
+ response = requests.get(url)
41
+ if response.status_code != 200:
42
+ return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url'])
43
+
44
+ soup = BeautifulSoup(response.content, 'html.parser')
45
+ discussion_sections = soup.find_all('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
46
+ titles = []
47
+ for section in discussion_sections:
48
+ try:
49
+ h2_tag = section.find('h2')
50
+ if not h2_tag:
51
+ continue
52
+ title_link = h2_tag.find('a')
53
+ title = title_link.text.strip() if title_link else h2_tag.get_text(strip=True)
54
+ titles.append(title)
55
+ except:
56
+ pass
57
+
58
+ discussion_tables = soup.find_all('table')
59
+ if not discussion_tables:
60
+ return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url'])
61
+
62
+ data = []
63
+ for idx, table in enumerate(discussion_tables):
64
+ try:
65
+ decision_row = table.find('tr')
66
+ decision_cell = decision_row.find('th') if decision_row else None
67
+ if decision_cell:
68
+ result_match = re.search(
69
+ r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL
70
+ )
71
+ result_sentence = result_match.group(1).strip() if result_match else "No result found"
72
+ else:
73
+ result_sentence = "No result found"
74
+
75
+ discussion_row = decision_row.find_next_sibling('tr') if decision_row else None
76
+ discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None
77
+ discussion_content = discussion_cell.get_text(separator="\n") if discussion_cell else "No discussion content found"
78
+ discussion_content = discussion_content.split('\nμητρώο\n)\n\n\n\n\n')[-1].replace('\n','')
79
+
80
+ title = titles[idx] if idx < len(titles) else f"Discussion {idx + 1}"
81
+ data.append({
82
+ "title": title,
83
+ "discussion": discussion_content,
84
+ "result_sentence": result_sentence,
85
+ "result": extract_result(result_sentence),
86
+ "text_url": url
87
+ })
88
+ except:
89
+ pass
90
+
91
+ return pd.DataFrame(data, columns=['title', 'discussion', 'result_sentence', 'result', 'text_url'])
92
+
93
+ ###########################################
94
+ # Functions from Code 2 (non-collapsible) #
95
+ ###########################################
96
+
97
+ def extract_discussions_from_page_non_collapsible(url):
98
+ response = requests.get(url)
99
+ if response.status_code != 200:
100
+ return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url'])
101
+
102
+ soup = BeautifulSoup(response.content, 'html.parser')
103
+ discussion_sections = soup.find_all('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
104
+ titles = []
105
+ for section in discussion_sections:
106
+ try:
107
+ h2_tag = section.find('h2')
108
+ if not h2_tag:
109
+ continue
110
+ title_link = h2_tag.find('a')
111
+ title = title_link.text.strip() if title_link else h2_tag.get_text(strip=True)
112
+ titles.append(title)
113
+ except:
114
+ pass
115
+
116
+ discussion_tables = soup.find_all('table', class_='pagediscussion')
117
+ if not discussion_tables:
118
+ return pd.DataFrame(columns=['title', 'discussion', 'result_sentence', 'result', 'text_url'])
119
+
120
+ data = []
121
+ for idx, table in enumerate(discussion_tables):
122
+ try:
123
+ decision_row = table.find('tr')
124
+ decision_cell = decision_row.find('th') if decision_row else None
125
+ if decision_cell:
126
+ result_match = re.search(
127
+ r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL
128
+ )
129
+ result_sentence = result_match.group(1).strip() if result_match else "No result found"
130
+ else:
131
+ result_sentence = "No result found"
132
+
133
+ discussion_row = decision_row.find_next_sibling('tr') if decision_row else None
134
+ discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None
135
+ discussion_content = discussion_cell.get_text(separator="\n") if discussion_cell else "No discussion content found"
136
+ discussion_content = discussion_content.split('\nμητρώο\n)\n\n\n\n\n')[-1].replace('\n','')
137
+
138
+ title = titles[idx] if idx < len(titles) else f"Discussion {idx + 1}"
139
+ data.append({
140
+ "title": title,
141
+ "discussion": discussion_content,
142
+ "result_sentence": result_sentence,
143
+ "result": extract_result(result_sentence),
144
+ "text_url": url
145
+ })
146
+ except:
147
+ pass
148
+
149
+ return pd.DataFrame(data, columns=['title', 'discussion', 'result_sentence', 'result', 'text_url'])
150
+
151
+ ###########################################
152
+ # Title-based extraction with fallback #
153
+ ###########################################
154
+
155
+ def html_to_plaintext(html_content):
156
+ soup = BeautifulSoup(html_content, 'html.parser')
157
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']):
158
+ tag.insert_before('\n')
159
+ tag.insert_after('\n')
160
+ for br in soup.find_all('br'):
161
+ br.replace_with('\n')
162
+ text = soup.get_text(separator=' ', strip=True)
163
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip()])
164
+ return text
165
+
166
+ def split_text_into_sentences(text):
167
+ seg = pysbd.Segmenter(language="el", clean=False)
168
+ sentences = seg.segment(text)
169
+ return ' '.join(sentences)
170
+
171
+ def clean_discussion_text(text):
172
+ return text.strip()
173
+
174
+ def extract_outcome_from_text(text):
175
+ outcomes = ['Διαγραφή', 'Παραμονή', 'διαγραφή', 'Συγχώνευση', 'Διατήρηση', 'Άλλο αποτέλεσμα']
176
+ lowered = text.lower()
177
+ found_outcome = None
178
+ for outcome in outcomes:
179
+ if outcome.lower() in lowered:
180
+ found_outcome = outcome
181
+ break
182
+ if not found_outcome:
183
+ found_outcome = 'Δεν υπάρχει συναίνεση'
184
+ return normalize_outcome(found_outcome)
185
+
186
+ def extract_discussion_section(soup, title):
187
+ t = title.replace(' ', '_')
188
+ h2_tag = soup.find('h2', id=t)
189
+ if not h2_tag:
190
+ return '', '', ''
191
+
192
+ heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
193
+ if not heading_div:
194
+ return '', '', ''
195
+
196
+ next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
197
+
198
+ html_fragments = []
199
+ current = heading_div.next_sibling
200
+ while current and current != next_heading_div:
201
+ if hasattr(current, 'prettify'):
202
+ html_fragments.append(current.prettify())
203
+ else:
204
+ html_fragments.append(str(current))
205
+ current = current.next_sibling
206
+
207
+ discussion_html = ''.join(html_fragments).strip()
208
+ if not discussion_html:
209
+ return '', '', ''
210
+
211
+ sub_soup = BeautifulSoup(discussion_html, 'html.parser')
212
+ discussion_tags = sub_soup.find_all(['p', 'ul', 'dl'])
213
+
214
+ if not discussion_tags:
215
+ return '', '', ''
216
+
217
+ cleaned_parts = []
218
+ for tag in discussion_tags:
219
+ for unwanted in tag.find_all(['span', 'img', 'a', 'div', 'table'], recursive=True):
220
+ unwanted.decompose()
221
+ text = tag.get_text(separator=' ', strip=True)
222
+ if text:
223
+ cleaned_parts.append(text)
224
+
225
+ cleaned_discussion = ' '.join(cleaned_parts)
226
+ label = extract_outcome_from_text(cleaned_discussion)
227
+
228
+ return discussion_html, label, cleaned_discussion
229
+
230
+ def extract_fallback_discussion(url, title):
231
+ response = requests.get(url)
232
+ if response.status_code != 200:
233
+ return '', None
234
+
235
+ soup = BeautifulSoup(response.text, 'html.parser')
236
+ discussion_tables = soup.find_all('table')
237
+ if not discussion_tables:
238
+ return '', None
239
+ for table in discussion_tables:
240
+ table_text = table.get_text(separator='\n', strip=True)
241
+ if title in table_text:
242
+ decision_row = table.find('tr')
243
+ decision_cell = decision_row.find('th') if decision_row else None
244
+ if decision_cell:
245
+ result_match = re.search(r"Η συζήτηση τελείωσε, το αποτέλεσμα ήταν: <i>(.*?)</i>", str(decision_cell), re.DOTALL)
246
+ result_sentence = result_match.group(1).strip() if result_match else "No result found"
247
+ else:
248
+ result_sentence = "No result found"
249
+
250
+ discussion_row = decision_row.find_next_sibling('tr') if decision_row else None
251
+ discussion_cell = discussion_row.find('td', class_='plainlinks') if discussion_row else None
252
+ discussion_content = ''
253
+ if discussion_cell:
254
+ discussion_content = discussion_cell.get_text(separator=' ', strip=True)
255
+
256
+ if discussion_content:
257
+ outcome = extract_result(result_sentence)
258
+ return discussion_content, outcome
259
+
260
+ return '', None
261
+
262
+ def extract_div_from_title_with_fallback(title, url ='', date=''):
263
+ if not date:
264
+ raise ValueError("For 'title' mode, 'date' must be provided in the format: mm/yyyy")
265
+
266
+ month_map = {
267
+ '01': 'Ιανουαρίου', '02': 'Φεβρουαρίου', '03': 'Μαρτίου', '04': 'Απριλίου', '05': 'Μαΐου', '06': 'Ιουνίου',
268
+ '07': 'Ιουλίου', '08': 'Αυγούστου', '09': 'Σεπτεμβρίου', '10': 'Οκτωβρίου', '11': 'Νοεμβρίου', '12': 'Δεκεμβρίου'
269
+ }
270
+ if '_' in date and date.split('_')[0] in month_map.values():
271
+ # If date is already in 'Month_Year' format
272
+ date_str = date
273
+ else:
274
+ # Try to parse date in 'mm/yyyy' format
275
+ match = re.match(r'(\d{2})/(\d{4})', date)
276
+ if not match:
277
+ raise ValueError("Date must be in the format mm/yyyy or Month_Year")
278
+ mm, yyyy = match.groups()
279
+ if mm not in month_map:
280
+ raise ValueError(f"Invalid month: {mm}")
281
+
282
+ date_str = f"{month_map[mm]}_{yyyy}" # Convert to 'Month_Year' format
283
+ base_url = 'https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή'
284
+ url = f"{base_url}/{date_str}#{title}"
285
+
286
+ response = requests.get(url)
287
+ if response.status_code != 200:
288
+ return pd.DataFrame(columns=['title', 'discussion_url', 'discussion', 'outcome'])
289
+
290
+ soup = BeautifulSoup(response.content, 'html.parser')
291
+ discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title)
292
+
293
+ text_url = f"{base_url}/{date_str}"
294
+ discussion_url = text_url + '#' + title
295
+
296
+ cleaned_discussion = html_to_plaintext(cleaned_discussion)
297
+ cleaned_discussion = split_text_into_sentences(cleaned_discussion)
298
+ cleaned_discussion = clean_discussion_text(cleaned_discussion)
299
+
300
+ if not cleaned_discussion.strip():
301
+ fallback_url = f"{base_url}/{date_str}"
302
+ discussion_content, outcome = extract_fallback_discussion(fallback_url, title)
303
+ cleaned_discussion = html_to_plaintext(discussion_content)
304
+ cleaned_discussion = split_text_into_sentences(cleaned_discussion)
305
+ cleaned_discussion = clean_discussion_text(cleaned_discussion)
306
+ if outcome:
307
+ label = normalize_outcome(outcome)
308
+
309
+ df = pd.DataFrame([[title, discussion_url, cleaned_discussion, label]],
310
+ columns=['title', 'discussion_url', 'discussion', 'outcome'])
311
+ return df
312
+
313
+ def normalize_outcome(o):
314
+ lowered = o.lower()
315
+ if 'διαγρ' in lowered:
316
+ return 'Διαγραφή'
317
+ elif 'διατήρη' in lowered or 'παραμονή' in lowered:
318
+ return 'Διατήρηση'
319
+ elif 'συγχών' in lowered:
320
+ return 'συγχώνευση'
321
+ else:
322
+ return 'Δεν υπάρχει συναίνεση'
323
+
324
+ ###################################
325
+ # The collect_gr() function #
326
+ ###################################
327
+
328
+ def collect_gr(mode='url', title='', url = '', years=[]):
329
+ if mode not in ['title', 'year', 'url']:
330
+ raise ValueError("mode must be either 'title' or 'year' or 'url'.")
331
+
332
+ if mode == 'title':
333
+ if not title or not years or len(years) != 1:
334
+ raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be a single-element list like ['mm/yyyy'].")
335
+ date = years[0]
336
+ df = extract_div_from_title_with_fallback(title, date=date)
337
+ return df[['title', 'discussion_url', 'discussion', 'outcome']]
338
+
339
+ elif mode == 'url':
340
+ if title or years:
341
+ raise ValueError("For 'url' mode, 'title' must be empty and 'years' must be empty.")
342
+ #collect the title and date from the url like: base_url = 'https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή'/{date_str}#{title}
343
+ match = re.search(r'Βικιπαίδεια:Σελίδες_γι��_διαγραφή/([^#]+)#(.+)', url)
344
+ if not match:
345
+ raise ValueError("URL format is incorrect.")
346
+ date_str, title = match.groups()
347
+ print(date_str, title)
348
+ df = extract_div_from_title_with_fallback(title, date=date_str)
349
+ return df[['title', 'discussion_url', 'discussion', 'outcome']]
350
+
351
+
352
+ elif mode == 'year':
353
+ if title or not years:
354
+ raise ValueError("For 'year' mode, 'title' must be empty and 'years' must be provided.")
355
+ if len(years) == 1:
356
+ start_year = end_year = years[0]
357
+ elif len(years) == 2:
358
+ start_year, end_year = min(years), max(years)
359
+ else:
360
+ raise ValueError("Invalid years input. Provide one year or two years for a range.")
361
+
362
+ all_data = []
363
+ for year in range(start_year, end_year + 1):
364
+ url = f"https://el.wikipedia.org/wiki/Βικιπαίδεια:Σελίδες_για_διαγραφή/Ιανουαρίου_{year}"
365
+ df = extract_discussions_from_page_collapsible(url)
366
+ if df.empty:
367
+ df = extract_discussions_from_page_non_collapsible(url)
368
+
369
+ if not df.empty:
370
+ df['result'] = df['result'].apply(normalize_outcome)
371
+ df['discussion_url'] = df.apply(lambda row: row['text_url'] + '#' + row['title'].replace(' ', '_'), axis=1)
372
+ df = df.rename(columns={'result':'outcome'})
373
+ all_data.append(df[['title', 'discussion_url', 'discussion', 'outcome']])
374
+
375
+ if all_data:
376
+ return pd.concat(all_data, ignore_index=True)
377
+ else:
378
+ return pd.DataFrame(columns=['title', 'discussion_url', 'discussion', 'outcome'])
collect_data_wikidata_ent.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import pysbd
5
+ import re
6
+
7
+
8
+ ########################
9
+ ## Year based search ##
10
+ ########################
11
+
12
+ BASE_URL = "https://www.wikidata.org/wiki/Wikidata:Requests_for_deletions/Archive"
13
+
14
+ def get_soup(url):
15
+ response = requests.get(url)
16
+ response.raise_for_status()
17
+ return BeautifulSoup(response.text, 'html.parser')
18
+
19
+ def get_year_urls():
20
+ soup = get_soup(BASE_URL)
21
+ year_urls = {}
22
+ for link in soup.select('a[href^="/wiki/Wikidata:Requests_for_deletions/Archive/"]'):
23
+ year_url = link['href']
24
+ if year_url.endswith(tuple(str(year) for year in range(2012, 2025))):
25
+ year = year_url.split('/')[-1]
26
+ full_year_url = "https://www.wikidata.org" + year_url
27
+ year_urls[year] = full_year_url
28
+ return year_urls
29
+
30
+ def get_month_day_urls(year_url):
31
+ soup = get_soup(year_url)
32
+ month_day_urls = []
33
+ for link in soup.select('a[href^="/wiki/Wikidata:Requests_for_deletions/Archive/"]'):
34
+ date_url = link['href']
35
+ if len(date_url.split('/')) >= 7:
36
+ full_date_url = "https://www.wikidata.org" + date_url
37
+ if full_date_url not in month_day_urls:
38
+ month_day_urls.append(full_date_url)
39
+ return month_day_urls
40
+
41
+ def extract_outcome_from_dd(dd):
42
+ try:
43
+ result_tag = dd.find('b')
44
+ if result_tag:
45
+ return result_tag.get_text().strip()
46
+ return 'unknown'
47
+ except:
48
+ return 'unknown'
49
+
50
+ def extract_discussions(url):
51
+ soup = get_soup(url)
52
+ discussions = []
53
+ for h2 in soup.find_all('h2'):
54
+ title_tag = h2.find('a')
55
+ if title_tag and 'Q' in title_tag.get_text():
56
+ title = title_tag.get_text().strip()
57
+ discussion_parts = []
58
+ last_dd = None
59
+ for sibling in h2.find_all_next():
60
+ if sibling.name == 'h2':
61
+ break
62
+ if sibling.name == 'p':
63
+ discussion_parts.append(sibling.get_text(separator=' ', strip=True))
64
+ if sibling.name == 'dl':
65
+ dds = sibling.find_all('dd')
66
+ if dds:
67
+ for dd in dds[:-1]:
68
+ discussion_parts.append(dd.get_text(separator=' ', strip=True))
69
+ last_dd = dds[-1]
70
+ discussion_text = ' '.join(discussion_parts) if discussion_parts else 'No discussion found'
71
+ outcome = extract_outcome_from_dd(last_dd) if last_dd else 'Outcome not found'
72
+ entity_url = url + '#' + title
73
+ discussions.append({
74
+ "title": title,
75
+ "discussion": discussion_text,
76
+ "outcome": outcome,
77
+ "url": entity_url,
78
+ 'date': url.split('Archive/')[-1]
79
+ })
80
+ return discussions
81
+
82
+ def remove_first_sentence_if_q_number(text):
83
+ seg = pysbd.Segmenter(language="en", clean=False)
84
+ sentences = seg.segment(text)
85
+ if sentences and sentences[0].startswith('Q') and sentences[0][1:].isdigit():
86
+ return ' '.join(sentences[1:])
87
+ return text
88
+
89
+ def process_discussions_by_url_list(url_list):
90
+ all_discussions = []
91
+ for url in url_list:
92
+ discussions = extract_discussions(url)
93
+ all_discussions.extend(discussions)
94
+ df = pd.DataFrame(all_discussions)
95
+ if not df.empty:
96
+ df['discussion'] = df['discussion'].apply(remove_first_sentence_if_q_number)
97
+ return df
98
+
99
+
100
+ ########################
101
+ ## Title based search ##
102
+ ########################
103
+
104
+ import requests
105
+ from bs4 import BeautifulSoup
106
+ import pandas as pd
107
+ import pysbd
108
+
109
+ def html_to_plaintext(html_content):
110
+ soup = BeautifulSoup(html_content, 'html.parser')
111
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']):
112
+ tag.insert_before('\n')
113
+ tag.insert_after('\n')
114
+ for br in soup.find_all('br'):
115
+ br.replace_with('\n')
116
+ text = soup.get_text(separator=' ', strip=True)
117
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
118
+ return text
119
+
120
+ def split_text_into_sentences(text):
121
+ seg = pysbd.Segmenter(language="en", clean=False)
122
+ sentences = seg.segment(text)
123
+ return ' '.join(sentences)
124
+
125
+ def clean_discussion_tag(tag):
126
+ for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True):
127
+ unwanted.decompose()
128
+ return tag.get_text(separator=' ', strip=True)
129
+
130
+ def extract_outcome_from_text_elements(elements):
131
+ consensus_keywords = [
132
+ 'Deleted', 'Delete', 'delete', 'deleted',
133
+ 'kept', 'keep', 'Keep', 'Kept',
134
+ 'merge', 'Merge', 'Not done', 'No consensus', 'no consensus'
135
+ ]
136
+ for el in elements:
137
+ b_tags = el.find_all('b')
138
+ for b in b_tags:
139
+ if b.text.strip() in consensus_keywords:
140
+ return b.text.strip()
141
+ return ''
142
+
143
+ def extract_discussion_section(soup, title):
144
+ h2_tag = soup.find('h2', id=title)
145
+ if not h2_tag:
146
+ print(f"No heading found with id={title}")
147
+ return '', '', ''
148
+
149
+ heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
150
+ if not heading_div:
151
+ print(f"No heading div found for {title}")
152
+ return '', '', ''
153
+
154
+ next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
155
+ discussion_nodes = []
156
+ for sibling in heading_div.next_siblings:
157
+ if sibling == next_heading_div:
158
+ break
159
+ discussion_nodes.append(sibling)
160
+
161
+ discussion_tags = []
162
+ for node in discussion_nodes:
163
+ if getattr(node, 'name', None) in ['p', 'ul', 'dl']:
164
+ if node.has_attr('class') and 'plainlinks' in node['class']:
165
+ continue
166
+ if node.get('style', '').lower() == 'visibility:hidden;display:none':
167
+ continue
168
+ if node.find('span', id=title):
169
+ continue
170
+ discussion_tags.append(node)
171
+
172
+ if not discussion_tags:
173
+ return '', '', ''
174
+
175
+ label = extract_outcome_from_text_elements(discussion_tags)
176
+ discussion_html_parts = [str(tag) for tag in discussion_tags]
177
+ cleaned_parts = []
178
+ for tag in discussion_tags:
179
+ text = clean_discussion_tag(tag)
180
+ if text:
181
+ cleaned_parts.append(text)
182
+
183
+ cleaned_discussion = ' '.join(cleaned_parts)
184
+ discussion_html = '\n'.join(discussion_html_parts)
185
+ return discussion_html, label, cleaned_discussion
186
+
187
+ def extract_div_from_title(title, url=''):
188
+ if url=='' or not url:
189
+ base_url = 'https://www.wikidata.org/wiki/Wikidata:Requests_for_deletions'
190
+ url = base_url + '#' + title
191
+ text_url = base_url
192
+ discussion_url = text_url + '#' + title
193
+
194
+
195
+ response = requests.get(url)
196
+ if response.status_code != 200:
197
+ print(f"Could not fetch {url}")
198
+ return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
199
+ if title == '':
200
+ title = url.split('#')[-1]
201
+
202
+ soup = BeautifulSoup(response.content, 'html.parser')
203
+ discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title)
204
+
205
+ text_url = 'https://www.wikidata.org/wiki/'+ url.split('#')[0]
206
+ discussion_url = url
207
+
208
+ df = pd.DataFrame([[title, text_url, discussion_url, cleaned_discussion, label]],
209
+ columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
210
+ if label:
211
+ df['label'] = df['label'].replace({
212
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
213
+ 'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
214
+ 'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
215
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus'
216
+ })
217
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
218
+
219
+ return df
220
+
221
+
222
+ ########################
223
+ ## Collection function ##
224
+ ########################
225
+
226
+
227
+ import pandas as pd
228
+
229
+ def collect_wikidata_entity(mode='year', title='', url='', years=[]):
230
+ if mode not in ['title', 'year','url']:
231
+ raise ValueError("mode must be either 'title' or 'year'")
232
+
233
+ if mode == 'title':
234
+ if not title or years:
235
+ raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be empty.")
236
+ df = extract_div_from_title(title)
237
+ df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'})
238
+ return df
239
+ elif mode == 'url':
240
+ if 'Archive' in url:
241
+ archived_url = url.split('#')[0]
242
+ title = url.split('#')[-1]
243
+ disc_df = process_discussions_by_url_list([archived_url])
244
+ disc_df['title'] = disc_df['title'].str.strip()
245
+ title = title.strip()
246
+ df = disc_df[disc_df['title'] == title]
247
+ print(f"Found {len(df)} discussions for title {title}")
248
+ if df.empty:
249
+ return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
250
+ df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'})
251
+ return df
252
+ if title or years:
253
+ raise ValueError("For 'url' mode, 'url' must be provided and 'title' must be empty.")
254
+ df = extract_div_from_title('', url)
255
+ df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'})
256
+ return df
257
+
258
+ elif mode == 'year':
259
+ if title or not years:
260
+ raise ValueError("For 'year' mode, 'years' must be provided and 'title' must be empty.")
261
+ if isinstance(years, list) and len(years) == 2:
262
+ start_year, end_year = years
263
+ years = list(range(start_year, end_year + 1))
264
+ elif isinstance(years, int):
265
+ years = [years]
266
+ df = pd.DataFrame()
267
+ for year in years:
268
+ print(f"Processing year: {year}")
269
+
270
+ year_urls = get_year_urls()
271
+ if str(year) not in year_urls:
272
+ print(f"No URL found for year {year}")
273
+ continue
274
+ year_url = year_urls[str(year)]
275
+ month_day_urls = get_month_day_urls(year_url)
276
+ print(f"Found {len(month_day_urls)} month-day URLs for {year}")
277
+ discussions_df = process_discussions_by_url_list(month_day_urls)
278
+
279
+ if discussions_df.empty:
280
+ continue
281
+
282
+ discussions_df.rename(columns={'url':'discussion_url', 'outcome':'label', 'discussion':'discussion_cleaned'}, inplace=True)
283
+ text_url = year_url
284
+ discussions_df['text_url'] = text_url
285
+ discussions_df['label'] = discussions_df['label'].replace({
286
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
287
+ 'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
288
+ 'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
289
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus'
290
+ })
291
+
292
+ desired_columns = ['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']
293
+ for col in desired_columns:
294
+ if col not in discussions_df.columns:
295
+ discussions_df[col] = ''
296
+ discussions_df = discussions_df[desired_columns]
297
+
298
+ df = pd.concat([df, discussions_df], ignore_index=True)
299
+ df = df.rename(columns={'label':'outcome', 'discussion_cleaned':'discussion'})
300
+ return df
301
+
302
+
collect_data_wikidata_prop.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import pysbd
5
+ import re
6
+
7
+ #####################
8
+ # Utility functions #
9
+ #####################
10
+
11
+ def html_to_plaintext(html_content):
12
+ soup = BeautifulSoup(html_content, 'html.parser')
13
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']):
14
+ tag.insert_before('\n')
15
+ tag.insert_after('\n')
16
+ for br in soup.find_all('br'):
17
+ br.replace_with('\n')
18
+
19
+ text = soup.get_text(separator=' ', strip=True)
20
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
21
+ return text
22
+
23
+ def split_text_into_sentences(text):
24
+ seg = pysbd.Segmenter(language="en", clean=False)
25
+ sentences = seg.segment(text)
26
+ return ' '.join(sentences)
27
+
28
+ def process_html_to_plaintext(df):
29
+ if df.empty:
30
+ return df
31
+ if 'discussion' in df.columns:
32
+ df['discussion_cleaned'] = df['discussion'].apply(html_to_plaintext)
33
+ return df
34
+
35
+ def process_split_text_into_sentences(df):
36
+ if df.empty:
37
+ return df
38
+ if 'discussion_cleaned' in df.columns:
39
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
40
+ return df
41
+
42
+ ###########################
43
+ # Year-based extraction #
44
+ ###########################
45
+
46
+ def extract_outcome_from_div(div):
47
+ try:
48
+ consensus_keywords = ['Deleted', 'Delete', 'delete', 'deleted', 'kept', 'keep', 'Keep', 'Kept', 'merge', 'Merge', 'Not done', 'No consensus', 'no consensus']
49
+ dd_tags = div.find_all('dd')
50
+ for dd in dd_tags:
51
+ b_tag = dd.find('b')
52
+ if b_tag and b_tag.text.strip() in consensus_keywords:
53
+ return b_tag.text.strip()
54
+ img_tag = dd.find('img')
55
+ if img_tag and 'X_mark.svg' in img_tag.get('src', ''):
56
+ next_b_tag = dd.find_next('b')
57
+ if next_b_tag and next_b_tag.text.strip() in consensus_keywords:
58
+ return next_b_tag.text.strip()
59
+
60
+ return 'no consensus'
61
+ except Exception as e:
62
+ print(f"Error extracting outcome: {e}")
63
+ return 'unknown'
64
+
65
+
66
+ def extract_cleaned_discussion(div):
67
+ discussion_parts = []
68
+ discussion_items = div.find_all(['li', 'dd'])
69
+
70
+ for item in discussion_items:
71
+ for tag in item.find_all(['span', 'img', 'a']):
72
+ tag.decompose()
73
+ cleaned_text = item.get_text(separator=' ', strip=True)
74
+ discussion_parts.append(cleaned_text)
75
+ return ' '.join(discussion_parts)
76
+
77
+ def extract_div_contents_with_additional_columns(url):
78
+ response = requests.get(url)
79
+ if response.status_code != 200:
80
+ return pd.DataFrame(columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion'])
81
+
82
+ soup = BeautifulSoup(response.content, 'html.parser')
83
+ divs = soup.find_all('div', class_='boilerplate metadata discussion-archived mw-archivedtalk')
84
+ if len(divs) == 0:
85
+ print(f"No discussions found in {url}. Please check the structure.")
86
+
87
+ data = []
88
+ for i, div in enumerate(divs):
89
+ try:
90
+ heading_div = div.find_previous('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
91
+ if heading_div:
92
+ h2_tag = heading_div.find('h2')
93
+ if h2_tag:
94
+ id = h2_tag.get('id', 'Unknown ID')
95
+ if id:
96
+ text_url = url+'#' + id
97
+ title = id.replace('(page does not exist)', '').strip()
98
+ else:
99
+ title = "Unknown Title"
100
+ text_url = "Unknown URL"
101
+ else:
102
+ title = "Unknown Title"
103
+ text_url = "Unknown URL"
104
+ else:
105
+ # fallback for rare cases
106
+ title = "Unknown Title"
107
+ text_url = "Unknown URL"
108
+
109
+ deletion_discussion = div.prettify()
110
+ label = extract_outcome_from_div(div)
111
+ cleaned_discussion = extract_cleaned_discussion(div)
112
+ parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
113
+ discussion = parts[0] if len(parts) > 0 else ''
114
+ verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
115
+
116
+ data.append([title, text_url, deletion_discussion, label, '', cleaned_discussion, verdict])
117
+ except Exception as e:
118
+ print(f"Error processing div #{i} in {url}: {e}")
119
+ continue
120
+
121
+ df = pd.DataFrame(data, columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
122
+ return df
123
+
124
+ def scrape_wikidata_deletions(wikidata_url):
125
+ months_data = []
126
+ month_found = False
127
+ for month in range(1, 13):
128
+ month_url = f"{wikidata_url}/{month}"
129
+ print(f"Processing month: {month}")
130
+ response = requests.get(month_url)
131
+ if response.status_code == 200:
132
+ df = extract_div_contents_with_additional_columns(month_url)
133
+ if not df.empty:
134
+ df = process_html_to_plaintext(df)
135
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(lambda x: ' '.join(pysbd.Segmenter(language="en", clean=False).segment(x)[1:]) if x else x)
136
+ months_data.append(df)
137
+ month_found = True
138
+ else:
139
+ print(f"No month-specific page found for {month_url}.")
140
+
141
+ if month_found and months_data:
142
+ all_data = pd.concat(months_data, ignore_index=True)
143
+ return all_data
144
+
145
+ print(f"Attempting year-based extraction for base URL: {wikidata_url}")
146
+ df = extract_div_contents_with_additional_columns(wikidata_url)
147
+ if not df.empty:
148
+ df = process_html_to_plaintext(df)
149
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(lambda x: ' '.join(pysbd.Segmenter(language="en", clean=False).segment(x)[1:]) if x else x)
150
+ return df
151
+
152
+ print("No data found using month-specific or year-based extraction.")
153
+ return pd.DataFrame()
154
+
155
+ ############################
156
+ # Title-based extraction #
157
+ ############################
158
+
159
+ def extract_outcome_from_text_elements(elements):
160
+ consensus_keywords = [
161
+ 'Deleted', 'Delete', 'delete', 'deleted',
162
+ 'kept', 'keep', 'Keep', 'Kept',
163
+ 'merge', 'Merge', 'Not done', 'No consensus', 'no consensus'
164
+ ]
165
+ for el in elements:
166
+ b_tags = el.find_all('b')
167
+ for b in b_tags:
168
+ if b.text.strip() in consensus_keywords:
169
+ return b.text.strip()
170
+ return ''
171
+
172
+ def clean_discussion_tag(tag):
173
+ for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True):
174
+ unwanted.decompose()
175
+ return tag.get_text(separator=' ', strip=True)
176
+
177
+ def extract_discussion_section(soup, title):
178
+ h2_tag = soup.find('h2', id=title)
179
+ if not h2_tag:
180
+ print(f"No heading found with id={title}")
181
+ return '', '', ''
182
+ heading_div = h2_tag.find_parent('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
183
+ if not heading_div:
184
+ print(f"No heading div found for {title}")
185
+ return '', '', ''
186
+
187
+ next_heading_div = heading_div.find_next('div', class_='mw-heading mw-heading2 ext-discussiontools-init-section')
188
+ discussion_nodes = []
189
+ for sibling in heading_div.next_siblings:
190
+ if sibling == next_heading_div:
191
+ break
192
+ discussion_nodes.append(sibling)
193
+
194
+ discussion_tags = []
195
+ for node in discussion_nodes:
196
+ if getattr(node, 'name', None) in ['p', 'ul', 'dl']:
197
+ if node.find('span', id=title) or node.get('style', '').lower() == 'visibility:hidden;display:none':
198
+ continue
199
+ discussion_tags.append(node)
200
+
201
+ if not discussion_tags:
202
+ return '', '', ''
203
+
204
+ label = extract_outcome_from_text_elements(discussion_tags)
205
+ discussion_html_parts = [str(tag) for tag in discussion_tags]
206
+ cleaned_parts = []
207
+ for tag in discussion_tags:
208
+ text = clean_discussion_tag(tag)
209
+ if text:
210
+ cleaned_parts.append(text)
211
+
212
+ cleaned_discussion = ' '.join(cleaned_parts)
213
+ discussion_html = '\n'.join(discussion_html_parts)
214
+ return discussion_html, label, cleaned_discussion
215
+
216
+ def extract_div_from_title(url, title):
217
+ response = requests.get(url)
218
+ if response.status_code != 200:
219
+ print(f"Could not fetch {url}")
220
+ return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
221
+
222
+ soup = BeautifulSoup(response.content, 'html.parser')
223
+ discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title)
224
+
225
+ text_url = 'https://www.wikidata.org/wiki/Wikidata:Properties_for_deletion'
226
+ discussion_url = text_url + '#' + title
227
+
228
+ data = [[title, text_url, discussion_url, cleaned_discussion, label]]
229
+ df = pd.DataFrame(data, columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
230
+ return df
231
+
232
+ ############################
233
+ # Unified collect function #
234
+ ############################
235
+
236
+ def collect_wikidata(mode='year', title='', url='', years=[]):
237
+ if mode not in ['title', 'year','url']:
238
+ raise ValueError("mode must be either 'title' or 'year' or 'url'.")
239
+
240
+ if mode == 'title':
241
+
242
+ if not title or years:
243
+ raise ValueError("For 'title' mode, 'title' must be provided and 'years' must be empty.")
244
+ url = 'https://www.wikidata.org/wiki/Wikidata:Properties_for_deletion#' + title
245
+ df = extract_div_from_title(url, title)
246
+ if not df.empty and 'label' in df.columns and df['label'].notnull().any():
247
+ df['label'] = df['label'].replace({
248
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
249
+ 'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
250
+ 'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
251
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus'
252
+ })
253
+ df = df.rename(columns={'discussion_cleaned':'discussion'})
254
+ return df
255
+
256
+ elif mode == 'url':
257
+ if title or years:
258
+ raise ValueError("For 'url' mode, 'url' must be provided and 'title' must be empty.")
259
+ df = extract_div_contents_with_additional_columns(url)
260
+ if not df.empty and 'label' in df.columns and df['label'].notnull().any():
261
+ df['label'] = df['label'].replace({
262
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
263
+ 'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
264
+ 'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
265
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus'
266
+ })
267
+ else:
268
+ return ValueError("No data found for the provided URL.")
269
+ df = df.rename(columns={'discussion_cleaned':'discussion'})
270
+ return df
271
+
272
+ elif mode == 'year':
273
+ if title or not years:
274
+ raise ValueError("For 'year' mode, 'years' must be provided and 'title' must be empty.")
275
+
276
+ if isinstance(years, list) and len(years) == 2:
277
+ start_year, end_year = years
278
+ years = list(range(start_year, end_year + 1))
279
+ elif isinstance(years, int):
280
+ years = [years]
281
+
282
+ df = pd.DataFrame()
283
+ for year in years:
284
+ wikidata_url = f'https://www.wikidata.org/wiki/Wikidata:Properties_for_deletion/Archive/{year}'
285
+ deletions_df = scrape_wikidata_deletions(wikidata_url)
286
+ if deletions_df.empty:
287
+ continue
288
+
289
+ columns_to_drop = ['confirmation', 'discussion', 'verdict', 'deletion_discussion']
290
+ deletions_df = deletions_df.drop(columns=[col for col in columns_to_drop if col in deletions_df.columns], errors='ignore')
291
+
292
+ if 'label' in deletions_df.columns:
293
+ deletions_df.rename(columns={'label':'label'}, inplace=True)
294
+ deletions_df['label'] = deletions_df['label'].replace({
295
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
296
+ 'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
297
+ 'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
298
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus'
299
+ })
300
+
301
+
302
+ if 'text_url' in deletions_df.columns:
303
+ deletions_df.rename(columns={'text_url':'discussion_url'}, inplace=True)
304
+ deletions_df['text_url'] = wikidata_url
305
+ if 'label' not in deletions_df.columns:
306
+ deletions_df['label'] = ''
307
+
308
+ for col in ['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']:
309
+ if col not in deletions_df.columns:
310
+ deletions_df[col] = ''
311
+
312
+ deletions_df = deletions_df[['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label']]
313
+
314
+ deletions_df['year'] = year
315
+ df = pd.concat([df, deletions_df], ignore_index=True)
316
+ df = df.rename(columns={'discussion_cleaned':'discussion'})
317
+ return df
collect_data_wikinews.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import pysbd
5
+ import re
6
+
7
+
8
+ ################################
9
+ # Year based data collection ###
10
+ ################################
11
+
12
+ def get_soup(url):
13
+ response = requests.get(url)
14
+ response.raise_for_status()
15
+ return BeautifulSoup(response.text, 'html.parser')
16
+
17
+ def html_to_plaintext(html_content):
18
+ soup = BeautifulSoup(html_content, 'html.parser')
19
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl']):
20
+ tag.insert_before('\n')
21
+ tag.insert_after('\n')
22
+ for br in soup.find_all('br'):
23
+ br.replace_with('\n')
24
+
25
+ text = soup.get_text(separator=' ', strip=True)
26
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
27
+ return text
28
+
29
+ def extract_fallback_discussion(html_content):
30
+ soup = BeautifulSoup(html_content, 'html.parser')
31
+ discussion_parts = []
32
+ for element in soup.find_all(['p', 'li', 'dd', 'ol'], recursive=False):
33
+ discussion_parts.append(element.get_text(separator=' ', strip=True))
34
+ return ' '.join(discussion_parts).strip()
35
+
36
+ def process_html_to_plaintext(df):
37
+ if df.empty:
38
+ return df
39
+ df['discussion_cleaned'] = df['discussion'].apply(html_to_plaintext)
40
+ for index, row in df.iterrows():
41
+ if not row['discussion_cleaned'].strip():
42
+ df.at[index, 'discussion_cleaned'] = extract_fallback_discussion(row['discussion_uncleaned'])
43
+ return df
44
+
45
+ def extract_outcome_from_div(div):
46
+ try:
47
+ result_phrase = div.find(text=re.compile(r'The result was to'))
48
+ if result_phrase:
49
+ result = result_phrase.find_next('b')
50
+ if result:
51
+ outcome_text = result.text.strip()
52
+ if outcome_text.lower() == "please do not modify it":
53
+ return extract_following_sentence(div) or 'unknown'
54
+ elif validate_outcome(outcome_text):
55
+ return outcome_text
56
+ li_outcome = div.find('li')
57
+ if li_outcome and li_outcome.find('b'):
58
+ outcome_text = li_outcome.find('b').text.strip()
59
+ if outcome_text.lower() == "please do not modify it":
60
+ return extract_following_sentence(div) or 'unknown'
61
+ elif validate_outcome(outcome_text):
62
+ return outcome_text
63
+
64
+ dl_outcome = div.find('dl')
65
+ if dl_outcome and dl_outcome.find('b'):
66
+ outcome_text = dl_outcome.find('b').text.strip()
67
+ if outcome_text.lower() == "please do not modify it":
68
+ return extract_following_sentence(div) or 'unknown'
69
+ elif validate_outcome(outcome_text):
70
+ return outcome_text
71
+
72
+ outcome_italic = div.find('dd')
73
+ if outcome_italic and outcome_italic.find('i'):
74
+ outcome_text = outcome_italic.find('i').get_text(strip=True)
75
+ if outcome_text.lower() == "please do not modify it":
76
+ return extract_following_sentence(div) or 'unknown'
77
+ elif validate_outcome(outcome_text):
78
+ return outcome_text
79
+ return extract_following_sentence(div) or 'unknown'
80
+
81
+ except Exception as e:
82
+ print(f"Error extracting outcome: {e}")
83
+ return 'unknown'
84
+
85
+
86
+ def extract_following_sentence(div):
87
+ try:
88
+ phrases = [
89
+ "No further edits should be made to this discussion",
90
+ "Please do not add any more comments and votes to this request",
91
+ "No further edits should be made to this discussion."
92
+ ]
93
+
94
+ for phrase in phrases:
95
+ phrase_location = div.find(text=re.compile(phrase))
96
+ if phrase_location:
97
+ following_text = ""
98
+ for sibling in phrase_location.find_all_next(string=True):
99
+ if "Please do not modify it" in sibling:
100
+ continue
101
+ following_text += sibling.strip() + " "
102
+ if "." in sibling:
103
+ break
104
+ sentence = following_text.split('.')[0].strip()
105
+ if validate_outcome(sentence):
106
+ return sentence
107
+
108
+ return None
109
+
110
+ except Exception as e:
111
+ print(f"Error extracting following sentence: {e}")
112
+ return None
113
+
114
+ def validate_outcome(outcome_text):
115
+ label_mapping = {
116
+ 'delete': [
117
+ 'delete', 'delete ... unanimous', 'deleted', 'deleted as abandoned',
118
+ 'speedy delete', 'Delete', 'delete as redundant to existing template',
119
+ 'delete as unlikely to be used', 'delete but no prejudice against recreation when needed',
120
+ 'delete after Ottawahitech chose not to provide a rationale',
121
+ 'Delete, with no objection to recreation when needed.', 'Deleted',
122
+ 'delete the Cigarette redirect and keep the NHS redirect.', 'Delete all articles', 'Tentatively sending through the usual abandonment process',
123
+ 'Delete all articles','This was completed already.'
124
+ ],
125
+ 'speedy delete': [
126
+
127
+ 'speedy delete', 'speedy deleted', 'speedy deleted test page', 'Speedy-deleted', 'Speedy deleted', 'Speedy-deleted, no meaningful content',
128
+ 'Speeded as "old prep"', 'Speedied as "old prep" -- Pi zero ( talk ) 23:42, 10 February 2020 (UTC) [ reply ] __DTELLIPSISBUTTON__{"threadItem":{"timestamp":"2020-02-10T23:42:00'
129
+ ],
130
+
131
+ 'keep': [
132
+ 'keep',
133
+ 'Do not undelete. The content should be kept by the author off-wiki, and can be included as a part of another story that is current',
134
+ 'Personal details have been redacted and hidden from public view together with a NOINDEX flag',
135
+
136
+ ],
137
+ 'redirect': [
138
+ 'soft redirect'
139
+ ],
140
+ 'merge': [
141
+ 'convert near-clone of mainspace article to use {{topic cat}}; apply {{correction}} to mainspace article'
142
+ ],
143
+ 'no_consensus': [
144
+ 'No consensus to delete. However, there clearly is a consensus that if we are to have this template, we aren\'t to use it in its present form.',
145
+ 'no consensus', 'No consensus',
146
+ "At this time, it's unclear if there's a consensus to keep but abundantly clear there isn't one to delete."
147
+ ],
148
+ 'comment': [
149
+ 'Remove', 'SVT', 'withdraw the deletion request', 'On consideration, speedied as unused and lacking fair-use rationale',
150
+ 'Moved to userspace', 'Withdrawn to allow interview re-focus','More userspace drafts This is the second batch of a large number of draft articles in userspace',
151
+ 'This was completed already ', 'Do not undelete. The content should be kept by the author off-wiki, and can be included as a part of another story that is current',
152
+
153
+ ],
154
+ 'withdrawn': ['Withdrawn to allow interview re-focus',
155
+ ]
156
+ }
157
+
158
+
159
+
160
+ outcome_to_label = {outcome.lower(): label for label, outcomes in label_mapping.items() for outcome in outcomes}
161
+ return outcome_to_label.get(outcome_text.lower(), 'unknown')
162
+
163
+
164
+ def update_unknown_outcomes(df):
165
+ base_url = "https://en.wikinews.org/w/index.php?title="
166
+
167
+ for i in df.index:
168
+ if df.at[i, 'outcome'] == 'unknown':
169
+ title = df.at[i, 'title'].replace(" ", "_")
170
+ url = f"{base_url}{title}&action=edit&redlink=1"
171
+ print(f"Checking page: {url}")
172
+
173
+ try:
174
+ response = requests.get(url)
175
+ if response.status_code == 200:
176
+ page_soup = BeautifulSoup(response.content, 'html.parser')
177
+
178
+ # Look for the specific warning div
179
+ warning_div = page_soup.find('div', class_='cdx-message cdx-message--block cdx-message--warning mw-warning-with-logexcerpt')
180
+ if warning_div:
181
+ df.at[i, 'outcome'] = 'delete'
182
+ else:
183
+ df.at[i, 'outcome'] = 'keep'
184
+ else:
185
+ print(f"Failed to retrieve page: {url}")
186
+
187
+ except Exception as e:
188
+ print(f"Error accessing {url}: {e}")
189
+
190
+ return df
191
+
192
+
193
+ def collect_wikinews_deletions(years=None):
194
+ base_url = 'https://en.wikinews.org/wiki/Wikinews:Deletion_requests/Archives'
195
+ response = requests.get(base_url)
196
+ if response.status_code != 200:
197
+ print("Failed to retrieve the archive page.")
198
+ return None
199
+
200
+ soup = get_soup(base_url)
201
+ titles = []
202
+ text_urls = []
203
+ outcomes = []
204
+ deletion_discussions = []
205
+ discussion_uncleaned = []
206
+ year_links = []
207
+ for a in soup.select('a[href^="/wiki/Wikinews:Deletion_requests/Archives/"]'):
208
+ year_text = re.findall(r'\d{4}', a.get_text())
209
+ if year_text:
210
+ year_links.append((year_text[0], a['href']))
211
+ if years:
212
+ if len(years) == 1:
213
+ start_year = end_year = years[0]
214
+ elif len(years) == 2:
215
+ start_year, end_year = min(years), max(years)
216
+ else:
217
+ print("Invalid years input. Provide one or two years.")
218
+ return None
219
+ year_links = [(year, link) for year, link in year_links if start_year <= int(year) <= end_year]
220
+ for year, year_link in year_links:
221
+ year_url = 'https://en.wikinews.org' + year_link
222
+ print(f"Processing year: {year_url}")
223
+ year_soup = get_soup(year_url)
224
+ discussion_divs = year_soup.find_all('div', class_=lambda x: x and 'boilerplate metadata' in x)
225
+
226
+ for div in discussion_divs:
227
+ title_tag = div.find(['h2', 'h3'])
228
+ if title_tag:
229
+ link_tag = title_tag.find('a', title=True)
230
+ if link_tag:
231
+ title = link_tag.get_text(strip=True)
232
+ titles.append(title)
233
+ text_url = year_url + '#' + link_tag['title'].replace(' ', '_')
234
+ text_urls.append(text_url)
235
+ else:
236
+ titles.append(title_tag.get_text(strip=True))
237
+ text_urls.append(year_url)
238
+ else:
239
+ dl_tag = div.find('dl')
240
+ if dl_tag and dl_tag.find('b'):
241
+ titles.append(dl_tag.find('b').get_text(strip=True))
242
+ else:
243
+ titles.append('No title found')
244
+ text_urls.append(year_url)
245
+ deletion_discussions.append(div.prettify())
246
+ discussion_uncleaned.append(div.prettify())
247
+ outcome = extract_outcome_from_div(div)
248
+ outcomes.append(outcome)
249
+
250
+ df = pd.DataFrame({
251
+ 'title': titles,
252
+ 'url': text_urls,
253
+ 'outcome': outcomes,
254
+ 'discussion': deletion_discussions,
255
+ 'discussion_uncleaned': discussion_uncleaned
256
+ })
257
+
258
+
259
+ df = process_html_to_plaintext(df)
260
+ for i in df.index:
261
+ if df.at[i,'outcome'] == 'Please do not modify it' or df.at[i,'outcome'] == 'Please do not modify it.':
262
+ df.at[i,'outcome'] = extract_following_sentence(BeautifulSoup(df.at[i,'discussion_uncleaned'], 'html.parser')) or 'unknown'
263
+ df['outcome'] = df['outcome'].apply(lambda x: validate_outcome(x) if x else 'unknown')
264
+ df = update_unknown_outcomes(df)
265
+ return df
266
+
267
+ def collect_wikinews(years=None):
268
+ df = collect_wikinews_deletions(years=years)
269
+ if df is None:
270
+ print('Error collecting Wikinews deletions.')
271
+ return None
272
+ return df
273
+
274
+
275
+ ##################################
276
+ ## Ttitle based data collection ##
277
+ ##################################
278
+
279
+ def html_to_plaintext(html_content):
280
+ soup = BeautifulSoup(html_content, 'html.parser')
281
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']):
282
+ tag.insert_before('\n')
283
+ tag.insert_after('\n')
284
+ for br in soup.find_all('br'):
285
+ br.replace_with('\n')
286
+ text = soup.get_text(separator=' ', strip=True)
287
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
288
+ return text
289
+
290
+ def split_text_into_sentences(text):
291
+ seg = pysbd.Segmenter(language="en", clean=False)
292
+ sentences = seg.segment(text)
293
+ return ' '.join(sentences)
294
+
295
+ def clean_discussion_tag(tag):
296
+ for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True):
297
+ unwanted.decompose()
298
+ return tag.get_text(separator=' ', strip=True)
299
+
300
+ def extract_outcome_from_text_elements(elements):
301
+ consensus_keywords = [
302
+ 'Deleted', 'Delete', 'delete', 'deleted',
303
+ 'kept', 'keep', 'Keep', 'Kept',
304
+ 'merge', 'Merge', 'Not done', 'No consensus', 'no consensus', 'Done'
305
+ ]
306
+ for el in elements:
307
+ b_tags = el.find_all('b')
308
+ for b in b_tags:
309
+ if b.text.strip() in consensus_keywords:
310
+ return b.text.strip()
311
+ return ''
312
+
313
+ def extract_discussion_section(soup, title):
314
+ """Extracts discussion section, label, and cleaned text."""
315
+ try:
316
+ h3_id = title.replace(" ", "_")
317
+ h3_tag = soup.find('h3', {'id': h3_id})
318
+
319
+ if not h3_tag:
320
+ print(f"h3 tag with id '{h3_id}' not found.")
321
+ return '', '', ''
322
+
323
+ heading_div = h3_tag.parent
324
+
325
+ if not heading_div:
326
+ print("Parent div not found.")
327
+ return '', '', ''
328
+
329
+ next_heading_div = heading_div.find_next_sibling('div', class_='mw-heading mw-heading3')
330
+ discussion_nodes = []
331
+ for sibling in heading_div.next_siblings:
332
+ if sibling == next_heading_div:
333
+ break
334
+ discussion_nodes.append(sibling)
335
+
336
+ discussion_tags = []
337
+ for node in discussion_nodes:
338
+ if getattr(node, 'name', None) in ['p', 'ul', 'dl']:
339
+ discussion_tags.append(node)
340
+
341
+ if not discussion_tags:
342
+ return '', '', ''
343
+
344
+ label = extract_outcome_from_text_elements(discussion_tags)
345
+
346
+ discussion_html_parts = [str(tag) for tag in discussion_tags]
347
+ cleaned_parts = []
348
+ for tag in discussion_tags:
349
+ text = clean_discussion_tag(tag)
350
+ if text:
351
+ cleaned_parts.append(text)
352
+
353
+ cleaned_discussion = ' '.join(cleaned_parts)
354
+ discussion_html = '\n'.join(discussion_html_parts)
355
+ return discussion_html, label, cleaned_discussion
356
+
357
+ except Exception as e:
358
+ print(f"Error processing title '{title}': {e}")
359
+ import traceback
360
+ traceback.print_exc()
361
+ return '', '', ''
362
+
363
+ def extract_div_from_title(title):
364
+ base_url = 'https://en.wikinews.org/wiki/Wikinews:Deletion_requests'
365
+ t = title.replace(' ', '_')
366
+ url = base_url + '#' + t
367
+
368
+ response = requests.get(url)
369
+ if response.status_code != 200:
370
+ return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
371
+
372
+ soup = BeautifulSoup(response.content, 'html.parser')
373
+ discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title)
374
+
375
+ text_url = base_url
376
+ discussion_url = text_url + '#' + title.replace(' ', '_')
377
+
378
+ df = pd.DataFrame([[title, text_url, discussion_url, cleaned_discussion, label]],
379
+ columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
380
+
381
+ if label:
382
+ df['label'] = df['label'].replace({
383
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete',
384
+ 'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep',
385
+ 'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus',
386
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus', 'Done':'delete'
387
+ })
388
+
389
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
390
+ df = df.rename(columns={'discussion_cleaned':'discussion'})
391
+ return df
392
+
393
+ ########################
394
+ ## Umbrella function ##
395
+ ########################
396
+
397
+ def collect_wikinews(mode, title=None, url ='', year=None):
398
+
399
+ if mode == 'title':
400
+ if not title:
401
+ raise ValueError("Title is required for 'title' mode.")
402
+ return extract_div_from_title(title)
403
+
404
+ elif mode == 'url':
405
+ if 'Archives' in url.split('/')[-2]:
406
+ year = int(url.split('/')[-1].split('#')[0])
407
+ print(f"Year extracted from URL: {year}")
408
+ df = collect_wikinews_deletions(years=[year])
409
+ #keep the row with the title only
410
+ df = df[df['title'] == url.split('#')[-1].replace('_', ' ')]
411
+ if df.empty:
412
+ return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
413
+ df = df[['title','url','discussion_cleaned','outcome']]
414
+ df = df.rename(columns={'discussion_cleaned':'discussion'})
415
+ return df
416
+
417
+ if not url:
418
+ raise ValueError("URL is required for 'url' mode.")
419
+ title = url.split('#')[-1].replace('_', ' ')
420
+ print(f"Title extracted from URL: {title}")
421
+ return extract_div_from_title(title)
422
+
423
+ elif mode == 'year':
424
+ if not year:
425
+ raise ValueError("Year or year range is required for 'year' mode.")
426
+ return collect_wikinews_deletions(years=year)
427
+
428
+ else:
429
+ raise ValueError("Invalid mode. Please specify 'title' or 'year' or 'url'.")
430
+
431
+ # year_df = collect_wikinews(mode='year', year=[2023])
432
+ # title_df = collect_wikinews(mode='title', title="NurMi spam")
433
+
434
+ # print(year_df)
435
+ # print(title_df)
collect_data_wikiquote.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ import pysbd
5
+ import re
6
+
7
+ def extract_outcome_from_div(div):
8
+ try:
9
+ # Extracting the decision from <b> tag that contains result like 'no consensus', 'deleted', etc.
10
+ result = div.find(text=re.compile(r'The result was:')).find_next('b')
11
+ if result:
12
+ return result.text.strip()
13
+ return 'no consensus'
14
+ except Exception as e:
15
+ print(f"Error extracting outcome: {e}")
16
+ return 'unknown'
17
+
18
+ def html_to_plaintext(html_content):
19
+ soup = BeautifulSoup(html_content, 'html.parser')
20
+ for tag in soup.find_all(['p', 'li', 'dd', 'dl']):
21
+ tag.insert_before('\n')
22
+ tag.insert_after('\n')
23
+ for br in soup.find_all('br'):
24
+ br.replace_with('\n')
25
+
26
+ text = soup.get_text(separator=' ', strip=True)
27
+ text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
28
+ return text
29
+
30
+ def process_html_to_plaintext(df):
31
+ if df.empty:
32
+ return df
33
+ df['discussion_cleaned'] = df['discussion'].apply(html_to_plaintext)
34
+ return df
35
+
36
+ def split_text_into_sentences(text):
37
+ seg = pysbd.Segmenter(language="en", clean=False)
38
+ sentences = seg.segment(text)
39
+ for i, sentence in enumerate(sentences):
40
+ if 'The result was:' in sentence:
41
+ return ' '.join(sentences[i+1:])
42
+ return ' '.join(sentences[1:])
43
+
44
+
45
+ def process_split_text_into_sentences(df):
46
+ if df.empty:
47
+ return df
48
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
49
+ df['discussion_cleaned'] = df['discussion_cleaned'].apply(lambda x: x.replace("The above discussion is preserved as an archive of the debate. Please do not modify it. Subsequent comments should be made on the appropriate discussion page (such as the article's talk page or in a deletion review ). No further edits should be made to this page.", ''))
50
+ #df['discussion_cleaned'] = df['discussion_cleaned'].apply(cleanup_initial_sentences)
51
+ return df
52
+
53
+ def collect_wikiquote_title(title='all', base_url='https://en.wikiquote.org/wiki/Wikiquote:Votes_for_deletion_archive'):
54
+ titles = []
55
+ text_urls = []
56
+ labels = []
57
+ deletion_discussions = []
58
+ if title == 'all':
59
+ url = base_url
60
+ else:
61
+ url = base_url + '#' + title.replace(' ', '_')
62
+
63
+ response = requests.get(url)
64
+
65
+ if response.status_code == 200:
66
+ soup = BeautifulSoup(response.text, 'html.parser')
67
+
68
+ if title == 'all':
69
+ divs = soup.find_all('div', class_='boilerplate metadata vfd')
70
+ else:
71
+ # For specific title, find a div that matches the title
72
+ divs = soup.find_all('div', class_='boilerplate metadata vfd')
73
+ divs = [div for div in divs if div.find('div', class_="mw-heading mw-heading2 ext-discussiontools-init-section") and title in div.find('div', class_="mw-heading mw-heading2 ext-discussiontools-init-section").text]
74
+
75
+ no_divs = len(divs)
76
+ print(f"Found {no_divs} div(s) with the expected classes.")
77
+
78
+ if no_divs >= 1:
79
+ for div in divs:
80
+ heading_div = div.find('div', class_="mw-heading mw-heading2 ext-discussiontools-init-section")
81
+ if heading_div:
82
+ found_title = heading_div.text.strip()
83
+ titles.append(found_title.replace('[edit]', ''))
84
+ text_url = base_url + '#' + found_title.replace(' ', '_')
85
+ text_urls.append(text_url)
86
+ label = extract_outcome_from_div(div)
87
+ labels.append(label)
88
+ deletion_discussions.append(div.prettify())
89
+ else:
90
+ print("No heading div found with the expected classes.")
91
+
92
+ df = pd.DataFrame({'title': titles, 'text_url': text_urls, 'label': labels, 'discussion': deletion_discussions})
93
+ df = process_html_to_plaintext(df)
94
+ df = process_split_text_into_sentences(df)
95
+ df['label'] = df['label'].replace({
96
+ 'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete', 'deleted.':'delete', 'speedy deleted test page':'delete', 'Deleted and protected with a message':'delete',
97
+ 'delete both':'delete', 'delete everything':'delete', 'Deleted due to copyvio':'delete', 'delete after various merges':'delete', 'delete 3 quoteless, keep 1 redirect':'delete',
98
+ 'Consensus to remove from Wikiquote, but only if it is not merged into another article':'delete', 'Consensus to remove from Wikiquote, but not how':'delete', 'delete, pending technical fix':'delete', 'delete all':'delete', 'delete Portal:portal, no consensus/keep Template:Wikimedia':'delete',
99
+ 'Speedy-deleted':'delete', 'Speedy deleted':'delete', 'Speedy-deleted, no meaningful content':'delete',
100
+ 'kept':'keep', 'Kept.':'keep', 'Keep':'keep', 'keep':'keep', 'Kept':'keep', 'No consensus/keep':'keep', 'kept/no consensus':'keep', 'Kept; lack of consensus':'keep', 'kept after copyvio removal':'keep',
101
+ 'Speedy-kept':'keep', 'Speedy kept':'keep',
102
+ 'merge':'merge', 'Merge':'merge', 'merged':'merge', 'Merged':'merge', 'merged into Azerbaijani proverbs':'merge', 'Merge with Stephen Covey':'merge', 'Merge with Lyrics':'merge',
103
+ 'merge and redirect':'merge', 'merge with Crusade (TV series)':'merge', 'Merged to Health':'merge', 'merge with 3rd Rock from the Sun':'merge',
104
+ 'redirect to List of proverbs':'redirect', 'keep as redirect':'redirect', 'Redirect to Inuyasha':'redirect', 'Redirected to Humor':'redirect', 'Redirected to Doctor Who':'redirect',
105
+ 'Redirect without text':'redirect', 'Proverbs turned to redirect to List of proverbs':'redirect', 'redirect to Drugs':'redirect', 'redirect to Advertising slogans':'redirect',
106
+ 'redirect to Jalal al-Din Muhammad Rumi':'redirect', 'redirect':'redirect', 'Redirected':'redirect', 'move to Category:United States Marines':'redirect', 'move to Die Hard: With a Vengeance':'redirect',
107
+ 'move to Star Wars Jedi Knight: Jedi Academy':'redirect', 'move to Lucien Lévy-Bruhl':'redirect', 'move to Dave Finlay':'redirect', 'move to User:Quenzer':'redirect', 'moved':'redirect',
108
+ 'moved to Monument inscriptions':'redirect', 'transwiki to Wikipedia, then delete':'redirect', 'Transwiki to Wikipedia':'redirect', 'Transwiki to Wikipedia':'redirect',
109
+ 'delete His Holiness the Dalai Lama, redirect Dalai Lama to Tenzin Gyatso, 14th Dalai Lama':'redirect',
110
+ 'move':'redirect', 'keep Just war theory, redirect Just war, delete Just War Theory':'no_consensus','move to Wikisource':'redirect',\
111
+ 'kept.':'keep', 'Keep as Redirect':'redirect', 'Deleted.':'delete', '1 delete, 1 redirect':'redirect', 'moved to User:Quenzer':'redirect',\
112
+ 'transwiki, then delete':'delete', 'merge with Lyrics':'redirect','Deleted all three images':'delete',\
113
+ 'No consensus':'no_consensus', 'no consensus':'no_consensus', 'inconclusive; no action taken.':'no_consensus', 'UNIDENTIFIED':'no_consensus'
114
+ })
115
+ return df
116
+ else:
117
+ print("No divs found with the expected classes.")
118
+ return None
119
+ else:
120
+ print("Failed to retrieve the page.")
121
+ return None
122
+
123
+ def collect_wikiquote(mode ='title',title = 'all', url = ''):
124
+ if mode not in ['title', 'url']:
125
+ raise ValueError("mode must be either 'title' or 'url'.")
126
+ if mode == 'title' and title == 'all':
127
+ base_url = 'https://en.wikiquote.org/wiki/Wikiquote:Votes_for_deletion_archive'
128
+ df = collect_wikiquote_title(title, base_url)
129
+ if df is not None:
130
+ if 'discussion_cleaned' in df.columns:
131
+ df = df[['title', 'text_url', 'label', 'discussion_cleaned']]
132
+ df = df.rename(columns={'discussion_cleaned': 'discussion'})
133
+ return df
134
+ elif mode == 'url':
135
+ df = collect_wikiquote_title('all', url)
136
+ title = url.split('#')[-1].replace('_', ' ')
137
+ df = df[df['title'].str.lower() == title.lower()].reset_index(drop=True)
138
+ if not df.empty:
139
+ if 'discussion_cleaned' in df.columns:
140
+ df = df[['title', 'text_url', 'label', 'discussion_cleaned']]
141
+ df = df.rename(columns={'discussion_cleaned': 'discussion'})
142
+ return df
143
+ else:
144
+ raise ValueError(f"No data found for the url: {url}")
145
+ else:
146
+ base_url = 'https://en.wikiquote.org/wiki/Wikiquote:Votes_for_deletion'
147
+ df = collect_wikiquote_title(title, base_url)
148
+ if 'discussion_cleaned' in df.columns:
149
+ df = df[['title', 'text_url', 'label', 'discussion_cleaned']]
150
+ df = df.rename(columns={'discussion_cleaned': 'discussion'})
151
+ return df
data_collect.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from datetime import datetime
3
+ from wide_analysis.data.process_data import prepare_dataset
4
+ from datasets import load_dataset
5
+
6
+ from collect_data_wikidata_ent import collect_wikidata_entity
7
+ from collect_data_wikidata_prop import collect_wikidata
8
+ from collect_data_wikinews import collect_wikinews
9
+ from collect_data_wikiquote import collect_wikiquote
10
+ from collect_data_es import collect_es
11
+ from collect_data_gr import collect_gr
12
+
13
+ def normalize_outcome(o):
14
+ lowered = o.lower()
15
+ if 'διαγρ' in lowered:
16
+ return 'Διαγραφή'
17
+ elif 'διατήρη' in lowered or 'παραμονή' in lowered:
18
+ return 'Διατήρηση'
19
+ elif 'συγχών' in lowered:
20
+ return 'συγχώνευση'
21
+ else:
22
+ return 'Δεν υπάρχει συναίνεση'
23
+
24
+ def collect(mode, start_date=None, end_date=None, url=None, title=None, output_path=None,
25
+ platform=None, lang=None, date=None, years=None):
26
+ if mode not in ['date_range', 'date', 'title','url','wide_2023']:
27
+ raise ValueError("Invalid mode. Choose from ['date_range', 'date', 'title','url','wide_2023']")
28
+
29
+ if mode == 'wide_2023':
30
+ dataset = load_dataset('hsuvaskakoty/wide_analysis')
31
+ print('Dataset loaded successfully as huggingface dataset')
32
+ print('The dataset has the following columns:', dataset.column_names)
33
+ return dataset
34
+ underlying_mode = mode
35
+ if mode in ['date', 'date_range']:
36
+ underlying_mode = 'year'
37
+ if mode == 'url':
38
+ underlying_mode = 'url'
39
+ if (platform is None and lang is None) or (platform=='wikipedia' and lang=='en'):
40
+ if mode in ['date_range', 'date', 'title']:
41
+ return prepare_dataset(
42
+ mode=mode,
43
+ start_date=start_date,
44
+ end_date=end_date,
45
+ url=url,
46
+ title=title,
47
+ output_path=output_path
48
+ )
49
+ else:
50
+ print("Invalid input. Choose from ['date_range', 'date', 'title','wide_2023']")
51
+ return None
52
+
53
+ if platform == 'wikidata_entity':
54
+ if underlying_mode == 'title':
55
+ if not title or (years and len(years)>0):
56
+ raise ValueError("For 'title' mode in wikidata entity, 'title' must be provided and 'years' must be empty.")
57
+ return collect_wikidata_entity(mode='title', title=title, years=[])
58
+
59
+ elif underlying_mode == 'year':
60
+ if start_date and end_date:
61
+ start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
62
+ end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
63
+ return collect_wikidata_entity(mode='year', years=[start_year, end_year])
64
+ elif start_date:
65
+ single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
66
+ return collect_wikidata_entity(mode='year', years=single_year)
67
+ else:
68
+ raise ValueError("For 'year' mode in wikidata entity, start_date (and optionally end_date) is required.")
69
+ elif underlying_mode == 'url':
70
+ if not url:
71
+ raise ValueError("For 'url' mode in wikidata entity, 'url' must be provided.")
72
+ return collect_wikidata_entity(mode='url', url=url)
73
+ else:
74
+ raise ValueError("Invalid mode for wikidata entity. Use 'title' or 'year'.")
75
+
76
+
77
+ elif platform == 'wikidata_property':
78
+ if underlying_mode == 'title':
79
+ if not title or (years and len(years)>0):
80
+ raise ValueError("For 'title' mode in wikidata property, 'title' must be provided and 'years' must be empty.")
81
+ return collect_wikidata(mode='title', title=title, years=[])
82
+ elif underlying_mode == 'url':
83
+ if not url:
84
+ raise ValueError("For 'url' mode in wikidata property, 'url' must be provided.")
85
+ return collect_wikidata(mode='url', title='', url=url, years=[])
86
+
87
+ elif underlying_mode == 'year':
88
+ if start_date and end_date:
89
+ start_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
90
+ end_year = int(datetime.strptime(end_date, "%Y-%m-%d").year)
91
+ return collect_wikidata(mode='year', years=[start_year, end_year])
92
+ elif start_date:
93
+ single_year = int(datetime.strptime(start_date, "%Y-%m-%d").year)
94
+ return collect_wikidata(mode='year', years=single_year)
95
+ else:
96
+ raise ValueError("For 'year' mode in wikidata property, start_date (and optionally end_date) is required.")
97
+ else:
98
+ raise ValueError("Invalid mode for wikidata property. Use 'title' or 'year'.")
99
+
100
+ # else:
101
+ # raise ValueError("Invalid lang for wikidata. Use 'entity' or 'property'.")
102
+
103
+ elif platform == 'wikinews':
104
+ if underlying_mode == 'title':
105
+ if not title:
106
+ raise ValueError("For 'title' mode in wikinews, 'title' is required.")
107
+ return collect_wikinews(mode='title', title=title)
108
+ elif underlying_mode == 'url':
109
+ if not url:
110
+ raise ValueError("For 'url' mode in wikinews, 'url' is required.")
111
+ return collect_wikinews(mode='url', url=url)
112
+ elif underlying_mode == 'year':
113
+ if start_date and end_date:
114
+ start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
115
+ end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
116
+ return collect_wikinews(mode='year', year=[start_y, end_y])
117
+ elif start_date:
118
+ single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
119
+ return collect_wikinews(mode='year', year=single_y)
120
+ else:
121
+ raise ValueError("For 'year' mode in wikinews, start_date (and optionally end_date) is required.")
122
+ else:
123
+ raise ValueError("Invalid mode for wikinews. Use 'title' or 'year' or 'url'.")
124
+
125
+ # elif platform == 'wikiquote':
126
+ # if underlying_mode != 'title':
127
+ # raise ValueError("Wikiquote collection currently only supports 'title' mode.")
128
+ # if not title:
129
+ # title = 'all'
130
+ # return collect_wikiquote(mode='title', title=title)
131
+ elif platform == 'wikiquote':
132
+ if underlying_mode not in ['title', 'url']:
133
+ raise ValueError("Wikiquote collection currently only supports 'title' or 'url' mode.")
134
+
135
+ if underlying_mode == 'title':
136
+ if not title:
137
+ title = 'all'
138
+ return collect_wikiquote(mode='title', title=title)
139
+ elif underlying_mode == 'url':
140
+ if not url:
141
+ raise ValueError("For 'url' mode in wikiquote, 'url' must be provided.")
142
+ return collect_wikiquote(mode='url', url=url)
143
+
144
+
145
+ elif platform == 'wikipedia':
146
+ if lang == 'es':
147
+ if underlying_mode == 'title':
148
+ if not title or date:
149
+ raise ValueError("For 'title' mode in spanish wikipedia, 'title' must be provided and 'date' must be empty.")
150
+ return collect_es(mode='title', title=title, date='')
151
+ elif underlying_mode == 'year':
152
+ if not date:
153
+ raise ValueError("For 'year' mode in spanish wikipedia, 'date' parameter (dd/mm/yyyy) is required.")
154
+ return collect_es(mode='year', title='', date=date)
155
+ else:
156
+ raise ValueError("Invalid mode for spanish wikipedia. Use 'title' or 'year'.")
157
+
158
+ elif lang == 'gr':
159
+ if underlying_mode == 'title':
160
+ if not title or not years or len(years) != 1:
161
+ raise ValueError("For 'title' mode in greek wikipedia, 'title' and a single-element list years=['mm/yyyy'] are required.")
162
+ return collect_gr(mode='title', title=title, years=years)
163
+ elif underlying_mode == 'year':
164
+ if start_date and end_date:
165
+ start_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
166
+ end_y = int(datetime.strptime(end_date, "%Y-%m-%d").year)
167
+ return collect_gr(mode='year', title='', years=[start_y,end_y])
168
+ elif start_date:
169
+ single_y = int(datetime.strptime(start_date, "%Y-%m-%d").year)
170
+ return collect_gr(mode='year', title='', years=[single_y])
171
+ else:
172
+ raise ValueError("For 'year' mode in greek wikipedia, start_date (and optionally end_date) is required.")
173
+ else:
174
+ raise ValueError("Invalid mode for greek wikipedia. Use 'title' or 'year'.")
175
+
176
+ else:
177
+ raise ValueError("Invalid lang for wikipedia. Use 'en', 'es', or 'gr'.")
178
+
179
+ else:
180
+ raise ValueError("Invalid platform. Use 'wikipedia', 'wikidata_entity', Wikidata_property', 'wikinews', or 'wikiquote'.")
model_predict.py CHANGED
@@ -1,25 +1,290 @@
1
- #using pipeline to predict the input text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  from transformers import pipeline, AutoTokenizer
4
  import pysbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  #-----------------Outcome Prediction-----------------
7
- def outcome(text):
8
- label_mapping = {
9
- 'delete': [0, 'LABEL_0'],
10
- 'keep': [1, 'LABEL_1'],
11
- 'merge': [2, 'LABEL_2'],
12
- 'no consensus': [3, 'LABEL_3'],
13
- 'speedy keep': [4, 'LABEL_4'],
14
- 'speedy delete': [5, 'LABEL_5'],
15
- 'redirect': [6, 'LABEL_6'],
16
- 'withdrawn': [7, 'LABEL_7']
17
- }
18
- model_name = "research-dump/roberta-large_deletion_multiclass_complete_final"
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  tokenizer = AutoTokenizer.from_pretrained(model_name)
20
- model = pipeline("text-classification", model=model_name, return_all_scores=True)
21
-
22
- # Tokenize and truncate the text
23
  tokens = tokenizer(text, truncation=True, max_length=512)
24
  truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
25
 
@@ -31,10 +296,8 @@ def outcome(text):
31
  if result['label'] == value[1]:
32
  res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']})
33
  break
34
-
35
  return res_list
36
 
37
- #-----------------Stance Prediction-----------------
38
 
39
  def extract_response(text, model_name, label_mapping):
40
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -54,7 +317,7 @@ def extract_response(text, model_name, label_mapping):
54
 
55
  return final_scores
56
 
57
-
58
  def get_stance(text):
59
  label_mapping = {
60
  'delete': 0,
@@ -160,10 +423,9 @@ def get_offensive_label(text):
160
  return res
161
 
162
 
163
- #create the anchor function
164
- def predict_text(text, model_name):
165
  if model_name == 'outcome':
166
- return outcome(text)
167
  elif model_name == 'stance':
168
  return get_stance(text)
169
  elif model_name == 'policy':
@@ -173,4 +435,4 @@ def predict_text(text, model_name):
173
  elif model_name == 'offensive':
174
  return get_offensive_label(text)
175
  else:
176
- return "Invalid Task name"
 
1
+ # #using pipeline to predict the input text
2
+ # import pandas as pd
3
+ # from transformers import pipeline, AutoTokenizer
4
+ # import pysbd
5
+
6
+ # #-----------------Outcome Prediction-----------------
7
+ # def outcome(text):
8
+ # label_mapping = {
9
+ # 'delete': [0, 'LABEL_0'],
10
+ # 'keep': [1, 'LABEL_1'],
11
+ # 'merge': [2, 'LABEL_2'],
12
+ # 'no consensus': [3, 'LABEL_3'],
13
+ # 'speedy keep': [4, 'LABEL_4'],
14
+ # 'speedy delete': [5, 'LABEL_5'],
15
+ # 'redirect': [6, 'LABEL_6'],
16
+ # 'withdrawn': [7, 'LABEL_7']
17
+ # }
18
+ # model_name = "research-dump/roberta-large_deletion_multiclass_complete_final"
19
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ # model = pipeline("text-classification", model=model_name, return_all_scores=True)
21
+
22
+ # # Tokenize and truncate the text
23
+ # tokens = tokenizer(text, truncation=True, max_length=512)
24
+ # truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
25
+
26
+ # results = model(truncated_text)
27
+
28
+ # res_list = []
29
+ # for result in results[0]:
30
+ # for key, value in label_mapping.items():
31
+ # if result['label'] == value[1]:
32
+ # res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']})
33
+ # break
34
+
35
+ # return res_list
36
+
37
+
38
+ # #-----------------Stance Prediction-----------------
39
+
40
+ # def extract_response(text, model_name, label_mapping):
41
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
42
+ # pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None)
43
+
44
+ # tokens = tokenizer(text, truncation=True, max_length=512)
45
+ # truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
46
+
47
+ # results = pipe(truncated_text)
48
+
49
+ # final_scores = {key: 0.0 for key in label_mapping}
50
+ # for result in results[0]:
51
+ # for key, value in label_mapping.items():
52
+ # if result['label'] == f'LABEL_{value}':
53
+ # final_scores[key] = result['score']
54
+ # break
55
+
56
+ # return final_scores
57
+
58
+
59
+ # def get_stance(text):
60
+ # label_mapping = {
61
+ # 'delete': 0,
62
+ # 'keep': 1,
63
+ # 'merge': 2,
64
+ # 'comment': 3
65
+ # }
66
+ # seg = pysbd.Segmenter(language="en", clean=False)
67
+ # text_list = seg.segment(text)
68
+ # model = 'research-dump/bert-large-uncased_wikistance_v1'
69
+ # res_list = []
70
+ # for t in text_list:
71
+ # res = extract_response(t, model,label_mapping) #, access_token)
72
+ # highest_key = max(res, key=res.get)
73
+ # highest_score = res[highest_key]
74
+ # result = {'sentence':t,'stance': highest_key, 'score': highest_score}
75
+ # res_list.append(result)
76
+
77
+ # return res_list
78
+
79
+
80
+ # #-----------------Policy Prediction-----------------
81
+ # def get_policy(text):
82
+ # label_mapping = {'Wikipedia:Notability': 0,
83
+ # 'Wikipedia:What Wikipedia is not': 1,
84
+ # 'Wikipedia:Neutral point of view': 2,
85
+ # 'Wikipedia:Verifiability': 3,
86
+ # 'Wikipedia:Wikipedia is not a dictionary': 4,
87
+ # 'Wikipedia:Wikipedia is not for things made up one day': 5,
88
+ # 'Wikipedia:Criteria for speedy deletion': 6,
89
+ # 'Wikipedia:Deletion policy': 7,
90
+ # 'Wikipedia:No original research': 8,
91
+ # 'Wikipedia:Biographies of living persons': 9,
92
+ # 'Wikipedia:Arguments to avoid in deletion discussions': 10,
93
+ # 'Wikipedia:Conflict of interest': 11,
94
+ # 'Wikipedia:Articles for deletion': 12
95
+ # }
96
+
97
+
98
+ # seg = pysbd.Segmenter(language="en", clean=False)
99
+ # text_list = seg.segment(text)
100
+ # model = 'research-dump/bert-large-uncased_wikistance_policy_v1'
101
+ # res_list = []
102
+
103
+ # for t in text_list:
104
+ # res = extract_response(t, model,label_mapping)
105
+ # highest_key = max(res, key=res.get)
106
+ # highest_score = res[highest_key]
107
+ # result = {'sentence': t, 'policy': highest_key, 'score': highest_score}
108
+ # res_list.append(result)
109
+
110
+ # return res_list
111
+
112
+
113
+
114
+ # #-----------------Sentiment Analysis-----------------
115
+
116
+ # def extract_highest_score_label(res):
117
+ # flat_res = [item for sublist in res for item in sublist]
118
+ # highest_score_item = max(flat_res, key=lambda x: x['score'])
119
+ # highest_score_label = highest_score_item['label']
120
+ # highest_score_value = highest_score_item['score']
121
+ # return highest_score_label, highest_score_value
122
+
123
+
124
+ # def get_sentiment(text):
125
+ # #sentiment analysis
126
+ # model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
127
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
128
+ # model = pipeline("text-classification", model=model_name, top_k= None)
129
+
130
+ # #sentence tokenize the text using pysbd
131
+ # seg = pysbd.Segmenter(language="en", clean=False)
132
+ # text_list = seg.segment(text)
133
+
134
+ # res = []
135
+ # for t in text_list:
136
+ # results = model(t)
137
+ # highest_label, highest_score = extract_highest_score_label(results)
138
+ # result = {'sentence': t,'sentiment': highest_label, 'score': highest_score}
139
+ # res.append(result)
140
+ # return res
141
+
142
+
143
+ # #-----------------Toxicity Prediction-----------------
144
+
145
+ # def get_offensive_label(text):
146
+ # #offensive language detection model
147
+ # model_name = "cardiffnlp/twitter-roberta-base-offensive"
148
+ # tokenizer = AutoTokenizer.from_pretrained(model_name)
149
+ # model = pipeline("text-classification", model=model_name, top_k= None)
150
+
151
+ # #sentence tokenize the text using pysbd
152
+ # seg = pysbd.Segmenter(language="en", clean=False)
153
+ # text_list = seg.segment(text)
154
+
155
+ # res = []
156
+ # for t in text_list:
157
+ # results = model(t)
158
+ # highest_label, highest_score = extract_highest_score_label(results)
159
+ # result = {'sentence': t,'offensive_label': highest_label, 'score': highest_score}
160
+ # res.append(result)
161
+ # return res
162
+
163
+
164
+ # #create the anchor function
165
+ # def predict_text(text, model_name):
166
+ # if model_name == 'outcome':
167
+ # return outcome(text)
168
+ # elif model_name == 'stance':
169
+ # return get_stance(text)
170
+ # elif model_name == 'policy':
171
+ # return get_policy(text)
172
+ # elif model_name == 'sentiment':
173
+ # return get_sentiment(text)
174
+ # elif model_name == 'offensive':
175
+ # return get_offensive_label(text)
176
+ # else:
177
+ # return "Invalid model name"
178
+
179
+
180
  import pandas as pd
181
  from transformers import pipeline, AutoTokenizer
182
  import pysbd
183
+ import torch
184
+
185
+
186
+ label_mapping_wikipedia_en = {
187
+ 'delete': [0, 'LABEL_0'],
188
+ 'keep': [1, 'LABEL_1'],
189
+ 'merge': [2, 'LABEL_2'],
190
+ 'no consensus': [3, 'LABEL_3'],
191
+ 'speedy keep': [4, 'LABEL_4'],
192
+ 'speedy delete': [5, 'LABEL_5'],
193
+ 'redirect': [6, 'LABEL_6'],
194
+ 'withdrawn': [7, 'LABEL_7']
195
+ }
196
+
197
+ label_mapping_es = {
198
+ 'Borrar': [0, 'LABEL_0'],
199
+ 'Mantener': [1, 'LABEL_1'],
200
+ 'Fusionar': [2, 'LABEL_2'],
201
+ 'Otros': [3, 'LABEL_3']
202
+ }
203
+
204
+ label_mapping_gr = {
205
+ 'Διαγραφή': [0, 'LABEL_0'],
206
+ 'Δεν υπάρχει συναίνεση': [1, 'LABEL_1'],
207
+ 'Διατήρηση': [2, 'LABEL_2'],
208
+ 'συγχώνευση': [3, 'LABEL_3']
209
+ }
210
+
211
+ label_mapping_wikidata_ent = {
212
+ 'delete': [0, 'LABEL_0'],
213
+ 'no_consensus': [1, 'LABEL_1'],
214
+ 'merge': [2, 'LABEL_2'],
215
+ 'keep': [3, 'LABEL_3'],
216
+ 'comment': [4, 'LABEL_4'],
217
+ 'redirect': [5, 'LABEL_5']
218
+ }
219
+
220
+ label_mapping_wikidata_prop = {
221
+ 'deleted': [0, 'LABEL_0'],
222
+ 'keep': [1, 'LABEL_1'],
223
+ 'no_consensus': [2, 'LABEL_2']
224
+ }
225
+
226
+ label_mapping_wikinews = {
227
+ 'delete': [0, 'LABEL_0'],
228
+ 'no_consensus': [1, 'LABEL_1'],
229
+ 'speedy delete': [2, 'LABEL_2'],
230
+ 'keep': [3, 'LABEL_3'],
231
+ 'redirect': [4, 'LABEL_4'],
232
+ 'comment': [5, 'LABEL_5'],
233
+ 'merge': [6, 'LABEL_6'],
234
+ 'withdrawn': [7, 'LABEL_7']
235
+ }
236
+
237
+ label_mapping_wikiquote = {
238
+ 'merge': [0, 'LABEL_0'],
239
+ 'keep': [1, 'LABEL_1'],
240
+ 'no_consensus': [2, 'LABEL_2'],
241
+ 'redirect': [3, 'LABEL_3'],
242
+ 'delete': [4, 'LABEL_4']
243
+ }
244
+
245
+ best_models_tasks = {
246
+ 'wikipedia': 'research-dump/roberta-large_deletion_multiclass_complete_final_v2',
247
+ 'wikidata_entity': 'research-dump/roberta-large_wikidata_ent_outcome_prediction_v1',
248
+ 'wikidata_property': 'research-dump/roberta-large_wikidata_prop_outcome_prediction_v1',
249
+ 'wikinews': 'research-dump/all-roberta-large-v1_wikinews_outcome_prediction_v1',
250
+ 'wikiquote': 'research-dump/roberta-large_wikiquote_outcome_prediction_v1'
251
+ }
252
+
253
+ best_models_langs = {
254
+ 'en': 'research-dump/roberta-large_deletion_multiclass_complete_final_v2',
255
+ 'es': 'research-dump/xlm-roberta-large_deletion_multiclass_es',
256
+ 'gr': 'research-dump/xlm-roberta-large_deletion_multiclass_gr'
257
+ }
258
 
259
  #-----------------Outcome Prediction-----------------
260
+
261
+ def outcome(text, lang='en', platform='wikipedia', date='', years=None):
262
+ if lang == 'en':
263
+ if platform not in best_models_tasks:
264
+ raise ValueError(f"For lang='en', platform must be one of {list(best_models_tasks.keys())}")
265
+ model_name = best_models_tasks[platform]
266
+ if platform == 'wikipedia':
267
+ label_mapping = label_mapping_wikipedia_en
268
+ elif platform == 'wikidata_entity':
269
+ label_mapping = label_mapping_wikidata_ent
270
+ elif platform == 'wikidata_property':
271
+ label_mapping = label_mapping_wikidata_prop
272
+ elif platform == 'wikinews':
273
+ label_mapping = label_mapping_wikinews
274
+ elif platform == 'wikiquote':
275
+ label_mapping = label_mapping_wikiquote
276
+ elif lang in ['es', 'gr']:
277
+ if platform != 'wikipedia':
278
+ raise ValueError(f"For lang='{lang}', only platform='wikipedia' is supported.")
279
+ model_name = best_models_langs[lang]
280
+ label_mapping = label_mapping_es if lang == 'es' else label_mapping_gr
281
+ else:
282
+ raise ValueError("Invalid lang. Use 'en', 'es', or 'gr'.")
283
+
284
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
285
  tokenizer = AutoTokenizer.from_pretrained(model_name)
286
+ model = pipeline("text-classification", model=model_name, return_all_scores=True, device=device)
287
+
 
288
  tokens = tokenizer(text, truncation=True, max_length=512)
289
  truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
290
 
 
296
  if result['label'] == value[1]:
297
  res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']})
298
  break
 
299
  return res_list
300
 
 
301
 
302
  def extract_response(text, model_name, label_mapping):
303
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
317
 
318
  return final_scores
319
 
320
+ #-----------------Stance Detection-----------------
321
  def get_stance(text):
322
  label_mapping = {
323
  'delete': 0,
 
423
  return res
424
 
425
 
426
+ def predict_text(text, model_name, lang='en', platform='wikipedia', date='', years=None):
 
427
  if model_name == 'outcome':
428
+ return outcome(text, lang=lang, platform=platform, date=date, years=years)
429
  elif model_name == 'stance':
430
  return get_stance(text)
431
  elif model_name == 'policy':
 
435
  elif model_name == 'offensive':
436
  return get_offensive_label(text)
437
  else:
438
+ return "Invalid model name"