File size: 17,355 Bytes
0d0a4e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pysbd
import re


################################
# Year based data collection ###
################################

def get_soup(url):
    response = requests.get(url)
    response.raise_for_status()  
    return BeautifulSoup(response.text, 'html.parser')

def html_to_plaintext(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for tag in soup.find_all(['p', 'li', 'dd', 'dl']):
        tag.insert_before('\n')
        tag.insert_after('\n')
    for br in soup.find_all('br'):
        br.replace_with('\n')

    text = soup.get_text(separator=' ', strip=True)
    text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
    return text

def extract_fallback_discussion(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    discussion_parts = []
    for element in soup.find_all(['p', 'li', 'dd', 'ol'], recursive=False):
        discussion_parts.append(element.get_text(separator=' ', strip=True))
    return ' '.join(discussion_parts).strip()

def process_html_to_plaintext(df):
    if df.empty:
        return df
    df['discussion_cleaned'] = df['discussion'].apply(html_to_plaintext)
    for index, row in df.iterrows():
        if not row['discussion_cleaned'].strip():
            df.at[index, 'discussion_cleaned'] = extract_fallback_discussion(row['discussion_uncleaned']) 
    return df

def extract_outcome_from_div(div):
    try:
        result_phrase = div.find(text=re.compile(r'The result was to'))
        if result_phrase:
            result = result_phrase.find_next('b')
            if result:
                outcome_text = result.text.strip()
                if outcome_text.lower() == "please do not modify it":
                    return extract_following_sentence(div) or 'unknown'
                elif validate_outcome(outcome_text):
                    return outcome_text  
        li_outcome = div.find('li')
        if li_outcome and li_outcome.find('b'):
            outcome_text = li_outcome.find('b').text.strip()
            if outcome_text.lower() == "please do not modify it":
                return extract_following_sentence(div) or 'unknown'
            elif validate_outcome(outcome_text):
                return outcome_text

        dl_outcome = div.find('dl')
        if dl_outcome and dl_outcome.find('b'):
            outcome_text = dl_outcome.find('b').text.strip()
            if outcome_text.lower() == "please do not modify it":
                return extract_following_sentence(div) or 'unknown'
            elif validate_outcome(outcome_text):
                return outcome_text

        outcome_italic = div.find('dd')
        if outcome_italic and outcome_italic.find('i'):
            outcome_text = outcome_italic.find('i').get_text(strip=True)
            if outcome_text.lower() == "please do not modify it":
                return extract_following_sentence(div) or 'unknown'
            elif validate_outcome(outcome_text):
                return outcome_text
        return extract_following_sentence(div) or 'unknown'

    except Exception as e:
        print(f"Error extracting outcome: {e}")
        return 'unknown'


def extract_following_sentence(div):
    try:
        phrases = [
            "No further edits should be made to this discussion",
            "Please do not add any more comments and votes to this request",
            "No further edits should be made to this discussion."
        ]
        
        for phrase in phrases:
            phrase_location = div.find(text=re.compile(phrase))
            if phrase_location:
                following_text = ""
                for sibling in phrase_location.find_all_next(string=True):
                    if "Please do not modify it" in sibling:
                        continue
                    following_text += sibling.strip() + " "
                    if "." in sibling:
                        break
                sentence = following_text.split('.')[0].strip()
                if validate_outcome(sentence):
                    return sentence
        
        return None 

    except Exception as e:
        print(f"Error extracting following sentence: {e}")
        return None

def validate_outcome(outcome_text):
    label_mapping = {
        'delete': [
            'delete', 'delete ... unanimous', 'deleted', 'deleted as abandoned',
            'speedy delete', 'Delete', 'delete as redundant to existing template',
            'delete as unlikely to be used', 'delete but no prejudice against recreation when needed',
            'delete after Ottawahitech chose not to provide a rationale',
            'Delete, with no objection to recreation when needed.', 'Deleted', 
            'delete the Cigarette redirect and keep the NHS redirect.', 'Delete all articles', 'Tentatively sending through the usual abandonment process',
            'Delete all articles','This was completed already.'
        ],
'speedy delete': [ 

            'speedy delete', 'speedy deleted', 'speedy deleted test page', 'Speedy-deleted', 'Speedy deleted', 'Speedy-deleted, no meaningful content',
            'Speeded as "old prep"', 'Speedied as "old prep"  -- Pi zero ( talk ) 23:42, 10 February 2020 (UTC)   [  reply  ]  __DTELLIPSISBUTTON__{"threadItem":{"timestamp":"2020-02-10T23:42:00'
],

        'keep': [
             'keep',
            'Do not undelete. The content should be kept by the author off-wiki, and can be included as a part of another story that is current',
            'Personal details have been redacted and hidden from public view together with a NOINDEX flag',

            ],
        'redirect': [
            'soft redirect'
        ],
        'merge': [
            'convert near-clone of mainspace article to use {{topic cat}}; apply {{correction}} to mainspace article'
        ],
        'no_consensus': [
            'No consensus to delete. However, there clearly is a consensus that if we are to have this template, we aren\'t to use it in its present form.',
            'no consensus', 'No consensus',
            "At this time, it's unclear if there's a consensus to keep but abundantly clear there isn't one to delete."
        ],
        'comment': [
            'Remove', 'SVT', 'withdraw the deletion request', 'On consideration, speedied as unused and lacking fair-use rationale',
            'Moved to userspace', 'Withdrawn to allow interview re-focus','More userspace drafts       This is the second batch of a large number of draft articles in userspace',
            'This was completed already ', 'Do not undelete. The content should be kept by the author off-wiki, and can be included as a part of another story that is current',

        ],
        'withdrawn': ['Withdrawn to allow interview re-focus', 
        ]
    }

    

    outcome_to_label = {outcome.lower(): label for label, outcomes in label_mapping.items() for outcome in outcomes}
    return outcome_to_label.get(outcome_text.lower(), 'unknown')


def update_unknown_outcomes(df):
    base_url = "https://en.wikinews.org/w/index.php?title="

    for i in df.index:
        if df.at[i, 'outcome'] == 'unknown':
            title = df.at[i, 'title'].replace(" ", "_") 
            url = f"{base_url}{title}&action=edit&redlink=1"
            print(f"Checking page: {url}")

            try:
                response = requests.get(url)
                if response.status_code == 200:
                    page_soup = BeautifulSoup(response.content, 'html.parser')
                    
                    # Look for the specific warning div
                    warning_div = page_soup.find('div', class_='cdx-message cdx-message--block cdx-message--warning mw-warning-with-logexcerpt')
                    if warning_div:
                        df.at[i, 'outcome'] = 'delete'
                    else:
                        df.at[i, 'outcome'] = 'keep'
                else:
                    print(f"Failed to retrieve page: {url}")
            
            except Exception as e:
                print(f"Error accessing {url}: {e}")

    return df

    
def collect_wikinews_deletions(years=None):
    base_url = 'https://en.wikinews.org/wiki/Wikinews:Deletion_requests/Archives'
    response = requests.get(base_url)
    if response.status_code != 200:
        print("Failed to retrieve the archive page.")
        return None
    
    soup = get_soup(base_url)
    titles = []
    text_urls = []
    outcomes = []
    deletion_discussions = []  
    discussion_uncleaned = [] 
    year_links = []
    for a in soup.select('a[href^="/wiki/Wikinews:Deletion_requests/Archives/"]'):
        year_text = re.findall(r'\d{4}', a.get_text()) 
        if year_text:
            year_links.append((year_text[0], a['href']))
    if years:
        if len(years) == 1:
            start_year = end_year = years[0]
        elif len(years) == 2:
            start_year, end_year = min(years), max(years)
        else:
            print("Invalid years input. Provide one or two years.")
            return None
        year_links = [(year, link) for year, link in year_links if start_year <= int(year) <= end_year]
    for year, year_link in year_links:
        year_url = 'https://en.wikinews.org' + year_link
        print(f"Processing year: {year_url}")
        year_soup = get_soup(year_url)
        discussion_divs = year_soup.find_all('div', class_=lambda x: x and 'boilerplate metadata' in x)
        
        for div in discussion_divs:
            title_tag = div.find(['h2', 'h3'])
            if title_tag:
                link_tag = title_tag.find('a', title=True)
                if link_tag:
                    title = link_tag.get_text(strip=True)
                    titles.append(title)
                    text_url = year_url + '#' + link_tag['title'].replace(' ', '_')
                    text_urls.append(text_url)
                else:
                    titles.append(title_tag.get_text(strip=True))
                    text_urls.append(year_url)
            else:
                dl_tag = div.find('dl')
                if dl_tag and dl_tag.find('b'):
                    titles.append(dl_tag.find('b').get_text(strip=True))
                else:
                    titles.append('No title found')
                text_urls.append(year_url)
            deletion_discussions.append(div.prettify())
            discussion_uncleaned.append(div.prettify())
            outcome = extract_outcome_from_div(div)
            outcomes.append(outcome)

    df = pd.DataFrame({
        'title': titles,
        'url': text_urls,
        'outcome': outcomes,
        'discussion': deletion_discussions,
        'discussion_uncleaned': discussion_uncleaned  
    })


    df = process_html_to_plaintext(df)
    for i in df.index:
        if df.at[i,'outcome'] == 'Please do not modify it' or df.at[i,'outcome'] == 'Please do not modify it.':
            df.at[i,'outcome'] = extract_following_sentence(BeautifulSoup(df.at[i,'discussion_uncleaned'], 'html.parser')) or 'unknown'
    df['outcome'] = df['outcome'].apply(lambda x: validate_outcome(x) if x else 'unknown')
    df = update_unknown_outcomes(df)
    return df

def collect_wikinews(years=None):
    df = collect_wikinews_deletions(years=years)
    if df is None:
        print('Error collecting Wikinews deletions.')
        return None
    return df


##################################
## Ttitle based data collection ##
##################################

def html_to_plaintext(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for tag in soup.find_all(['p', 'li', 'dd', 'dl', 'ul']):
        tag.insert_before('\n')
        tag.insert_after('\n')
    for br in soup.find_all('br'):
        br.replace_with('\n')
    text = soup.get_text(separator=' ', strip=True)
    text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != ''])
    return text

def split_text_into_sentences(text):
    seg = pysbd.Segmenter(language="en", clean=False)
    sentences = seg.segment(text)
    return ' '.join(sentences)

def clean_discussion_tag(tag):
    for unwanted in tag.find_all(['span', 'img', 'a', 'div'], recursive=True):
        unwanted.decompose()
    return tag.get_text(separator=' ', strip=True)

def extract_outcome_from_text_elements(elements):
    consensus_keywords = [
        'Deleted', 'Delete', 'delete', 'deleted', 
        'kept', 'keep', 'Keep', 'Kept', 
        'merge', 'Merge', 'Not done', 'No consensus', 'no consensus', 'Done'
    ]
    for el in elements:
        b_tags = el.find_all('b')
        for b in b_tags:
            if b.text.strip() in consensus_keywords:
                return b.text.strip()
    return ''

def extract_discussion_section(soup, title):
    """Extracts discussion section, label, and cleaned text."""
    try:
        h3_id = title.replace(" ", "_") 
        h3_tag = soup.find('h3', {'id': h3_id})

        if not h3_tag:
            print(f"h3 tag with id '{h3_id}' not found.")
            return '', '', ''

        heading_div = h3_tag.parent

        if not heading_div:
            print("Parent div not found.")
            return '', '', ''

        next_heading_div = heading_div.find_next_sibling('div', class_='mw-heading mw-heading3')
        discussion_nodes = []
        for sibling in heading_div.next_siblings:
            if sibling == next_heading_div:
                break
            discussion_nodes.append(sibling)

        discussion_tags = []
        for node in discussion_nodes:
            if getattr(node, 'name', None) in ['p', 'ul', 'dl']:
                discussion_tags.append(node)

        if not discussion_tags:
            return '', '', ''

        label = extract_outcome_from_text_elements(discussion_tags)

        discussion_html_parts = [str(tag) for tag in discussion_tags]
        cleaned_parts = []
        for tag in discussion_tags:
            text = clean_discussion_tag(tag)
            if text:
                cleaned_parts.append(text)

        cleaned_discussion = ' '.join(cleaned_parts)
        discussion_html = '\n'.join(discussion_html_parts)
        return discussion_html, label, cleaned_discussion

    except Exception as e:
        print(f"Error processing title '{title}': {e}")
        import traceback
        traceback.print_exc()  
        return '', '', ''

def extract_div_from_title(title):
    base_url = 'https://en.wikinews.org/wiki/Wikinews:Deletion_requests'
    t = title.replace(' ', '_')
    url = base_url + '#' + t

    response = requests.get(url)
    if response.status_code != 200:
        return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])

    soup = BeautifulSoup(response.content, 'html.parser')
    discussion_html, label, cleaned_discussion = extract_discussion_section(soup, title)
 
    text_url = base_url
    discussion_url = text_url + '#' + title.replace(' ', '_')

    df = pd.DataFrame([[title, text_url, discussion_url, cleaned_discussion, label]],
                      columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])

    if label:
        df['label'] = df['label'].replace({
            'Deleted':'delete', 'Delete':'delete', 'delete':'delete', 'deleted':'delete', 
            'kept':'keep', 'keep':'keep', 'Keep':'keep', 'Kept':'keep', 
            'merge':'merge', 'Merge':'merge', 'Not done':'no_consensus', 
            'No consensus':'no_consensus', 'no consensus':'no_consensus', 'Done':'delete'
        })

    df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences)
    df = df.rename(columns={'discussion_cleaned':'discussion'})
    return df

########################
## Umbrella function  ##
########################

def collect_wikinews(mode, title=None, url ='', year=None):

    if mode == 'title':
        if not title:
            raise ValueError("Title is required for 'title' mode.")
        return extract_div_from_title(title)
    
    elif mode == 'url':
        if 'Archives' in url.split('/')[-2]:
            year = int(url.split('/')[-1].split('#')[0])
            print(f"Year extracted from URL: {year}")
            df = collect_wikinews_deletions(years=[year])
            #keep the row with the title only
            df = df[df['title'] == url.split('#')[-1].replace('_', ' ')]
            if df.empty:
                return pd.DataFrame(columns=['title', 'text_url', 'discussion_url', 'discussion_cleaned', 'label'])
            df = df[['title','url','discussion_cleaned','outcome']]
            df = df.rename(columns={'discussion_cleaned':'discussion'})   
            return df

        if not url:
            raise ValueError("URL is required for 'url' mode.")
        title = url.split('#')[-1].replace('_', ' ')
        print(f"Title extracted from URL: {title}")
        return extract_div_from_title(title)

    elif mode == 'year':
        if not year:
            raise ValueError("Year or year range is required for 'year' mode.")
        return collect_wikinews_deletions(years=year)

    else:
        raise ValueError("Invalid mode. Please specify 'title' or 'year' or 'url'.")

# year_df = collect_wikinews(mode='year', year=[2023]) 
# title_df = collect_wikinews(mode='title', title="NurMi spam") 

# print(year_df)
# print(title_df)