Spaces:
Sleeping
Sleeping
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import pysbd | |
from datetime import datetime, timedelta | |
def extract_div_contents_with_additional_columns(url, log_date): | |
response = requests.get(url) | |
if response.status_code != 200: | |
return pd.DataFrame(columns=['log_date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
div_classes = ['boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] | |
divs = [] | |
for div_class in div_classes: | |
divs.extend(soup.find_all('div', class_=div_class)) | |
url_fragment = url.split('#')[-1].replace('_', ' ') | |
data = [] | |
for div in divs: | |
title_tag = div.find('a') | |
if title_tag: | |
title_span = div.find('span', {'data-mw-comment-start': True}) | |
if title_span: | |
title_anchor = title_span.find_next_sibling('a') | |
if title_anchor: | |
title = title_anchor.text | |
text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
else: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
deletion_discussion = div.prettify() | |
# Extract label | |
label = '' | |
verdict_tag = div.find('p') | |
if verdict_tag: | |
label_b_tag = verdict_tag.find('b') | |
if label_b_tag: | |
label = verdict_tag.prettify() | |
# Extract confirmation | |
confirmation = '' | |
discussion_tag = div.find('dd').find('i') | |
if discussion_tag: | |
confirmation_b_tag = discussion_tag.find('b') | |
if confirmation_b_tag: | |
confirmation = discussion_tag.prettify() | |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
discussion = parts[0] if len(parts) > 0 else '' | |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
data.append([log_date, title, text_url, deletion_discussion, label, confirmation, discussion, verdict]) | |
df = pd.DataFrame(data, columns=['log_date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) | |
return df | |
def extract_div_contents_from_url(url,date): | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Error: Received status code {response.status_code} for URL: {url}") | |
return pd.DataFrame(columns=['date','title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
div_classes = ['boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] | |
divs = [] | |
for div_class in div_classes: | |
divs.extend(soup.find_all('div', class_=div_class)) | |
url_fragment = url.split('#')[-1].replace('_', ' ') | |
log_date = url.split('/')[-1] | |
data = [] | |
for div in divs: | |
try: | |
title = None | |
text_url = None | |
title_tag = div.find('a') | |
if title_tag: | |
title_span = div.find('span', {'data-mw-comment-start': True}) | |
if title_span: | |
title_anchor = title_span.find_next_sibling('a') | |
if title_anchor: | |
title = title_anchor.text | |
text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
else: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if title == 'talk page' or title is None: | |
heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
if heading_tag: | |
title_tag = heading_tag.find('a') | |
if title_tag: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if not title: | |
continue | |
if title.lower() != url_fragment.lower(): | |
continue | |
deletion_discussion = div.prettify() | |
label = '' | |
verdict_tag = div.find('p') | |
if verdict_tag: | |
label_b_tag = verdict_tag.find('b') | |
if label_b_tag: | |
label = label_b_tag.text.strip() | |
confirmation = '' | |
discussion_tag = div.find('dd') | |
if discussion_tag: | |
discussion_tag_i = discussion_tag.find('i') | |
if discussion_tag_i: | |
confirmation_b_tag = discussion_tag_i.find('b') | |
if confirmation_b_tag: | |
confirmation = confirmation_b_tag.text.strip() | |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
discussion = parts[0] if len(parts) > 0 else '' | |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
data.append([date,title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
except Exception as e: | |
print(f"Error processing div: {e}") | |
continue | |
df = pd.DataFrame(data, columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
return df | |
def extract_div_contents_from_url_new(url,date): | |
response = requests.get(url) | |
if response.status_code != 200: | |
print(f"Error: Received status code {response.status_code} for URL: {url}") | |
return pd.DataFrame(columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
div_classes = ['boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk',"mw-heading mw-heading3"] | |
divs = [] | |
for div_class in div_classes: | |
divs.extend(soup.find_all('div', class_=div_class)) | |
url_fragment = url.split('#')[-1].replace('_', ' ') | |
log_date = url.split('/')[-1] | |
data = [] | |
for i, div in enumerate(divs): | |
try: | |
title = None | |
text_url = None | |
title_tag = div.find('a') | |
if title_tag: | |
title_span = div.find('span', {'data-mw-comment-start': True}) | |
if title_span: | |
title_anchor = title_span.find_next_sibling('a') | |
if title_anchor: | |
title = title_anchor.text | |
text_url = 'https://en.wikipedia.org' + title_anchor['href'] | |
else: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if title == 'talk page' or title is None: | |
heading_tag = div.find('div', class_='mw-heading mw-heading3') | |
if heading_tag: | |
title_tag = heading_tag.find('a') | |
if title_tag: | |
title = title_tag.text | |
text_url = 'https://en.wikipedia.org' + title_tag['href'] | |
if not title: | |
continue | |
if title.lower() != url_fragment.lower(): | |
continue | |
next_div = div.find_next('div', class_='mw-heading mw-heading3') | |
deletion_discussion = '' | |
sibling = div.find_next_sibling() | |
while sibling and sibling != next_div: | |
deletion_discussion += str(sibling) | |
sibling = sibling.find_next_sibling() | |
label = '' | |
verdict_tag = div.find('p') | |
if verdict_tag: | |
label_b_tag = verdict_tag.find('b') | |
if label_b_tag: | |
label = label_b_tag.text.strip() | |
confirmation = '' | |
discussion_tag = div.find('dd') | |
if discussion_tag: | |
discussion_tag_i = discussion_tag.find('i') | |
if discussion_tag_i: | |
confirmation_b_tag = discussion_tag_i.find('b') | |
if confirmation_b_tag: | |
confirmation = confirmation_b_tag.text.strip() | |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">') | |
discussion = parts[0] if len(parts) > 0 else '' | |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else '' | |
data.append([date, title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) | |
except Exception as e: | |
print(f"Error processing div: {e}") | |
continue | |
df = pd.DataFrame(data, columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) | |
return df | |
def extract_label(label_html): | |
soup = BeautifulSoup(label_html, 'html.parser') | |
b_tag = soup.find('b') | |
return b_tag.text.strip() if b_tag else '' | |
def process_labels(df): | |
df['proper_label'] = df['label'].apply(extract_label) | |
return df | |
def extract_confirmation(confirmation_html): | |
soup = BeautifulSoup(confirmation_html, 'html.parser') | |
b_tag = soup.find('span', {'style': 'color:red'}).find('b') | |
return b_tag.text.strip() if b_tag else '' | |
def process_confirmations(df): | |
df['confirmation'] = df['confirmation'].apply(extract_confirmation) | |
return df | |
def extract_post_links_text(discussion_html): | |
split_point = '<span class="plainlinks">' | |
if split_point in discussion_html: | |
parts = discussion_html.split(split_point) | |
if len(parts) > 1: | |
return parts[1] | |
return discussion_html | |
def process_discussion(df): | |
df['discussion_cleaned'] = df['discussion'].apply(extract_post_links_text) | |
return df | |
def html_to_plaintext(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
for tag in soup.find_all(['p', 'li', 'dd', 'dl']): | |
tag.insert_before('\n') | |
tag.insert_after('\n') | |
for br in soup.find_all('br'): | |
br.replace_with('\n') | |
text = soup.get_text(separator=' ', strip=True) | |
text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) | |
return text | |
def process_html_to_plaintext(df): | |
df['discussion_cleaned'] = df['discussion_cleaned'].apply(html_to_plaintext) | |
return df | |
def split_text_into_sentences(text): | |
seg = pysbd.Segmenter(language="en", clean=False) | |
sentences = seg.segment(text) | |
return ' '.join(sentences[1:]) | |
def process_split_text_into_sentences(df): | |
df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) | |
return df | |
def process_data(url,date): | |
df = extract_div_contents_from_url(url,date) | |
#print('Discussion: ',df.discussion.tolist()) | |
if df.discussion.tolist() == []: | |
#print('Empty Discussion') | |
df = extract_div_contents_from_url_new(url,date) | |
#print(df.head()) | |
df = process_discussion(df) | |
#print(df.at[0,'discussion']) | |
df = process_html_to_plaintext(df) | |
df = process_split_text_into_sentences(df) | |
if not df.empty: | |
return df | |
else: | |
return 'Empty DataFrame' | |
def collect_deletion_discussions(start_date, end_date): | |
base_url = 'https://en.wikipedia.org/wiki/Wikipedia:Articles_for_deletion/Log/' | |
all_data = pd.DataFrame() | |
current_date = start_date | |
while current_date <= end_date: | |
try: | |
print(f"Processing {current_date.strftime('%Y-%B-%d')}") | |
date_str = current_date.strftime('%Y_%B_%d') | |
url = base_url + date_str | |
log_date = current_date.strftime('%Y-%m-%d') | |
df = extract_div_contents_with_additional_columns(url, log_date) | |
if not df.empty: | |
df = process_labels(df) | |
df = process_confirmations(df) | |
df = process_discussion(df) | |
df = process_html_to_plaintext(df) | |
df = process_split_text_into_sentences(df) | |
all_data = pd.concat([all_data, df], ignore_index=True) | |
current_date += timedelta(days=1) | |
except Exception as e: | |
print(f"Error processing {current_date.strftime('%Y-%B-%d')}: {e}") | |
current_date += timedelta(days=1) | |
continue | |
return all_data | |