Spaces:
Runtime error
Runtime error
File size: 6,793 Bytes
9ae1b66 0952d73 8bedcfb 0515cdc 7bde858 9ae1b66 8bedcfb 9ae1b66 8bedcfb ba7deb1 9ae1b66 28757aa 9ae1b66 ba7deb1 9ae1b66 2216ba6 9ae1b66 85c9513 f2b70d1 85c9513 b6f8c08 fdda521 b6f8c08 9ae1b66 8686a5f 08d3eb0 aeb2044 9ae1b66 aeb2044 9ae1b66 85c9513 9ae1b66 7bde858 bb1fcb1 d0819c0 5802d42 d0819c0 5802d42 9ae1b66 7bde858 0952d73 c9245bc 7bde858 5a23388 c1815e2 5802d42 3bd2613 7bde858 5802d42 0952d73 bb1fcb1 8bedcfb bb1fcb1 9ae1b66 7fa626d ef9cbc8 7fa626d 9ae1b66 ba7deb1 2216ba6 9ae1b66 ef9cbc8 ba7deb1 88b7fb3 8bedcfb 9ae1b66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
from pathlib import Path
import gradio as gr
import requests
from fastapi import BackgroundTasks, Response, status
from huggingface_hub import WebhookPayload, WebhooksServer
from huggingface_hub.utils import build_hf_headers, get_session
from src.build_nomic import build_nomic
from src.my_logger import setup_logger
from src.readme_update import update_dataset_readme
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string
proj_dir = Path(__name__).parent
logger = setup_logger(__name__)
logger.info("Starting Application...")
SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
# HF_TOKEN = os.environ["HF_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')
intro_md = """
# Processing BORU
This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/).
Have you ever been curious to search for stories that are similar to one of your favorites? This can help!
- Each dot represents a post (try clicking on one)
- Closer dots are similar in topic
- Use the filters on the left to help you narrow down what you are looking for
- The lasso can help you search in a smaller range that you drag with your mouse
- The filter can help you narrow by field,
- Filtering posts that are `CONCLUDED`
- Filtering popular posts
- Filtering by date
- The search can help you look by keyword
Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map)
"""
details_md = """
# Details
## Creation Details
1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates).
2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings)
- [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
- Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API
3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed)
4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py)
"""
url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map"
html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">'
with gr.Blocks() as ui:
with gr.Tab("Application"):
gr.Markdown(intro_md)
gr.HTML(html_str)
with gr.Tab("Logs"):
gr.Markdown("# Logs")
output = gr.HTML(log_file_to_html_string, every=1)
with gr.Tab("Details"):
gr.Markdown(details_md)
app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)
@app.add_webhook("/dataset_repo")
async def handle_repository_changes(payload: WebhookPayload, task_queue: BackgroundTasks):
if not payload.event.scope.startswith("repo"):
return Response("No task scheduled", status_code=status.HTTP_200_OK)
# Only run if change is on main branch
try:
if payload.updatedRefs[0].ref != 'refs/heads/main':
response_content = "No task scheduled: Change not on main branch"
logger.info(response_content)
return Response(response_content, status_code=status.HTTP_200_OK)
except:
response_content = "No task scheduled"
logger.info(response_content)
return Response(response_content, status_code=status.HTTP_200_OK)
# No need to run for README only updates
try:
commit_files_url = f"""{payload.repo.url.api}/compare/{payload.updatedRefs[0].oldSha}..{payload.updatedRefs[0].newSha}?raw=true"""
response_text = get_session().get(commit_files_url, headers=build_hf_headers()).text
logger.info(f"Git Compare URl: {commit_files_url}")
# Splitting the output into lines
file_lines = response_text.split('\n')
# Filtering the lines to find file changes
changed_files = [line.split('\t')[-1] for line in file_lines if line.strip()]
logger.info(f"Changed files: {changed_files}")
# Checking if only README.md has been changed
if all('README.md' in file for file in changed_files):
response_content = "No task scheduled: its a README only update."
logger.info(response_content)
return Response(response_content, status_code=status.HTTP_200_OK)
except Exception as e:
logger.error(f"{str(e)}")
response_content = "Unexpected issue :'("
logger.info(response_content)
return Response(response_content, status_code=status.HTTP_501_NOT_IMPLEMENTED)
logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
task_queue.add_task(_process_webhook, payload=payload)
return Response("Task scheduled.", status_code=status.HTTP_202_ACCEPTED)
def _process_webhook(payload: WebhookPayload):
logger.info(f"Loading new dataset...")
dataset, original_dataset = load_datasets()
logger.info(f"Loaded new dataset")
logger.info(f"Merging and Updating rows...")
dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)
logger.info(f"Merged and Updated rows")
# Push the augmented dataset to the Hugging Face hub
logger.info(f"Pushing processed data to the Hugging Face Hub...")
dataset.push_to_hub(PROCESSED_DATASET)
logger.info(f"Pushed processed data to the Hugging Face Hub")
update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
logger.info(f"Updated README.")
# Build Nomic
logger.info(f"Building Nomic...")
build_nomic(dataset=dataset)
logger.info(f"Built Nomic")
logger.info(f"Update from webhook completed!")
if __name__ == '__main__':
app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|