File size: 6,793 Bytes
9ae1b66
 
 
 
0952d73
8bedcfb
0515cdc
7bde858
9ae1b66
8bedcfb
9ae1b66
8bedcfb
ba7deb1
9ae1b66
 
 
 
 
28757aa
9ae1b66
 
 
ba7deb1
9ae1b66
2216ba6
9ae1b66
 
 
 
85c9513
 
f2b70d1
 
 
 
 
 
 
 
 
 
 
85c9513
 
 
 
 
 
b6f8c08
 
 
fdda521
b6f8c08
 
9ae1b66
 
8686a5f
08d3eb0
aeb2044
9ae1b66
 
 
aeb2044
 
 
9ae1b66
85c9513
 
9ae1b66
 
 
 
 
7bde858
bb1fcb1
 
d0819c0
 
 
5802d42
 
 
d0819c0
5802d42
 
 
9ae1b66
7bde858
0952d73
c9245bc
7bde858
5a23388
c1815e2
 
 
 
 
 
 
 
 
 
 
5802d42
 
3bd2613
7bde858
 
5802d42
 
0952d73
bb1fcb1
 
 
 
8bedcfb
bb1fcb1
9ae1b66
 
 
 
7fa626d
ef9cbc8
7fa626d
9ae1b66
 
ba7deb1
2216ba6
9ae1b66
 
ef9cbc8
 
 
 
ba7deb1
 
 
 
88b7fb3
 
8bedcfb
9ae1b66
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
from pathlib import Path

import gradio as gr
import requests
from fastapi import BackgroundTasks, Response, status
from huggingface_hub import WebhookPayload, WebhooksServer
from huggingface_hub.utils import build_hf_headers, get_session

from src.build_nomic import build_nomic
from src.my_logger import setup_logger
from src.readme_update import update_dataset_readme
from src.utilities import load_datasets, merge_and_update_datasets
from src.visualize_logs import log_file_to_html_string

proj_dir = Path(__name__).parent

logger = setup_logger(__name__)
logger.info("Starting Application...")

SUBREDDIT = os.environ["SUBREDDIT"]
USERNAME = os.environ["USERNAME"]
OG_DATASET = f"{USERNAME}/dataset-creator-reddit-{SUBREDDIT}"
PROCESSED_DATASET = os.environ['PROCESSED_DATASET']
# HF_TOKEN = os.environ["HF_TOKEN"]
WEBHOOK_SECRET = os.getenv("HF_WEBHOOK_SECRET", 'secret')

intro_md = """
# Processing BORU
This is a space to visually search the subreddit [/r/bestofredditorupdates](https://www.reddit.com/r/BestofRedditorUpdates/). 
Have you ever been curious to search for stories that are similar to one of your favorites? This can help!

- Each dot represents a post (try clicking on one)
- Closer dots are similar in topic
- Use the filters on the left to help you narrow down what you are looking for
    - The lasso can help you search in a smaller range that you drag with your mouse
    - The filter can help you narrow by field, 
        - Filtering posts that are `CONCLUDED`
        - Filtering popular posts
        - Filtering by date
    - The search can help you look by keyword
    
Check out the original on [Nomic](https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map)
"""

details_md = """
# Details
## Creation Details
1. This space is triggered by a webhook for changes on [reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates](https://huggingface.co/datasets/reddit-tools-HF/dataset-creator-reddit-bestofredditorupdates).
2. It then takes the updates from that dataset and get embeddings by making leveraging [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings)
    - [reddit-tools-HF/nomic-embeddings](https://huggingface.co/spaces/reddit-tools-HF/nomic-embeddings) is using [zero-spaces](https://huggingface.co/zero-gpu-explorers) a free GPU service to compute the model [nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
    - Im calling this via [gradio_client](https://www.gradio.app/docs/client) which allows any space to be used as an API
3. The calculated embeddings are stored in this dataset [reddit-tools-HF/reddit-bestofredditorupdates-processed](https://huggingface.co/datasets/reddit-tools-HF/reddit-bestofredditorupdates-processed)
4. These get visualized by [nomic atlas](https://docs.nomic.ai/atlas/introduction/quick-start). You can see how I process it in [build_nomic.py](https://huggingface.co/spaces/reddit-tools-HF/processing-bestofredditorupdates/blob/main/src/build_nomic.py)
"""

url = "https://atlas.nomic.ai/data/derek2/boru-subreddit-neural-search/map"
html_str = f'<iframe src={url} style="border:none;height:1024px;width:100%" allow="clipboard-read; clipboard-write" title="Nomic Atlas">'

with gr.Blocks() as ui:
    with gr.Tab("Application"):
        gr.Markdown(intro_md)
        gr.HTML(html_str)
    with gr.Tab("Logs"):
        gr.Markdown("# Logs")
        output = gr.HTML(log_file_to_html_string, every=1)
    with gr.Tab("Details"):
        gr.Markdown(details_md)

app = WebhooksServer(ui=ui.queue(), webhook_secret=WEBHOOK_SECRET)


@app.add_webhook("/dataset_repo")
async def handle_repository_changes(payload: WebhookPayload, task_queue: BackgroundTasks):
    if not payload.event.scope.startswith("repo"):
        return Response("No task scheduled", status_code=status.HTTP_200_OK)
    # Only run if change is on main branch
    try:
        if payload.updatedRefs[0].ref != 'refs/heads/main':
            response_content = "No task scheduled: Change not on main branch"
            logger.info(response_content)
            return Response(response_content, status_code=status.HTTP_200_OK)
    except:
        response_content = "No task scheduled"
        logger.info(response_content)
        return Response(response_content, status_code=status.HTTP_200_OK)

    # No need to run for README only updates
    try:
        commit_files_url = f"""{payload.repo.url.api}/compare/{payload.updatedRefs[0].oldSha}..{payload.updatedRefs[0].newSha}?raw=true"""
        response_text = get_session().get(commit_files_url, headers=build_hf_headers()).text
        logger.info(f"Git Compare URl: {commit_files_url}")

        # Splitting the output into lines
        file_lines = response_text.split('\n')

        # Filtering the lines to find file changes
        changed_files = [line.split('\t')[-1] for line in file_lines if line.strip()]
        logger.info(f"Changed files: {changed_files}")

        # Checking if only README.md has been changed
        if all('README.md' in file for file in changed_files):
            response_content = "No task scheduled: its a README only update."
            logger.info(response_content)
            return Response(response_content, status_code=status.HTTP_200_OK)
    except Exception as e:
        logger.error(f"{str(e)}")
        response_content = "Unexpected issue :'("
        logger.info(response_content)
        return Response(response_content, status_code=status.HTTP_501_NOT_IMPLEMENTED)

    logger.info(f"Webhook received from {payload.repo.name} indicating a repo {payload.event.action}")
    task_queue.add_task(_process_webhook, payload=payload)
    return Response("Task scheduled.", status_code=status.HTTP_202_ACCEPTED)


def _process_webhook(payload: WebhookPayload):
    logger.info(f"Loading new dataset...")
    dataset, original_dataset = load_datasets()
    logger.info(f"Loaded new dataset")

    logger.info(f"Merging and Updating rows...")
    dataset, updated_row_count = merge_and_update_datasets(dataset, original_dataset)
    logger.info(f"Merged and Updated rows")

    # Push the augmented dataset to the Hugging Face hub
    logger.info(f"Pushing processed data to the Hugging Face Hub...")
    dataset.push_to_hub(PROCESSED_DATASET)
    logger.info(f"Pushed processed data to the Hugging Face Hub")

    update_dataset_readme(dataset_name=PROCESSED_DATASET, subreddit=SUBREDDIT, new_rows=updated_row_count)
    logger.info(f"Updated README.")

    # Build Nomic
    logger.info(f"Building Nomic...")
    build_nomic(dataset=dataset)
    logger.info(f"Built Nomic")

    logger.info(f"Update from webhook completed!")


if __name__ == '__main__':
    app.launch(server_name="0.0.0.0", show_error=True, server_port=7860)