import gradio as gr import requests from cachetools import cached, TTLCache from bs4 import BeautifulSoup from httpx import Client import json from pathlib import Path from huggingface_hub import CommitScheduler from dotenv import load_dotenv import os from functools import lru_cache from typing import Tuple load_dotenv() HF_TOKEN = os.getenv("HF_TOKEN") CACHE_TIME = 60 * 60 * 6 # 6 hours client = Client() REPO_ID = "librarian-bots/paper-recommendations-v2" scheduler = CommitScheduler( repo_id=REPO_ID, repo_type="dataset", folder_path="comments", path_in_repo="data", every=5, token=HF_TOKEN, ) def parse_arxiv_id_from_paper_url(url): return url.split("/")[-1] @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_recommendations_from_semantic_scholar(semantic_scholar_id: str): try: r = requests.post( "https://api.semanticscholar.org/recommendations/v1/papers/", json={ "positivePaperIds": [semantic_scholar_id], }, params={"fields": "externalIds,title,year", "limit": 14}, ) return r.json()["recommendedPapers"] except KeyError as e: raise gr.Error( "Error getting recommendations, if this is a new paper it may not yet have" " been indexed by Semantic Scholar." ) from e def filter_recommendations(recommendations, max_paper_count=7): # include only arxiv papers arxiv_paper = [ r for r in recommendations if r["externalIds"].get("ArXiv", None) is not None ] if len(arxiv_paper) > max_paper_count: arxiv_paper = arxiv_paper[:max_paper_count] return arxiv_paper @cached(cache=TTLCache(maxsize=500, ttl=CACHE_TIME)) def get_paper_title_from_arxiv_id(arxiv_id): try: return requests.get(f"https://huggingface.co/api/papers/{arxiv_id}").json()[ "title" ] except Exception as e: print(f"Error getting paper title for {arxiv_id}: {e}") raise gr.Error("Error getting paper title for {arxiv_id}: {e}") from e def format_recommendation_into_markdown(arxiv_id, recommendations): # title = get_paper_title_from_arxiv_id(arxiv_id) # url = f"https://huggingface.co/papers/{arxiv_id}" # comment = f"Recommended papers for [{title}]({url})\n\n" comment = "The following papers were recommended by the Semantic Scholar API \n\n" for r in recommendations: hub_paper_url = f"https://huggingface.co/papers/{r['externalIds']['ArXiv']}" comment += f"* [{r['title']}]({hub_paper_url}) ({r['year']})\n" return comment def format_comment(result: str): result = ( "This is an automated message from the [Librarian Bot](https://huggingface.co/librarian-bots). I found the following papers similar to this paper. \n\n" + result ) result += "\n\n Please give a thumbs up to this comment if you found it helpful!" result += "\n\n If you want recommendations for any Paper on Hugging Face checkout [this](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) Space" result += "\n\n You can directly ask Librarian Bot for paper recommendations by tagging it in a comment: `@librarian-bot recommend`" return result def post_comment( paper_url: str, comment: str, comment_id: str | None = None, token: str = HF_TOKEN ) -> Tuple[bool, str]: """ Post a comment on a paper or a reply to a comment using the Hugging Face API. Args: paper_url (str): The URL of the paper to post the comment on. comment (str): The text of the comment or reply to post. comment_id (str, optional): The ID of the comment to reply to. If provided, the function will post a reply to the specified comment. Defaults to None. token (str, optional): The authentication token to use for the API request. Defaults to HF_TOKEN. Returns: Tuple[bool, str]: A tuple containing two elements: - bool: True if the comment or reply was posted successfully, False otherwise. - str: The ID of the posted comment or reply if successful, an empty string otherwise. Raises: requests.exceptions.RequestException: If an error occurs while making the API request. """ try: paper_id = paper_url.split("/")[-1] if comment_id: url = f"https://huggingface.co/api/papers/{paper_id}/comment/{comment_id}/reply" gr.Info(f"Replying to comment {comment_id}") print(f"Replying to comment {comment_id}") else: url = f"https://huggingface.co/api/papers/{paper_id}/comment" print(f"Posting comment for {paper_url}") gr.Info(f"Posting comment for {paper_url}") headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json", } comment_data = {"comment": comment} response = requests.post(url, json=comment_data, headers=headers) if response.status_code == 201: posted_comment_id = response.json().get("id", "") if comment_id: print( f"Reply posted successfully to comment {comment_id} for {paper_url}. Reply ID: {posted_comment_id}" ) else: print( f"Comment posted successfully for {paper_url}. Comment ID: {posted_comment_id}" ) return True, posted_comment_id else: print( f"Failed to post {'reply' if comment_id else 'comment'} for {paper_url}. Status code: {response.status_code}" ) print(f"Response text: {response.text}") return False, "" except requests.exceptions.RequestException as e: print( f"Error posting {'reply' if comment_id else 'comment'} for {paper_url}: {e}" ) return False, "" # @lru_cache(maxsize=500) # def is_comment_from_librarian_bot(html: str) -> bool: # """ # Checks if the given HTML contains a comment from the librarian-bot. # Args: # html (str): The HTML content to check. # Returns: # bool: True if a comment from the librarian-bot is found, False otherwise. # """ # soup = BeautifulSoup(html, "lxml") # librarian_bot_links = soup.find_all("a", string="librarian-bot") # return any(librarian_bot_links) def check_if_lib_bot_comment_exists(paper_url: str) -> Tuple[bool, str]: """ Check if a comment or reply from the librarian-bot exists for a given paper URL using the Hugging Face API. Args: paper_url (str): The URL of the paper to check for librarian-bot comments. Returns: Tuple[bool, str]: A tuple containing two elements: - bool: True if a comment or reply from the librarian-bot is found, False otherwise. - str: The ID of the comment if a librarian-bot comment is found, an empty string otherwise. Raises: Exception: If an error occurs while retrieving comments from the API. """ try: paper_id = paper_url.split("/")[-1] url = f"https://huggingface.co/api/papers/{paper_id}/?field=comments" headers = {"Authorization": f"Bearer {HF_TOKEN}"} response = requests.get(url, headers=headers) if response.status_code == 200: paper_data = response.json() comments = paper_data.get("comments", []) for comment in comments: comment_author = comment.get("author", {}).get("name") if comment_author == "librarian-bot": return True, comment.get("id") replies = comment.get("replies", []) for reply in replies: reply_author = reply.get("author", {}).get("name") if reply_author == "librarian-bot": return True, comment.get("id") else: print( f"Failed to retrieve comments for {paper_url}. Status code: {response.status_code}" ) return False, "" except Exception as e: print(f"Error checking if comment exists for {paper_url}: {e}") return True, "" # default to not posting comment def log_comments(paper_url: str, comment: str): """ Logs comments for a given paper URL. Args: paper_url (str): The URL of the paper. comment (str): The comment to be logged. Returns: None """ paper_id = paper_url.split("/")[-1] file_path = Path(f"comments/{paper_id}.json") if not file_path.exists(): with scheduler.lock: with open(file_path, "w") as f: data = {"paper_url": paper_url, "comment": comment} json.dump(data, f) def return_recommendations( url: str, comment_id: str | None, post_to_paper: bool = True ) -> str: arxiv_id = parse_arxiv_id_from_paper_url(url) recommendations = get_recommendations_from_semantic_scholar(f"ArXiv:{arxiv_id}") filtered_recommendations = filter_recommendations(recommendations) formatted_recommendation = format_recommendation_into_markdown( arxiv_id, filtered_recommendations ) # Assign early if post_to_paper: comment = format_comment(formatted_recommendation) # Check if a librarian-bot comment already exists. existing_comments, existing_comment_id = check_if_lib_bot_comment_exists(url) if existing_comments: gr.Info( f"Librarian-bot already commented on this paper. Comment ID: {existing_comment_id}. No further action will be taken." ) else: # If no existing librarian-bot comment, check if a specific comment_id is provided for replying. if comment_id: comment_status, posted_comment_id = post_comment( url, comment, comment_id, token=HF_TOKEN ) if comment_status: log_comments(url, comment) gr.Info(f"Posted reply to comment {posted_comment_id}") else: # If no comment_id is provided, post a new comment. comment_status, posted_comment_id = post_comment( url, comment, token=HF_TOKEN ) if comment_status: log_comments(url, comment) gr.Info(f"Posted new comment {posted_comment_id}") if not comment_status: gr.Info("Failed to post comment") return formatted_recommendation title = "Semantic Scholar Paper Recommender" description = ( "Paste a link to a paper on Hugging Face Papers and get recommendations for similar" " papers from Semantic Scholar. **Note**: Some papers may not have recommendations" " yet if they are new or have not been indexed by Semantic Scholar." ) examples = [ ["https://huggingface.co/papers/2309.12307", None, False], ["https://huggingface.co/papers/2211.10086", None, False], ] interface = gr.Interface( return_recommendations, [ gr.Textbox(lines=1), gr.Textbox(None, lines=1, label="Comment ID (only for API)", visible=False), gr.Checkbox(False, label="Post recommendations to Paper page?"), ], gr.Markdown(), examples=examples, title=title, description=description, ) interface.queue() interface.launch()