import http.client as http_client import json import logging import os import re import string import gradio as gr import requests def mark_tokens_bold(string, tokens): for token in tokens: pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b" string = re.sub(pattern, "" + token + "", string) return string def process_results(results, highlight_terms): if len(results) == 0: return """

No results retrieved.



""" results_html = "" for result in results: text_html = result["text"] text_html = mark_tokens_bold(text_html, highlight_terms) meta_html = ( """

{}

""".format( result["meta"]["url"], result["meta"]["url"] ) if "meta" in result and result["meta"] is not None and "url" in result["meta"] else "" ) docid_html = str(result["docid"]) licenses = " | ".join(result["repo_license"]) repo_name = result["repo_name"] repo_path = result["repo_path"] results_html += """{}

Repository name: {}

Repository path: {}

Repository licenses: {}

{}

""".format( meta_html, repo_name, repo_path, licenses, text_html ) return results_html + "
" def scisearch(query, language, num_results=10): query = " ".join(query.split()) if query == "" or query is None: return "" post_data = {"query": query, "k": num_results} output = requests.post( os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) results = payload["results"] highlight_terms = payload["highlight_terms"] return process_results(results, highlight_terms) description = """#

🌸 🔎 ROOTS search tool 🔍 🌸

The ROOTS corpus was developed during the [BigScience workshop](/static-proxy?url=https%3A%2F%2Fbigscience.huggingface.co%2F) for the purpose of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in ROOTS. You can read more about the details of the tool design [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99).""" if __name__ == "__main__": demo = gr.Blocks( css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; background-color:#20233fff; } .gradio-container {background-color: #20233fff}" ) with demo: with gr.Row(): gr.Markdown(value=description) with gr.Row(): query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query") with gr.Row(): k = gr.Slider(1, 100, value=10, step=1, label="Max Results") with gr.Row(): submit_btn = gr.Button("Submit") with gr.Row(): results = gr.HTML(label="Results") def submit(query, k, lang="en"): query = query.strip() if query is None or query == "": return "", "" return { results: scisearch(query, lang, k), } query.submit(fn=submit, inputs=[query, k], outputs=[results]) submit_btn.click(submit, inputs=[query, k], outputs=[results]) demo.launch(enable_queue=True, debug=True)