import http.client as http_client import json import logging import os import re import string import gradio as gr import requests lang = "en" def process_results(results, highlight_terms): if len(results) == 0: return """

No results retrieved.



""" results_html = "" for result in results: tokens = result["text"].split() tokens_html = [] for token in tokens: if token in highlight_terms: tokens_html.append("{}".format(token)) else: tokens_html.append(token) tokens_html = " ".join(tokens_html) meta_html = ( """

{}

""".format( result["meta"]["url"], result["meta"]["url"] ) if "meta" in result and result["meta"] is not None and "url" in result["meta"] else "" ) docid_html = str(result["docid"]) results_html += """{}

Document ID: {}

Language: {}

{}


""".format( meta_html, docid_html, result["lang"], tokens_html ) return results_html + "
" def scisearch(query, language, num_results=10): query = " ".join(query.split()) if query == "" or query is None: return "" post_data = {"query": query, "k": num_results} output = requests.post( os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) results = payload["results"] highlight_terms = payload["highlight_terms"] return process_results(results, highlight_terms) description = """#

🌸 🔎 ROOTS search tool 🔍 🌸

The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in ROOTS. You can read more about the details of the tool design [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99).""" if __name__ == "__main__": demo = gr.Blocks( css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }" ) with demo: with gr.Row(): gr.Markdown(value=description) with gr.Row(): query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query") with gr.Row(): k = gr.Slider(1, 100, value=10, step=1, label="Max Results") with gr.Row(): submit_btn = gr.Button("Submit") with gr.Row(): results = gr.HTML(label="Results") def submit(query, lang, k): query = query.strip() if query is None or query == "": return "", "" return { results: scisearch(query, lang, k), } query.submit(fn=submit, inputs=[query, lang, k], outputs=[results]) submit_btn.click(submit, inputs=[query, lang, k], outputs=[results]) demo.launch(enable_queue=True, debug=True)