Spaces:
Runtime error
Runtime error
File size: 4,436 Bytes
cb35e87 385bf5d fb9e6d1 96d3a8b 385bf5d cb35e87 385bf5d cb35e87 655c971 e146ae1 cb35e87 e146ae1 cb35e87 e146ae1 cb35e87 655c971 cb35e87 655c971 cb35e87 655c971 cb35e87 655c971 cb35e87 655c971 cb35e87 25cf3d1 cb35e87 655c971 cb35e87 01b1b14 cb35e87 01b1b14 cb35e87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import http.client as http_client
import json
import logging
import os
import re
import string
import gradio as gr
import requests
def mark_tokens_bold(string, tokens):
for token in tokens:
pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b"
string = re.sub(pattern, "<span style='color: red;'><b>" + token + "</b></span>", string)
return string
def process_results(results, highlight_terms):
if len(results) == 0:
return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
No results retrieved.</p><br><hr>"""
results_html = ""
for result in results:
text_html = result["text"]
text_html = mark_tokens_bold(text_html, highlight_terms)
meta_html = (
"""
<p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
<a href='{}' target='_blank'>{}</a></p>""".format(
result["meta"]["url"], result["meta"]["url"]
)
if "meta" in result and result["meta"] is not None and "url" in result["meta"]
else ""
)
docid_html = str(result["docid"])
licenses = " | ".join(result["repo_license"])
repo_name = result["repo_name"]
repo_path = result["repo_path"]
results_html += """{}
<p style='font-size:16px; font-family: Arial; text-align: left;'>Repository name: <span style='color: #20233fff;'>{}</span></p>
<p style='font-size:16px; font-family: Arial; text-align: left;'>Repository path: <span style='color: #20233fff;'>{}</span></p>
<p style='font-size:16px; font-family: Arial; text-align: left;'>Repository licenses: <span style='color: #20233fff;'>{}</span></p>
<pre style='height: 600px; overflow: scroll;'><code>{}</code></pre>
<br>
""".format(
meta_html, repo_name, repo_path, licenses, text_html
)
return results_html + "<hr>"
def scisearch(query, language, num_results=10):
query = " ".join(query.split())
if query == "" or query is None:
return ""
post_data = {"query": query, "k": num_results}
output = requests.post(
os.environ.get("address"),
headers={"Content-type": "application/json"},
data=json.dumps(post_data),
timeout=60,
)
payload = json.loads(output.text)
results = payload["results"]
highlight_terms = payload["highlight_terms"]
return process_results(results, highlight_terms)
description = """# <p style="text-align: center;"> ๐ธ ๐ ROOTS search tool ๐ ๐ธ </p>
The ROOTS corpus was developed during the [BigScience workshop](/static-proxy?url=https%3A%2F%2Fbigscience.huggingface.co%2F) for the purpose
of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
ROOTS. You can read more about the details of the tool design
[here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""
if __name__ == "__main__":
demo = gr.Blocks(
css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; background-color:#20233fff; } .gradio-container {background-color: #20233fff}"
)
with demo:
with gr.Row():
gr.Markdown(value=description)
with gr.Row():
query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")
with gr.Row():
k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
with gr.Row():
submit_btn = gr.Button("Submit")
with gr.Row():
results = gr.HTML(label="Results")
def submit(query, k, lang="en"):
query = query.strip()
if query is None or query == "":
return "", ""
return {
results: scisearch(query, lang, k),
}
query.submit(fn=submit, inputs=[query, k], outputs=[results])
submit_btn.click(submit, inputs=[query, k], outputs=[results])
demo.launch(enable_queue=True, debug=True)
|