import http.client as http_client import json import logging import os import re import string import gradio as gr import requests def mark_tokens_bold(string, tokens): for token in tokens: pattern = re.escape(token) #r"\b" + re.escape(token) + r"\b" string = re.sub(pattern, "" + token + "", string) return string def process_results(results, highlight_terms): if len(results) == 0: return """

No results retrieved.



""" results_html = "" for result in results: text_html = result["text"] text_html = mark_tokens_bold(text_html, highlight_terms) docid_html = str(result["docid"]) licenses = " | ".join(result["repo_license"]) repo_name = result["repo_name"] repo_path = result["repo_path"] results_html += """\

Repository name: {}

Repository path: {}

Repository licenses: {}


{}



""".format(repo_name, repo_path, licenses, text_html) return results_html def scisearch(query, language, num_results=10): query = " ".join(query.split()) if query == "" or query is None: return "" post_data = {"query": query, "k": num_results} output = requests.post( os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) results = payload["results"] highlight_terms = payload["highlight_terms"] return process_results(results, highlight_terms) description = """#

🎅 SantaCoder: Dataset Search 🔍

When you use IceCoder to generate code it might produce exact copies of code in the pretraining dataset. In that case the code requires and with this search tool we aim to provide help to finding out where the code came from.""" if __name__ == "__main__": demo = gr.Blocks( css=".gradio-container {background-color: #20233fff; color:white}" ) with demo: with gr.Row(): gr.Markdown(value=description) with gr.Row(): query = gr.Textbox(lines=5, placeholder="Type your query here...", label="Query") with gr.Row(): k = gr.Slider(1, 100, value=10, step=1, label="Max Results") with gr.Row(): submit_btn = gr.Button("Submit") with gr.Row(): results = gr.HTML(label="Results", value="contact") def submit(query, k, lang="en"): query = query.strip() if query is None or query == "": return "", "" return { results: scisearch(query, lang, k), } query.submit(fn=submit, inputs=[query, k], outputs=[results]) submit_btn.click(submit, inputs=[query, k], outputs=[results]) demo.launch(enable_queue=True, debug=True)