Spaces:
Running
Running
liujch1998
commited on
Commit
β’
6477832
1
Parent(s):
0067690
Bug fix: find_result cache breaks down upon concurrent users
Browse files
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: π
|
|
4 |
colorFrom: blue
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: cc-by-nc-sa-4.0
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: cc-by-nc-sa-4.0
|
app.py
CHANGED
@@ -150,16 +150,16 @@ def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_di
|
|
150 |
docs.append([])
|
151 |
return tuple([latency, tokenization_info, message] + metadatas + docs)
|
152 |
|
153 |
-
|
154 |
-
|
155 |
-
def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens):
|
156 |
-
global find_result
|
157 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
158 |
find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
|
159 |
find_result['type'] = 'cnf'
|
160 |
else: # simple query
|
161 |
find_result = process('find', index_desc, query=query)
|
162 |
find_result['type'] = 'simple'
|
|
|
|
|
|
|
163 |
latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
|
164 |
tokenization_info = format_tokenization_info(find_result)
|
165 |
if 'error' in find_result:
|
@@ -167,7 +167,7 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
|
|
167 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
168 |
metadata = ''
|
169 |
doc = []
|
170 |
-
return latency, tokenization_info, message, idx, metadata, doc
|
171 |
|
172 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
173 |
ptrs_by_shard = find_result['ptrs_by_shard']
|
@@ -183,21 +183,20 @@ def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_t
|
|
183 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
184 |
metadata = ''
|
185 |
doc = []
|
186 |
-
return latency, tokenization_info, message, idx, metadata, doc
|
187 |
idx = random.randint(0, cnt_retrievable-1)
|
188 |
-
metadata, doc = get_another_doc(index_desc, idx, max_disp_len)
|
189 |
idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
|
190 |
-
return latency, tokenization_info, message, idx, metadata, doc
|
191 |
|
192 |
-
def clear_search_docs_new():
|
193 |
-
|
194 |
-
find_result = None
|
195 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
196 |
-
return idx
|
197 |
|
198 |
-
def get_another_doc(index_desc, idx, max_disp_len):
|
199 |
-
|
200 |
-
if not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
|
201 |
metadata = ''
|
202 |
doc = []
|
203 |
return metadata, doc
|
@@ -230,10 +229,10 @@ def get_another_doc(index_desc, idx, max_disp_len):
|
|
230 |
with gr.Blocks() as demo:
|
231 |
with gr.Column():
|
232 |
gr.HTML(
|
233 |
-
'''<h1 text-align="center">Infini-gram: An Engine
|
234 |
|
235 |
-
<p style='font-size: 16px;'>This
|
236 |
-
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng
|
237 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
238 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
239 |
'''
|
@@ -482,10 +481,29 @@ with gr.Blocks() as demo:
|
|
482 |
search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
|
483 |
search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
|
484 |
search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
|
|
|
485 |
search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
|
486 |
-
search_docs_new_clear.click(
|
487 |
-
|
488 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
489 |
|
490 |
with gr.Row():
|
491 |
gr.Markdown('''
|
|
|
150 |
docs.append([])
|
151 |
return tuple([latency, tokenization_info, message] + metadatas + docs)
|
152 |
|
153 |
+
def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens, state):
|
|
|
|
|
|
|
154 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
155 |
find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
|
156 |
find_result['type'] = 'cnf'
|
157 |
else: # simple query
|
158 |
find_result = process('find', index_desc, query=query)
|
159 |
find_result['type'] = 'simple'
|
160 |
+
|
161 |
+
state = find_result
|
162 |
+
|
163 |
latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
|
164 |
tokenization_info = format_tokenization_info(find_result)
|
165 |
if 'error' in find_result:
|
|
|
167 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
168 |
metadata = ''
|
169 |
doc = []
|
170 |
+
return latency, tokenization_info, message, idx, metadata, doc, state
|
171 |
|
172 |
if ' AND ' in query or ' OR ' in query: # CNF query
|
173 |
ptrs_by_shard = find_result['ptrs_by_shard']
|
|
|
183 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
184 |
metadata = ''
|
185 |
doc = []
|
186 |
+
return latency, tokenization_info, message, idx, metadata, doc, state
|
187 |
idx = random.randint(0, cnt_retrievable-1)
|
188 |
+
metadata, doc = get_another_doc(index_desc, idx, max_disp_len, state)
|
189 |
idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
|
190 |
+
return latency, tokenization_info, message, idx, metadata, doc, state
|
191 |
|
192 |
+
def clear_search_docs_new(state):
|
193 |
+
state = None
|
|
|
194 |
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
|
195 |
+
return idx, state
|
196 |
|
197 |
+
def get_another_doc(index_desc, idx, max_disp_len, state):
|
198 |
+
find_result = state
|
199 |
+
if find_result is None or not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
|
200 |
metadata = ''
|
201 |
doc = []
|
202 |
return metadata, doc
|
|
|
229 |
with gr.Blocks() as demo:
|
230 |
with gr.Column():
|
231 |
gr.HTML(
|
232 |
+
'''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>
|
233 |
|
234 |
+
<p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
|
235 |
+
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
|
236 |
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
|
237 |
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
|
238 |
'''
|
|
|
481 |
search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
|
482 |
search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
|
483 |
search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
|
484 |
+
search_docs_state = gr.State(value=None)
|
485 |
search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
|
486 |
+
search_docs_new_clear.click(
|
487 |
+
clear_search_docs_new,
|
488 |
+
inputs=[search_docs_state],
|
489 |
+
outputs=[search_docs_new_idx, search_docs_state]
|
490 |
+
)
|
491 |
+
search_docs_new_submit.click(
|
492 |
+
search_docs_new,
|
493 |
+
inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len,
|
494 |
+
search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens,
|
495 |
+
search_docs_state],
|
496 |
+
outputs=[search_docs_new_latency, search_docs_new_tokenized,
|
497 |
+
search_docs_new_message, search_docs_new_idx,
|
498 |
+
search_docs_new_metadata, search_docs_new_output,
|
499 |
+
search_docs_state]
|
500 |
+
)
|
501 |
+
search_docs_new_idx.input(
|
502 |
+
get_another_doc,
|
503 |
+
inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len,
|
504 |
+
search_docs_state],
|
505 |
+
outputs=[search_docs_new_metadata, search_docs_new_output]
|
506 |
+
)
|
507 |
|
508 |
with gr.Row():
|
509 |
gr.Markdown('''
|