Spaces:
Running
Running
File size: 41,044 Bytes
d8d9fba 4641d03 d8d9fba 8c4a00c d8d9fba e418a75 d8d9fba 555cd42 0d3c7d8 555cd42 d8d9fba cb08e07 26b368d d8d9fba 555cd42 d8d9fba 555cd42 26b368d cb08e07 aa7da7f d8d9fba 26b368d 9282a5a 4641d03 d8d9fba 619c9ac 2195005 3649303 619c9ac 3649303 619c9ac 3649303 619c9ac 3649303 5f2c7e6 619c9ac 5f2c7e6 619c9ac 5f2c7e6 555cd42 5f2c7e6 619c9ac 5f2c7e6 106f995 5f2c7e6 619c9ac 5f2c7e6 3649303 5f2c7e6 619c9ac 5f2c7e6 619c9ac 106f995 619c9ac 5f2c7e6 555cd42 5f2c7e6 619c9ac 5f2c7e6 619c9ac 5f2c7e6 3649303 5f2c7e6 619c9ac 5f2c7e6 619c9ac 5f2c7e6 3649303 5f2c7e6 2195005 5f2c7e6 3649303 555cd42 5f2c7e6 3649303 5f2c7e6 3649303 555cd42 5f2c7e6 3649303 26b368d 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 6477832 8c4a00c d8d9fba 6477832 d8d9fba 6477832 c4bc2a0 506e239 d8d9fba 3649303 555cd42 d8d9fba 555cd42 9282a5a 3649303 8c4a00c 3649303 d8d9fba 9282a5a 3649303 9282a5a 3649303 9282a5a 3649303 d8d9fba 9282a5a 3649303 d8d9fba 9282a5a 3649303 9282a5a 555cd42 d8d9fba 9282a5a 3649303 d8d9fba 9282a5a 3649303 9282a5a 3649303 9282a5a 3649303 d8d9fba 9282a5a 3649303 d8d9fba 9282a5a 3649303 9282a5a 555cd42 d8d9fba 9282a5a 3649303 d8d9fba 9282a5a 3649303 9282a5a 3649303 9282a5a 3649303 26b368d 8c4a00c 9282a5a 3649303 d8d9fba 3649303 9282a5a 3649303 9282a5a 3649303 9282a5a 3649303 555cd42 3649303 555cd42 3649303 d8d9fba 8c4a00c 6477832 8c4a00c 6477832 8c4a00c 4641d03 4a045f5 88fb6c3 4641d03 d8d9fba 9282a5a 4a045f5 d8d9fba 9282a5a 4bd71cd d8d9fba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 |
import gradio as gr
import datetime
import json
import random
import requests
from constants import *
def process(query_type, index_desc, **kwargs):
timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
index = INDEX_BY_DESC[index_desc]
data = {
'source': 'hf' if not DEBUG else 'hf-dev',
'timestamp': timestamp,
'query_type': query_type,
'index': index,
}
data.update(kwargs)
print(json.dumps(data))
if API_URL is None:
raise ValueError(f'API_URL envvar is not set!')
try:
response = requests.post(API_URL, json=data, timeout=10)
except requests.exceptions.Timeout:
raise ValueError('Web request timed out. Please try again later.')
except requests.exceptions.RequestException as e:
raise ValueError(f'Web request error: {e}')
if response.status_code == 200:
result = response.json()
else:
raise ValueError(f'HTTP error {response.status_code}: {response.json()}')
if DEBUG:
print(result)
return result
def format_tokenization_info(result):
if not ('token_ids' in result and 'tokens' in result):
return ''
token_ids = result['token_ids']
tokens = result['tokens']
if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]):
output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
else:
ttt = []
for token_idss, tokenss in zip(token_ids, tokens):
tt = []
for token_ids, tokens in zip(token_idss, tokenss):
t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
tt.append(t)
tt = '\n'.join(tt)
ttt.append(tt)
output = '\n\n'.join(ttt)
return output
def format_doc_metadata(doc):
formatted = f'Document #{doc["doc_ix"]}\n'
if doc['doc_len'] == doc['disp_len']:
formatted += f'Length: {doc["doc_len"]} tokens\n'
else:
formatted += f'Length: {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)\n'
metadata = doc['metadata'].strip("\n")
formatted += f'Metadata: {metadata}'
return formatted
def count(index_desc, query, max_clause_freq, max_diff_tokens):
if ' AND ' in query or ' OR ' in query: # CNF query
result = process('count', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
else: # simple query
result = process('count', index_desc, query=query)
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
tokenization_info = format_tokenization_info(result)
if 'error' in result:
count = result['error']
else:
count = f'{result["count"]:,}'
return latency, tokenization_info, count
def prob(index_desc, query):
result = process('prob', index_desc, query=query)
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
tokenization_info = format_tokenization_info(result)
if 'error' in result:
prob = result['error']
elif result['prompt_cnt'] == 0:
prob = '(n-1)-gram is not found in the corpus'
else:
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
return latency, tokenization_info, prob
def ntd(index_desc, query, max_support):
result = process('ntd', index_desc, query=query, max_support=max_support)
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
tokenization_info = format_tokenization_info(result)
if 'error' in result:
ntd = result['error']
else:
result_by_token_id = result['result_by_token_id']
ntd = {}
for token_id, r in result_by_token_id.items():
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
if ntd == {}:
ntd = '(n-1)-gram is not found in the corpus'
return latency, tokenization_info, ntd
def infgram_prob(index_desc, query):
result = process('infgram_prob', index_desc, query=query)
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
tokenization_info = format_tokenization_info(result)
if 'error' in result:
longest_suffix = ''
prob = result['error']
else:
longest_suffix = result['longest_suffix']
prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
return latency, tokenization_info, longest_suffix, prob
def infgram_ntd(index_desc, query, max_support):
result = process('infgram_ntd', index_desc, query=query, max_support=max_support)
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
tokenization_info = format_tokenization_info(result)
if 'error' in result:
longest_suffix = ''
ntd = result['error']
else:
longest_suffix = result['longest_suffix']
result_by_token_id = result['result_by_token_id']
ntd = {}
for token_id, r in result_by_token_id.items():
ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
return latency, tokenization_info, longest_suffix, ntd
def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_diff_tokens):
if ' AND ' in query or ' OR ' in query: # CNF query
result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
else: # simple query
result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len)
latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
tokenization_info = format_tokenization_info(result)
if 'error' in result:
message = result['error']
metadatas = ['' for _ in range(MAXNUM)]
docs = [[] for _ in range(MAXNUM)]
else:
message = result['message']
metadatas = [format_doc_metadata(doc) for doc in result['documents']]
docs = [doc['spans'] for doc in result['documents']]
metadatas = metadatas[:maxnum]
docs = docs[:maxnum]
while len(metadatas) < MAXNUM:
metadatas.append('')
while len(docs) < MAXNUM:
docs.append([])
return tuple([latency, tokenization_info, message] + metadatas + docs)
def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens, state):
if ' AND ' in query or ' OR ' in query: # CNF query
find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
find_result['type'] = 'cnf'
else: # simple query
find_result = process('find', index_desc, query=query)
find_result['type'] = 'simple'
state = find_result
latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
tokenization_info = format_tokenization_info(find_result)
if 'error' in find_result:
message = find_result['error']
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
metadata = ''
doc = []
return latency, tokenization_info, message, idx, metadata, doc, state
if ' AND ' in query or ' OR ' in query: # CNF query
ptrs_by_shard = find_result['ptrs_by_shard']
cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
if find_result["approx"]:
message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
else:
message = f'{find_result["cnt"]} occurrences found'
else: # simple query
message = f'{find_result["cnt"]} occurrences found'
cnt_retrievable = find_result['cnt']
if cnt_retrievable == 0:
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
metadata = ''
doc = []
return latency, tokenization_info, message, idx, metadata, doc, state
idx = random.randint(0, cnt_retrievable-1)
metadata, doc = get_another_doc(index_desc, idx, max_disp_len, state)
idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
return latency, tokenization_info, message, idx, metadata, doc, state
def clear_search_docs_new(state):
state = None
idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
return idx, state
def get_another_doc(index_desc, idx, max_disp_len, state):
find_result = state
if find_result is None or not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
metadata = ''
doc = []
return metadata, doc
if find_result['type'] == 'cnf':
ptrs_by_shard = find_result['ptrs_by_shard']
cnt_by_shard = [len(ptrs) for ptrs in ptrs_by_shard]
s = 0
while idx >= cnt_by_shard[s]:
idx -= cnt_by_shard[s]
s += 1
ptr = ptrs_by_shard[s][idx]
result = process('get_doc_by_ptr', index_desc, s=s, ptr=ptr, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
else: # simple query
segment_by_shard = find_result['segment_by_shard']
cnt_by_shard = [end - start for (start, end) in segment_by_shard]
s = 0
while idx >= cnt_by_shard[s]:
idx -= cnt_by_shard[s]
s += 1
rank = segment_by_shard[s][0] + idx
result = process('get_doc_by_rank', index_desc, s=s, rank=rank, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
if 'error' in result:
metadata = result['error']
doc = []
return metadata, doc
metadata = format_doc_metadata(result)
doc = result['spans']
return metadata, doc
with gr.Blocks() as demo:
with gr.Column():
gr.HTML(
'''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>
<p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
<p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
<p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
<p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
'''
)
with gr.Row():
with gr.Column(scale=1, min_width=240):
index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
with gr.Column(scale=7):
with gr.Tab('1. Count an n-gram'):
with gr.Column():
gr.HTML('<h2>1. Count an n-gram</h2>')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
<br>
<p style="font-size: 16px;">Example queries:</p>
<ul style="font-size: 16px;">
<li><b>natural language processing</b> (the output is number of occurrences of "natural language processing")</li>
<li><b>natural language processing AND deep learning</b> (the output is the number of co-occurrences of "natural language processing" and "deep learning")</li>
<li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the output is the number of co-occurrences of [one of "natural language processing" / "artificial intelligence"] and [one of "deep learning" / "machine learning"])</li>
</ul>
<br>
<p style="font-size: 16px;">Notes on CNF queries:</p>
<ul style="font-size: 16px;">
<li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
<li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
<li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
</ul>
''')
with gr.Row():
with gr.Column(scale=1):
count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
with gr.Accordion(label='Advanced options', open=False):
with gr.Row():
count_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
count_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
with gr.Row():
count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
count_submit = gr.Button(value='Submit', variant='primary', visible=True)
count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
count_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
with gr.Column(scale=1):
count_count = gr.Label(label='Count', num_top_classes=0)
count_clear.add([count_query, count_latency, count_tokenized, count_count])
count_submit.click(count, inputs=[index_desc, count_query, count_max_clause_freq, count_max_diff_tokens], outputs=[count_latency, count_tokenized, count_count], api_name=False)
with gr.Tab('2. Prob of the last token'):
with gr.Column():
gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>
<br>
<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>
<br>
<p style="font-size: 16px;">Notes:</p>
<ul style="font-size: 16px;">
<li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
</ul>
''')
with gr.Row():
with gr.Column(scale=1):
prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
with gr.Row():
prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
with gr.Column(scale=1):
prob_probability = gr.Label(label='Probability', num_top_classes=0)
prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
prob_submit.click(prob, inputs=[index_desc, prob_query], outputs=[prob_latency, prob_tokenized, prob_probability], api_name=False)
with gr.Tab('3. Next-token distribution'):
with gr.Column():
gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This is an extension of the Query Type 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>
<br>
<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>
<br>
<p style="font-size: 16px;">Notes:</p>
<ul style="font-size: 16px;">
<li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
<li>If the (n-1)-gram appears more than {max_support} times in the corpus, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
</ul>
''')
with gr.Row():
with gr.Column(scale=1):
ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
with gr.Accordion(label='Advanced options', open=False):
ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
with gr.Row():
ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
with gr.Column(scale=1):
ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
ntd_submit.click(ntd, inputs=[index_desc, ntd_query, ntd_max_support], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
with gr.Tab('4. ∞-gram prob'):
with gr.Column():
gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>
<br>
<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
<br>
<p style="font-size: 16px;">Notes:</p>
<ul style="font-size: 16px;">
<li>It may be possible that the effective n = 1, i.e. longest found suffix is empty, in which case it reduces to the uni-gram probability of the last token.</li>
</ul>
''')
with gr.Row():
with gr.Column(scale=1):
infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
with gr.Row():
infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
with gr.Column(scale=1):
infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
infgram_prob_clear.add([infgram_prob_query, infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability])
infgram_prob_submit.click(infgram_prob, inputs=[index_desc, infgram_prob_query], outputs=[infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability], api_name=False)
with gr.Tab('5. ∞-gram next-token distribution'):
with gr.Column():
gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
<br>
<p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
''')
with gr.Row():
with gr.Column(scale=1):
infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
with gr.Accordion(label='Advanced options', open=False):
infgram_ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
with gr.Row():
infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
with gr.Column(scale=1):
infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
with gr.Tab('6. Search documents', visible=False):
with gr.Column():
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
<br>
<p style="font-size: 16px;">Example queries:</p>
<ul style="font-size: 16px;">
<li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
<li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
<li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
</ul>
<br>
<p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
<br>
<p style="font-size: 16px;">Notes on CNF queries:</p>
<ul style="font-size: 16px;">
<li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
<li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
<li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
</ul>
<br>
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
''')
with gr.Row():
with gr.Column(scale=1):
search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=maxnum, step=1, label='Number of documents to display')
search_docs_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
with gr.Accordion(label='Advanced options', open=False):
with gr.Row():
search_docs_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
search_docs_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
with gr.Row():
search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
search_docs_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
with gr.Column(scale=2):
search_docs_message = gr.Label(label='Message', num_top_classes=0)
search_docs_metadatas = []
search_docs_outputs = []
for i in range(MAXNUM):
with gr.Tab(label=str(i+1)):
search_docs_metadatas.append(gr.Textbox(label='Metadata', lines=3, interactive=False))
search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)
with gr.Tab('6. Search documents'):
with gr.Column():
gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
with gr.Accordion(label='Click to view instructions', open=False):
gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
<br>
<p style="font-size: 16px;">Example queries:</p>
<ul style="font-size: 16px;">
<li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
<li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
<li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
</ul>
<br>
<p style="font-size: 16px;">Notes on CNF queries:</p>
<ul style="font-size: 16px;">
<li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
<li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
<li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
<li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
<li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
</ul>
<br>
<p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
''')
with gr.Row():
with gr.Column(scale=1):
search_docs_new_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
search_docs_new_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
with gr.Accordion(label='Advanced options', open=False):
with gr.Row():
search_docs_new_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
search_docs_new_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
with gr.Row():
search_docs_new_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
search_docs_new_submit = gr.Button(value='Submit', variant='primary', visible=True)
search_docs_new_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
search_docs_new_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
with gr.Column(scale=2):
search_docs_new_message = gr.Label(label='Message', num_top_classes=0)
search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
search_docs_state = gr.State(value=None)
search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
search_docs_new_clear.click(
clear_search_docs_new,
inputs=[search_docs_state],
outputs=[search_docs_new_idx, search_docs_state]
)
search_docs_new_submit.click(
search_docs_new,
inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len,
search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens,
search_docs_state],
outputs=[search_docs_new_latency, search_docs_new_tokenized,
search_docs_new_message, search_docs_new_idx,
search_docs_new_metadata, search_docs_new_output,
search_docs_state]
)
search_docs_new_idx.input(
get_another_doc,
inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len,
search_docs_state],
outputs=[search_docs_new_metadata, search_docs_new_output]
)
with gr.Row():
gr.Markdown('''
If you find this tool useful, please kindly cite our paper:
```bibtex
@article{Liu2024InfiniGram,
title={Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens},
author={Liu, Jiacheng and Min, Sewon and Zettlemoyer, Luke and Choi, Yejin and Hajishirzi, Hannaneh},
journal={arXiv preprint arXiv:2401.17377},
year={2024}
}
```
''')
demo.queue(
default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT,
max_size=MAX_SIZE,
api_open=False,
).launch(
max_threads=MAX_THREADS,
debug=DEBUG,
show_api=False,
)
|