File size: 41,044 Bytes
d8d9fba
4641d03
d8d9fba
8c4a00c
d8d9fba
e418a75
d8d9fba
555cd42
0d3c7d8
555cd42
d8d9fba
cb08e07
26b368d
d8d9fba
555cd42
d8d9fba
555cd42
26b368d
cb08e07
 
aa7da7f
 
 
 
 
 
d8d9fba
 
 
26b368d
9282a5a
4641d03
d8d9fba
 
619c9ac
 
 
 
 
2195005
 
 
 
 
 
 
 
 
 
 
 
 
3649303
 
619c9ac
3649303
619c9ac
3649303
 
 
619c9ac
 
3649303
 
 
 
 
5f2c7e6
619c9ac
5f2c7e6
 
 
 
619c9ac
5f2c7e6
555cd42
 
5f2c7e6
619c9ac
5f2c7e6
 
106f995
 
5f2c7e6
 
619c9ac
5f2c7e6
3649303
 
5f2c7e6
619c9ac
5f2c7e6
 
 
619c9ac
 
 
 
106f995
 
619c9ac
5f2c7e6
555cd42
 
5f2c7e6
619c9ac
5f2c7e6
 
 
 
 
 
619c9ac
5f2c7e6
3649303
 
5f2c7e6
619c9ac
5f2c7e6
 
 
 
 
619c9ac
 
 
 
 
5f2c7e6
3649303
 
 
 
 
5f2c7e6
2195005
5f2c7e6
 
3649303
555cd42
5f2c7e6
 
3649303
 
 
5f2c7e6
3649303
 
555cd42
5f2c7e6
3649303
26b368d
6477832
8c4a00c
 
 
 
 
 
6477832
 
 
8c4a00c
 
 
 
 
 
 
6477832
8c4a00c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6477832
8c4a00c
6477832
8c4a00c
6477832
8c4a00c
6477832
 
8c4a00c
6477832
8c4a00c
6477832
 
 
8c4a00c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d9fba
 
 
6477832
d8d9fba
6477832
 
c4bc2a0
506e239
d8d9fba
 
 
3649303
555cd42
d8d9fba
555cd42
9282a5a
 
 
3649303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c4a00c
3649303
 
 
d8d9fba
9282a5a
 
3649303
 
 
 
9282a5a
 
 
 
3649303
9282a5a
 
 
3649303
d8d9fba
9282a5a
 
 
3649303
 
 
 
 
 
 
 
 
 
d8d9fba
9282a5a
 
 
 
 
 
3649303
9282a5a
 
 
555cd42
d8d9fba
9282a5a
 
 
3649303
 
 
 
 
 
 
 
 
 
 
 
d8d9fba
9282a5a
 
3649303
 
9282a5a
 
 
 
3649303
9282a5a
 
 
3649303
d8d9fba
9282a5a
 
 
3649303
 
 
 
 
 
 
 
 
 
d8d9fba
9282a5a
 
 
 
 
 
3649303
9282a5a
 
 
 
555cd42
d8d9fba
9282a5a
 
 
3649303
 
 
 
 
d8d9fba
9282a5a
 
3649303
 
9282a5a
 
 
 
3649303
9282a5a
 
 
 
3649303
26b368d
8c4a00c
9282a5a
3649303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d9fba
3649303
9282a5a
3649303
 
 
 
 
 
9282a5a
 
 
 
3649303
 
9282a5a
3649303
555cd42
 
 
3649303
555cd42
3649303
 
d8d9fba
8c4a00c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6477832
8c4a00c
6477832
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c4a00c
4641d03
 
 
4a045f5
88fb6c3
 
 
 
 
 
4641d03
 
 
d8d9fba
9282a5a
 
4a045f5
d8d9fba
9282a5a
 
4bd71cd
d8d9fba
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
import gradio as gr
import datetime
import json
import random
import requests
from constants import *

def process(query_type, index_desc, **kwargs):
    timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
    index = INDEX_BY_DESC[index_desc]
    data = {
        'source': 'hf' if not DEBUG else 'hf-dev',
        'timestamp': timestamp,
        'query_type': query_type,
        'index': index,
    }
    data.update(kwargs)
    print(json.dumps(data))
    if API_URL is None:
        raise ValueError(f'API_URL envvar is not set!')
    try:
        response = requests.post(API_URL, json=data, timeout=10)
    except requests.exceptions.Timeout:
        raise ValueError('Web request timed out. Please try again later.')
    except requests.exceptions.RequestException as e:
        raise ValueError(f'Web request error: {e}')
    if response.status_code == 200:
        result = response.json()
    else:
        raise ValueError(f'HTTP error {response.status_code}: {response.json()}')
    if DEBUG:
        print(result)
    return result

def format_tokenization_info(result):
    if not ('token_ids' in result and 'tokens' in result):
        return ''
    token_ids = result['token_ids']
    tokens = result['tokens']
    if type(token_ids) == list and all([type(token_id) == int for token_id in token_ids]):
        output = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
    else:
        ttt = []
        for token_idss, tokenss in zip(token_ids, tokens):
            tt = []
            for token_ids, tokens in zip(token_idss, tokenss):
                t = '[' + " ".join(['"' + token.replace('Ġ', ' ') + '"' for token in tokens]) + '] ' + str(token_ids)
                tt.append(t)
            tt = '\n'.join(tt)
            ttt.append(tt)
        output = '\n\n'.join(ttt)
    return output
def format_doc_metadata(doc):
    formatted = f'Document #{doc["doc_ix"]}\n'
    if doc['doc_len'] == doc['disp_len']:
        formatted += f'Length: {doc["doc_len"]} tokens\n'
    else:
        formatted += f'Length: {doc["doc_len"]} tokens ({doc["disp_len"]} tokens displayed)\n'
    metadata = doc['metadata'].strip("\n")
    formatted += f'Metadata: {metadata}'
    return formatted

def count(index_desc, query, max_clause_freq, max_diff_tokens):
    if ' AND ' in query or ' OR ' in query: # CNF query
        result = process('count', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
    else: # simple query
        result = process('count', index_desc, query=query)
    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(result)
    if 'error' in result:
        count = result['error']
    else:
        count = f'{result["count"]:,}'
    return latency, tokenization_info, count

def prob(index_desc, query):
    result = process('prob', index_desc, query=query)
    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(result)
    if 'error' in result:
        prob = result['error']
    elif result['prompt_cnt'] == 0:
        prob = '(n-1)-gram is not found in the corpus'
    else:
        prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
    return latency, tokenization_info, prob

def ntd(index_desc, query, max_support):
    result = process('ntd', index_desc, query=query, max_support=max_support)
    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(result)
    if 'error' in result:
        ntd = result['error']
    else:
        result_by_token_id = result['result_by_token_id']
        ntd = {}
        for token_id, r in result_by_token_id.items():
            ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
        if ntd == {}:
            ntd = '(n-1)-gram is not found in the corpus'
    return latency, tokenization_info, ntd

def infgram_prob(index_desc, query):
    result = process('infgram_prob', index_desc, query=query)
    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(result)
    if 'error' in result:
        longest_suffix = ''
        prob = result['error']
    else:
        longest_suffix = result['longest_suffix']
        prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
    return latency, tokenization_info, longest_suffix, prob

def infgram_ntd(index_desc, query, max_support):
    result = process('infgram_ntd', index_desc, query=query, max_support=max_support)
    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(result)
    if 'error' in result:
        longest_suffix = ''
        ntd = result['error']
    else:
        longest_suffix = result['longest_suffix']
        result_by_token_id = result['result_by_token_id']
        ntd = {}
        for token_id, r in result_by_token_id.items():
            ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
    return latency, tokenization_info, longest_suffix, ntd

def search_docs(index_desc, query, maxnum, max_disp_len, max_clause_freq, max_diff_tokens):
    if ' AND ' in query or ' OR ' in query: # CNF query
        result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
    else: # simple query
        result = process('search_docs', index_desc, query=query, maxnum=maxnum, max_disp_len=max_disp_len)
    latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(result)
    if 'error' in result:
        message = result['error']
        metadatas = ['' for _ in range(MAXNUM)]
        docs = [[] for _ in range(MAXNUM)]
    else:
        message = result['message']
        metadatas = [format_doc_metadata(doc) for doc in result['documents']]
        docs = [doc['spans'] for doc in result['documents']]
    metadatas = metadatas[:maxnum]
    docs = docs[:maxnum]
    while len(metadatas) < MAXNUM:
        metadatas.append('')
    while len(docs) < MAXNUM:
        docs.append([])
    return tuple([latency, tokenization_info, message] + metadatas + docs)

def search_docs_new(index_desc, query, max_disp_len, max_clause_freq, max_diff_tokens, state):
    if ' AND ' in query or ' OR ' in query: # CNF query
        find_result = process('find_cnf', index_desc, query=query, max_clause_freq=max_clause_freq, max_diff_tokens=max_diff_tokens)
        find_result['type'] = 'cnf'
    else: # simple query
        find_result = process('find', index_desc, query=query)
        find_result['type'] = 'simple'

    state = find_result

    latency = '' if 'latency' not in find_result else f'{find_result["latency"]:.3f}'
    tokenization_info = format_tokenization_info(find_result)
    if 'error' in find_result:
        message = find_result['error']
        idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
        metadata = ''
        doc = []
        return latency, tokenization_info, message, idx, metadata, doc, state

    if ' AND ' in query or ' OR ' in query: # CNF query
        ptrs_by_shard = find_result['ptrs_by_shard']
        cnt_retrievable = sum([len(ptrs) for ptrs in ptrs_by_shard])
        if find_result["approx"]:
            message = f'Approximately {find_result["cnt"]} occurrences found, of which {cnt_retrievable} are retrievable'
        else:
            message = f'{find_result["cnt"]} occurrences found'
    else: # simple query
        message = f'{find_result["cnt"]} occurrences found'
        cnt_retrievable = find_result['cnt']
    if cnt_retrievable == 0:
        idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
        metadata = ''
        doc = []
        return latency, tokenization_info, message, idx, metadata, doc, state
    idx = random.randint(0, cnt_retrievable-1)
    metadata, doc = get_another_doc(index_desc, idx, max_disp_len, state)
    idx = gr.Number(minimum=0, maximum=cnt_retrievable-1, step=1, value=idx, interactive=True)
    return latency, tokenization_info, message, idx, metadata, doc, state

def clear_search_docs_new(state):
    state = None
    idx = gr.Number(minimum=0, maximum=0, step=1, value=0, interactive=False)
    return idx, state

def get_another_doc(index_desc, idx, max_disp_len, state):
    find_result = state
    if find_result is None or not (type(idx) == int and 0 <= idx and idx < find_result['cnt']):
        metadata = ''
        doc = []
        return metadata, doc
    if find_result['type'] == 'cnf':
        ptrs_by_shard = find_result['ptrs_by_shard']
        cnt_by_shard = [len(ptrs) for ptrs in ptrs_by_shard]
        s = 0
        while idx >= cnt_by_shard[s]:
            idx -= cnt_by_shard[s]
            s += 1
        ptr = ptrs_by_shard[s][idx]
        result = process('get_doc_by_ptr', index_desc, s=s, ptr=ptr, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
    else: # simple query
        segment_by_shard = find_result['segment_by_shard']
        cnt_by_shard = [end - start for (start, end) in segment_by_shard]
        s = 0
        while idx >= cnt_by_shard[s]:
            idx -= cnt_by_shard[s]
            s += 1
        rank = segment_by_shard[s][0] + idx
        result = process('get_doc_by_rank', index_desc, s=s, rank=rank, max_disp_len=max_disp_len, query_ids=find_result['token_ids'])
    if 'error' in result:
        metadata = result['error']
        doc = []
        return metadata, doc
    metadata = format_doc_metadata(result)
    doc = result['spans']
    return metadata, doc

with gr.Blocks() as demo:
    with gr.Column():
        gr.HTML(
            '''<h1 text-align="center">Infini-gram: An Efficient Search Engine over the Massive Pretraining Datasets of Language Models</h1>

            <p style='font-size: 16px;'>This engine does exact-match search over several open pretraining datasets of language models. Please first select the corpus and the type of query, then enter your query and submit.</p>
            <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng Liu</a> and documented in our paper: <a href="https://huggingface.co/papers/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. Feel free to check out our <a href="https://infini-gram.io">Project Homepage</a>.</p>
            <p style='font-size: 16px;'><b>API Endpoint:</b> If you'd like to issue batch queries to infini-gram, you may invoke our API endpoint. Please refer to the <a href="https://infini-gram.io/api_doc">API documentation</a>.</p>
            <p style='font-size: 16px;'><b>Note:</b> The query is <b>case-sensitive</b>. Your query will be tokenized with the Llama-2 tokenizer (unless otherwise specified).</p>
            '''
        )
        with gr.Row():
            with gr.Column(scale=1, min_width=240):
                index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])

            with gr.Column(scale=7):
                with gr.Tab('1. Count an n-gram'):
                    with gr.Column():
                        gr.HTML('<h2>1. Count an n-gram</h2>')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus. You can also make more complex queries by connecting multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>.</p>
                                        <br>
                                        <p style="font-size: 16px;">Example queries:</p>
                                        <ul style="font-size: 16px;">
                                            <li><b>natural language processing</b> (the output is number of occurrences of "natural language processing")</li>
                                            <li><b>natural language processing AND deep learning</b> (the output is the number of co-occurrences of "natural language processing" and "deep learning")</li>
                                            <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the output is the number of co-occurrences of [one of "natural language processing" / "artificial intelligence"] and [one of "deep learning" / "machine learning"])</li>
                                        </ul>
                                        <br>
                                        <p style="font-size: 16px;">Notes on CNF queries:</p>
                                        <ul style="font-size: 16px;">
                                            <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
                                            <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                            <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
                                            <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                        </ul>
                                    ''')
                        with gr.Row():
                            with gr.Column(scale=1):
                                count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
                                with gr.Accordion(label='Advanced options', open=False):
                                    with gr.Row():
                                        count_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
                                        count_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
                                with gr.Row():
                                    count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    count_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                count_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                            with gr.Column(scale=1):
                                count_count = gr.Label(label='Count', num_top_classes=0)
                    count_clear.add([count_query, count_latency, count_tokenized, count_count])
                    count_submit.click(count, inputs=[index_desc, count_query, count_max_clause_freq, count_max_diff_tokens], outputs=[count_latency, count_tokenized, count_count], api_name=False)

                with gr.Tab('2. Prob of the last token'):
                    with gr.Column():
                        gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>
                                        <br>
                                        <p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>
                                        <br>
                                        <p style="font-size: 16px;">Notes:</p>
                                        <ul style="font-size: 16px;">
                                            <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
                                        </ul>
                                    ''')
                        with gr.Row():
                            with gr.Column(scale=1):
                                prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
                                with gr.Row():
                                    prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                            with gr.Column(scale=1):
                                prob_probability = gr.Label(label='Probability', num_top_classes=0)
                    prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
                    prob_submit.click(prob, inputs=[index_desc, prob_query], outputs=[prob_latency, prob_tokenized, prob_probability], api_name=False)

                with gr.Tab('3. Next-token distribution'):
                    with gr.Column():
                        gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This is an extension of the Query Type 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>
                                        <br>
                                        <p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>
                                        <br>
                                        <p style="font-size: 16px;">Notes:</p>
                                        <ul style="font-size: 16px;">
                                            <li>The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</li>
                                            <li>If the (n-1)-gram appears more than {max_support} times in the corpus, the result will be approximate: we will estimate the distribution by examining a subset of {max_support} occurrences of the (n-1)-gram. This value can be adjusted within range [1, {MAX_SUPPORT}] in "Advanced options".</li>
                                        </ul>
                                    ''')

                        with gr.Row():
                            with gr.Column(scale=1):
                                ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
                                with gr.Accordion(label='Advanced options', open=False):
                                    ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
                                with gr.Row():
                                    ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                            with gr.Column(scale=1):
                                ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
                    ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
                    ntd_submit.click(ntd, inputs=[index_desc, ntd_query, ntd_max_support], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)

                with gr.Tab('4. ∞-gram prob'):
                    with gr.Column():
                        gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query Type 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>
                                        <br>
                                        <p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(processing | natural language); in this case the effective n = 3)</p>
                                        <br>
                                        <p style="font-size: 16px;">Notes:</p>
                                        <ul style="font-size: 16px;">
                                            <li>It may be possible that the effective n = 1, i.e. longest found suffix is empty, in which case it reduces to the uni-gram probability of the last token.</li>
                                        </ul>
                                    ''')
                        with gr.Row():
                            with gr.Column(scale=1):
                                infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
                                with gr.Row():
                                    infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                                infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
                            with gr.Column(scale=1):
                                infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
                    infgram_prob_clear.add([infgram_prob_query, infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability])
                    infgram_prob_submit.click(infgram_prob, inputs=[index_desc, infgram_prob_query], outputs=[infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability], api_name=False)

                with gr.Tab('5. ∞-gram next-token distribution'):
                    with gr.Column():
                        gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This is similar to Query Type 3, but with ∞-gram instead of n-gram.</p>
                                        <br>
                                        <p style="font-size: 16px;">Example query: <b>I love natural language</b> (if "natural language" appears in the corpus but "love natural language" doesn't, the output is P(* | natural language), for the top-10 tokens *)</p>
                                    ''')
                        with gr.Row():
                            with gr.Column(scale=1):
                                infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
                                with gr.Accordion(label='Advanced options', open=False):
                                    infgram_ntd_max_support = gr.Slider(minimum=1, maximum=MAX_SUPPORT, value=MAX_SUPPORT, step=1, label='max_support')
                                with gr.Row():
                                    infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                                infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
                            with gr.Column(scale=1):
                                infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
                    infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
                    infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query, infgram_ntd_max_support], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)

                with gr.Tab('6. Search documents', visible=False):
                    with gr.Column():
                        gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
                                        <br>
                                        <p style="font-size: 16px;">Example queries:</p>
                                        <ul style="font-size: 16px;">
                                            <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
                                            <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
                                            <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
                                        </ul>
                                        <br>
                                        <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
                                        <br>
                                        <p style="font-size: 16px;">Notes on CNF queries:</p>
                                        <ul style="font-size: 16px;">
                                            <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
                                            <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                            <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} documents out of all documents containing that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
                                            <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                        </ul>
                                        <br>
                                        <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
                                    ''')
                        with gr.Row():
                            with gr.Column(scale=1):
                                search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
                                search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=maxnum, step=1, label='Number of documents to display')
                                search_docs_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
                                with gr.Accordion(label='Advanced options', open=False):
                                    with gr.Row():
                                        search_docs_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
                                        search_docs_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
                                with gr.Row():
                                    search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                search_docs_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                            with gr.Column(scale=2):
                                search_docs_message = gr.Label(label='Message', num_top_classes=0)
                                search_docs_metadatas = []
                                search_docs_outputs = []
                                for i in range(MAXNUM):
                                    with gr.Tab(label=str(i+1)):
                                        search_docs_metadatas.append(gr.Textbox(label='Metadata', lines=3, interactive=False))
                                        search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
                    search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs)
                    search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum, search_docs_max_disp_len, search_docs_max_clause_freq, search_docs_max_diff_tokens], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_metadatas + search_docs_outputs, api_name=False)

                with gr.Tab('6. Search documents'):
                    with gr.Column():
                        gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>''')
                        with gr.Accordion(label='Click to view instructions', open=False):
                            gr.HTML(f'''<p style="font-size: 16px;">This displays the documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
                                        <br>
                                        <p style="font-size: 16px;">Example queries:</p>
                                        <ul style="font-size: 16px;">
                                            <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
                                            <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
                                            <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
                                        </ul>
                                        <br>
                                        <p style="font-size: 16px;">Notes on CNF queries:</p>
                                        <ul style="font-size: 16px;">
                                            <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} n-gram terms.</li>
                                            <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
                                            <li>In AND queries, we can only examine co-occurrences where adjacent clauses are separated by no more than {max_diff_tokens} tokens. This value can be adjusted within range [1, {MAX_DIFF_TOKENS}] in "Advanced options".</li>
                                            <li>In AND queries, if a clause has more than {max_clause_freq} matches, we will estimate the count by examining a random subset of {max_clause_freq} occurrences of that clause. This value can be adjusted within range [1, {MAX_CLAUSE_FREQ}] in "Advanced options".</li>
                                            <li>The above subsampling mechanism might cause a zero count on co-occurrences of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
                                        </ul>
                                        <br>
                                        <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
                                    ''')
                        with gr.Row():
                            with gr.Column(scale=1):
                                search_docs_new_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
                                search_docs_new_max_disp_len = gr.Slider(minimum=1, maximum=MAX_DISP_LEN, value=max_disp_len, step=1, label='Number of tokens to display')
                                with gr.Accordion(label='Advanced options', open=False):
                                    with gr.Row():
                                        search_docs_new_max_clause_freq = gr.Slider(minimum=1, maximum=MAX_CLAUSE_FREQ, value=max_clause_freq, step=1, label='max_clause_freq')
                                        search_docs_new_max_diff_tokens = gr.Slider(minimum=1, maximum=MAX_DIFF_TOKENS, value=max_diff_tokens, step=1, label='max_diff_tokens')
                                with gr.Row():
                                    search_docs_new_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
                                    search_docs_new_submit = gr.Button(value='Submit', variant='primary', visible=True)
                                search_docs_new_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
                                search_docs_new_tokenized = gr.Textbox(label='Tokenized', lines=1, interactive=False)
                            with gr.Column(scale=2):
                                search_docs_new_message = gr.Label(label='Message', num_top_classes=0)
                                search_docs_new_idx = gr.Slider(label='', minimum=0, maximum=0, step=1, value=0, interactive=False)
                                search_docs_new_metadata = gr.Textbox(label='Metadata', lines=3, max_lines=3, interactive=False)
                                search_docs_new_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
                    search_docs_state = gr.State(value=None)
                    search_docs_new_clear.add([search_docs_new_query, search_docs_new_latency, search_docs_new_tokenized, search_docs_new_message, search_docs_new_idx, search_docs_new_metadata, search_docs_new_output])
                    search_docs_new_clear.click(
                        clear_search_docs_new,
                        inputs=[search_docs_state],
                        outputs=[search_docs_new_idx, search_docs_state]
                    )
                    search_docs_new_submit.click(
                        search_docs_new,
                        inputs=[index_desc, search_docs_new_query, search_docs_new_max_disp_len,
                               search_docs_new_max_clause_freq, search_docs_new_max_diff_tokens,
                               search_docs_state],
                        outputs=[search_docs_new_latency, search_docs_new_tokenized,
                                search_docs_new_message, search_docs_new_idx,
                                search_docs_new_metadata, search_docs_new_output,
                                search_docs_state]
                    )
                    search_docs_new_idx.input(
                        get_another_doc,
                        inputs=[index_desc, search_docs_new_idx, search_docs_new_max_disp_len,
                               search_docs_state],
                        outputs=[search_docs_new_metadata, search_docs_new_output]
                    )

        with gr.Row():
            gr.Markdown('''
If you find this tool useful, please kindly cite our paper:
```bibtex
@article{Liu2024InfiniGram,
  title={Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens},
  author={Liu, Jiacheng and Min, Sewon and Zettlemoyer, Luke and Choi, Yejin and Hajishirzi, Hannaneh},
  journal={arXiv preprint arXiv:2401.17377},
  year={2024}
}
```
''')

demo.queue(
    default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT,
    max_size=MAX_SIZE,
    api_open=False,
).launch(
    max_threads=MAX_THREADS,
    debug=DEBUG,
    show_api=False,
)