liujch1998 commited on
Commit
9282a5a
Β·
1 Parent(s): 0d3c7d8

Sync changes

Browse files
Files changed (2) hide show
  1. app.py +195 -292
  2. constants.py +19 -12
app.py CHANGED
@@ -1,42 +1,23 @@
1
  import gradio as gr
2
  import datetime
3
  import json
4
- import os
5
  import requests
6
- import time
7
  from constants import *
8
 
9
- API_IPADDR = os.environ.get('API_IPADDR', None)
10
- default_concurrency_limit = os.environ.get('default_concurrency_limit', 10)
11
- max_size = os.environ.get('max_size', 100)
12
- max_threads = os.environ.get('max_threads', 40)
13
- debug = (os.environ.get('debug', 'False') != 'False')
14
-
15
- last_query_time_by_ip = {}
16
-
17
- def process(corpus_desc, query_desc, query, ret_num, request: gr.Request):
18
- global last_query_time_by_ip
19
- ip = request.client.host if request else ''
20
  timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
21
- t = time.time()
22
- last_query_time = 0 if ip == '' else last_query_time_by_ip.get(ip, 0)
23
- blocked = (t - last_query_time < MIN_QUERY_INTERVAL_SECONDS)
24
-
25
  corpus = CORPUS_BY_DESC[corpus_desc]
26
- query_type = QUERY_TYPE_BY_DESC[query_desc]
27
  data = {
28
  'timestamp': timestamp,
29
- 'ip': ip,
30
- 'blocked': blocked,
31
- 'corpus': corpus,
32
  'query_type': query_type,
 
 
33
  'query': query,
34
  }
 
 
35
  print(json.dumps(data))
36
- if blocked:
37
- return tuple([f'You queried too frequently. Please try again in {MIN_QUERY_INTERVAL_SECONDS} seconds.'] + [''] * (ret_num - 1))
38
- if ip != '':
39
- last_query_time_by_ip[ip] = t
40
  if API_IPADDR is None:
41
  raise ValueError(f'API_IPADDR envvar is not set!')
42
  response = requests.post(f'http://{API_IPADDR}:5000/', json=data)
@@ -44,58 +25,35 @@ def process(corpus_desc, query_desc, query, ret_num, request: gr.Request):
44
  result = response.json()
45
  else:
46
  raise ValueError(f'HTTP error {response.status_code}: {response.json()}')
47
- if debug:
48
  print(result)
49
  return result
50
 
51
- def process_1(corpus_desc, query_desc, query, request: gr.Request):
52
- return process(corpus_desc, query_desc, query, 1, request)
53
- def process_2(corpus_desc, query_desc, query, request: gr.Request):
54
- return process(corpus_desc, query_desc, query, 2, request)
55
- def process_3(corpus_desc, query_desc, query, request: gr.Request):
56
- return process(corpus_desc, query_desc, query, 3, request)
57
-
58
- def process_ard_cnf_multi(corpus_desc, query_desc, query, maxnum, request: gr.Request):
59
- global last_query_time_by_ip
60
- ip = request.client.host if request else ''
61
- timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
62
- t = time.time()
63
- last_query_time = 0 if ip == '' else last_query_time_by_ip.get(ip, 0)
64
- blocked = (t - last_query_time < MIN_QUERY_INTERVAL_SECONDS)
65
-
66
- corpus = CORPUS_BY_DESC[corpus_desc]
67
- query_type = QUERY_TYPE_BY_DESC[query_desc]
68
- timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
69
- data = {
70
- 'timestamp': timestamp,
71
- 'ip': ip,
72
- 'blocked': blocked,
73
- 'corpus': corpus,
74
- 'query_type': query_type,
75
- 'query': query,
76
- 'maxnum': maxnum,
77
- }
78
- print(json.dumps(data))
79
- if blocked:
80
- return tuple([f'You queried too frequently. Please try again in {MIN_QUERY_INTERVAL_SECONDS} seconds.'] + [''] * 11)
81
- if ip != '':
82
- last_query_time_by_ip[ip] = t
83
- if API_IPADDR is None:
84
- raise ValueError(f'API_IPADDR envvar is not set!')
85
- response = requests.post(f'http://{API_IPADDR}:5000/', json=data)
86
- if response.status_code == 200:
87
- result = response.json()
88
- else:
89
- raise ValueError(f'HTTP error {response.status_code}: {response.json()}')
90
- if debug:
91
- print(result)
92
- if len(result) != 3:
93
- raise ValueError(f'Invalid result: {result}')
94
- outputs, output_tokens, message = result[0], result[1], result[2]
95
  outputs = outputs[:maxnum]
96
  while len(outputs) < 10:
97
  outputs.append([])
98
- return message, output_tokens, outputs[0], outputs[1], outputs[2], outputs[3], outputs[4], outputs[5], outputs[6], outputs[7], outputs[8], outputs[9]
 
 
 
99
 
100
  with gr.Blocks() as demo:
101
  with gr.Column():
@@ -103,196 +61,181 @@ with gr.Blocks() as demo:
103
  '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Models with Trillion-Token Corpora</h1>
104
 
105
  <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on a text corpus. Please first select the corpus and the type of query, then enter your query and submit.</p>
106
- <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://arxiv.org/abs/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a></p>
107
- <p style='font-size: 16px;'>HF Paper Page: <a href="https://huggingface.co/papers/2401.17377">https://huggingface.co/papers/2401.17377</a></p>
108
  <p style='font-size: 16px;'><b>Note: We kindly ask you not to programmatically submit queries to the API at the moment. We will release a more stable API soon. Thank you :)</b></p>
109
  '''
110
  )
111
  with gr.Row():
112
  with gr.Column(scale=1):
113
  corpus_desc = gr.Radio(choices=CORPUS_DESCS, label='Corpus', value=CORPUS_DESCS[0])
114
- with gr.Column(scale=3):
115
- query_desc = gr.Radio(
116
- choices=QUERY_DESCS, label='Query Type', value=QUERY_DESCS[0],
117
- )
118
 
119
- with gr.Row(visible=True) as row_1:
120
- with gr.Column():
121
- gr.HTML('<h2>1. Count an n-gram</h2>')
122
- gr.HTML('<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus.</p>')
123
- gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is Cnt(natural language processing))</p>')
124
- with gr.Row():
125
- with gr.Column(scale=1):
126
- count_input = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
127
  with gr.Row():
128
- count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
129
- count_submit = gr.Button(value='Submit', variant='primary', visible=True)
130
- count_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
131
- with gr.Column(scale=1):
132
- count_output = gr.Label(label='Count', num_top_classes=0)
 
 
 
 
 
 
133
 
134
- with gr.Row(visible=False) as row_2:
135
- with gr.Column():
136
- gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
137
- gr.HTML('<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>')
138
- gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>')
139
- gr.HTML('<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</p>')
140
- with gr.Row():
141
- with gr.Column(scale=1):
142
- ngram_input = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
143
  with gr.Row():
144
- ngram_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
145
- ngram_submit = gr.Button(value='Submit', variant='primary', visible=True)
146
- ngram_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
147
- with gr.Column(scale=1):
148
- ngram_output = gr.Label(label='Probability', num_top_classes=0)
 
 
 
 
 
 
149
 
150
- with gr.Row(visible=False) as row_3:
151
- with gr.Column():
152
- gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
153
- gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
154
- gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
155
- gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_CNT_FOR_NTD} times in the corpus, the result will be approximate.</p>')
156
- with gr.Row():
157
- with gr.Column(scale=1):
158
- ntd_input = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
159
  with gr.Row():
160
- ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
161
- ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
162
- ntd_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
163
- with gr.Column(scale=1):
164
- ntd_output = gr.Label(label='Distribution', num_top_classes=10)
 
 
 
 
 
 
165
 
166
- with gr.Row(visible=False) as row_4:
167
- with gr.Column():
168
- gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
169
- gr.HTML('<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>')
170
- gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (the output is P(processing | natural language), because "natural language" appears in the corpus but "love natural language" doesn\'t; in this case the effective n = 3)</p>')
171
- gr.HTML('<p style="font-size: 16px;">Note: It may be possible that the effective n = 1, in which case it reduces to the uni-gram probability of the last token.</p>')
172
- with gr.Row():
173
- with gr.Column(scale=1):
174
- infgram_input = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
175
  with gr.Row():
176
- infgram_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
177
- infgram_submit = gr.Button(value='Submit', variant='primary', visible=True)
178
- infgram_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
179
- infgram_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
180
- with gr.Column(scale=1):
181
- infgram_output = gr.Label(label='Probability', num_top_classes=0)
 
 
 
 
 
 
182
 
183
- with gr.Row(visible=False) as row_5:
184
- with gr.Column():
185
- gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
186
- gr.HTML('<p style="font-size: 16px;">This is similar to Query 3, but with ∞-gram instead of n-gram.</p>')
187
- gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
188
- with gr.Row():
189
- with gr.Column(scale=1):
190
- infntd_input = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
191
  with gr.Row():
192
- infntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
193
- infntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
194
- infntd_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
195
- infntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
196
- with gr.Column(scale=1):
197
- infntd_output = gr.Label(label='Distribution', num_top_classes=10)
 
 
 
 
 
 
198
 
199
- # with gr.Row(visible=False) as row_6:
200
- # with gr.Column():
201
- # gr.HTML(f'''<h2>6. Searching for document containing n-gram(s)</h2>
202
- # <p style="font-size: 16px;">This displays a random document in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
203
- # <p style="font-size: 16px;">Example queries:</p>
204
- # <ul style="font-size: 16px;">
205
- # <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
206
- # <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
207
- # <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
208
- # </ul>
209
- # <p style="font-size: 16px;">If you want another random document, simply hit the Submit button again :)</p>
210
- # <p style="font-size: 16px;">A few notes:</p>
211
- # <ul style="font-size: 16px;">
212
- # <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
213
- # <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
214
- # <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
215
- # <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
216
- # <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
217
- # </ul>
218
- # <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
219
- # ''')
220
- # with gr.Row():
221
- # with gr.Column(scale=1):
222
- # ard_cnf_input = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
223
- # with gr.Row():
224
- # ard_cnf_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
225
- # ard_cnf_submit = gr.Button(value='Submit', variant='primary', visible=True)
226
- # ard_cnf_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
227
- # with gr.Column(scale=1):
228
- # ard_cnf_output_message = gr.Label(label='Message', num_top_classes=0)
229
- # ard_cnf_output = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
230
-
231
- with gr.Row(visible=False) as row_6a:
232
- with gr.Column():
233
- gr.HTML(f'''<h2>6. Searching for documents containing n-gram(s)</h2>
234
- <p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
235
- <p style="font-size: 16px;">Example queries:</p>
236
- <ul style="font-size: 16px;">
237
- <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
238
- <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
239
- <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
240
- </ul>
241
- <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
242
- <p style="font-size: 16px;">A few notes:</p>
243
- <ul style="font-size: 16px;">
244
- <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
245
- <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
246
- <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
247
- <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
248
- <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
249
- </ul>
250
- <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
251
- ''')
252
- with gr.Row():
253
- with gr.Column(scale=1):
254
- ard_cnf_multi_input = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
255
- ard_cnf_multi_maxnum = gr.Slider(minimum=1, maximum=10, value=1, step=1, label='Number of documents to Display')
256
  with gr.Row():
257
- ard_cnf_multi_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
258
- ard_cnf_multi_submit = gr.Button(value='Submit', variant='primary', visible=True)
259
- ard_cnf_multi_output_tokens = gr.Textbox(label='Tokenized', lines=2, interactive=False)
260
- with gr.Column(scale=1):
261
- ard_cnf_multi_output_message = gr.Label(label='Message', num_top_classes=0)
262
- with gr.Tab(label='1'):
263
- ard_cnf_multi_output_0 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
264
- with gr.Tab(label='2'):
265
- ard_cnf_multi_output_1 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
266
- with gr.Tab(label='3'):
267
- ard_cnf_multi_output_2 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
268
- with gr.Tab(label='4'):
269
- ard_cnf_multi_output_3 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
270
- with gr.Tab(label='5'):
271
- ard_cnf_multi_output_4 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
272
- with gr.Tab(label='6'):
273
- ard_cnf_multi_output_5 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
274
- with gr.Tab(label='7'):
275
- ard_cnf_multi_output_6 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
276
- with gr.Tab(label='8'):
277
- ard_cnf_multi_output_7 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
278
- with gr.Tab(label='9'):
279
- ard_cnf_multi_output_8 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
280
- with gr.Tab(label='10'):
281
- ard_cnf_multi_output_9 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
 
 
 
 
 
 
 
282
 
283
- with gr.Row(visible=False) as row_7:
284
- with gr.Column():
285
- gr.HTML('<h2>7. Analyze an (AI-generated) document using ∞-gram</h2>')
286
- gr.HTML('<p style="font-size: 16px;">This analyzes the document you entered using the ∞-gram. Each token is highlighted where (1) the color represents its ∞-gram probability (red is 0.0, blue is 1.0), and (2) the alpha represents the effective n (higher alpha means higher n).</p>')
287
- gr.HTML('<p style="font-size: 16px;">If you hover over a token, the tokens preceding it are each highlighted where (1) the color represents the n-gram probability of your selected token, with the n-gram starting from that highlighted token (red is 0.0, blue is 1.0), and (2) the alpha represents the count of the (n-1)-gram starting from that highlighted token (and up to but excluding your selected token) (higher alpha means higher count).</p>')
288
- with gr.Row():
289
- with gr.Column(scale=1):
290
- doc_analysis_input = gr.Textbox(placeholder='Enter a document here', label='Query', interactive=True, lines=10)
291
  with gr.Row():
292
- doc_analysis_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
293
- doc_analysis_submit = gr.Button(value='Submit', variant='primary', visible=True)
294
- with gr.Column(scale=1):
295
- doc_analysis_output = gr.HTML(value='', label='Analysis')
 
 
 
 
 
296
 
297
  with gr.Row():
298
  gr.Markdown('''
@@ -307,65 +250,25 @@ If you find this tool useful, please kindly cite our paper:
307
  ```
308
  ''')
309
 
310
- count_clear.add([count_input, count_output, count_output_tokens])
311
- ngram_clear.add([ngram_input, ngram_output, ngram_output_tokens])
312
- ntd_clear.add([ntd_input, ntd_output, ntd_output_tokens])
313
- infgram_clear.add([infgram_input, infgram_output, infgram_output_tokens])
314
- infntd_clear.add([infntd_input, infntd_output, infntd_output_tokens, infntd_longest_suffix])
315
- # ard_cnf_clear.add([ard_cnf_input, ard_cnf_output, ard_cnf_output_tokens, ard_cnf_output_message])
316
- ard_cnf_multi_clear.add([ard_cnf_multi_input, ard_cnf_multi_output_tokens, ard_cnf_multi_output_message, ard_cnf_multi_output_0, ard_cnf_multi_output_1, ard_cnf_multi_output_2, ard_cnf_multi_output_3, ard_cnf_multi_output_4, ard_cnf_multi_output_5, ard_cnf_multi_output_6, ard_cnf_multi_output_7, ard_cnf_multi_output_8, ard_cnf_multi_output_9])
317
- doc_analysis_clear.add([doc_analysis_input, doc_analysis_output])
318
-
319
- count_submit.click(process_2, inputs=[corpus_desc, query_desc, count_input], outputs=[count_output, count_output_tokens], api_name=False)
320
- ngram_submit.click(process_2, inputs=[corpus_desc, query_desc, ngram_input], outputs=[ngram_output, ngram_output_tokens], api_name=False)
321
- ntd_submit.click(process_2, inputs=[corpus_desc, query_desc, ntd_input], outputs=[ntd_output, ntd_output_tokens], api_name=False)
322
- infgram_submit.click(process_3, inputs=[corpus_desc, query_desc, infgram_input], outputs=[infgram_output, infgram_output_tokens, infgram_longest_suffix], api_name=False)
323
- infntd_submit.click(process_3, inputs=[corpus_desc, query_desc, infntd_input], outputs=[infntd_output, infntd_output_tokens, infntd_longest_suffix], api_name=False)
324
- # ard_cnf_submit.click(process, inputs=[corpus_desc, query_desc, ard_cnf_input], outputs=[ard_cnf_output, ard_cnf_output_tokens, ard_cnf_output_message], api_name=False)
325
- ard_cnf_multi_submit.click(process_ard_cnf_multi, inputs=[corpus_desc, query_desc, ard_cnf_multi_input, ard_cnf_multi_maxnum], outputs=[ard_cnf_multi_output_message, ard_cnf_multi_output_tokens, ard_cnf_multi_output_0, ard_cnf_multi_output_1, ard_cnf_multi_output_2, ard_cnf_multi_output_3, ard_cnf_multi_output_4, ard_cnf_multi_output_5, ard_cnf_multi_output_6, ard_cnf_multi_output_7, ard_cnf_multi_output_8, ard_cnf_multi_output_9], api_name=False)
326
- doc_analysis_submit.click(process_1, inputs=[corpus_desc, query_desc, doc_analysis_input], outputs=[doc_analysis_output], api_name=False)
327
-
328
- def update_query_desc(selection):
329
- return {
330
- row_1: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['count'])),
331
- row_2: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['compute_prob'])),
332
- row_3: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_next_token_distribution_approx'])),
333
- row_4: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['compute_infgram_prob'])),
334
- row_5: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_infgram_next_token_distribution_approx'])),
335
- # row_6: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_a_random_document_from_cnf_query_fast_approx'])),
336
- row_6a: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['get_random_documents_from_cnf_query_fast_approx'])),
337
- # row_7: gr.Row(visible=(selection == QUERY_DESC_BY_TYPE['analyze_document'])),
338
- }
339
- query_desc.change(fn=update_query_desc, inputs=query_desc, outputs=[
340
- row_1,
341
- row_2,
342
- row_3,
343
- row_4,
344
- row_5,
345
- # row_6,
346
- row_6a,
347
- # row_7,
348
- ])
349
-
350
  for d in demo.dependencies:
351
  d['api_name'] = False
352
  for d in demo.config['dependencies']:
353
  d['api_name'] = False
354
- if debug:
355
- print(demo.dependencies)
356
- print(demo.config['dependencies'])
357
 
358
  demo.queue(
359
- default_concurrency_limit=default_concurrency_limit,
360
- max_size=max_size,
361
  api_open=False,
362
  ).launch(
363
- max_threads=max_threads,
364
- debug=debug,
365
  show_api=False,
366
  )
367
 
368
- for d in gr.context.Context.root_block.dependencies:
369
- d['api_name'] = False
370
- if debug:
371
- print(gr.context.Context.root_block.dependencies)
 
1
  import gradio as gr
2
  import datetime
3
  import json
 
4
  import requests
 
5
  from constants import *
6
 
7
+ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Request):
 
 
 
 
 
 
 
 
 
 
8
  timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
 
 
 
 
9
  corpus = CORPUS_BY_DESC[corpus_desc]
10
+ engine = ENGINE_BY_DESC[engine_desc]
11
  data = {
12
  'timestamp': timestamp,
 
 
 
13
  'query_type': query_type,
14
+ 'corpus': corpus,
15
+ 'engine': engine,
16
  'query': query,
17
  }
18
+ if maxnum is not None:
19
+ data['maxnum'] = maxnum
20
  print(json.dumps(data))
 
 
 
 
21
  if API_IPADDR is None:
22
  raise ValueError(f'API_IPADDR envvar is not set!')
23
  response = requests.post(f'http://{API_IPADDR}:5000/', json=data)
 
25
  result = response.json()
26
  else:
27
  raise ValueError(f'HTTP error {response.status_code}: {response.json()}')
28
+ if DEBUG:
29
  print(result)
30
  return result
31
 
32
+ def count(corpus_desc, engine_desc, query, request: gr.Request):
33
+ result = process('count', corpus_desc, engine_desc, query, None, request)
34
+ return result.get('latency', ''), result.get('tokenized', ''), result.get('count', '')
35
+ def prob(corpus_desc, engine_desc, query, request: gr.Request):
36
+ result = process('prob', corpus_desc, engine_desc, query, None, request)
37
+ return result.get('latency', ''), result.get('tokenized', ''), result.get('probability', '')
38
+ def ntd(corpus_desc, engine_desc, query, request: gr.Request):
39
+ result = process('ntd', corpus_desc, engine_desc, query, None, request)
40
+ return result.get('latency', ''), result.get('tokenized', ''), result.get('distribution', '')
41
+ def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
42
+ result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
43
+ return result.get('latency', ''), result.get('tokenized', ''), result.get('longest_suffix', ''), result.get('probability', '')
44
+ def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
45
+ result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
46
+ return result.get('latency', ''), result.get('tokenized', ''), result.get('longest_suffix', ''), result.get('distribution', '')
47
+ def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
48
+ result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
49
+ outputs = result.get('outputs', [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  outputs = outputs[:maxnum]
51
  while len(outputs) < 10:
52
  outputs.append([])
53
+ return result.get('latency', ''), result.get('tokenized', ''), result.get('message', ''), outputs[0], outputs[1], outputs[2], outputs[3], outputs[4], outputs[5], outputs[6], outputs[7], outputs[8], outputs[9]
54
+ def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
55
+ result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
56
+ return result.get('latency', ''), result.get('html', '')
57
 
58
  with gr.Blocks() as demo:
59
  with gr.Column():
 
61
  '''<h1 text-align="center">Infini-gram: An Engine for n-gram / ∞-gram Language Models with Trillion-Token Corpora</h1>
62
 
63
  <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on a text corpus. Please first select the corpus and the type of query, then enter your query and submit.</p>
64
+ <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://arxiv.org/abs/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. HF Paper Page: <a href="https://huggingface.co/papers/2401.17377">https://huggingface.co/papers/2401.17377</a></p>
 
65
  <p style='font-size: 16px;'><b>Note: We kindly ask you not to programmatically submit queries to the API at the moment. We will release a more stable API soon. Thank you :)</b></p>
66
  '''
67
  )
68
  with gr.Row():
69
  with gr.Column(scale=1):
70
  corpus_desc = gr.Radio(choices=CORPUS_DESCS, label='Corpus', value=CORPUS_DESCS[0])
71
+ engine_desc = gr.Radio(choices=ENGINE_DESCS, label='Engine', value=ENGINE_DESCS[0])
 
 
 
72
 
73
+ with gr.Column(scale=5):
74
+ with gr.Tab('1. Count an n-gram'):
75
+ with gr.Column():
76
+ gr.HTML('<h2>1. Count an n-gram</h2>')
77
+ gr.HTML('<p style="font-size: 16px;">This counts the number of times an n-gram appears in the corpus. If you submit an empty input, it will return the total number of tokens in the corpus.</p>')
78
+ gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is Cnt(natural language processing))</p>')
 
 
79
  with gr.Row():
80
+ with gr.Column(scale=1):
81
+ count_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
82
+ with gr.Row():
83
+ count_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
84
+ count_submit = gr.Button(value='Submit', variant='primary', visible=True)
85
+ count_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
86
+ count_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
87
+ with gr.Column(scale=1):
88
+ count_count = gr.Label(label='Count', num_top_classes=0)
89
+ count_clear.add([count_query, count_latency, count_tokenized, count_count])
90
+ count_submit.click(count, inputs=[corpus_desc, engine_desc, count_query], outputs=[count_latency, count_tokenized, count_count], api_name=False)
91
 
92
+ with gr.Tab('2. Prob of the last token'):
93
+ with gr.Column():
94
+ gr.HTML('<h2>2. Compute the probability of the last token in an n-gram</h2>')
95
+ gr.HTML('<p style="font-size: 16px;">This computes the n-gram probability of the last token conditioned on the previous tokens (i.e. (n-1)-gram)).</p>')
96
+ gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language processing</b> (the output is P(processing | natural language), by counting the appearance of the 3-gram "natural language processing" and the 2-gram "natural language", and take the division between the two)</p>')
97
+ gr.HTML('<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear.</p>')
 
 
 
98
  with gr.Row():
99
+ with gr.Column(scale=1):
100
+ prob_query = gr.Textbox(placeholder='Enter a string (an n-gram) here', label='Query', interactive=True)
101
+ with gr.Row():
102
+ prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
103
+ prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
104
+ prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
105
+ prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
106
+ with gr.Column(scale=1):
107
+ prob_probability = gr.Label(label='Probability', num_top_classes=0)
108
+ prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
109
+ prob_submit.click(prob, inputs=[corpus_desc, engine_desc, prob_query], outputs=[prob_latency, prob_tokenized, prob_probability], api_name=False)
110
 
111
+ with gr.Tab('3. Next-token distribution'):
112
+ with gr.Column():
113
+ gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
114
+ gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
115
+ gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
116
+ gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_CNT_FOR_NTD} times in the corpus, the result will be approximate.</p>')
 
 
 
117
  with gr.Row():
118
+ with gr.Column(scale=1):
119
+ ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
120
+ with gr.Row():
121
+ ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
122
+ ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
123
+ ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
124
+ ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
125
+ with gr.Column(scale=1):
126
+ ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
127
+ ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
128
+ ntd_submit.click(ntd, inputs=[corpus_desc, engine_desc, ntd_query], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
129
 
130
+ with gr.Tab('4. ∞-gram prob'):
131
+ with gr.Column():
132
+ gr.HTML('<h2>4. Compute the ∞-gram probability of the last token</h2>')
133
+ gr.HTML('<p style="font-size: 16px;">This computes the ∞-gram probability of the last token conditioned on the previous tokens. Compared to Query 2 (which uses your entire input for n-gram modeling), here we take the longest suffix that we can find in the corpus.</p>')
134
+ gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language processing</b> (the output is P(processing | natural language), because "natural language" appears in the corpus but "love natural language" doesn\'t; in this case the effective n = 3)</p>')
135
+ gr.HTML('<p style="font-size: 16px;">Note: It may be possible that the effective n = 1, in which case it reduces to the uni-gram probability of the last token.</p>')
 
 
 
136
  with gr.Row():
137
+ with gr.Column(scale=1):
138
+ infgram_prob_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
139
+ with gr.Row():
140
+ infgram_prob_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
141
+ infgram_prob_submit = gr.Button(value='Submit', variant='primary', visible=True)
142
+ infgram_prob_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
143
+ infgram_prob_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
144
+ infgram_prob_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
145
+ with gr.Column(scale=1):
146
+ infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
147
+ infgram_prob_clear.add([infgram_prob_query, infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability])
148
+ infgram_prob_submit.click(infgram_prob, inputs=[corpus_desc, engine_desc, infgram_prob_query], outputs=[infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability], api_name=False)
149
 
150
+ with gr.Tab('5. ∞-gram next-token distribution'):
151
+ with gr.Column():
152
+ gr.HTML('<h2>5. Compute the ∞-gram next-token distribution</h2>')
153
+ gr.HTML('<p style="font-size: 16px;">This is similar to Query 3, but with ∞-gram instead of n-gram.</p>')
154
+ gr.HTML('<p style="font-size: 16px;">Example query: <b>I love natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
 
 
 
155
  with gr.Row():
156
+ with gr.Column(scale=1):
157
+ infgram_ntd_query = gr.Textbox(placeholder='Enter a string here', label='Query', interactive=True)
158
+ with gr.Row():
159
+ infgram_ntd_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
160
+ infgram_ntd_submit = gr.Button(value='Submit', variant='primary', visible=True)
161
+ infgram_ntd_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
162
+ infgram_ntd_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
163
+ infgram_ntd_longest_suffix = gr.Textbox(label='Longest Found Suffix', interactive=False)
164
+ with gr.Column(scale=1):
165
+ infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
166
+ infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
167
+ infgram_ntd_submit.click(infgram_ntd, inputs=[corpus_desc, engine_desc, infgram_ntd_query], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
168
 
169
+ with gr.Tab('6. Search documents'):
170
+ with gr.Column():
171
+ gr.HTML(f'''<h2>6. Search for documents containing n-gram(s)</h2>
172
+ <p style="font-size: 16px;">This displays a few random documents in the corpus that satisfies your query. You can simply enter an n-gram, in which case the document displayed would contain your n-gram. You can also connect multiple n-gram terms with the AND/OR operators, in the <a href="https://en.wikipedia.org/wiki/Conjunctive_normal_form">CNF format</a>, in which case the displayed document contains n-grams such that it satisfies this logical constraint.</p>
173
+ <p style="font-size: 16px;">Example queries:</p>
174
+ <ul style="font-size: 16px;">
175
+ <li><b>natural language processing</b> (the displayed document would contain "natural language processing")</li>
176
+ <li><b>natural language processing AND deep learning</b> (the displayed document would contain both "natural language processing" and "deep learning")</li>
177
+ <li><b>natural language processing OR artificial intelligence AND deep learning OR machine learning</b> (the displayed document would contain at least one of "natural language processing" / "artificial intelligence", and also at least one of "deep learning" / "machine learning")</li>
178
+ </ul>
179
+ <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
180
+ <p style="font-size: 16px;">A few notes:</p>
181
+ <ul style="font-size: 16px;">
182
+ <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
183
+ <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
184
+ <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
185
+ <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
186
+ <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
187
+ </ul>
188
+ <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
189
+ ''')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  with gr.Row():
191
+ with gr.Column(scale=2):
192
+ search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
193
+ search_docs_maxnum = gr.Slider(minimum=1, maximum=10, value=1, step=1, label='Number of documents to Display')
194
+ with gr.Row():
195
+ search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
196
+ search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
197
+ search_docs_latency = gr.Textbox(label='Latency (milliseconds)', interactive=False, lines=1)
198
+ search_docs_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
199
+ with gr.Column(scale=3):
200
+ search_docs_message = gr.Label(label='Message', num_top_classes=0)
201
+ with gr.Tab(label='1'):
202
+ search_docs_output_0 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
203
+ with gr.Tab(label='2'):
204
+ search_docs_output_1 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
205
+ with gr.Tab(label='3'):
206
+ search_docs_output_2 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
207
+ with gr.Tab(label='4'):
208
+ search_docs_output_3 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
209
+ with gr.Tab(label='5'):
210
+ search_docs_output_4 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
211
+ with gr.Tab(label='6'):
212
+ search_docs_output_5 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
213
+ with gr.Tab(label='7'):
214
+ search_docs_output_6 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
215
+ with gr.Tab(label='8'):
216
+ search_docs_output_7 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
217
+ with gr.Tab(label='9'):
218
+ search_docs_output_8 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
219
+ with gr.Tab(label='10'):
220
+ search_docs_output_9 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
221
+ search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message, search_docs_output_0, search_docs_output_1, search_docs_output_2, search_docs_output_3, search_docs_output_4, search_docs_output_5, search_docs_output_6, search_docs_output_7, search_docs_output_8, search_docs_output_9])
222
+ search_docs_submit.click(search_docs, inputs=[corpus_desc, engine_desc, search_docs_query, search_docs_maxnum], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message, search_docs_output_0, search_docs_output_1, search_docs_output_2, search_docs_output_3, search_docs_output_4, search_docs_output_5, search_docs_output_6, search_docs_output_7, search_docs_output_8, search_docs_output_9], api_name=False)
223
 
224
+ with gr.Tab('7. Analyze an (AI-generated) document using ∞-gram', visible=False):
225
+ with gr.Column():
226
+ gr.HTML('<h2>7. Analyze an (AI-generated) document using ∞-gram</h2>')
227
+ gr.HTML('<p style="font-size: 16px;">This analyzes the document you entered using the ∞-gram. Each token is highlighted where (1) the color represents its ∞-gram probability (red is 0.0, blue is 1.0), and (2) the alpha represents the effective n (higher alpha means higher n).</p>')
228
+ gr.HTML('<p style="font-size: 16px;">If you hover over a token, the tokens preceding it are each highlighted where (1) the color represents the n-gram probability of your selected token, with the n-gram starting from that highlighted token (red is 0.0, blue is 1.0), and (2) the alpha represents the count of the (n-1)-gram starting from that highlighted token (and up to but excluding your selected token) (higher alpha means higher count).</p>')
 
 
 
229
  with gr.Row():
230
+ with gr.Column(scale=1):
231
+ analyze_document_query = gr.Textbox(placeholder='Enter a document here', label='Query', interactive=True, lines=10)
232
+ with gr.Row():
233
+ analyze_document_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
234
+ analyze_document_submit = gr.Button(value='Submit', variant='primary', visible=True)
235
+ with gr.Column(scale=1):
236
+ analyze_document_html = gr.HTML(value='', label='Analysis')
237
+ analyze_document_clear.add([analyze_document_query, analyze_document_html])
238
+ analyze_document_submit.click(analyze_document, inputs=[corpus_desc, engine_desc, analyze_document_query], outputs=[analyze_document_html], api_name=False)
239
 
240
  with gr.Row():
241
  gr.Markdown('''
 
250
  ```
251
  ''')
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  for d in demo.dependencies:
254
  d['api_name'] = False
255
  for d in demo.config['dependencies']:
256
  d['api_name'] = False
257
+ # if DEBUG:
258
+ # print(demo.dependencies)
259
+ # print(demo.config['dependencies'])
260
 
261
  demo.queue(
262
+ default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT,
263
+ max_size=MAX_SIZE,
264
  api_open=False,
265
  ).launch(
266
+ max_threads=MAX_THREADS,
267
+ debug=DEBUG,
268
  show_api=False,
269
  )
270
 
271
+ # for d in gr.context.Context.root_block.dependencies:
272
+ # d['api_name'] = False
273
+ # if DEBUG:
274
+ # print(gr.context.Context.root_block.dependencies)
constants.py CHANGED
@@ -1,27 +1,24 @@
1
  import os
2
 
 
3
  CORPUS_BY_DESC = {
4
  'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v3_rpj_llama_c4',
5
  'Pile-val (LLaMA tokenizer), 390M tokens': 'v3_pileval_llama',
6
- 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v3_pileval',
7
  }
8
  CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
9
- QUERY_TYPE_BY_DESC = {
10
- '1. Count an n-gram': 'count',
11
- '2. Compute the probability of the last token in an n-gram': 'compute_prob',
12
- '3. Compute the next-token distribution of an (n-1)-gram': 'get_next_token_distribution_approx',
13
- '4. Compute the ∞-gram probability of the last token': 'compute_infgram_prob',
14
- '5. Compute the ∞-gram next-token distribution': 'get_infgram_next_token_distribution_approx',
15
- # '6. Searching for document containing n-gram(s)': 'get_a_random_document_from_cnf_query_fast_approx',
16
- '6. Searching for documents containing n-gram(s)': 'get_random_documents_from_cnf_query_fast_approx',
17
- # '7. Analyze an (AI-generated) document using ∞-gram': 'analyze_document',
18
  }
19
- QUERY_DESC_BY_TYPE = {v: k for k, v in QUERY_TYPE_BY_DESC.items()}
20
- QUERY_DESCS = list(QUERY_TYPE_BY_DESC.keys())
21
 
 
22
  MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
23
  MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
24
  MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
 
25
  MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
26
  MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 10000))
27
  MAX_CLAUSE_FREQ_FAST = int(os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000))
@@ -31,4 +28,14 @@ MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
31
  MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
32
  MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
33
 
 
 
 
 
 
 
34
  MIN_QUERY_INTERVAL_SECONDS = int(os.environ.get('MIN_QUERY_INTERVAL_SECONDS', 5))
 
 
 
 
 
1
  import os
2
 
3
+ # options
4
  CORPUS_BY_DESC = {
5
  'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v3_rpj_llama_c4',
6
  'Pile-val (LLaMA tokenizer), 390M tokens': 'v3_pileval_llama',
7
+ 'Pile-val (GPT-2 tokenizer), 380M tokens': 'v3_pileval_gpt2',
8
  }
9
  CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
10
+ ENGINE_BY_DESC = {
11
+ 'Python': 'python',
12
+ 'C++ (πŸš€ Fast, Experimental)': 'c++',
 
 
 
 
 
 
13
  }
14
+ ENGINE_DESCS = list(ENGINE_BY_DESC.keys())
15
+ ENGINES = list(ENGINE_BY_DESC.values())
16
 
17
+ # engine
18
  MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
19
  MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
20
  MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
21
+ MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10))
22
  MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
23
  MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 10000))
24
  MAX_CLAUSE_FREQ_FAST = int(os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000))
 
28
  MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
29
  MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
30
 
31
+ # HF demo
32
+ API_IPADDR = os.environ.get('API_IPADDR', None)
33
+ DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
34
+ MAX_SIZE = os.environ.get('MAX_SIZE', 100)
35
+ MAX_THREADS = os.environ.get('MAX_THREADS', 40)
36
+ DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
37
  MIN_QUERY_INTERVAL_SECONDS = int(os.environ.get('MIN_QUERY_INTERVAL_SECONDS', 5))
38
+
39
+ # C++ engine
40
+ CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
41
+ SOCKET_OUT_BUFFER_SIZE = 65536