liujch1998 commited on
Commit
555cd42
·
1 Parent(s): 2195005

Sync changes

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +38 -91
  3. constants.py +10 -18
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 📖
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
 
4
  colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.36.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-nc-sa-4.0
app.py CHANGED
@@ -4,20 +4,16 @@ import json
4
  import requests
5
  from constants import *
6
 
7
- def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Request):
8
  timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
9
- corpus = CORPUS_BY_DESC[corpus_desc]
10
- engine = ENGINE_BY_DESC[engine_desc]
11
  data = {
12
  'source': 'hf' if not DEBUG else 'hf-dev',
13
  'timestamp': timestamp,
14
  'query_type': query_type,
15
- 'corpus': corpus,
16
- 'engine': engine,
17
- 'query': query,
18
  }
19
- if maxnum is not None:
20
- data['maxnum'] = maxnum
21
  print(json.dumps(data))
22
  if API_URL is None:
23
  raise ValueError(f'API_URL envvar is not set!')
@@ -63,8 +59,8 @@ def format_doc(doc):
63
  formatted += doc['spans']
64
  return formatted
65
 
66
- def count(corpus_desc, engine_desc, query, request: gr.Request):
67
- result = process('count', corpus_desc, engine_desc, query, None, request)
68
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
69
  tokenization_info = format_tokenization_info(result)
70
  if 'error' in result:
@@ -73,8 +69,8 @@ def count(corpus_desc, engine_desc, query, request: gr.Request):
73
  count = f'{result["count"]:,}'
74
  return latency, tokenization_info, count
75
 
76
- def prob(corpus_desc, engine_desc, query, request: gr.Request):
77
- result = process('prob', corpus_desc, engine_desc, query, None, request)
78
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
79
  tokenization_info = format_tokenization_info(result)
80
  if 'error' in result:
@@ -85,8 +81,8 @@ def prob(corpus_desc, engine_desc, query, request: gr.Request):
85
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
86
  return latency, tokenization_info, prob
87
 
88
- def ntd(corpus_desc, engine_desc, query, request: gr.Request):
89
- result = process('ntd', corpus_desc, engine_desc, query, None, request)
90
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
91
  tokenization_info = format_tokenization_info(result)
92
  if 'error' in result:
@@ -100,8 +96,8 @@ def ntd(corpus_desc, engine_desc, query, request: gr.Request):
100
  ntd = '(n-1)-gram is not found in the corpus'
101
  return latency, tokenization_info, ntd
102
 
103
- def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
104
- result = process('infgram_prob', corpus_desc, engine_desc, query, None, request)
105
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
106
  tokenization_info = format_tokenization_info(result)
107
  if 'error' in result:
@@ -112,8 +108,8 @@ def infgram_prob(corpus_desc, engine_desc, query, request: gr.Request):
112
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
113
  return latency, tokenization_info, longest_suffix, prob
114
 
115
- def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
116
- result = process('infgram_ntd', corpus_desc, engine_desc, query, None, request)
117
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
118
  tokenization_info = format_tokenization_info(result)
119
  if 'error' in result:
@@ -127,25 +123,21 @@ def infgram_ntd(corpus_desc, engine_desc, query, request: gr.Request):
127
  ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
128
  return latency, tokenization_info, longest_suffix, ntd
129
 
130
- def search_docs(corpus_desc, engine_desc, query, maxnum, request: gr.Request):
131
- result = process('search_docs', corpus_desc, engine_desc, query, maxnum, request)
132
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
133
  tokenization_info = format_tokenization_info(result)
134
  if 'error' in result:
135
  message = result['error']
136
- docs = [[] for _ in range(10)]
137
  else:
138
  message = result['message']
139
  docs = result['documents']
140
  docs = [format_doc(doc) for doc in docs]
141
  docs = docs[:maxnum]
142
- while len(docs) < 10:
143
  docs.append([])
144
- return latency, tokenization_info, message, docs[0], docs[1], docs[2], docs[3], docs[4], docs[5], docs[6], docs[7], docs[8], docs[9]
145
-
146
- def analyze_document(corpus_desc, engine_desc, query, request: gr.Request):
147
- result = process('analyze_document', corpus_desc, engine_desc, query, None, request)
148
- return result.get('latency', ''), result.get('html', '')
149
 
150
  with gr.Blocks() as demo:
151
  with gr.Column():
@@ -160,10 +152,9 @@ with gr.Blocks() as demo:
160
  )
161
  with gr.Row():
162
  with gr.Column(scale=1):
163
- corpus_desc = gr.Radio(choices=CORPUS_DESCS, label='Corpus', value=CORPUS_DESCS[0])
164
- engine_desc = gr.Radio(choices=ENGINE_DESCS, label='Engine', value=ENGINE_DESCS[0])
165
 
166
- with gr.Column(scale=5):
167
  with gr.Tab('1. Count an n-gram'):
168
  with gr.Column():
169
  gr.HTML('<h2>1. Count an n-gram</h2>')
@@ -180,7 +171,7 @@ with gr.Blocks() as demo:
180
  with gr.Column(scale=1):
181
  count_count = gr.Label(label='Count', num_top_classes=0)
182
  count_clear.add([count_query, count_latency, count_tokenized, count_count])
183
- count_submit.click(count, inputs=[corpus_desc, engine_desc, count_query], outputs=[count_latency, count_tokenized, count_count], api_name=False)
184
 
185
  with gr.Tab('2. Prob of the last token'):
186
  with gr.Column():
@@ -199,14 +190,14 @@ with gr.Blocks() as demo:
199
  with gr.Column(scale=1):
200
  prob_probability = gr.Label(label='Probability', num_top_classes=0)
201
  prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
202
- prob_submit.click(prob, inputs=[corpus_desc, engine_desc, prob_query], outputs=[prob_latency, prob_tokenized, prob_probability], api_name=False)
203
 
204
  with gr.Tab('3. Next-token distribution'):
205
  with gr.Column():
206
  gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
207
  gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
208
  gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
209
- gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_CNT_FOR_NTD} times in the corpus, the result will be approximate.</p>')
210
  with gr.Row():
211
  with gr.Column(scale=1):
212
  ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
@@ -218,7 +209,7 @@ with gr.Blocks() as demo:
218
  with gr.Column(scale=1):
219
  ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
220
  ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
221
- ntd_submit.click(ntd, inputs=[corpus_desc, engine_desc, ntd_query], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
222
 
223
  with gr.Tab('4. ∞-gram prob'):
224
  with gr.Column():
@@ -238,7 +229,7 @@ with gr.Blocks() as demo:
238
  with gr.Column(scale=1):
239
  infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
240
  infgram_prob_clear.add([infgram_prob_query, infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability])
241
- infgram_prob_submit.click(infgram_prob, inputs=[corpus_desc, engine_desc, infgram_prob_query], outputs=[infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability], api_name=False)
242
 
243
  with gr.Tab('5. ∞-gram next-token distribution'):
244
  with gr.Column():
@@ -257,7 +248,7 @@ with gr.Blocks() as demo:
257
  with gr.Column(scale=1):
258
  infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
259
  infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
260
- infgram_ntd_submit.click(infgram_ntd, inputs=[corpus_desc, engine_desc, infgram_ntd_query], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
261
 
262
  with gr.Tab('6. Search documents'):
263
  with gr.Column():
@@ -272,18 +263,19 @@ with gr.Blocks() as demo:
272
  <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
273
  <p style="font-size: 16px;">A few notes:</p>
274
  <ul style="font-size: 16px;">
 
 
 
275
  <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
276
- <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
277
  <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
278
- <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
279
- <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
280
  </ul>
281
  <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
282
  ''')
283
  with gr.Row():
284
  with gr.Column(scale=2):
285
  search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
286
- search_docs_maxnum = gr.Slider(minimum=1, maximum=10, value=1, step=1, label='Number of documents to Display')
287
  with gr.Row():
288
  search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
289
  search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
@@ -291,44 +283,12 @@ with gr.Blocks() as demo:
291
  search_docs_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
292
  with gr.Column(scale=3):
293
  search_docs_message = gr.Label(label='Message', num_top_classes=0)
294
- with gr.Tab(label='1'):
295
- search_docs_output_0 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
296
- with gr.Tab(label='2'):
297
- search_docs_output_1 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
298
- with gr.Tab(label='3'):
299
- search_docs_output_2 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
300
- with gr.Tab(label='4'):
301
- search_docs_output_3 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
302
- with gr.Tab(label='5'):
303
- search_docs_output_4 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
304
- with gr.Tab(label='6'):
305
- search_docs_output_5 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
306
- with gr.Tab(label='7'):
307
- search_docs_output_6 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
308
- with gr.Tab(label='8'):
309
- search_docs_output_7 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
310
- with gr.Tab(label='9'):
311
- search_docs_output_8 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
312
- with gr.Tab(label='10'):
313
- search_docs_output_9 = gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"})
314
- search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message, search_docs_output_0, search_docs_output_1, search_docs_output_2, search_docs_output_3, search_docs_output_4, search_docs_output_5, search_docs_output_6, search_docs_output_7, search_docs_output_8, search_docs_output_9])
315
- search_docs_submit.click(search_docs, inputs=[corpus_desc, engine_desc, search_docs_query, search_docs_maxnum], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message, search_docs_output_0, search_docs_output_1, search_docs_output_2, search_docs_output_3, search_docs_output_4, search_docs_output_5, search_docs_output_6, search_docs_output_7, search_docs_output_8, search_docs_output_9], api_name=False)
316
-
317
- with gr.Tab('7. Analyze an (AI-generated) document using ∞-gram', visible=False):
318
- with gr.Column():
319
- gr.HTML('<h2>7. Analyze an (AI-generated) document using ∞-gram</h2>')
320
- gr.HTML('<p style="font-size: 16px;">This analyzes the document you entered using the ∞-gram. Each token is highlighted where (1) the color represents its ∞-gram probability (red is 0.0, blue is 1.0), and (2) the alpha represents the effective n (higher alpha means higher n).</p>')
321
- gr.HTML('<p style="font-size: 16px;">If you hover over a token, the tokens preceding it are each highlighted where (1) the color represents the n-gram probability of your selected token, with the n-gram starting from that highlighted token (red is 0.0, blue is 1.0), and (2) the alpha represents the count of the (n-1)-gram starting from that highlighted token (and up to but excluding your selected token) (higher alpha means higher count).</p>')
322
- with gr.Row():
323
- with gr.Column(scale=1):
324
- analyze_document_query = gr.Textbox(placeholder='Enter a document here', label='Query', interactive=True, lines=10)
325
- with gr.Row():
326
- analyze_document_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
327
- analyze_document_submit = gr.Button(value='Submit', variant='primary', visible=True)
328
- with gr.Column(scale=1):
329
- analyze_document_html = gr.HTML(value='', label='Analysis')
330
- analyze_document_clear.add([analyze_document_query, analyze_document_html])
331
- analyze_document_submit.click(analyze_document, inputs=[corpus_desc, engine_desc, analyze_document_query], outputs=[analyze_document_html], api_name=False)
332
 
333
  with gr.Row():
334
  gr.Markdown('''
@@ -343,14 +303,6 @@ If you find this tool useful, please kindly cite our paper:
343
  ```
344
  ''')
345
 
346
- for d in demo.dependencies:
347
- d['api_name'] = False
348
- for d in demo.config['dependencies']:
349
- d['api_name'] = False
350
- # if DEBUG:
351
- # print(demo.dependencies)
352
- # print(demo.config['dependencies'])
353
-
354
  demo.queue(
355
  default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT,
356
  max_size=MAX_SIZE,
@@ -360,8 +312,3 @@ demo.queue(
360
  debug=DEBUG,
361
  show_api=False,
362
  )
363
-
364
- # for d in gr.context.Context.root_block.dependencies:
365
- # d['api_name'] = False
366
- # if DEBUG:
367
- # print(gr.context.Context.root_block.dependencies)
 
4
  import requests
5
  from constants import *
6
 
7
+ def process(query_type, index_desc, **kwargs):
8
  timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
9
+ index = INDEX_BY_DESC[index_desc]
 
10
  data = {
11
  'source': 'hf' if not DEBUG else 'hf-dev',
12
  'timestamp': timestamp,
13
  'query_type': query_type,
14
+ 'index': index,
 
 
15
  }
16
+ data.update(kwargs)
 
17
  print(json.dumps(data))
18
  if API_URL is None:
19
  raise ValueError(f'API_URL envvar is not set!')
 
59
  formatted += doc['spans']
60
  return formatted
61
 
62
+ def count(index_desc, query):
63
+ result = process('count', index_desc, query=query)
64
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
65
  tokenization_info = format_tokenization_info(result)
66
  if 'error' in result:
 
69
  count = f'{result["count"]:,}'
70
  return latency, tokenization_info, count
71
 
72
+ def prob(index_desc, query):
73
+ result = process('prob', index_desc, query=query)
74
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
75
  tokenization_info = format_tokenization_info(result)
76
  if 'error' in result:
 
81
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
82
  return latency, tokenization_info, prob
83
 
84
+ def ntd(index_desc, query):
85
+ result = process('ntd', index_desc, query=query)
86
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
87
  tokenization_info = format_tokenization_info(result)
88
  if 'error' in result:
 
96
  ntd = '(n-1)-gram is not found in the corpus'
97
  return latency, tokenization_info, ntd
98
 
99
+ def infgram_prob(index_desc, query):
100
+ result = process('infgram_prob', index_desc, query=query)
101
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
102
  tokenization_info = format_tokenization_info(result)
103
  if 'error' in result:
 
108
  prob = f'{result["prob"]:.4f} ({result["cont_cnt"]:,} / {result["prompt_cnt"]:,})'
109
  return latency, tokenization_info, longest_suffix, prob
110
 
111
+ def infgram_ntd(index_desc, query):
112
+ result = process('infgram_ntd', index_desc, query=query)
113
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
114
  tokenization_info = format_tokenization_info(result)
115
  if 'error' in result:
 
123
  ntd[f'{r["token"]} ({r["cont_cnt"]} / {result["prompt_cnt"]})'] = r['prob']
124
  return latency, tokenization_info, longest_suffix, ntd
125
 
126
+ def search_docs(index_desc, query, maxnum):
127
+ result = process('search_docs', index_desc, query=query, maxnum=maxnum)
128
  latency = '' if 'latency' not in result else f'{result["latency"]:.3f}'
129
  tokenization_info = format_tokenization_info(result)
130
  if 'error' in result:
131
  message = result['error']
132
+ docs = [[] for _ in range(MAXNUM)]
133
  else:
134
  message = result['message']
135
  docs = result['documents']
136
  docs = [format_doc(doc) for doc in docs]
137
  docs = docs[:maxnum]
138
+ while len(docs) < MAXNUM:
139
  docs.append([])
140
+ return tuple([latency, tokenization_info, message] + docs)
 
 
 
 
141
 
142
  with gr.Blocks() as demo:
143
  with gr.Column():
 
152
  )
153
  with gr.Row():
154
  with gr.Column(scale=1):
155
+ index_desc = gr.Radio(choices=INDEX_DESCS, label='Corpus', value=INDEX_DESCS[0])
 
156
 
157
+ with gr.Column(scale=7):
158
  with gr.Tab('1. Count an n-gram'):
159
  with gr.Column():
160
  gr.HTML('<h2>1. Count an n-gram</h2>')
 
171
  with gr.Column(scale=1):
172
  count_count = gr.Label(label='Count', num_top_classes=0)
173
  count_clear.add([count_query, count_latency, count_tokenized, count_count])
174
+ count_submit.click(count, inputs=[index_desc, count_query], outputs=[count_latency, count_tokenized, count_count], api_name=False)
175
 
176
  with gr.Tab('2. Prob of the last token'):
177
  with gr.Column():
 
190
  with gr.Column(scale=1):
191
  prob_probability = gr.Label(label='Probability', num_top_classes=0)
192
  prob_clear.add([prob_query, prob_latency, prob_tokenized, prob_probability])
193
+ prob_submit.click(prob, inputs=[index_desc, prob_query], outputs=[prob_latency, prob_tokenized, prob_probability], api_name=False)
194
 
195
  with gr.Tab('3. Next-token distribution'):
196
  with gr.Column():
197
  gr.HTML('<h2>3. Compute the next-token distribution of an (n-1)-gram</h2>')
198
  gr.HTML('<p style="font-size: 16px;">This is an extension of the Query 2: It interprets your input as the (n-1)-gram and gives you the full next-token distribution.</p>')
199
  gr.HTML('<p style="font-size: 16px;">Example query: <b>natural language</b> (the output is P(* | natural language), for the top-10 tokens *)</p>')
200
+ gr.HTML(f'<p style="font-size: 16px;">Note: The (n-1)-gram needs to exist in the corpus. If the (n-1)-gram is not found in the corpus, an error message will appear. If the (n-1)-gram appears more than {MAX_SUPPORT} times in the corpus, the result will be approximate.</p>')
201
  with gr.Row():
202
  with gr.Column(scale=1):
203
  ntd_query = gr.Textbox(placeholder='Enter a string (an (n-1)-gram) here', label='Query', interactive=True)
 
209
  with gr.Column(scale=1):
210
  ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
211
  ntd_clear.add([ntd_query, ntd_latency, ntd_tokenized, ntd_distribution])
212
+ ntd_submit.click(ntd, inputs=[index_desc, ntd_query], outputs=[ntd_latency, ntd_tokenized, ntd_distribution], api_name=False)
213
 
214
  with gr.Tab('4. ∞-gram prob'):
215
  with gr.Column():
 
229
  with gr.Column(scale=1):
230
  infgram_prob_probability = gr.Label(label='Probability', num_top_classes=0)
231
  infgram_prob_clear.add([infgram_prob_query, infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability])
232
+ infgram_prob_submit.click(infgram_prob, inputs=[index_desc, infgram_prob_query], outputs=[infgram_prob_latency, infgram_prob_tokenized, infgram_prob_longest_suffix, infgram_prob_probability], api_name=False)
233
 
234
  with gr.Tab('5. ∞-gram next-token distribution'):
235
  with gr.Column():
 
248
  with gr.Column(scale=1):
249
  infgram_ntd_distribution = gr.Label(label='Distribution', num_top_classes=10)
250
  infgram_ntd_clear.add([infgram_ntd_query, infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution])
251
+ infgram_ntd_submit.click(infgram_ntd, inputs=[index_desc, infgram_ntd_query], outputs=[infgram_ntd_latency, infgram_ntd_tokenized, infgram_ntd_longest_suffix, infgram_ntd_distribution], api_name=False)
252
 
253
  with gr.Tab('6. Search documents'):
254
  with gr.Column():
 
263
  <p style="font-size: 16px;">If you want another batch of random documents, simply hit the Submit button again :)</p>
264
  <p style="font-size: 16px;">A few notes:</p>
265
  <ul style="font-size: 16px;">
266
+ <li>If the document is too long, it will be truncated to {MAX_DISP_LEN} tokens.</li>
267
+ <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
268
+ <li>A CNF query may contain up to {MAX_CLAUSES_PER_CNF} clauses, and each clause may contain up to {MAX_TERMS_PER_CLAUSE} terms.</li>
269
  <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
 
270
  <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
271
+ <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ} matches, we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
 
272
  </ul>
273
  <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
274
  ''')
275
  with gr.Row():
276
  with gr.Column(scale=2):
277
  search_docs_query = gr.Textbox(placeholder='Enter a query here', label='Query', interactive=True)
278
+ search_docs_maxnum = gr.Slider(minimum=1, maximum=MAXNUM, value=1, step=1, label='Number of documents to display')
279
  with gr.Row():
280
  search_docs_clear = gr.ClearButton(value='Clear', variant='secondary', visible=True)
281
  search_docs_submit = gr.Button(value='Submit', variant='primary', visible=True)
 
283
  search_docs_tokenized = gr.Textbox(label='Tokenized', lines=2, interactive=False)
284
  with gr.Column(scale=3):
285
  search_docs_message = gr.Label(label='Message', num_top_classes=0)
286
+ search_docs_outputs = []
287
+ for i in range(MAXNUM):
288
+ with gr.Tab(label=str(i+1)):
289
+ search_docs_outputs.append(gr.HighlightedText(label='Document', show_legend=False, color_map={"-": "red", "0": "green", "1": "cyan", "2": "blue", "3": "magenta"}))
290
+ search_docs_clear.add([search_docs_query, search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_outputs)
291
+ search_docs_submit.click(search_docs, inputs=[index_desc, search_docs_query, search_docs_maxnum], outputs=[search_docs_latency, search_docs_tokenized, search_docs_message] + search_docs_outputs, api_name=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  with gr.Row():
294
  gr.Markdown('''
 
303
  ```
304
  ''')
305
 
 
 
 
 
 
 
 
 
306
  demo.queue(
307
  default_concurrency_limit=DEFAULT_CONCURRENCY_LIMIT,
308
  max_size=MAX_SIZE,
 
312
  debug=DEBUG,
313
  show_api=False,
314
  )
 
 
 
 
 
constants.py CHANGED
@@ -1,8 +1,8 @@
1
  import os
2
 
3
  # options
4
- CORPUS_BY_DESC = {
5
- 'Dolma (3.1T tokens)': 'v4_dolma-v1_6_llama',
6
  'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
7
  'Pile-train (380B tokens)': 'v4_piletrain_llama',
8
  'C4-train (200B tokens)': 'v4_c4train_llama',
@@ -20,25 +20,17 @@ CORPUS_BY_DESC = {
20
  # 'Dolma-v1.6-cc_en_middle (650B tokens): 'v4_dolma-v1_6-cc_en_middle_llama',
21
  # 'Dolma-v1.6-cc_en_tail (970B tokens): 'v4_dolma-v1_6-cc_en_tail_llama',
22
  }
23
- CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
24
- ENGINE_BY_DESC = {
25
- 'C++ (🚀🚀 Fast)': 'c++',
26
- 'Python': 'python',
27
- }
28
- ENGINE_DESCS = list(ENGINE_BY_DESC.keys())
29
- ENGINES = list(ENGINE_BY_DESC.values())
30
 
31
- # engine
32
  MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
33
- MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
34
- MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
35
- MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
36
- MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
37
- MAX_CLAUSE_FREQ_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
38
  MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
39
- MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
40
- MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
41
- MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
42
 
43
  # HF demo
44
  API_URL = os.environ.get('API_URL', None)
 
1
  import os
2
 
3
  # options
4
+ INDEX_BY_DESC = {
5
+ 'Dolma-v1.6 (3.1T tokens)': 'v4_dolma-v1_6_llama',
6
  'RedPajama (1.4T tokens)': 'v4_rpj_llama_s4',
7
  'Pile-train (380B tokens)': 'v4_piletrain_llama',
8
  'C4-train (200B tokens)': 'v4_c4train_llama',
 
20
  # 'Dolma-v1.6-cc_en_middle (650B tokens): 'v4_dolma-v1_6-cc_en_middle_llama',
21
  # 'Dolma-v1.6-cc_en_tail (970B tokens): 'v4_dolma-v1_6-cc_en_tail_llama',
22
  }
23
+ INDEX_DESCS = list(INDEX_BY_DESC.keys())
 
 
 
 
 
 
24
 
25
+ # API limits
26
  MAX_QUERY_CHARS = int(os.environ.get('MAX_QUERY_CHARS', 1000))
27
+ MAX_CLAUSES_PER_CNF = int(os.environ.get('MAX_CLAUSES_PER_CNF', 4))
28
+ MAX_TERMS_PER_CLAUSE = int(os.environ.get('MAX_TERMS_PER_CLAUSE', 4))
29
+ MAX_SUPPORT = int(os.environ.get('MAX_SUPPORT', 1000))
30
+ MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 50000))
 
31
  MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
32
+ MAXNUM = int(os.environ.get('MAXNUM', 10))
33
+ MAX_DISP_LEN = int(os.environ.get('MAX_DISP_LEN', 5000))
 
34
 
35
  # HF demo
36
  API_URL = os.environ.get('API_URL', None)