liujch1998 commited on
Commit
cb08e07
·
1 Parent(s): 7474206

Sync changes

Browse files
Files changed (2) hide show
  1. app.py +5 -4
  2. constants.py +2 -9
app.py CHANGED
@@ -9,6 +9,7 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
9
  corpus = CORPUS_BY_DESC[corpus_desc]
10
  engine = ENGINE_BY_DESC[engine_desc]
11
  data = {
 
12
  'timestamp': timestamp,
13
  'query_type': query_type,
14
  'corpus': corpus,
@@ -18,9 +19,9 @@ def process(query_type, corpus_desc, engine_desc, query, maxnum, request: gr.Req
18
  if maxnum is not None:
19
  data['maxnum'] = maxnum
20
  print(json.dumps(data))
21
- if API_IPADDR is None:
22
- raise ValueError(f'API_IPADDR envvar is not set!')
23
- response = requests.post(f'http://{API_IPADDR}:5000/', json=data)
24
  if response.status_code == 200:
25
  result = response.json()
26
  else:
@@ -230,7 +231,7 @@ with gr.Blocks() as demo:
230
  <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
231
  <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
232
  <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
233
- <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
234
  <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
235
  </ul>
236
  <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
 
9
  corpus = CORPUS_BY_DESC[corpus_desc]
10
  engine = ENGINE_BY_DESC[engine_desc]
11
  data = {
12
+ 'source': 'hf' if not DEBUG else 'hf-dev',
13
  'timestamp': timestamp,
14
  'query_type': query_type,
15
  'corpus': corpus,
 
19
  if maxnum is not None:
20
  data['maxnum'] = maxnum
21
  print(json.dumps(data))
22
+ if API_URL is None:
23
+ raise ValueError(f'API_URL envvar is not set!')
24
+ response = requests.post(API_URL, json=data)
25
  if response.status_code == 200:
26
  result = response.json()
27
  else:
 
231
  <li>When you write a query in CNF, note that <b>OR has higher precedence than AND</b> (which is contrary to conventions in boolean algebra).</li>
232
  <li>If the document is too long, it will be truncated to {MAX_OUTPUT_DOC_TOKENS} tokens.</li>
233
  <li>We can only include documents where all terms (or clauses) are separated by no more than {MAX_DIFF_TOKENS} tokens.</li>
234
+ <li>If you query for two or more clauses, and a clause has more than {MAX_CLAUSE_FREQ_PER_SHARD} matches (per shard), we will estimate the count from a random subset of all documents containing that clause. This might cause a zero count on conjuction of some simple n-grams (e.g., <b>birds AND oil</b>).</li>
235
  <li>The number of found documents may contain duplicates (e.g., if a document contains your query term twice, it may be counted twice).</li>
236
  </ul>
237
  <p style="font-size: 16px;">❗️WARNING: Corpus may contain problematic contents such as PII, toxicity, hate speech, and NSFW text. This tool is merely presenting selected text from the corpus, without any post-hoc safety filtering. It is NOT creating new text. This is a research prototype through which we can expose and examine existing problems with massive text corpora. Please use with caution. Don't be evil :)</p>
constants.py CHANGED
@@ -22,22 +22,15 @@ MAX_INPUT_DOC_TOKENS = int(os.environ.get('MAX_INPUT_DOC_TOKENS', 1000))
22
  MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
23
  MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
24
  MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
25
- MAX_CLAUSE_FREQ = int(os.environ.get('MAX_CLAUSE_FREQ', 10000))
26
- MAX_CLAUSE_FREQ_FAST = int(os.environ.get('MAX_CLAUSE_FREQ_FAST', 1000000))
27
- MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
28
  MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
29
  MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
30
  MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
31
  MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
32
 
33
  # HF demo
34
- API_IPADDR = os.environ.get('API_IPADDR', None)
35
  DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
36
  MAX_SIZE = os.environ.get('MAX_SIZE', 100)
37
  MAX_THREADS = os.environ.get('MAX_THREADS', 40)
38
  DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
39
-
40
- # C++ engine
41
- CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
42
- SOCKET_IN_BUFFER_SIZE = 2048
43
- SOCKET_OUT_BUFFER_SIZE = 65536
 
22
  MAX_OUTPUT_DOC_TOKENS = int(os.environ.get('MAX_OUTPUT_DOC_TOKENS', 5000))
23
  MAX_OUTPUT_NUM_DOCS = int(os.environ.get('MAX_OUTPUT_NUM_DOCS', 10)) # This number is also hard-coded in app.py
24
  MAX_CNT_FOR_NTD = int(os.environ.get('MAX_CNT_FOR_NTD', 1000))
25
+ MAX_CLAUSE_FREQ_PER_SHARD = int(os.environ.get('MAX_CLAUSE_FREQ_FAST_APPROX_PER_SHARD', 50000))
 
 
26
  MAX_DIFF_TOKENS = int(os.environ.get('MAX_DIFF_TOKENS', 100))
27
  MAX_DIFF_BYTES = 2 * MAX_DIFF_TOKENS
28
  MAX_CLAUSES_IN_CNF = int(os.environ.get('MAX_CLAUSES_IN_CNF', 4))
29
  MAX_TERMS_IN_DISJ_CLAUSE = int(os.environ.get('MAX_TERMS_IN_DISJ_CLAUSE', 4))
30
 
31
  # HF demo
32
+ API_URL = os.environ.get('API_URL', None)
33
  DEFAULT_CONCURRENCY_LIMIT = os.environ.get('DEFAULT_CONCURRENCY_LIMIT', 10)
34
  MAX_SIZE = os.environ.get('MAX_SIZE', 100)
35
  MAX_THREADS = os.environ.get('MAX_THREADS', 40)
36
  DEBUG = (os.environ.get('DEBUG', 'False') != 'False')