Spaces:

liujch1998
/

infini-gram

Running

liujch1998 commited on Feb 5, 2024

Commit

9f036ec

1 Parent(s): 5f2c7e6

Sync changes

Files changed (2) hide show

app.py CHANGED Viewed

@@ -109,6 +109,7 @@ with gr.Blocks() as demo:
             <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on a text corpus. Please first select the corpus and the type of query, then enter your query and submit.</p>
             <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://arxiv.org/abs/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. HF Paper Page: <a href="https://huggingface.co/papers/2401.17377">https://huggingface.co/papers/2401.17377</a></p>
             <p style='font-size: 16px;'><b>Note: We kindly ask you not to programmatically submit queries to the API at the moment. We will release a more stable API soon. Thank you :)</b></p>
             '''
         )

             <p style='font-size: 16px;'>This is an engine that processes n-gram / ∞-gram queries on a text corpus. Please first select the corpus and the type of query, then enter your query and submit.</p>
             <p style='font-size: 16px;'>The engine is developed by <a href="https://liujch1998.github.io">Jiacheng (Gary) Liu</a> and documented in our paper: <a href="https://arxiv.org/abs/2401.17377">Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens</a>. HF Paper Page: <a href="https://huggingface.co/papers/2401.17377">https://huggingface.co/papers/2401.17377</a></p>
+            <p style='font-size: 16px;'>All inputs are <b>case-sensitive</b>.</p>
             <p style='font-size: 16px;'><b>Note: We kindly ask you not to programmatically submit queries to the API at the moment. We will release a more stable API soon. Thank you :)</b></p>
             '''
         )

constants.py CHANGED Viewed

@@ -5,6 +5,7 @@ CORPUS_BY_DESC = {
     'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v3_rpj_llama_c4',
     'Pile-val (LLaMA tokenizer), 390M tokens': 'v3_pileval_llama',
     'Pile-val (GPT-2 tokenizer), 380M tokens': 'v3_pileval_gpt2',
 }
 CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
 ENGINE_BY_DESC = {
@@ -37,4 +38,5 @@ DEBUG = (os.environ.get('DEBUG', 'False') != 'False')
 # C++ engine
 CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
 SOCKET_OUT_BUFFER_SIZE = 65536

     'RedPajama (LLaMA tokenizer), 1.4T tokens': 'v3_rpj_llama_c4',
     'Pile-val (LLaMA tokenizer), 390M tokens': 'v3_pileval_llama',
     'Pile-val (GPT-2 tokenizer), 380M tokens': 'v3_pileval_gpt2',
+    'Dolma-sample (OLMo tokenizer), 8.0B tokens': 'v4_dolmasample_olmo',
 }
 CORPUS_DESCS = list(CORPUS_BY_DESC.keys())
 ENGINE_BY_DESC = {
 # C++ engine
 CPP_PORT = int(os.environ.get('CPP_PORT', 3786))
+SOCKET_IN_BUFFER_SIZE = 2048
 SOCKET_OUT_BUFFER_SIZE = 65536