import ast import json import os, sys import pytest from client_test import get_client, run_client_chat, run_client, get_args, run_client_gen from tests.utils import wrap_test_forked, make_user_path_test, get_llama from utils import get_githash @wrap_test_forked def test_client1(): os.environ['TEST_LANGCHAIN_IMPORT'] = "1" sys.modules.pop('gpt_langchain', None) sys.modules.pop('langchain', None) from generate import main main(base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b', prompt_type='human_bot', chat=False, stream_output=False, gradio=True, num_beams=1, block_gradio_exit=False) from client_test import test_client_basic res_dict, _ = test_client_basic() assert res_dict['prompt'] == 'Who are you?' assert res_dict['iinput'] == '' assert 'I am h2oGPT' in res_dict['response'] or "I'm h2oGPT" in res_dict['response'] or 'I’m h2oGPT' in res_dict[ 'response'] @wrap_test_forked def test_client1api(): os.environ['TEST_LANGCHAIN_IMPORT'] = "1" sys.modules.pop('gpt_langchain', None) sys.modules.pop('langchain', None) from generate import main main(base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b', prompt_type='human_bot', chat=False, stream_output=False, gradio=True, num_beams=1, block_gradio_exit=False) from client_test import test_client_basic_api res_dict, _ = test_client_basic_api() assert res_dict['prompt'] == 'Who are you?' assert res_dict['iinput'] == '' assert 'I am h2oGPT' in res_dict['response'] or "I'm h2oGPT" in res_dict['response'] or 'I’m h2oGPT' in res_dict[ 'response'] @pytest.mark.parametrize("admin_pass", ['', 'foodoo1234']) @wrap_test_forked def test_client1api_lean(admin_pass): from generate import main base_model = 'h2oai/h2ogpt-oig-oasst1-512-6_9b' os.environ['ADMIN_PASS'] = admin_pass inf_port = os.environ['GRADIO_SERVER_PORT'] = "9999" main(base_model=base_model, prompt_type='human_bot', chat=False, stream_output=False, gradio=True, num_beams=1, block_gradio_exit=False) os.environ['HOST'] = "http://127.0.0.1:%s" % inf_port client1 = get_client(serialize=True) from gradio_utils.grclient import GradioClient client2 = GradioClient(os.environ['HOST']) client2.refresh_client() # test refresh for client in [client1, client2]: api_name = '/submit_nochat_api' # NOTE: like submit_nochat but stable API for string dict passing prompt = 'Who are you?' kwargs = dict(instruction_nochat=prompt) # pass string of dict. All entries are optional, but expect at least instruction_nochat to be filled res = client.predict(str(dict(kwargs)), api_name=api_name) print("Raw client result: %s" % res, flush=True) response = ast.literal_eval(res)['response'] assert 'I am h2oGPT' in response or "I'm h2oGPT" in response or 'I’m h2oGPT' in response api_name = '/system_info_dict' # pass string of dict. All entries are optional, but expect at least instruction_nochat to be filled ADMIN_PASS = os.getenv('ADMIN_PASS', admin_pass) res = client.predict(ADMIN_PASS, api_name=api_name) res = json.loads(res) assert isinstance(res, dict) assert res['base_model'] == base_model, "Problem with res=%s" % res assert 'device' in res assert res['hash'] == get_githash() api_name = '/system_hash' res = client.predict(api_name=api_name) assert res == get_githash() res = client.predict(api_name=api_name) assert res == get_githash() client2.refresh_client() # test refresh res = client.predict(api_name=api_name) assert res == get_githash() res = client2.get_server_hash() assert res == get_githash() @wrap_test_forked def test_client1api_lean_chat_server(): from generate import main main(base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b', prompt_type='human_bot', chat=True, stream_output=True, gradio=True, num_beams=1, block_gradio_exit=False) api_name = '/submit_nochat_api' # NOTE: like submit_nochat but stable API for string dict passing prompt = 'Who are you?' kwargs = dict(instruction_nochat=prompt) client = get_client(serialize=True) # pass string of dict. All entries are optional, but expect at least instruction_nochat to be filled res = client.predict(str(dict(kwargs)), api_name=api_name) print("Raw client result: %s" % res, flush=True) response = ast.literal_eval(res)['response'] assert 'I am h2oGPT' in response or "I'm h2oGPT" in response or 'I’m h2oGPT' in response @wrap_test_forked def test_client_chat_nostream(): res_dict, client = run_client_chat_with_server(stream_output=False) assert 'I am h2oGPT' in res_dict['response'] or "I'm h2oGPT" in res_dict['response'] or 'I’m h2oGPT' in res_dict[ 'response'] @wrap_test_forked def test_client_chat_nostream_gpt4all(): res_dict, client = run_client_chat_with_server(stream_output=False, base_model='gptj', prompt_type='gptj') assert 'I am a computer program designed to assist' in res_dict['response'] or \ 'I am a person who enjoys' in res_dict['response'] or \ 'I am a student at' in res_dict['response'] or \ 'I am a person who' in res_dict['response'] @wrap_test_forked def test_client_chat_nostream_gpt4all_llama(): res_dict, client = run_client_chat_with_server(stream_output=False, base_model='gpt4all_llama', prompt_type='gptj') assert 'What do you want from me?' in res_dict['response'] or \ 'What do you want?' in res_dict['response'] or \ 'What is your name and title?' in res_dict['response'] or \ 'I can assist you with any information' in res_dict['response'] or \ 'I can provide information or assistance' in res_dict['response'] or \ 'am a student' in res_dict['response'] @pytest.mark.need_tokens @wrap_test_forked def test_client_chat_nostream_llama7b(): prompt_type = get_llama() res_dict, client = run_client_chat_with_server(stream_output=False, base_model='llama', prompt_type=prompt_type) assert "am a virtual assistant" in res_dict['response'] or \ 'am a student' in res_dict['response'] def run_client_chat_with_server(prompt='Who are you?', stream_output=False, max_new_tokens=256, base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b', prompt_type='human_bot', langchain_mode='Disabled', user_path=None, visible_langchain_modes=['UserData', 'MyData'], reverse_docs=True): if langchain_mode == 'Disabled': os.environ['TEST_LANGCHAIN_IMPORT'] = "1" sys.modules.pop('gpt_langchain', None) sys.modules.pop('langchain', None) from generate import main main(base_model=base_model, prompt_type=prompt_type, chat=True, stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, user_path=user_path, visible_langchain_modes=visible_langchain_modes, reverse_docs=reverse_docs) from client_test import run_client_chat res_dict, client = run_client_chat(prompt=prompt, prompt_type=prompt_type, stream_output=stream_output, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode) assert res_dict['prompt'] == prompt assert res_dict['iinput'] == '' return res_dict, client @wrap_test_forked def test_client_chat_stream(): run_client_chat_with_server(stream_output=True) def run_client_nochat_with_server(prompt='Who are you?', stream_output=False, max_new_tokens=256, base_model='h2oai/h2ogpt-oig-oasst1-512-6_9b', prompt_type='human_bot', langchain_mode='Disabled', user_path=None, visible_langchain_modes=['UserData', 'MyData'], reverse_docs=True): if langchain_mode == 'Disabled': os.environ['TEST_LANGCHAIN_IMPORT'] = "1" sys.modules.pop('gpt_langchain', None) sys.modules.pop('langchain', None) from generate import main main(base_model=base_model, prompt_type=prompt_type, chat=True, stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, user_path=user_path, visible_langchain_modes=visible_langchain_modes, reverse_docs=reverse_docs) from client_test import run_client_nochat_gen res_dict, client = run_client_nochat_gen(prompt=prompt, prompt_type=prompt_type, stream_output=stream_output, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode) assert 'Birds' in res_dict['response'] or \ 'and can learn new things' in res_dict['response'] or \ 'Once upon a time' in res_dict['response'] return res_dict, client @wrap_test_forked def test_client_nochat_stream(): run_client_nochat_with_server(stream_output=True, prompt="Tell a very long kid's story about birds.") @wrap_test_forked def test_client_chat_stream_langchain(): user_path = make_user_path_test() prompt = "What is h2oGPT?" res_dict, client = run_client_chat_with_server(prompt=prompt, stream_output=True, langchain_mode="UserData", user_path=user_path, visible_langchain_modes=['UserData', 'MyData'], reverse_docs=False, # for 6_9 dumb model for testing ) # below wouldn't occur if didn't use LangChain with README.md, # raw LLM tends to ramble about H2O.ai and what it does regardless of question. # bad answer about h2o.ai is just becomes dumb model, why flipped context above, # but not stable over different systems assert 'h2oGPT is a large language model' in res_dict['response'] or \ 'H2O.ai is a technology company' in res_dict['response'] @pytest.mark.parametrize("max_new_tokens", [256, 2048]) @pytest.mark.parametrize("top_k_docs", [3, 100]) @wrap_test_forked def test_client_chat_stream_langchain_steps(max_new_tokens, top_k_docs): os.environ['VERBOSE_PIPELINE'] = '1' user_path = make_user_path_test() stream_output = True base_model = 'h2oai/h2ogpt-oig-oasst1-512-6_9b' prompt_type = 'human_bot' langchain_mode = 'UserData' visible_langchain_modes = ['UserData', 'MyData'] from generate import main main(base_model=base_model, prompt_type=prompt_type, chat=True, stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, max_new_tokens=max_new_tokens, top_k_docs=top_k_docs, langchain_mode=langchain_mode, user_path=user_path, visible_langchain_modes=visible_langchain_modes, reverse_docs=False, # for 6_9 ) from client_test import get_client, get_args, run_client client = get_client(serialize=False) # QUERY1 prompt = "What is h2oGPT?" langchain_mode = 'UserData' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, top_k_docs=top_k_docs, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) assert ('a large language model' in res_dict['response'] or 'language model trained' in res_dict['response'] or 'H2O GPT is a language model' in res_dict['response'] or 'H2O GPT is a chatbot framework' in res_dict['response'] or 'H2O GPT is a chatbot that can be trained' in res_dict['response'] or 'A large language model (LLM)' in res_dict['response'] or 'GPT-based language model' in res_dict['response'] or 'H2O.ai is a technology company' in res_dict['response'] ) \ and ('FAQ.md' in res_dict['response'] or 'README.md' in res_dict['response']) # QUERY1 prompt = "What is Whisper?" langchain_mode = 'UserData' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, top_k_docs=top_k_docs, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) # wrong answer given wrong docs assert ('A secure chatbot that uses a large language' in res_dict['response'] or 'Whisper is a chatbot' in res_dict['response'] or 'Whisper is a privacy-focused chatbot platform' in res_dict['response'] or 'h2oGPT' in res_dict['response'] or 'A secure, private, and anonymous chat platform' in res_dict['response'] or 'Whisper is a privacy-preserving' in res_dict['response'] or 'A chatbot that uses a large language model' in res_dict['response'] or 'This is a config file for Whisper' in res_dict['response'] or 'Whisper is a secure messaging app' in res_dict['response'] or 'secure, private, and anonymous chatbot' in res_dict['response'] or 'Whisper is a secure, anonymous, and encrypted' in res_dict['response'] ) \ and ('FAQ.md' in res_dict['response'] or 'README.md' in res_dict['response']) # QUERY2 prompt = "What is h2oGPT?" langchain_mode = 'ChatLLM' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, top_k_docs=top_k_docs, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) # i.e. answers wrongly without data, dumb model, but also no docs at all since cutoff entirely assert 'H2O.ai is a technology company' in res_dict['response'] and '.md' not in res_dict['response'] # QUERY3 prompt = "What is whisper?" langchain_mode = 'UserData' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, top_k_docs=top_k_docs, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) # odd answer since no whisper docs, but still shows some docs at very low score assert ('h2oGPT' in res_dict['response'] or 'A chatbot that can whisper to you' in res_dict['response'] or 'whisper is a simple' in res_dict['response'] or 'Whisper is a tool for generating text from a model' in res_dict['response'] or 'Whisper is a chatbot platform' in res_dict['response'] or 'whisper is a chatbot framework' in res_dict['response'] or 'whisper is a tool for training language models' in res_dict['response'] or 'whisper is a secure messaging app' in res_dict['response'] or 'LLaMa-based models are not commercially viable' in res_dict['response'] or 'A text-based chatbot that' in res_dict['response'] or 'A secure, private, and anonymous chat service' in res_dict['response'] or 'LLaMa is a language' in res_dict['response'] or 'chatbot that can' in res_dict['response'] or 'A secure, private, and anonymous chatbot' in res_dict['response'] or 'A secure, encrypted chat service that allows' in res_dict['response'] ) \ and '.md' in res_dict['response'] @pytest.mark.need_tokens @pytest.mark.parametrize("max_new_tokens", [256, 2048]) @pytest.mark.parametrize("top_k_docs", [3, 100]) @wrap_test_forked def test_client_chat_stream_langchain_steps2(max_new_tokens, top_k_docs): os.environ['VERBOSE_PIPELINE'] = '1' # full user data from make_db import make_db_main make_db_main(download_some=True) user_path = None # shouldn't be necessary, db already made stream_output = True max_new_tokens = 256 base_model = 'h2oai/h2ogpt-oig-oasst1-512-6_9b' prompt_type = 'human_bot' langchain_mode = 'UserData' visible_langchain_modes = ['UserData', 'MyData', 'github h2oGPT'] from generate import main main(base_model=base_model, prompt_type=prompt_type, chat=True, stream_output=stream_output, gradio=True, num_beams=1, block_gradio_exit=False, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode, user_path=user_path, visible_langchain_modes=visible_langchain_modes, verbose=True) from client_test import get_client, get_args, run_client client = get_client(serialize=False) # QUERY1 prompt = "Who are you?" langchain_mode = 'ChatLLM' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) assert 'a large language model' in res_dict['response'] and 'FAQ.md' not in res_dict['response'] # QUERY2 prompt = "What is whisper?" langchain_mode = 'UserData' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) assert 'large-scale speech recognition model' in res_dict['response'] and 'whisper.pdf' in res_dict['response'] # QUERY3 prompt = "What is h2oGPT" langchain_mode = 'github h2oGPT' kwargs, args = get_args(prompt, prompt_type, chat=True, stream_output=stream_output, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode) res_dict, client = run_client(client, prompt, args, kwargs) assert ('h2oGPT is an open-source, fully permissive, commercially usable, and fully trained language model' in res_dict['response'] or 'A new open-source language model that is fully permissive' in res_dict['response'] or 'h2oGPT is an open-source language model' in res_dict['response'] or 'h2oGPT is an open-source, fully permissive, commercially usable' in res_dict['response'] ) and \ 'README.md' in res_dict['response'] @wrap_test_forked def test_client_chat_stream_long(): prompt = 'Tell a very long story about cute birds for kids.' res_dict, client = run_client_chat_with_server(prompt=prompt, stream_output=True, max_new_tokens=1024) assert 'Once upon a time' in res_dict['response'] @pytest.mark.skip(reason="Local file required") @wrap_test_forked def test_client_long(): os.environ['TEST_LANGCHAIN_IMPORT'] = "1" sys.modules.pop('gpt_langchain', None) sys.modules.pop('langchain', None) from generate import main main(base_model='mosaicml/mpt-7b-storywriter', prompt_type='plain', chat=False, stream_output=False, gradio=True, num_beams=1, block_gradio_exit=False) with open("/home/jon/Downloads/Gatsby_PDF_FullText.txt") as f: prompt = f.readlines() from client_test import run_client_nochat res_dict, _ = run_client_nochat(prompt=prompt, prompt_type='plain', max_new_tokens=86000) print(res_dict['response']) @wrap_test_forked def test_fast_up(): from generate import main main(gradio=True, block_gradio_exit=False) @pytest.mark.skipif(not os.getenv('STRESS'), reason="Only for stress testing already-running server") @pytest.mark.parametrize("repeat", list(range(0, 100))) @wrap_test_forked def test_client_stress(repeat): # pip install pytest-repeat # license issues, don't put with requirements # pip install pytest-timeout # license issues, don't put with requirements # # CUDA_VISIBLE_DEVICES=0 SCORE_MODEL=None python generate.py --base_model=h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v2 --langchain_mode=UserData --user_path=user_path --debug=True --concurrency_count=8 # # timeout to mimic client disconnecting and generation still going, else too clean and doesn't fail STRESS=1 # pytest -s -v -n 8 --timeout=30 tests/test_client_calls.py::test_client_stress 2> stress1.log # HOST=http://192.168.1.46:9999 STRESS=1 pytest -s -v -n 8 --timeout=1000 tests/test_client_calls.py::test_client_stress 2> stress1.log prompt = "Tell a very long kid's story about birds." #prompt = "Say exactly only one word." client = get_client(serialize=True) kwargs = dict( instruction='', max_new_tokens=200, min_new_tokens=1, max_time=300, do_sample=False, instruction_nochat=prompt, ) api_name = '/submit_nochat_api' # NOTE: like submit_nochat but stable API for string dict passing res = client.predict( str(dict(kwargs)), api_name=api_name, ) print("Raw client result: %s" % res, flush=True) assert isinstance(res, str) res_dict = ast.literal_eval(res) assert 'response' in res_dict and res_dict['response'] @pytest.mark.skipif(not os.getenv('STRESS'), reason="Only for stress testing already-running server") @pytest.mark.parametrize("repeat", list(range(0, 100))) @wrap_test_forked def test_client_stress_stream(repeat): prompt = "Tell a very long kid's story about birds." max_new_tokens = 200 prompt_type = None langchain_mode = 'Disabled' stream_output = True chat = False client = get_client(serialize=True) kwargs, args = get_args(prompt, prompt_type, chat=chat, stream_output=stream_output, max_new_tokens=max_new_tokens, langchain_mode=langchain_mode) res_dict, client = run_client_gen(client, prompt, args, kwargs, do_md_to_text=False, verbose=False) assert 'response' in res_dict and res_dict['response'] @pytest.mark.skipif(not os.getenv('SERVER'), reason="For testing text-generatino-inference server") @wrap_test_forked def test_text_generation_inference_server1(): """ e.g. SERVER on 192.168.1.46 (alpaca) jon@gpu:/data/jon/h2o-llm$ CUDA_VISIBLE_DEVICES=0,1 docker run --gpus all --shm-size 2g -e NCCL_SHM_DISABLE=1 -e TRANSFORMERS_CACHE="/.cache/" -p 6112:80 -v $HOME/.cache:/.cache/ -v $HOME/.cache/huggingface/hub/:/data ghcr.io/huggingface/text-generation-inference:0.8.2 --model-id h2oai/h2ogpt-oasst1-512-12b --max-input-length 2048 --max-total-tokens 4096 --sharded=true --num-shard=2 --disable-custom-kernels --quantize bitsandbytes --trust-remote-code --max-stop-sequences=6 CLIENT on separate system HOST=http://192.168.1.46:6112 SERVER=1 pytest -s -v tests/test_client_calls.py::test_text_generation_inference_server1 :return: """ # Python client test: from text_generation import Client host = os.getenv("HOST", "http://127.0.0.1:6112") client = Client(host) print(client.generate("What is Deep Learning?", max_new_tokens=17).generated_text) text = "" for response in client.generate_stream("What is Deep Learning?", max_new_tokens=17): if not response.token.special: text += response.token.text assert 'Deep learning is a subfield of machine learning' in text # Curl Test (not really pass fail yet) import subprocess output = subprocess.run(['curl', '%s/generate' % host, '-X', 'POST', '-d', '{"inputs":"<|prompt|>What is Deep Learning?<|endoftext|><|answer|>","parameters":{"max_new_tokens": 20, "truncate": 1024, "do_sample": false, "temperature": 0.1, "repetition_penalty": 1.2}}', '-H', 'Content-Type: application/json', '--user', 'user:bhx5xmu6UVX4'], check=True, capture_output=True).stdout.decode() text = ast.literal_eval(output)['generated_text'] assert 'Deep learning is a subfield of machine learning' in text or \ 'Deep learning refers to a class of machine learning' in text