import pytest from tests.utils import wrap_test_forked @pytest.mark.need_tokens @wrap_test_forked def test_langchain_simple_h2ogpt(): run_langchain_simple(base_model='h2oai/h2ogpt-oasst1-512-12b', prompt_type='human_bot') @pytest.mark.need_tokens @wrap_test_forked def test_langchain_simple_vicuna(): run_langchain_simple(base_model='junelee/wizard-vicuna-13b', prompt_type='instruct_vicuna') def run_langchain_simple(base_model='h2oai/h2ogpt-oasst1-512-12b', prompt_type='human_bot'): """ :param base_model: :param prompt_type: prompt_type required for stopping support and correct handling of instruction prompting :return: """ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from h2oai_pipeline import H2OTextGenerationPipeline model_name = base_model from transformers import AutoConfig config = AutoConfig.from_pretrained(base_model, use_auth_token=True, trust_remote_code=True, offload_folder="./") llama_type_from_config = 'llama' in str(config).lower() llama_type_from_name = "llama" in base_model.lower() llama_type = llama_type_from_config or llama_type_from_name if llama_type: from transformers import LlamaForCausalLM, LlamaTokenizer model_loader = LlamaForCausalLM tokenizer_loader = LlamaTokenizer else: model_loader = AutoModelForCausalLM tokenizer_loader = AutoTokenizer load_in_8bit = True n_gpus = torch.cuda.device_count() if torch.cuda.is_available else 0 device = 'cpu' if n_gpus == 0 else 'cuda' device_map = {"": 0} if device == 'cuda' else "auto" tokenizer = tokenizer_loader.from_pretrained(model_name, padding_side="left") model = model_loader.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=device_map, load_in_8bit=load_in_8bit) gen_kwargs = dict(max_new_tokens=512, return_full_text=True, early_stopping=False) pipe = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer, prompt_type=prompt_type, **gen_kwargs) # below makes it listen only to our prompt removal, # not built in prompt removal that is less general and not specific for our model pipe.task = "text2text-generation" # create llm for LangChain from langchain.llms import HuggingFacePipeline llm = HuggingFacePipeline(pipeline=pipe) # Setup QA from langchain import PromptTemplate from langchain.chains.question_answering import load_qa_chain # NOTE: Instruct-tuned models don't need excessive many-shot examples that waste context space template = """ == {context} == {question}""" prompt = PromptTemplate( input_variables=["context", "question"], template=template, ) chain = load_qa_chain(llm, prompt=prompt) docs = [] # could have been some Documents from LangChain inputted from some sources query = "Give detailed list of reasons for who is smarter, Einstein or Newton." chain_kwargs = dict(input_documents=docs, question=query) answer = chain(chain_kwargs) print(answer) if 'vicuna' in base_model: res1 = 'Both Albert Einstein and Sir Isaac Newton were brilliant scientists' in answer[ 'output_text'] and "Newton" in answer['output_text'] res2 = 'Both Albert Einstein and Sir Isaac Newton are considered two' in answer[ 'output_text'] and "Newton" in answer['output_text'] else: res1 = 'Einstein was a genius who revolutionized physics' in answer['output_text'] and "Newton" in answer[ 'output_text'] res2 = 'Einstein and Newton are two of the most famous scientists in history' in answer[ 'output_text'] and "Newton" in answer['output_text'] assert res1 or res2