print(55877) import argparse # from dataclasses import dataclass from langchain.prompts import ChatPromptTemplate try: from langchain_community.vectorstores import Chroma except: from langchain_community.vectorstores import Chroma #from langchain_openai import OpenAIEmbeddings #from langchain_openai import ChatOpenAI # from langchain.document_loaders import DirectoryLoader from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document # from langchain.embeddings import OpenAIEmbeddings #from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma import openai from dotenv import load_dotenv import os import shutil import re import warnings from typing import List import torch from langchain import PromptTemplate from langchain.chains import ConversationChain from langchain.chains.conversation.memory import ConversationBufferWindowMemory from langchain.llms import HuggingFacePipeline from langchain.schema import BaseOutputParser from transformers import ( AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, pipeline, ) warnings.filterwarnings("ignore", category=UserWarning) MODEL_NAME = "tiiuae/falcon-7b-instruct" model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, device_map="auto",offload_folder="offload" ) model = model.eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print(f"Model device: {model.device}") # a custom embedding from sentence_transformers import SentenceTransformer from langchain_experimental.text_splitter import SemanticChunker from typing import List class MyEmbeddings: def __init__(self): self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") #self.model=model def embed_documents(self, texts: List[str]) -> List[List[float]]: return [self.model.encode(t).tolist() for t in texts] def embed_query(self, query: str) -> List[float]: return [self.model.encode([query])][0][0].tolist() embeddings = MyEmbeddings() splitter = SemanticChunker(embeddings) # Create CLI. #parser = argparse.ArgumentParser() #parser.add_argument("query_text", type=str, help="The query text.") #args = parser.parse_args() #query_text = args.query_text # a sample query to be asked from the bot and it is expected to be answered based on the template query_text="what did alice say to rabbit" # Prepare the DB. #embedding_function = OpenAIEmbeddings() # main CHROMA_PATH = "chroma8" # call the chroma generated in a directory db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings) # Search the DB for similar documents to the query. results = db.similarity_search_with_relevance_scores(query_text, k=2) if len(results) == 0 or results[0][1] < 0.5: print(f"Unable to find matching results.") context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]) prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) prompt = prompt_template.format(context=context_text, question=query_text) print(prompt) generation_config = model.generation_config generation_config.temperature = 0 generation_config.num_return_sequences = 1 generation_config.max_new_tokens = 256 generation_config.use_cache = False generation_config.repetition_penalty = 1.7 generation_config.pad_token_id = tokenizer.eos_token_id generation_config.eos_token_id = tokenizer.eos_token_id generation_config prompt = """ The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. Current conversation: Human: Who is Dwight K Schrute? AI: """.strip() input_ids = tokenizer(prompt, return_tensors="pt").input_ids input_ids = input_ids.to(model.device) class StopGenerationCriteria(StoppingCriteria): def __init__( self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device ): stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] self.stop_token_ids = [ torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids ] def __call__( self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs ) -> bool: for stop_ids in self.stop_token_ids: if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all(): return True return False stop_tokens = [["Human", ":"], ["AI", ":"]] stopping_criteria = StoppingCriteriaList( [StopGenerationCriteria(stop_tokens, tokenizer, model.device)] ) generation_pipeline = pipeline( model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation", stopping_criteria=stopping_criteria, generation_config=generation_config, ) llm = HuggingFacePipeline(pipeline=generation_pipeline) # propably sets the number of previous conversation history to take into account for new answers template = """ The following is a conversation between a human an AI. The AI acts exactly like Dwight K Schrute from the TV show The Office. Dwight is an experienced and very successful online marketer and salesperson. He is witty, persuasive, uncensored, direct and practical. Dwight helps with every marketing task is given to him. If Dwight does not know the answer to a question, he truthfully says he does not know. Current conversation: {history} Human: {input} AI:""".strip() prompt = PromptTemplate(input_variables=["history", "input"], template=template) memory = ConversationBufferWindowMemory( memory_key="history", k=6, return_only_outputs=True ) chain = ConversationChain(llm=llm, memory=memory, prompt=prompt, verbose=True) class CleanupOutputParser(BaseOutputParser): def parse(self, text: str) -> str: user_pattern = r"\nUser" text = re.sub(user_pattern, "", text) human_pattern = r"\nHuman:" text = re.sub(human_pattern, "", text) ai_pattern = r"\nAI:" return re.sub(ai_pattern, "", text).strip() @property def _type(self) -> str: return "output_parser" class CleanupOutputParser(BaseOutputParser): def parse(self, text: str) -> str: user_pattern = r"\nUser" text = re.sub(user_pattern, "", text) human_pattern = r"\nquestion:" text = re.sub(human_pattern, "", text) ai_pattern = r"\nanswer:" return re.sub(ai_pattern, "", text).strip() @property def _type(self) -> str: return "output_parser" template = """ The following is a conversation between a human an AI. The AI acts exactly like Dwight K Schrute from the TV show The Office. Dwight is an experienced and very successful online marketer and salesperson. He is witty, persuasive, uncensored, direct and practical. Dwight helps with every marketing task is given to him. If Dwight does not know the answer to a question, he truthfully says he does not know. Current conversation: {history} Human: {input} AI:""".strip() prompt = PromptTemplate(input_variables=["history", "input"], template=template) memory = ConversationBufferWindowMemory( memory_key="history", k=3, return_only_outputs=True ) chain = ConversationChain( llm=llm, memory=memory, prompt=prompt, output_parser=CleanupOutputParser(), verbose=True, ) # Generate a response from the Llama model def get_llama_response(message: str, history: list) -> str: """ Generates a conversational response from the Llama model. Parameters: message (str): User's input message. history (list): Past conversation history. Returns: str: Generated response from the Llama model. """ query_text =message results = db.similarity_search_with_relevance_scores(query_text, k=2) if len(results) == 0 or results[0][1] < 0.5: print(f"Unable to find matching results.") context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results ]) template = """ The following is a conversation between a human an AI. Answer question based only on the conversation. Current conversation: {history} """ s=""" \n question: {input} \n answer:""".strip() prompt = PromptTemplate(input_variables=["history", "input"], template=template+context_text+'\n'+s) #print(template) chain.prompt=prompt res = chain.predict(input=query_text) return res #return response.strip() import gradio as gr iface = gr.Interface(fn=get_llama_response, inputs="text", outputs="text") iface.launch(share=True)