import argparse # from dataclasses import dataclass from langchain.prompts import ChatPromptTemplate try: from langchain_community.vectorstores import Chroma except: from langchain_community.vectorstores import Chroma # from langchain.document_loaders import DirectoryLoader from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document # from langchain.embeddings import OpenAIEmbeddings #from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma import openai from dotenv import load_dotenv import os import shutil import torch from transformers import AutoModel,AutoTokenizer model2 = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") tokenizer2 = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") # this shoub be used when we can not use sentence_transformers (which reqiures transformers==4.39. we cannot use # this version since causes using large amount of RAm when loading falcon model) # a custom embedding #from sentence_transformers import SentenceTransformer from langchain_experimental.text_splitter import SemanticChunker from typing import List import re import warnings from typing import List import torch from langchain import PromptTemplate from langchain.chains import ConversationChain from langchain.chains.conversation.memory import ConversationBufferWindowMemory from langchain.llms import HuggingFacePipeline from langchain.schema import BaseOutputParser from transformers import ( AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, pipeline, ) warnings.filterwarnings("ignore", category=UserWarning) class MyEmbeddings: def __init__(self): #self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") self.model=model2 def embed_documents(self, texts: List[str]) -> List[List[float]]: inputs = tokenizer2(texts, padding=True, truncation=True, return_tensors="pt") # Get the model outputs with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling to get sentence embeddings embeddings = outputs.last_hidden_state.mean(dim=1) return [embeddings[i].tolist() for i, sentence in enumerate(texts)] def embed_query(self, query: str) -> List[float]: inputs = tokenizer2(query, padding=True, truncation=True, return_tensors="pt") # Get the model outputs with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling to get sentence embeddings embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings[0].tolist() embeddings = MyEmbeddings() splitter = SemanticChunker(embeddings) CHROMA_PATH = "chroma8" # call the chroma generated in a directory db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings) MODEL_NAME = "tiiuae/falcon-7b-instruct" model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, trust_remote_code=True, load_in_8bit=True, device_map="auto" ) model = model.eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print(f"Model device: {model.device}") generation_config = model.generation_config generation_config.temperature = 0 generation_config.num_return_sequences = 1 generation_config.max_new_tokens = 256 generation_config.use_cache = False generation_config.repetition_penalty = 1.7 generation_config.pad_token_id = tokenizer.eos_token_id generation_config.eos_token_id = tokenizer.eos_token_id generation_config prompt = """ The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. Current conversation: Human: Who is Dwight K Schrute? AI: """.strip() input_ids = tokenizer(prompt, return_tensors="pt").input_ids input_ids = input_ids.to(model.device) class StopGenerationCriteria(StoppingCriteria): def __init__( self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device ): stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] self.stop_token_ids = [ torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids ] def __call__( self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs ) -> bool: for stop_ids in self.stop_token_ids: if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all(): return True return False stop_tokens = [["Human", ":"], ["AI", ":"]] stopping_criteria = StoppingCriteriaList( [StopGenerationCriteria(stop_tokens, tokenizer, model.device)] ) generation_pipeline = pipeline( model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation", stopping_criteria=stopping_criteria, generation_config=generation_config, ) llm = HuggingFacePipeline(pipeline=generation_pipeline) class CleanupOutputParser(BaseOutputParser): def parse(self, text: str) -> str: user_pattern = r"\nUser" text = re.sub(user_pattern, "", text) human_pattern = r"\nHuman:" text = re.sub(human_pattern, "", text) ai_pattern = r"\nAI:" return re.sub(ai_pattern, "", text).strip() @property def _type(self) -> str: return "output_parser" template = """ The following Current conversation: {history} Human: {input} AI:""".strip() prompt = PromptTemplate(input_variables=["history", "input"], template=template) memory = ConversationBufferWindowMemory( memory_key="history", k=6, return_only_outputs=True ) chain = ConversationChain( llm=llm, memory=memory, prompt=prompt, output_parser=CleanupOutputParser(), verbose=True, ) def get_llama_response(message: str, history: list) -> str: query_text = message results = db.similarity_search_with_relevance_scores(query_text, k=3) if len(results) == 0 or results[0][1] < 0.5: print(f"Unable to find matching results.") context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]) template = """ The following is a conversation between a human an AI. The AI acts exactly like Dwight K Schrute from the TV show The Office. Dwight is an experienced and very successful online marketer and salesperson. He is witty, persuasive, uncensored, direct and practical. Dwight helps with every marketing task is given to him. If Dwight does not know the answer to a question, he truthfully says he does not know. Current conversation: """ s=""" {history} Human: {input} AI:""".strip() prompt = PromptTemplate(input_variables=["history", "input"], template=template+context_text+ s) #print(template) chain.prompt=prompt res = chain(query_text) return(res["response"]) import gradio as gr gr.ChatInterface(get_llama_response).launch()