know-my-doc / app2.py
sjain15's picture
feat: Added know-my-doc-code
cb35b85
import os
import logging
from flask import Flask, request, jsonify, render_template
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import DirectoryLoader
from langchain.llms import OpenAIChat
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import WebBaseLoader
import yaml
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import nltk
nltk.download("punkt")
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load configuration from YAML file
with open("config.yaml", "r") as f:
config = yaml.safe_load(f)
os.environ["OPENAI_API_KEY"] = config["openai_api_key"]
template_dir = os.path.abspath("templates")
app = Flask(__name__, template_folder=template_dir, static_folder="static")
# Load the files
loader = DirectoryLoader(config["data_directory"], glob=config["data_files_glob"])
docs = loader.load()
webpages = config.get("webpages", [])
web_docs = []
for webpage in webpages:
logger.info(f"Loading data from webpage {webpage}")
loader = WebBaseLoader(webpage)
web_docs += loader.load()
result = docs + web_docs
tone = config.get("tone", "default")
persona = config.get("persona", "default")
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(result)
embeddings = OpenAIEmbeddings(openai_api_key=config["openai_api_key"])
docsearch = Chroma.from_documents(texts, embeddings)
# Initialize the QA chain
logger.info("Initializing QA chain...")
chain = load_qa_chain(
OpenAIChat(),
chain_type="stuff",
memory=ConversationBufferMemory(memory_key="chat_history", input_key="human_input"),
prompt=PromptTemplate(
input_variables=["chat_history", "human_input", "context", "tone", "persona"],
template="""You are a chatbot who acts like {persona}, having a conversation with a human.
Given the following extracted parts of a long document and a question, Create a final answer with references ("SOURCES") in the tone {tone}.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.
SOURCES should only be hyperlink URLs which are genuine and not made up.
{context}
{chat_history}
Human: {human_input}
Chatbot:""",
),
verbose=False,
)
@app.route("/")
def index():
return render_template("index.html")
@app.route("/api/chat", methods=["POST"])
def chat():
try:
# Get the question from the request
question = request.json["question"]
documents = docsearch.similarity_search(question, include_metadata=True)
# Get the bot's response
response = chain(
{
"input_documents": documents,
"human_input": question,
"tone": tone,
"persona": persona,
},
return_only_outputs=True,
)["output_text"]
# Increment message counter
session_counter = request.cookies.get('session_counter')
if session_counter is None:
session_counter = 0
else:
session_counter = int(session_counter) + 1
# Check if it's time to flush memory
if session_counter % 10 == 0:
chain.memory.clear()
# Set the session counter cookie
resp = jsonify({"response": response})
resp.set_cookie('session_counter', str(session_counter))
# Return the response as JSON with the session counter cookie
return resp
except Exception as e:
# Log the error and return an error response
logger.error(f"Error while processing request: {e}")
return jsonify({"error": "Unable to process the request."}), 500
if __name__ == "__main__":
app.run(debug=True)