Spaces:
Sleeping
Sleeping
File size: 6,344 Bytes
5d16466 0c2a681 7f93b89 afb7b3e 7f93b89 afb7b3e 5d16466 0c2a681 5d16466 afb7b3e 7f93b89 afb7b3e 7f93b89 5d16466 7f93b89 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 7f93b89 5d16466 afb7b3e 7f93b89 afb7b3e 7f93b89 afb7b3e f7ca342 20fe8a3 7f93b89 0c2a681 5d16466 7f93b89 afb7b3e 7f93b89 0c2a681 5d16466 0c2a681 7f93b89 0c2a681 7f93b89 0c2a681 5d16466 0c2a681 5d16466 afb7b3e 5d16466 e161fc4 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 7f93b89 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 afb7b3e 7f93b89 5d16466 afb7b3e 5d16466 afb7b3e 5d16466 7f93b89 5d16466 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import streamlit as st
import requests
import subprocess
import atexit
import os
import signal
import os
import PyPDF2
from docx import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import pickle
from datetime import datetime
import io
from dotenv import load_dotenv
from groq import Groq
load_dotenv()
log_data = []
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
class User:
def __init__(self, username):
self.username = username
self.llm = "llama3-8b-8192"
self.embedder = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
def upload_documents(user: User, files) -> tuple[str, int, FAISS]:
text = _extract_text_from_document(files)
chunks = _chunk_text(text)
status_code , vector_store = _create_embeddings_and_save(user, chunks)
if status_code == 200:
return "Document uploaded successfully.", 200 , vector_store
else:
return "Failed to upload document.", 500
def _extract_text_from_document(files) -> str:
text = ""
for file in files:
file_name = file.name
file_extension = os.path.splitext(file_name)[1]
if file_extension == '.txt':
text += file.read().decode('utf-8')
elif file_extension == '.pdf':
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file.read()))
for page_number in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_number]
text += page.extract_text()
elif file_extension == '.docx':
doc = Document(io.BytesIO(file.read()))
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
def _chunk_text(text: str) -> list[str]:
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=512,
chunk_overlap=10,
length_function=len
)
return text_splitter.split_text(text)
def _create_embeddings_and_save(user: User, chunks: any) -> tuple[int, FAISS]:
embeddings = HuggingFaceEmbeddings(model_name=user.embedder)
vector_store = FAISS.from_texts(chunks, embeddings, metadatas=[{"source": f"{user.username}:{i}"} for i in range(len(chunks))])
return 200, vector_store
def ask_question(user: User, question: str, vector_store : FAISS) -> tuple[str, int]:
docs = vector_store.similarity_search(question,k = 3)
x = min(len(docs), 3)
retrieved_chunks = "".join([docs[i].page_content for i in range(x)])
prompt = f'Question: "{question}"\nContext: "{retrieved_chunks}"'
try:
response = get_completion(prompt)
except Exception as e:
return f"LLM connection failed.{e}", 400
answer = f'{response}\n\n**<Most Related Chunk>**\n\n{retrieved_chunks}'
_log(user, question, retrieved_chunks, response)
return answer, 200
def get_completion(prompt, model="llama3-8b-8192"):
messages = [
{
"role": "system",
"content": "Based on the context provided, answer the question as an easy-to-understand assistant. Ensure that the answer is concise, directly addresses the question, and is in the same language as the question."
},
{
"role": "user",
"content": prompt
}
]
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
return response.choices[0].message.content.strip()
def _log(user: User, question: str, retrieved_chunks: str, answer: str):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_message = (
f"{timestamp}, Username: {user.username}, Question: {question}, "
f"LLM: {user.llm}, Embedder: {user.embedder}, "
f"Retrieved Texts: {retrieved_chunks}, Answer: {answer}\n"
)
log_data.append(log_message)
def main():
st.title("Free Multilingual RAG")
url = "https://github.com/mehmet-emin-aydin/RAG_CHATBOT"
st.write("[Review in Github](%s)" % url)
tabs = ["Upload Document", "Ask Question"]
active_tab = st.radio("Upload documents first, ask questions later:", tabs)
if active_tab == "Upload Document":
upload_document()
elif active_tab == "Ask Question":
ask_question_ui(st.session_state.vector_store)
def upload_document():
st.write("Several files can be uploaded, each upload crushes the old one. Depending on the number and size of files, the upload process may take a long time.")
username = st.text_input("Enter a username (just something that represents you):")
uploaded_files = st.file_uploader("Upload your documents (for now it only works with files that have .txt, .pdf or .docx extension):", accept_multiple_files=True)
if uploaded_files and username:
st.write("Number of uploaded files:", len(uploaded_files))
for uploaded_file in uploaded_files:
file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type, "FileSize": uploaded_file.size}
st.write(file_details)
user = User(username=username)
response, status_code , vector_store= upload_documents(user, uploaded_files)
st.session_state.vector_store = vector_store
if status_code == 200:
st.success(response)
else:
st.error("Error:", response)
def ask_question_ui(vector_store : FAISS):
username = st.text_input("Enter a username (just something that represents you):")
question = st.text_area("Enter the question you want to ask in your document (the more detailed your question, the more accurate an answer you will get):")
if st.button("Ask"):
if not question:
st.warning("Please enter a question.")
elif not username:
st.warning("Please enter a username.")
else:
user = User(username=username)
answer, status_code = ask_question(user, question, vector_store)
if status_code == 200:
st.success("Answer: " + answer)
else:
st.error("Error: " + answer)
if __name__ == "__main__":
if "vector_store" not in st.session_state:
st.session_state.vector_store = {}
main()
|