import streamlit as st | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
from langchain.vectorstores import FAISS | |
from langchain.text_splitter import CharacterTextSplitter | |
from langchain.document_loaders import DirectoryLoader, PyPDFLoader | |
import os | |
from PyPDF2 import PdfReader | |
from transformers import pipeline | |
from transformers import AutoModel | |
#from googletrans import Translator | |
#from transformers import * | |
########### | |
#pip install faiss-cpu | |
#pip install langchain | |
#pip install pypdf | |
#pip tiktoken | |
#pip install InstructorEmbedding | |
############### | |
# PDF in String umwandeln | |
def get_pdf_text(folder_path): | |
#translator = Translator() | |
text = "" | |
# Durchsuche alle Dateien im angegebenen Verzeichnis | |
for filename in os.listdir(folder_path): | |
filepath = os.path.join(folder_path, filename) | |
# Überprüfe, ob die Datei die Erweiterung ".pdf" hat | |
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"): | |
pdf_reader = PdfReader(filepath) | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
#text += '\n' | |
text=text.replace("\n", " ") | |
text=text.replace("- ", "") | |
#text = translator.translate(text, dest ='en').text | |
st.text(text) | |
return text | |
#Chunks erstellen | |
def get_text_chunks(text): | |
#Arbeitsweise Textsplitter definieren | |
text_splitter = CharacterTextSplitter( | |
separator="\n", | |
chunk_size=1000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text) | |
return chunks | |
# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank | |
def create_vectorstore_and_store(): | |
folder_path = './files' | |
pdf_text = get_pdf_text(folder_path) | |
text_chunks = get_text_chunks(pdf_text) | |
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base") | |
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2") | |
# Initiate Faiss DB | |
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks, | |
# Verzeichnis in dem die VektorDB gespeichert werden soll | |
save_directory = "Store" | |
#VektorDB lokal speichern | |
vectorstoreDB.save_local(save_directory) | |
return None | |
######## | |
def get_vectorstore(): | |
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base") | |
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2") | |
#Abruf lokaler Vektordatenbank | |
save_directory = "Store" | |
vectorstoreDB = FAISS.load_local(save_directory, embeddings) | |
return vectorstoreDB | |
def get_llm_answer(user_question): | |
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt | |
#user_question = st.text_area("Stell mir eine Frage: ") | |
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt | |
# Retriever sucht passende Textausschnitte in den PDFs (unformatiert) | |
#translator = Translator() | |
#translator.translate(user_question, dest='en') | |
retriever=get_vectorstore().as_retriever() | |
retrieved_docs=retriever.invoke( | |
user_question | |
) | |
# Top 3 Suchergebnisse des Retrievers als Context speichern | |
context=""+retrieved_docs[0].page_content+retrieved_docs[1].page_content+retrieved_docs[2].page_content | |
# Context bereinigen | |
#context=context.replace("\n", " ") | |
#context=context.replace("- ", "") | |
# Erstelle die Question Answering-Pipeline für Deutsch | |
#qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2") | |
# Frage beantworten mit Q&A Pipeline | |
#answer = qa_pipeline(question=user_question, context=context, max_length=200) | |
#antw = translator.translate(answer["answer"],dest='de') | |
return content#answer["answer"]#antw | |
def main(): | |
st.set_page_config( | |
page_title="Chatbot", | |
layout="wide", | |
initial_sidebar_state="expanded", | |
) | |
st.text("Chatbot Rene ist über Telegram erreichbar!") | |
if __name__ == '__main__': | |
main() |