File size: 4,188 Bytes
ff1f92b 5585965 f9919ad 883c8fa 8848678 f9919ad cf0475c 5585965 ff1f92b f5dd29d 5585965 883c8fa 5585965 ff1f92b 5585965 e6fe9fb 883c8fa 1f3f1ca ff1f92b 5585965 7446d35 cc21256 5585965 ff1f92b 5585965 cc21256 5585965 ff1f92b a353273 f1e2b8d fd18087 87177f6 fd18087 883c8fa f1e2b8d fd18087 ebec180 25ba8c5 fd18087 ebec180 fd18087 ebec180 883c8fa ebec180 a353273 8b94f40 62b5d38 ff1f92b 8b94f40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import streamlit as st
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
import os
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import AutoModel
#from googletrans import Translator
#from transformers import *
###########
#pip install faiss-cpu
#pip install langchain
#pip install pypdf
#pip tiktoken
#pip install InstructorEmbedding
###############
# PDF in String umwandeln
def get_pdf_text(folder_path):
#translator = Translator()
text = ""
# Durchsuche alle Dateien im angegebenen Verzeichnis
for filename in os.listdir(folder_path):
filepath = os.path.join(folder_path, filename)
# Überprüfe, ob die Datei die Erweiterung ".pdf" hat
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
pdf_reader = PdfReader(filepath)
for page in pdf_reader.pages:
text += page.extract_text()
#text += '\n'
text=text.replace("\n", " ")
text=text.replace("- ", "")
#text = translator.translate(text, dest ='en').text
st.text(text)
return text
#Chunks erstellen
def get_text_chunks(text):
#Arbeitsweise Textsplitter definieren
text_splitter = CharacterTextSplitter(
separator="\n",
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
# nur zum Anlegen des lokalen Verzeichnisses "Store" und speichern der Vektor-Datenbank
def create_vectorstore_and_store():
folder_path = './files'
pdf_text = get_pdf_text(folder_path)
text_chunks = get_text_chunks(pdf_text)
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
# Initiate Faiss DB
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings)#texts=text_chunks,
# Verzeichnis in dem die VektorDB gespeichert werden soll
save_directory = "Store"
#VektorDB lokal speichern
vectorstoreDB.save_local(save_directory)
return None
########
def get_vectorstore():
embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-base")
#embeddings = HuggingFaceInstructEmbeddings(model_name="aari1995/German_Semantic_STS_V2")
#Abruf lokaler Vektordatenbank
save_directory = "Store"
vectorstoreDB = FAISS.load_local(save_directory, embeddings)
return vectorstoreDB
def get_llm_answer(user_question):
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
#user_question = st.text_area("Stell mir eine Frage: ")
#if os.path.exists("./Store"): #Nutzereingabe nur eingelesen, wenn vectorstore angelegt
# Retriever sucht passende Textausschnitte in den PDFs (unformatiert)
#translator = Translator()
#translator.translate(user_question, dest='en')
retriever=get_vectorstore().as_retriever()
retrieved_docs=retriever.invoke(
user_question
)
# Top 3 Suchergebnisse des Retrievers als Context speichern
context=""+retrieved_docs[0].page_content+retrieved_docs[1].page_content+retrieved_docs[2].page_content
# Context bereinigen
#context=context.replace("\n", " ")
#context=context.replace("- ", "")
# Erstelle die Question Answering-Pipeline für Deutsch
#qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2")
# Frage beantworten mit Q&A Pipeline
#answer = qa_pipeline(question=user_question, context=context, max_length=200)
#antw = translator.translate(answer["answer"],dest='de')
return content#answer["answer"]#antw
def main():
st.set_page_config(
page_title="Chatbot",
layout="wide",
initial_sidebar_state="expanded",
)
st.text("Chatbot Rene ist über Telegram erreichbar!")
if __name__ == '__main__':
main() |