Spaces:
Sleeping
Sleeping
MefhigosetH
commited on
Commit
路
7ffe358
1
Parent(s):
b2f16d4
Implementamos modulo LLM y VectorStore.
Browse files- .gitignore +3 -1
- Pipfile +8 -0
- app.py +9 -1
- chatbot/embeddings.py +20 -0
- chatbot/llm.py +36 -0
- chatbot/vectorstore.py +25 -0
- ingest.py +25 -0
- requirements.txt +2 -1
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
Pipfile.lock
|
2 |
-
*.pdf
|
|
|
|
|
|
1 |
Pipfile.lock
|
2 |
+
*.pdf
|
3 |
+
.env
|
4 |
+
chroma_db/
|
Pipfile
CHANGED
@@ -5,6 +5,14 @@ name = "pypi"
|
|
5 |
|
6 |
[packages]
|
7 |
huggingface-hub = "==0.25.2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
[dev-packages]
|
10 |
gradio = "==5.5.0"
|
|
|
|
5 |
|
6 |
[packages]
|
7 |
huggingface-hub = "==0.25.2"
|
8 |
+
langchain = "*"
|
9 |
+
langchain-community = "*"
|
10 |
+
langchain-huggingface = "*"
|
11 |
+
langchain-chroma = "*"
|
12 |
+
einops = "*"
|
13 |
+
langchain-google-genai = "*"
|
14 |
+
langchain-core = "*"
|
15 |
|
16 |
[dev-packages]
|
17 |
gradio = "==5.5.0"
|
18 |
+
pypdf = "==5.1.0"
|
app.py
CHANGED
@@ -2,13 +2,21 @@
|
|
2 |
Chatbot Nuevo R茅gimen Acad茅mico
|
3 |
"""
|
4 |
from chatbot.ui import ChatbotInterface
|
|
|
|
|
5 |
|
6 |
|
7 |
def respond(message, history):
|
8 |
-
|
|
|
|
|
|
|
9 |
|
10 |
|
11 |
if __name__ == "__main__":
|
|
|
|
|
|
|
12 |
ui = ChatbotInterface(respond)
|
13 |
|
14 |
ui.app.launch()
|
|
|
2 |
Chatbot Nuevo R茅gimen Acad茅mico
|
3 |
"""
|
4 |
from chatbot.ui import ChatbotInterface
|
5 |
+
from chatbot.llm import GeminiAI
|
6 |
+
from langchain.globals import set_verbose, set_debug
|
7 |
|
8 |
|
9 |
def respond(message, history):
|
10 |
+
prompt = llm.getMainTemplate()
|
11 |
+
chain = prompt | llm.llm
|
12 |
+
response = chain.invoke({"message": message, "history": history})
|
13 |
+
return response.content
|
14 |
|
15 |
|
16 |
if __name__ == "__main__":
|
17 |
+
set_verbose(True)
|
18 |
+
set_debug(True)
|
19 |
+
llm = GeminiAI("gemini-1.5-flash")
|
20 |
ui = ChatbotInterface(respond)
|
21 |
|
22 |
ui.app.launch()
|
chatbot/embeddings.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modulo embeddings
|
3 |
+
"""
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
|
6 |
+
def init_embeddings( embeddings_model_name="jinaai/jina-embeddings-v3" ):
|
7 |
+
"""
|
8 |
+
Inicializa y devuelve un modelo para embeddings.
|
9 |
+
"""
|
10 |
+
|
11 |
+
model_kwargs = {"trust_remote_code":True}
|
12 |
+
encode_kwargs = {'normalize_embeddings': False}
|
13 |
+
embeddings = HuggingFaceEmbeddings(
|
14 |
+
model_name=embeddings_model_name,
|
15 |
+
model_kwargs=model_kwargs,
|
16 |
+
encode_kwargs=encode_kwargs,
|
17 |
+
show_progress=True
|
18 |
+
)
|
19 |
+
|
20 |
+
return embeddings
|
chatbot/llm.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
"""
|
3 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
5 |
+
|
6 |
+
class GeminiAI:
|
7 |
+
"""
|
8 |
+
Google Gemini AI class.
|
9 |
+
"""
|
10 |
+
|
11 |
+
def __init__(self, llm_model_name: str) -> None:
|
12 |
+
self.llm = ChatGoogleGenerativeAI(model=llm_model_name)
|
13 |
+
|
14 |
+
def getMainTemplate(self) -> ChatPromptTemplate:
|
15 |
+
"""
|
16 |
+
Devuelve el system prompt principal.
|
17 |
+
"""
|
18 |
+
|
19 |
+
prompt = ChatPromptTemplate.from_messages(
|
20 |
+
[
|
21 |
+
("system",
|
22 |
+
"Eres un asesor experto en la Resolucion 1650/24 de la DGCyE de la Provincia de Buenos Aires.\n"
|
23 |
+
"Tu tarea es utiliza la informaci贸n de la conversaci贸n y el contexto disponible para responder las consultas del usuario.\n"),
|
24 |
+
("placeholder", "{history}"),
|
25 |
+
("human", "{message}"),
|
26 |
+
]
|
27 |
+
)
|
28 |
+
|
29 |
+
return prompt
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
llm = GeminiAI("gemini-1.5-flash")
|
35 |
+
response = llm.llm.invoke("Hola")
|
36 |
+
print(response)
|
chatbot/vectorstore.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modulo que permite gestionar la vector store.
|
3 |
+
"""
|
4 |
+
from langchain_chroma import Chroma
|
5 |
+
import requests, zipfile, io, os
|
6 |
+
|
7 |
+
class ChromaDB:
|
8 |
+
"""
|
9 |
+
Clase para gestionar una base ChromaDB
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, embedding_model) -> None:
|
13 |
+
if not os.path.exists("chroma_db"):
|
14 |
+
print("Descargando base de conocimiento...")
|
15 |
+
zip_file_url = "https://drive.google.com/uc?export=download&id=" + os.environ["GDRIVE_ID"]
|
16 |
+
r = requests.get(zip_file_url)
|
17 |
+
z = zipfile.ZipFile(io.BytesIO(r.content))
|
18 |
+
z.extractall()
|
19 |
+
print("OK")
|
20 |
+
|
21 |
+
self.db = Chroma(
|
22 |
+
collection_name="res_1650",
|
23 |
+
embedding_function=embedding_model,
|
24 |
+
persist_directory="./chroma_db",
|
25 |
+
)
|
ingest.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Modulo para procesar el PDF de la resolucion e indexar su contenido en la DB, para su posterior utilizaci贸n por parte del chatbot.
|
3 |
+
|
4 |
+
Por simplicidad, se indexo un documento por cada p谩gina completa del documento. TODO: Implementar estrategia ParentDocumentRetriever.
|
5 |
+
"""
|
6 |
+
#from langchain_community.document_loaders import PyPDFLoader
|
7 |
+
from chatbot.embeddings import init_embeddings
|
8 |
+
from chatbot.vectorstore import ChromaDB
|
9 |
+
|
10 |
+
if __name__ == "__main__":
|
11 |
+
#loader = PyPDFLoader("2024_DP_134.pdf")
|
12 |
+
embedding_model = init_embeddings()
|
13 |
+
|
14 |
+
vector_store = ChromaDB(embedding_model)
|
15 |
+
|
16 |
+
#for page in loader.lazy_load():
|
17 |
+
#print(f"Procesando pagina {page.metadata['page']} - len: {len(page.page_content)}")
|
18 |
+
#vector_store.add_documents([page])
|
19 |
+
|
20 |
+
results = vector_store.db.similarity_search(
|
21 |
+
"Cuantos anexos contiene la resolucion?",
|
22 |
+
k=2,
|
23 |
+
)
|
24 |
+
|
25 |
+
print(results)
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
gradio==5.5.0
|
|
|
|
1 |
+
gradio==5.5.0
|
2 |
+
pypdf==5.1.0
|