leandroaraujodev commited on
Commit
550c464
1 Parent(s): a168116

integracao gabriel

Browse files
Files changed (1) hide show
  1. app.py +47 -34
app.py CHANGED
@@ -25,42 +25,28 @@ from typing import List, Optional
25
  from llama_index.core import PromptTemplate
26
  import torch
27
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
28
- import huggingface_hub
29
 
30
  import logging
31
  import sys
32
  from PIL import Image
33
- import gc
34
-
35
- def flush():
36
- gc.collect()
37
- torch.cuda.empty_cache()
38
- torch.cuda.reset_peak_memory_stats()
39
 
40
- #Token do huggingface
41
- HF_TOKEN: Optional[str] = os.getenv("HF_TOKEN")
42
- huggingface_hub.login(HF_TOKEN)
43
  #Configuração da imagem da aba
44
 
45
- im = Image.open("./pngegg.png")
46
  st.set_page_config(page_title = "Chatbot Carômetro", page_icon=im, layout = "wide")
47
 
48
- # Lista de pastas que precisam ser criadas
49
- pastas = ["bm25_retriever", "chat_store", "chroma_db", "documentos"]
50
-
51
- # Criar cada pasta caso não exista
52
- for pasta in pastas:
53
- if not os.path.exists(pasta):
54
- os.makedirs(pasta)
55
- print(f"Pasta '{pasta}' criada com sucesso.")
56
- else:
57
- print(f"Pasta '{pasta}' já existe.")
58
-
59
-
60
 
61
  # Configuração do Streamlit
62
  st.sidebar.title("Configuração de LLM")
63
- sidebar_option = st.sidebar.radio("Selecione o LLM", ["OpenAI", "HF Local"])
64
  # logo_url = 'app\logos\logo-sicoob.jpg'
65
  # st.sidebar.image(logo_url)
66
  import base64
@@ -82,22 +68,16 @@ with open("sicoob-logo.png", "rb") as f:
82
  #if sidebar_option == "Ollama":
83
  # Settings.llm = Ollama(model="llama3.2:latest", request_timeout=500.0, num_gpu=1)
84
  # Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text:latest")
85
- if sidebar_option == "gpt-3.5":
86
  from llama_index.llms.openai import OpenAI
87
  from llama_index.embeddings.openai import OpenAIEmbedding
88
- os.environ["OPENAI_API_KEY"] = "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"
89
  Settings.llm = OpenAI(model="gpt-3.5-turbo")
90
  Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
91
- elif sidebar_option == 'HF Local':
92
 
93
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
94
  logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
95
 
96
- #query_wrapper_prompt = PromptTemplate(
97
- #"Below are several documents about a company "
98
- #"Write a response that appropriately completes the request.\n\n"
99
- #"### Instruction:\n{query_str}\n\n### Response:"
100
- #)
101
  #Embedding do huggingface
102
  Settings.embed_model = HuggingFaceEmbedding(
103
  model_name="BAAI/bge-small-en-v1.5"
@@ -139,6 +119,7 @@ elif sidebar_option == 'HF Local':
139
 
140
  tokenizer.apply_chat_template(chat, tokenize=False)
141
 
 
142
  Settings.chunk_size = 512
143
  Settings.llm = llm
144
 
@@ -149,7 +130,10 @@ else:
149
  chat_store_path = os.path.join("chat_store", "chat_store.json")
150
  documents_path = os.path.join("documentos")
151
  chroma_storage_path = os.path.join("chroma_db") # Diretório para persistência do Chroma
 
152
  bm25_persist_path = os.path.join("bm25_retriever")
 
 
153
 
154
  # Configuração de leitura de documentos
155
  documents = SimpleDirectoryReader(input_dir=documents_path).load_data()
@@ -191,10 +175,39 @@ else:
191
  os.makedirs(bm25_persist_path, exist_ok=True)
192
  bm25_retriever.persist(bm25_persist_path)
193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  # Combinação de Retrievers (Embeddings + BM25)
195
  vector_retriever = index.as_retriever(similarity_top_k=2)
196
  retriever = QueryFusionRetriever(
197
- [vector_retriever, bm25_retriever],
198
  similarity_top_k=2,
199
  num_queries=4,
200
  mode="reciprocal_rerank",
@@ -248,4 +261,4 @@ if user_input:
248
  for message in st.session_state.chat_history:
249
  role, text = message.split(":", 1)
250
  with st.chat_message(role.strip().lower()):
251
- st.write(text.strip())
 
25
  from llama_index.core import PromptTemplate
26
  import torch
27
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
28
+
29
 
30
  import logging
31
  import sys
32
  from PIL import Image
 
 
 
 
 
 
33
 
 
 
 
34
  #Configuração da imagem da aba
35
 
36
+ im = Image.open("pngegg.png")
37
  st.set_page_config(page_title = "Chatbot Carômetro", page_icon=im, layout = "wide")
38
 
39
+ #Removido loop e adicionado os.makedirs
40
+ os.makedirs("bm25_retriever", exist_ok=True)
41
+ os.makedirs("chat_store", exist_ok=True)
42
+ os.makedirs("chroma_db", exist_ok=True)
43
+ os.makedirs("documentos", exist_ok=True)
44
+ os.makedirs("curadoria", exist_ok=True)
45
+ os.makedirs("chroma_db_curadoria", exist_ok=True)
 
 
 
 
 
46
 
47
  # Configuração do Streamlit
48
  st.sidebar.title("Configuração de LLM")
49
+ sidebar_option = st.sidebar.radio("Selecione o LLM", ["gpt-3.5-turbo", "NuExtract-1.5"])
50
  # logo_url = 'app\logos\logo-sicoob.jpg'
51
  # st.sidebar.image(logo_url)
52
  import base64
 
68
  #if sidebar_option == "Ollama":
69
  # Settings.llm = Ollama(model="llama3.2:latest", request_timeout=500.0, num_gpu=1)
70
  # Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text:latest")
71
+ if sidebar_option == "gpt-3.5-turbo":
72
  from llama_index.llms.openai import OpenAI
73
  from llama_index.embeddings.openai import OpenAIEmbedding
 
74
  Settings.llm = OpenAI(model="gpt-3.5-turbo")
75
  Settings.embed_model = OpenAIEmbedding(model_name="text-embedding-ada-002")
76
+ elif sidebar_option == 'NuExtract-1.5':
77
 
78
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
79
  logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
80
 
 
 
 
 
 
81
  #Embedding do huggingface
82
  Settings.embed_model = HuggingFaceEmbedding(
83
  model_name="BAAI/bge-small-en-v1.5"
 
119
 
120
  tokenizer.apply_chat_template(chat, tokenize=False)
121
 
122
+
123
  Settings.chunk_size = 512
124
  Settings.llm = llm
125
 
 
130
  chat_store_path = os.path.join("chat_store", "chat_store.json")
131
  documents_path = os.path.join("documentos")
132
  chroma_storage_path = os.path.join("chroma_db") # Diretório para persistência do Chroma
133
+ chroma_storage_path_curadoria = os.path.join("chroma_db_curadoria") # Diretório para 'curadoria'
134
  bm25_persist_path = os.path.join("bm25_retriever")
135
+ curadoria_path = os.path.join("curadoria")
136
+
137
 
138
  # Configuração de leitura de documentos
139
  documents = SimpleDirectoryReader(input_dir=documents_path).load_data()
 
175
  os.makedirs(bm25_persist_path, exist_ok=True)
176
  bm25_retriever.persist(bm25_persist_path)
177
 
178
+ #Adicionado documentos na pasta curadoria, foi setado para 1200 o chunk pra receber pergunta, contexto e resposta
179
+ curadoria_documents = SimpleDirectoryReader(input_dir=curadoria_path).load_data()
180
+
181
+ curadoria_docstore = SimpleDocumentStore()
182
+ curadoria_docstore.add_documents(curadoria_documents)
183
+
184
+ db_curadoria = chromadb.PersistentClient(path=chroma_storage_path_curadoria)
185
+ chroma_collection_curadoria = db_curadoria.get_or_create_collection("dense_vectors_curadoria")
186
+ vector_store_curadoria = ChromaVectorStore(chroma_collection=chroma_collection_curadoria)
187
+
188
+ # Configuração do StorageContext para 'curadoria'
189
+ storage_context_curadoria = StorageContext.from_defaults(
190
+ docstore=curadoria_docstore, vector_store=vector_store_curadoria
191
+ )
192
+
193
+ # Criação/Recarregamento do índice com embeddings para 'curadoria'
194
+ if os.path.exists(chroma_storage_path_curadoria):
195
+ curadoria_index = VectorStoreIndex.from_vector_store(vector_store_curadoria)
196
+ else:
197
+ curadoria_splitter = LangchainNodeParser(
198
+ RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=100)
199
+ )
200
+ curadoria_index = VectorStoreIndex.from_documents(
201
+ curadoria_documents, storage_context=storage_context_curadoria, transformations=[curadoria_splitter]
202
+ )
203
+ vector_store_curadoria.persist()
204
+
205
+ curadoria_retriever = curadoria_index.as_retriever(similarity_top_k=2)
206
+
207
  # Combinação de Retrievers (Embeddings + BM25)
208
  vector_retriever = index.as_retriever(similarity_top_k=2)
209
  retriever = QueryFusionRetriever(
210
+ [vector_retriever, bm25_retriever, curadoria_retriever],
211
  similarity_top_k=2,
212
  num_queries=4,
213
  mode="reciprocal_rerank",
 
261
  for message in st.session_state.chat_history:
262
  role, text = message.split(":", 1)
263
  with st.chat_message(role.strip().lower()):
264
+ st.write(text.strip())