jarif commited on
Commit
2354330
·
verified ·
1 Parent(s): cefd1c0

Update ingest.py

Browse files
Files changed (1) hide show
  1. ingest.py +39 -73
ingest.py CHANGED
@@ -1,73 +1,39 @@
1
- import os
2
- import logging
3
- import streamlit as st
4
- from langchain_community.document_loaders import PDFMinerLoader
5
- from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community.embeddings import HuggingFaceEmbeddings
7
- from langchain_community.vectorstores import FAISS
8
-
9
- logging.basicConfig(level=logging.INFO)
10
- logger = logging.getLogger(__name__)
11
-
12
- def create_faiss_index():
13
- documents = []
14
- docs_dir = "docs"
15
-
16
- if not os.path.exists(docs_dir):
17
- st.error(f"The directory '{docs_dir}' does not exist.")
18
- return
19
-
20
- for root, dirs, files in os.walk(docs_dir):
21
- for file in files:
22
- if file.endswith(".pdf"):
23
- file_path = os.path.join(root, file)
24
- st.info(f"Loading document: {file_path}")
25
- try:
26
- loader = PDFMinerLoader(file_path)
27
- documents.extend(loader.load())
28
- except Exception as e:
29
- st.error(f"Error loading {file_path}: {e}")
30
-
31
- if not documents:
32
- st.error("No documents were loaded. Check the 'docs' directory and file paths.")
33
- return
34
-
35
- st.info(f"Loaded {len(documents)} documents.")
36
-
37
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
38
- texts = text_splitter.split_documents(documents)
39
-
40
- if not texts:
41
- st.error("No text chunks were created. Check the text splitting process.")
42
- return
43
-
44
- st.info(f"Created {len(texts)} text chunks.")
45
-
46
- try:
47
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
48
- except Exception as e:
49
- st.error(f"Failed to initialize embeddings: {e}")
50
- return
51
-
52
- try:
53
- db = FAISS.from_documents(texts, embeddings)
54
- st.info(f"Created FAISS index with {len(texts)} vectors")
55
- except Exception as e:
56
- st.error(f"Failed to create FAISS index: {e}")
57
- return
58
-
59
- index_dir = "faiss_index"
60
- if not os.path.exists(index_dir):
61
- os.makedirs(index_dir)
62
-
63
- try:
64
- db.save_local(index_dir)
65
- st.success(f"FAISS index successfully saved to {index_dir}")
66
- index_path = os.path.join(index_dir, "index.faiss")
67
- st.info(f"Index file size: {os.path.getsize(index_path)} bytes")
68
- st.info(f"Index file permissions: {oct(os.stat(index_path).st_mode)[-3:]}")
69
- except Exception as e:
70
- st.error(f"Failed to save FAISS index: {e}")
71
-
72
- if __name__ == "__main__":
73
- create_faiss_index()
 
1
+ import os
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain_community.embeddings import HuggingFaceEmbeddings
4
+ from langchain_community.vectorstores import FAISS
5
+
6
+ def create_faiss_index():
7
+ try:
8
+ # Ensure the 'docs' directory exists and contains files
9
+ docs_directory = 'docs'
10
+ if not os.path.exists(docs_directory) or not os.listdir(docs_directory):
11
+ raise ValueError(f"Directory '{docs_directory}' is empty or does not exist.")
12
+
13
+ # Load all documents from the 'docs' directory
14
+ documents = []
15
+ for file in os.listdir(docs_directory):
16
+ if file.endswith('.pdf'):
17
+ loader = PyPDFLoader(os.path.join(docs_directory, file))
18
+ documents.extend(loader.load())
19
+
20
+ if not documents:
21
+ raise ValueError("No valid documents found in the 'docs' directory.")
22
+
23
+ # Create embeddings using HuggingFace's 'sentence-transformers/all-MiniLM-L6-v2' model
24
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+ # Create the FAISS vector store index
27
+ faiss_index = FAISS.from_documents(documents, embeddings)
28
+
29
+ # Save the FAISS index locally
30
+ index_path = "faiss_index"
31
+ os.makedirs(index_path, exist_ok=True)
32
+ faiss_index.save_local(index_path)
33
+
34
+ print("FAISS index created and saved successfully.")
35
+ except Exception as e:
36
+ print(f"An error occurred during FAISS index creation: {e}")
37
+
38
+ if __name__ == "__main__":
39
+ create_faiss_index()