Spaces:
Sleeping
Sleeping
CordwainerSmith
commited on
Add project files and Docker setup
Browse files- Dockerfile +48 -0
- README.md +10 -10
- app.py +305 -0
- auth.py +39 -0
- knowledge_graph.html +0 -0
- query_config.yaml +23 -0
- requirements.txt +3 -0
- search_handlers.py +285 -0
- settings.yaml +196 -0
- styles.css +112 -0
- wiki.py +965 -0
Dockerfile
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Step 1: Use an official Python slim base image
|
2 |
+
FROM python:3.10-slim
|
3 |
+
|
4 |
+
# Step 2: Install system dependencies
|
5 |
+
RUN apt-get update && apt-get install -y \
|
6 |
+
wget \
|
7 |
+
tar \
|
8 |
+
&& apt-get clean
|
9 |
+
|
10 |
+
# Step 3: Add a non-root user (required by Hugging Face Spaces)
|
11 |
+
RUN useradd -m -u 1000 user
|
12 |
+
|
13 |
+
# Step 4: Switch to the "user" user
|
14 |
+
USER user
|
15 |
+
|
16 |
+
# Step 5: Set home and working directory
|
17 |
+
ENV HOME=/home/user \
|
18 |
+
PATH=/home/user/.local/bin:$PATH
|
19 |
+
WORKDIR $HOME/app
|
20 |
+
|
21 |
+
# Step 6: Copy requirements into the container
|
22 |
+
COPY --chown=user requirements.txt ./requirements.txt
|
23 |
+
|
24 |
+
# Step 7: Install Python dependencies
|
25 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
26 |
+
pip install --no-cache-dir -r requirements.txt
|
27 |
+
|
28 |
+
# Step 8: Copy all necessary files and folders into the container
|
29 |
+
COPY --chown=user .output ./.output
|
30 |
+
COPY --chown=user cache ./cache
|
31 |
+
COPY --chown=user input ./input
|
32 |
+
COPY --chown=user output ./output
|
33 |
+
COPY --chown=user prompts ./prompts
|
34 |
+
COPY --chown=user reports ./reports
|
35 |
+
COPY --chown=user auth.py ./auth.py
|
36 |
+
COPY --chown=user knowledge_graph.html ./knowledge_graph.html
|
37 |
+
COPY --chown=user query_config.yaml ./query_config.yaml
|
38 |
+
COPY --chown=user app.py ./app.py
|
39 |
+
COPY --chown=user search_handlers.py ./search_handlers.py
|
40 |
+
COPY --chown=user settings.yaml ./settings.yaml
|
41 |
+
COPY --chown=user styles.css ./styles.css
|
42 |
+
COPY --chown=user wiki.py ./wiki.py
|
43 |
+
|
44 |
+
# Step 10: Expose the Streamlit default port
|
45 |
+
EXPOSE 7860
|
46 |
+
|
47 |
+
# Step 11: Define the entrypoint command
|
48 |
+
CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
---
|
2 |
-
title: PwcGraphRAG
|
3 |
-
emoji: ⚡
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: docker
|
7 |
-
pinned: false
|
8 |
-
---
|
9 |
-
|
10 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: PwcGraphRAG
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import asyncio
|
3 |
+
import sys
|
4 |
+
from pathlib import Path
|
5 |
+
import base64
|
6 |
+
import pandas as pd
|
7 |
+
from typing import Literal, Tuple, Optional
|
8 |
+
from wiki import render_wiki_tab
|
9 |
+
from search_handlers import run_global_search, run_local_search, run_drift_search
|
10 |
+
import auth
|
11 |
+
|
12 |
+
|
13 |
+
import graphrag.api as api
|
14 |
+
from graphrag.config import GraphRagConfig, load_config, resolve_paths
|
15 |
+
from graphrag.index.create_pipeline_config import create_pipeline_config
|
16 |
+
from graphrag.logging import PrintProgressReporter
|
17 |
+
from graphrag.utils.storage import _create_storage, _load_table_from_storage
|
18 |
+
|
19 |
+
|
20 |
+
st.set_page_config(page_title="GraphRAG Chat Interface", page_icon="🔍", layout="wide")
|
21 |
+
|
22 |
+
# Define default avatars at the module level
|
23 |
+
DEFAULT_USER_AVATAR = "👤"
|
24 |
+
DEFAULT_BOT_AVATAR = "🤖"
|
25 |
+
|
26 |
+
# Initialize session state for avatars
|
27 |
+
if "user_avatar" not in st.session_state:
|
28 |
+
st.session_state.user_avatar = DEFAULT_USER_AVATAR
|
29 |
+
if "bot_avatar" not in st.session_state:
|
30 |
+
st.session_state.bot_avatar = DEFAULT_BOT_AVATAR
|
31 |
+
|
32 |
+
# Define avatar images
|
33 |
+
USER_AVATAR = "👤" # Default user emoji
|
34 |
+
BOT_AVATAR = "🤖" # Default bot emoji
|
35 |
+
|
36 |
+
|
37 |
+
class StreamlitProgressReporter(PrintProgressReporter):
|
38 |
+
def __init__(self, placeholder):
|
39 |
+
super().__init__("")
|
40 |
+
self.placeholder = placeholder
|
41 |
+
|
42 |
+
def success(self, message: str):
|
43 |
+
self.placeholder.success(message)
|
44 |
+
|
45 |
+
|
46 |
+
def render_chat_tab():
|
47 |
+
"""Render the Chat tab content."""
|
48 |
+
format_message_history()
|
49 |
+
|
50 |
+
# Chat input
|
51 |
+
if prompt := st.chat_input("Enter your query..."):
|
52 |
+
# Add user message to history with timestamp
|
53 |
+
st.session_state.messages.append(
|
54 |
+
{
|
55 |
+
"role": "user",
|
56 |
+
"content": prompt,
|
57 |
+
"timestamp": pd.Timestamp.now().strftime("%H:%M"),
|
58 |
+
}
|
59 |
+
)
|
60 |
+
|
61 |
+
# Process query
|
62 |
+
with st.spinner("Processing your query..."):
|
63 |
+
response_placeholder = st.empty()
|
64 |
+
try:
|
65 |
+
if st.session_state.search_type == "global":
|
66 |
+
response, context = run_global_search(
|
67 |
+
config_filepath=st.session_state.config_filepath,
|
68 |
+
data_dir=st.session_state.data_dir,
|
69 |
+
root_dir=st.session_state.root_dir,
|
70 |
+
community_level=st.session_state.community_level,
|
71 |
+
response_type=st.session_state.response_type,
|
72 |
+
streaming=st.session_state.streaming,
|
73 |
+
query=prompt,
|
74 |
+
progress_placeholder=response_placeholder,
|
75 |
+
)
|
76 |
+
elif st.session_state.search_type == "drift":
|
77 |
+
response, context = run_drift_search(
|
78 |
+
config_filepath=st.session_state.config_filepath,
|
79 |
+
data_dir=st.session_state.data_dir,
|
80 |
+
root_dir=st.session_state.root_dir,
|
81 |
+
community_level=st.session_state.community_level,
|
82 |
+
response_type=st.session_state.response_type,
|
83 |
+
streaming=st.session_state.streaming,
|
84 |
+
query=prompt,
|
85 |
+
progress_placeholder=response_placeholder,
|
86 |
+
)
|
87 |
+
else:
|
88 |
+
response, context = run_local_search(
|
89 |
+
config_filepath=st.session_state.config_filepath,
|
90 |
+
data_dir=st.session_state.data_dir,
|
91 |
+
root_dir=st.session_state.root_dir,
|
92 |
+
community_level=st.session_state.community_level,
|
93 |
+
response_type=st.session_state.response_type,
|
94 |
+
streaming=st.session_state.streaming,
|
95 |
+
query=prompt,
|
96 |
+
progress_placeholder=response_placeholder,
|
97 |
+
)
|
98 |
+
|
99 |
+
# Clear the placeholder before adding the final response
|
100 |
+
response_placeholder.empty()
|
101 |
+
|
102 |
+
# Add assistant response to history with timestamp
|
103 |
+
st.session_state.messages.append(
|
104 |
+
{
|
105 |
+
"role": "assistant",
|
106 |
+
"content": response,
|
107 |
+
"timestamp": pd.Timestamp.now().strftime("%H:%M"),
|
108 |
+
}
|
109 |
+
)
|
110 |
+
|
111 |
+
# Show context in expander
|
112 |
+
with st.expander("View Search Context"):
|
113 |
+
st.json(context)
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
error_message = f"Error processing query: {str(e)}"
|
117 |
+
st.session_state.messages.append(
|
118 |
+
{
|
119 |
+
"role": "assistant",
|
120 |
+
"content": error_message,
|
121 |
+
"timestamp": pd.Timestamp.now().strftime("%H:%M"),
|
122 |
+
}
|
123 |
+
)
|
124 |
+
|
125 |
+
|
126 |
+
st.rerun()
|
127 |
+
|
128 |
+
|
129 |
+
def display_message(msg: str, is_user: bool = False, timestamp: str = "") -> None:
|
130 |
+
"""Display a chat message with avatar and consistent formatting."""
|
131 |
+
role = "user" if is_user else "assistant"
|
132 |
+
message_class = "user-message" if is_user else "assistant-message"
|
133 |
+
avatar = st.session_state.user_avatar if is_user else st.session_state.bot_avatar
|
134 |
+
|
135 |
+
message_container = f"""
|
136 |
+
<div class="chat-message {message_class}">
|
137 |
+
<div class="avatar">
|
138 |
+
<div style="font-size: 25px; text-align: center;">{avatar}</div>
|
139 |
+
</div>
|
140 |
+
<div class="message-content-wrapper">
|
141 |
+
<div class="message-bubble">
|
142 |
+
<div class="message-content">
|
143 |
+
{msg}
|
144 |
+
</div>
|
145 |
+
</div>
|
146 |
+
<div class="timestamp">{timestamp}</div>
|
147 |
+
</div>
|
148 |
+
</div>
|
149 |
+
"""
|
150 |
+
st.markdown(message_container, unsafe_allow_html=True)
|
151 |
+
|
152 |
+
|
153 |
+
def format_message_history() -> None:
|
154 |
+
"""Display all messages in the chat history with consistent formatting."""
|
155 |
+
st.markdown('<div class="chat-container">', unsafe_allow_html=True)
|
156 |
+
for message in st.session_state.messages:
|
157 |
+
timestamp = message.get("timestamp", "")
|
158 |
+
display_message(
|
159 |
+
msg=message["content"],
|
160 |
+
is_user=(message["role"] == "user"),
|
161 |
+
timestamp=timestamp,
|
162 |
+
)
|
163 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
164 |
+
|
165 |
+
|
166 |
+
@st.cache_resource
|
167 |
+
def load_css():
|
168 |
+
with open("styles.css", "r") as f:
|
169 |
+
return f.read()
|
170 |
+
|
171 |
+
|
172 |
+
def initialize_session_state():
|
173 |
+
"""Initialize session state variables if they don't exist."""
|
174 |
+
if "messages" not in st.session_state:
|
175 |
+
st.session_state.messages = []
|
176 |
+
if "response_placeholder" not in st.session_state:
|
177 |
+
st.session_state.response_placeholder = None
|
178 |
+
if "config_filepath" not in st.session_state:
|
179 |
+
st.session_state.config_filepath = None
|
180 |
+
if "data_dir" not in st.session_state:
|
181 |
+
st.session_state.data_dir = None
|
182 |
+
if "root_dir" not in st.session_state:
|
183 |
+
st.session_state.root_dir = "."
|
184 |
+
if "community_level" not in st.session_state:
|
185 |
+
st.session_state.community_level = 2
|
186 |
+
if "response_type" not in st.session_state:
|
187 |
+
st.session_state.response_type = "concise"
|
188 |
+
if "search_type" not in st.session_state:
|
189 |
+
st.session_state.search_type = "global"
|
190 |
+
if "streaming" not in st.session_state:
|
191 |
+
st.session_state.streaming = True
|
192 |
+
if "authenticated" not in st.session_state:
|
193 |
+
st.session_state.authenticated = False
|
194 |
+
|
195 |
+
|
196 |
+
def main():
|
197 |
+
initialize_session_state()
|
198 |
+
|
199 |
+
# Authentication check
|
200 |
+
if not st.session_state.authenticated:
|
201 |
+
if auth.check_credentials():
|
202 |
+
st.session_state.authenticated = True
|
203 |
+
st.rerun() # Rerun to reflect the authentication state
|
204 |
+
else:
|
205 |
+
st.stop() # Stop further execution if authentication fails
|
206 |
+
|
207 |
+
# If authenticated, proceed with the main app
|
208 |
+
if st.session_state.authenticated:
|
209 |
+
# Main application content
|
210 |
+
st.title("PWC Home Assigment #1, Graphrag")
|
211 |
+
|
212 |
+
css = load_css()
|
213 |
+
st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
|
214 |
+
|
215 |
+
# Sidebar configuration
|
216 |
+
with st.sidebar:
|
217 |
+
# Display logos side by side at the top of the sidebar
|
218 |
+
col1, col2 = st.columns(2)
|
219 |
+
with col1:
|
220 |
+
st.markdown(
|
221 |
+
'<div class="logo-container"><img class="logo-image" src="https://nexttech.pwc.co.il/wp-content/uploads/2023/12/image-2.png"></div>',
|
222 |
+
unsafe_allow_html=True,
|
223 |
+
)
|
224 |
+
with col2:
|
225 |
+
st.markdown(
|
226 |
+
'<div class="logo-container"><img class="logo-image" src="https://nexttech.pwc.co.il/wp-content/uploads/2023/12/Frame.png"></div>',
|
227 |
+
unsafe_allow_html=True,
|
228 |
+
)
|
229 |
+
|
230 |
+
st.header("Configuration")
|
231 |
+
st.session_state.community_level = st.number_input(
|
232 |
+
"Community Level",
|
233 |
+
min_value=0,
|
234 |
+
max_value=10,
|
235 |
+
value=st.session_state.community_level,
|
236 |
+
help="Controls the granularity of the search...",
|
237 |
+
)
|
238 |
+
|
239 |
+
# Only show response type for global and local search
|
240 |
+
if st.session_state.search_type != "drift":
|
241 |
+
st.session_state.response_type = st.selectbox(
|
242 |
+
"Response Type",
|
243 |
+
options=["concise", "detailed"],
|
244 |
+
index=0 if st.session_state.response_type == "concise" else 1,
|
245 |
+
help="Style of response generation",
|
246 |
+
)
|
247 |
+
|
248 |
+
st.session_state.search_type = st.selectbox(
|
249 |
+
"Search Type",
|
250 |
+
options=["global", "local", "drift"],
|
251 |
+
index=(
|
252 |
+
0
|
253 |
+
if st.session_state.search_type == "global"
|
254 |
+
else 1 if st.session_state.search_type == "local" else 2
|
255 |
+
),
|
256 |
+
help="""Search Types:
|
257 |
+
- Local Search: "Focuses on finding specific information by searching through direct connections in the knowledge graph. Best for precise, fact-based queries."
|
258 |
+
- Global Search: "Analyzes the entire document collection at a high level using community summaries. Best for understanding broad themes and general policies."
|
259 |
+
- DRIFT Search: "Combines local and global search capabilities, dynamically exploring connections while gathering detailed information. Best for complex queries requiring both specific details and broader context."
|
260 |
+
""",
|
261 |
+
)
|
262 |
+
|
263 |
+
# Show streaming option only for supported search types
|
264 |
+
if st.session_state.search_type != "drift":
|
265 |
+
st.session_state.streaming = st.checkbox(
|
266 |
+
"Enable Streaming",
|
267 |
+
value=st.session_state.streaming,
|
268 |
+
help="Stream response tokens as they're generated",
|
269 |
+
)
|
270 |
+
else:
|
271 |
+
st.session_state.streaming = False
|
272 |
+
st.info("Streaming is not available for DRIFT search")
|
273 |
+
|
274 |
+
# logout button
|
275 |
+
if st.button("Logout"):
|
276 |
+
st.session_state.clear() # Clear all session state data
|
277 |
+
initialize_session_state() # Reinitialize the session state
|
278 |
+
st.query_params = {"restart": "true"} # Refresh the UI
|
279 |
+
st.rerun()
|
280 |
+
|
281 |
+
# Create tabs
|
282 |
+
tab1, tab2 = st.tabs(["Assignment Documentation", "Chat"])
|
283 |
+
|
284 |
+
# readme tab content
|
285 |
+
with tab1:
|
286 |
+
render_wiki_tab()
|
287 |
+
|
288 |
+
# Chat tab content
|
289 |
+
with tab2:
|
290 |
+
render_chat_tab()
|
291 |
+
|
292 |
+
st.sidebar.markdown(
|
293 |
+
"""
|
294 |
+
<div style="position: absolute; bottom: 0; width: 100%; text-align: center; font-size: 14px; margin-bottom: -200px;">
|
295 |
+
Liran Baba |
|
296 |
+
<a href="https://linkedin.com/in/liranba" target="_blank">LinkedIn</a> |
|
297 |
+
<a href="https://huggingface.co/CordwainerSmith" target="_blank">HuggingFace</a>
|
298 |
+
</div>
|
299 |
+
""",
|
300 |
+
unsafe_allow_html=True,
|
301 |
+
)
|
302 |
+
|
303 |
+
|
304 |
+
if __name__ == "__main__":
|
305 |
+
main()
|
auth.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
|
5 |
+
def check_credentials():
|
6 |
+
"""Handles login form and returns True if authenticated successfully."""
|
7 |
+
|
8 |
+
# Check if already authenticated
|
9 |
+
if st.session_state.get("authenticated", False):
|
10 |
+
return True # User is already authenticated
|
11 |
+
|
12 |
+
# Retrieve credentials from environment variables (set via Hugging Face Secrets)
|
13 |
+
expected_username = os.environ.get("APP_USERNAME")
|
14 |
+
expected_password = os.environ.get("APP_PASSWORD")
|
15 |
+
|
16 |
+
if not expected_username or not expected_password:
|
17 |
+
st.error("Server is misconfigured: missing credentials.")
|
18 |
+
return False
|
19 |
+
|
20 |
+
# Show the login form only if not authenticated
|
21 |
+
with st.form("login_form", clear_on_submit=True):
|
22 |
+
st.text_input("Username", key="username")
|
23 |
+
st.text_input("Password", type="password", key="password")
|
24 |
+
submit_button = st.form_submit_button("Login")
|
25 |
+
|
26 |
+
if submit_button:
|
27 |
+
# Validate credentials
|
28 |
+
if (
|
29 |
+
st.session_state["username"] == expected_username
|
30 |
+
and st.session_state["password"] == expected_password
|
31 |
+
):
|
32 |
+
st.session_state["authenticated"] = True # Mark user as authenticated
|
33 |
+
return True
|
34 |
+
else:
|
35 |
+
st.error("😕 Incorrect username or password")
|
36 |
+
return False # Indicate failed authentication
|
37 |
+
|
38 |
+
# Return False if login not attempted or failed
|
39 |
+
return False
|
knowledge_graph.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
query_config.yaml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
drift_search:
|
2 |
+
max_tokens: 4000
|
3 |
+
drift_k_followups: 3
|
4 |
+
n_depth: 2
|
5 |
+
local_search_text_unit_prop: 0.6
|
6 |
+
local_search_community_prop: 0.4
|
7 |
+
local_search_top_k_mapped_entities: 10
|
8 |
+
local_search_top_k_relationships: 10
|
9 |
+
|
10 |
+
local_search:
|
11 |
+
text_unit_prop: 0.5
|
12 |
+
community_prop: 0.3
|
13 |
+
conversation_history_max_turns: 5
|
14 |
+
top_k_mapped_entities: 10
|
15 |
+
top_k_relationships: 10
|
16 |
+
max_tokens: 8000
|
17 |
+
|
18 |
+
global_search:
|
19 |
+
max_tokens: 8000
|
20 |
+
data_max_tokens: 8000
|
21 |
+
map_max_tokens: 1000
|
22 |
+
reduce_max_tokens: 2000
|
23 |
+
concurrency: 16
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.40.1
|
2 |
+
pandas
|
3 |
+
graphrag==0.4.1
|
search_handlers.py
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from pathlib import Path
|
3 |
+
import pandas as pd
|
4 |
+
from typing import Tuple, Optional
|
5 |
+
from graphrag.config import GraphRagConfig, load_config, resolve_paths
|
6 |
+
from graphrag.index.create_pipeline_config import create_pipeline_config
|
7 |
+
from graphrag.logging import PrintProgressReporter
|
8 |
+
from graphrag.utils.storage import _create_storage, _load_table_from_storage
|
9 |
+
import graphrag.api as api
|
10 |
+
|
11 |
+
|
12 |
+
class StreamlitProgressReporter(PrintProgressReporter):
|
13 |
+
def __init__(self, placeholder):
|
14 |
+
super().__init__("")
|
15 |
+
self.placeholder = placeholder
|
16 |
+
|
17 |
+
def success(self, message: str):
|
18 |
+
self.placeholder.success(message)
|
19 |
+
|
20 |
+
|
21 |
+
def _resolve_parquet_files(
|
22 |
+
root_dir: str,
|
23 |
+
config: GraphRagConfig,
|
24 |
+
parquet_list: list[str],
|
25 |
+
optional_list: list[str],
|
26 |
+
) -> dict[str, pd.DataFrame]:
|
27 |
+
"""Read parquet files to a dataframe dict."""
|
28 |
+
dataframe_dict = {}
|
29 |
+
pipeline_config = create_pipeline_config(config)
|
30 |
+
storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
|
31 |
+
|
32 |
+
for parquet_file in parquet_list:
|
33 |
+
df_key = parquet_file.split(".")[0]
|
34 |
+
df_value = asyncio.run(
|
35 |
+
_load_table_from_storage(name=parquet_file, storage=storage_obj)
|
36 |
+
)
|
37 |
+
dataframe_dict[df_key] = df_value
|
38 |
+
|
39 |
+
for optional_file in optional_list:
|
40 |
+
file_exists = asyncio.run(storage_obj.has(optional_file))
|
41 |
+
df_key = optional_file.split(".")[0]
|
42 |
+
if file_exists:
|
43 |
+
df_value = asyncio.run(
|
44 |
+
_load_table_from_storage(name=optional_file, storage=storage_obj)
|
45 |
+
)
|
46 |
+
dataframe_dict[df_key] = df_value
|
47 |
+
else:
|
48 |
+
dataframe_dict[df_key] = None
|
49 |
+
|
50 |
+
return dataframe_dict
|
51 |
+
|
52 |
+
|
53 |
+
def run_global_search(
|
54 |
+
config_filepath: Optional[str],
|
55 |
+
data_dir: Optional[str],
|
56 |
+
root_dir: str,
|
57 |
+
community_level: int,
|
58 |
+
response_type: str,
|
59 |
+
streaming: bool,
|
60 |
+
query: str,
|
61 |
+
progress_placeholder,
|
62 |
+
) -> Tuple[str, dict]:
|
63 |
+
"""Perform a global search with a given query."""
|
64 |
+
root = Path(root_dir).resolve()
|
65 |
+
config = load_config(root, config_filepath)
|
66 |
+
reporter = StreamlitProgressReporter(progress_placeholder)
|
67 |
+
|
68 |
+
config.storage.base_dir = data_dir or config.storage.base_dir
|
69 |
+
resolve_paths(config)
|
70 |
+
|
71 |
+
dataframe_dict = _resolve_parquet_files(
|
72 |
+
root_dir=root_dir,
|
73 |
+
config=config,
|
74 |
+
parquet_list=[
|
75 |
+
"create_final_nodes.parquet",
|
76 |
+
"create_final_entities.parquet",
|
77 |
+
"create_final_community_reports.parquet",
|
78 |
+
],
|
79 |
+
optional_list=[],
|
80 |
+
)
|
81 |
+
|
82 |
+
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
83 |
+
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
84 |
+
final_community_reports: pd.DataFrame = dataframe_dict[
|
85 |
+
"create_final_community_reports"
|
86 |
+
]
|
87 |
+
|
88 |
+
if streaming:
|
89 |
+
|
90 |
+
async def run_streaming_search():
|
91 |
+
full_response = ""
|
92 |
+
context_data = None
|
93 |
+
get_context_data = True
|
94 |
+
try:
|
95 |
+
async for stream_chunk in api.global_search_streaming(
|
96 |
+
config=config,
|
97 |
+
nodes=final_nodes,
|
98 |
+
entities=final_entities,
|
99 |
+
community_reports=final_community_reports,
|
100 |
+
community_level=community_level,
|
101 |
+
response_type=response_type,
|
102 |
+
query=query,
|
103 |
+
):
|
104 |
+
if get_context_data:
|
105 |
+
context_data = stream_chunk
|
106 |
+
get_context_data = False
|
107 |
+
else:
|
108 |
+
full_response += stream_chunk
|
109 |
+
progress_placeholder.markdown(full_response)
|
110 |
+
except Exception as e:
|
111 |
+
progress_placeholder.error(f"Error during streaming search: {e}")
|
112 |
+
return None, None
|
113 |
+
|
114 |
+
return full_response, context_data
|
115 |
+
|
116 |
+
result = asyncio.run(run_streaming_search())
|
117 |
+
if result is None:
|
118 |
+
return "", {} # Graceful fallback
|
119 |
+
return result
|
120 |
+
|
121 |
+
# Non-streaming logic
|
122 |
+
try:
|
123 |
+
response, context_data = asyncio.run(
|
124 |
+
api.global_search(
|
125 |
+
config=config,
|
126 |
+
nodes=final_nodes,
|
127 |
+
entities=final_entities,
|
128 |
+
community_reports=final_community_reports,
|
129 |
+
community_level=community_level,
|
130 |
+
response_type=response_type,
|
131 |
+
query=query,
|
132 |
+
)
|
133 |
+
)
|
134 |
+
reporter.success(f"Global Search Response:\n{response}")
|
135 |
+
return response, context_data
|
136 |
+
except Exception as e:
|
137 |
+
progress_placeholder.error(f"Error during global search: {e}")
|
138 |
+
return "", {} # Graceful fallback
|
139 |
+
|
140 |
+
|
141 |
+
def run_local_search(
|
142 |
+
config_filepath: Optional[str],
|
143 |
+
data_dir: Optional[str],
|
144 |
+
root_dir: str,
|
145 |
+
community_level: int,
|
146 |
+
response_type: str,
|
147 |
+
streaming: bool,
|
148 |
+
query: str,
|
149 |
+
progress_placeholder,
|
150 |
+
) -> Tuple[str, dict]:
|
151 |
+
"""Perform a local search with a given query."""
|
152 |
+
root = Path(root_dir).resolve()
|
153 |
+
config = load_config(root, config_filepath)
|
154 |
+
reporter = StreamlitProgressReporter(progress_placeholder)
|
155 |
+
|
156 |
+
config.storage.base_dir = data_dir or config.storage.base_dir
|
157 |
+
resolve_paths(config)
|
158 |
+
|
159 |
+
dataframe_dict = _resolve_parquet_files(
|
160 |
+
root_dir=root_dir,
|
161 |
+
config=config,
|
162 |
+
parquet_list=[
|
163 |
+
"create_final_nodes.parquet",
|
164 |
+
"create_final_community_reports.parquet",
|
165 |
+
"create_final_text_units.parquet",
|
166 |
+
"create_final_relationships.parquet",
|
167 |
+
"create_final_entities.parquet",
|
168 |
+
],
|
169 |
+
optional_list=["create_final_covariates.parquet"],
|
170 |
+
)
|
171 |
+
|
172 |
+
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
173 |
+
final_community_reports: pd.DataFrame = dataframe_dict[
|
174 |
+
"create_final_community_reports"
|
175 |
+
]
|
176 |
+
final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
|
177 |
+
final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
|
178 |
+
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
179 |
+
final_covariates: Optional[pd.DataFrame] = dataframe_dict["create_final_covariates"]
|
180 |
+
|
181 |
+
if streaming:
|
182 |
+
|
183 |
+
async def run_streaming_search():
|
184 |
+
full_response = ""
|
185 |
+
context_data = None
|
186 |
+
get_context_data = True
|
187 |
+
async for stream_chunk in api.local_search_streaming(
|
188 |
+
config=config,
|
189 |
+
nodes=final_nodes,
|
190 |
+
entities=final_entities,
|
191 |
+
community_reports=final_community_reports,
|
192 |
+
text_units=final_text_units,
|
193 |
+
relationships=final_relationships,
|
194 |
+
covariates=final_covariates,
|
195 |
+
community_level=community_level,
|
196 |
+
response_type=response_type,
|
197 |
+
query=query,
|
198 |
+
):
|
199 |
+
if get_context_data:
|
200 |
+
context_data = stream_chunk
|
201 |
+
get_context_data = False
|
202 |
+
else:
|
203 |
+
full_response += stream_chunk
|
204 |
+
progress_placeholder.markdown(full_response)
|
205 |
+
return full_response, context_data
|
206 |
+
|
207 |
+
return asyncio.run(run_streaming_search())
|
208 |
+
|
209 |
+
response, context_data = asyncio.run(
|
210 |
+
api.local_search(
|
211 |
+
config=config,
|
212 |
+
nodes=final_nodes,
|
213 |
+
entities=final_entities,
|
214 |
+
community_reports=final_community_reports,
|
215 |
+
text_units=final_text_units,
|
216 |
+
relationships=final_relationships,
|
217 |
+
covariates=final_covariates,
|
218 |
+
community_level=community_level,
|
219 |
+
response_type=response_type,
|
220 |
+
query=query,
|
221 |
+
)
|
222 |
+
)
|
223 |
+
reporter.success(f"Local Search Response:\n{response}")
|
224 |
+
return response, context_data
|
225 |
+
|
226 |
+
|
227 |
+
def run_drift_search(
|
228 |
+
config_filepath: Optional[str],
|
229 |
+
data_dir: Optional[str],
|
230 |
+
root_dir: str,
|
231 |
+
community_level: int,
|
232 |
+
response_type: str,
|
233 |
+
streaming: bool,
|
234 |
+
query: str,
|
235 |
+
progress_placeholder,
|
236 |
+
) -> Tuple[str, dict]:
|
237 |
+
"""Perform a DRIFT search with a given query."""
|
238 |
+
root = Path(root_dir).resolve()
|
239 |
+
config = load_config(root, config_filepath)
|
240 |
+
reporter = StreamlitProgressReporter(progress_placeholder)
|
241 |
+
|
242 |
+
config.storage.base_dir = data_dir or config.storage.base_dir
|
243 |
+
resolve_paths(config)
|
244 |
+
|
245 |
+
dataframe_dict = _resolve_parquet_files(
|
246 |
+
root_dir=root_dir,
|
247 |
+
config=config,
|
248 |
+
parquet_list=[
|
249 |
+
"create_final_nodes.parquet",
|
250 |
+
"create_final_entities.parquet",
|
251 |
+
"create_final_community_reports.parquet",
|
252 |
+
"create_final_text_units.parquet",
|
253 |
+
"create_final_relationships.parquet",
|
254 |
+
],
|
255 |
+
optional_list=[], # Remove covariates as it's not supported
|
256 |
+
)
|
257 |
+
|
258 |
+
final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
|
259 |
+
final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
|
260 |
+
final_community_reports: pd.DataFrame = dataframe_dict[
|
261 |
+
"create_final_community_reports"
|
262 |
+
]
|
263 |
+
final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
|
264 |
+
final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
|
265 |
+
|
266 |
+
# Note: DRIFT search doesn't support streaming
|
267 |
+
if streaming:
|
268 |
+
progress_placeholder.warning(
|
269 |
+
"Streaming is not supported for DRIFT search. Using standard search instead."
|
270 |
+
)
|
271 |
+
|
272 |
+
response, context_data = asyncio.run(
|
273 |
+
api.drift_search(
|
274 |
+
config=config,
|
275 |
+
nodes=final_nodes,
|
276 |
+
entities=final_entities,
|
277 |
+
community_reports=final_community_reports,
|
278 |
+
text_units=final_text_units,
|
279 |
+
relationships=final_relationships,
|
280 |
+
community_level=community_level,
|
281 |
+
query=query,
|
282 |
+
)
|
283 |
+
)
|
284 |
+
reporter.success(f"DRIFT Search Response:\n{response}")
|
285 |
+
return response, context_data
|
settings.yaml
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
encoding_model: cl100k_base
|
2 |
+
skip_workflows: []
|
3 |
+
llm:
|
4 |
+
api_key: ${GRAPHRAG_API_KEY}
|
5 |
+
type: openai_chat
|
6 |
+
model: gpt-4o-mini
|
7 |
+
model_supports_json: true
|
8 |
+
max_tokens: 4000
|
9 |
+
temperature: 0
|
10 |
+
|
11 |
+
embeddings:
|
12 |
+
async_mode: threaded
|
13 |
+
batch_size: 16
|
14 |
+
vector_store:
|
15 |
+
type: lancedb
|
16 |
+
db_uri: 'output/lancedb'
|
17 |
+
container_name: default
|
18 |
+
overwrite: true
|
19 |
+
llm:
|
20 |
+
api_key: ${GRAPHRAG_API_KEY}
|
21 |
+
type: openai_embedding
|
22 |
+
model: text-embedding-3-small
|
23 |
+
|
24 |
+
chunks:
|
25 |
+
size: 500
|
26 |
+
overlap: 50
|
27 |
+
group_by_columns: [id]
|
28 |
+
|
29 |
+
input:
|
30 |
+
type: file
|
31 |
+
file_type: text
|
32 |
+
base_dir: "input"
|
33 |
+
file_pattern: ".*\\.txt$"
|
34 |
+
recursive: true
|
35 |
+
source_tracking: true
|
36 |
+
processing_order:
|
37 |
+
- path: "first_paragraphs"
|
38 |
+
priority: 1
|
39 |
+
purpose: "graph_building"
|
40 |
+
- path: "full_documents"
|
41 |
+
priority: 2
|
42 |
+
purpose: "retrieval"
|
43 |
+
|
44 |
+
entity_extraction:
|
45 |
+
prompt: "prompts/entity_extraction.txt"
|
46 |
+
entity_types:
|
47 |
+
- "Baggage Type"
|
48 |
+
- "Dimension"
|
49 |
+
- "Linear Dimension"
|
50 |
+
- "Weight"
|
51 |
+
- "Material Type"
|
52 |
+
- "Wheel Configuration"
|
53 |
+
- "Measurement Unit"
|
54 |
+
- "Size Category"
|
55 |
+
- "Weight Category"
|
56 |
+
- "Airline"
|
57 |
+
- "Alliance"
|
58 |
+
- "Airport"
|
59 |
+
- "Route Type"
|
60 |
+
- "Travel Class"
|
61 |
+
- "Cabin Section"
|
62 |
+
- "Aircraft Type"
|
63 |
+
- "Restriction"
|
64 |
+
- "Exemption"
|
65 |
+
- "Policy"
|
66 |
+
- "Fee Structure"
|
67 |
+
- "Currency"
|
68 |
+
- "Allowance"
|
69 |
+
- "Special Item"
|
70 |
+
- "Prohibited Item"
|
71 |
+
- "Restricted Item"
|
72 |
+
- "Dangerous Good"
|
73 |
+
- "Fragile Item"
|
74 |
+
- "Valuable Item"
|
75 |
+
- "Required Document"
|
76 |
+
- "Label Type"
|
77 |
+
- "Tag Category"
|
78 |
+
- "Service Type"
|
79 |
+
- "Handler Role"
|
80 |
+
- "Service Location"
|
81 |
+
- "Time Period"
|
82 |
+
- "Passenger Type"
|
83 |
+
- "Membership Level"
|
84 |
+
- "Group Category"
|
85 |
+
max_gleanings: 2
|
86 |
+
source_filter: "first_paragraphs"
|
87 |
+
|
88 |
+
claim_extraction:
|
89 |
+
enabled: true
|
90 |
+
claim_types:
|
91 |
+
- "Basic Size Restriction"
|
92 |
+
- "Oversize Condition"
|
93 |
+
- "Weight Limit Standard"
|
94 |
+
- "Overweight Condition"
|
95 |
+
- "Combined Dimension Limit"
|
96 |
+
- "Cabin Storage Requirement"
|
97 |
+
- "Standard Fee"
|
98 |
+
- "Excess Fee"
|
99 |
+
- "Oversize Fee"
|
100 |
+
- "Overweight Fee"
|
101 |
+
- "Special Handling Fee"
|
102 |
+
- "Season Surcharge"
|
103 |
+
- "Route-Specific Fee"
|
104 |
+
- "Multi-Piece Pricing"
|
105 |
+
- "Fee Waiver Condition"
|
106 |
+
- "Basic Allowance"
|
107 |
+
- "Class-Based Allowance"
|
108 |
+
- "Status-Based Allowance"
|
109 |
+
- "Route-Based Allowance"
|
110 |
+
- "Special Group Allowance"
|
111 |
+
- "Seasonal Allowance"
|
112 |
+
- "Equipment Allowance"
|
113 |
+
- "Prohibited Item Policy"
|
114 |
+
- "Restricted Item Condition"
|
115 |
+
- "Dangerous Goods Policy"
|
116 |
+
- "Special Item Restriction"
|
117 |
+
- "Packaging Requirement"
|
118 |
+
- "Declaration Requirement"
|
119 |
+
- "Check-in Deadline"
|
120 |
+
- "Special Handling Procedure"
|
121 |
+
- "Priority Handling Rule"
|
122 |
+
- "Transfer Handling Policy"
|
123 |
+
- "Delivery Service Policy"
|
124 |
+
- "Storage Policy"
|
125 |
+
- "Liability Limit"
|
126 |
+
- "Insurance Requirement"
|
127 |
+
- "Claim Procedure"
|
128 |
+
- "Compensation Policy"
|
129 |
+
- "Time Limit Policy"
|
130 |
+
- "Weather Restriction"
|
131 |
+
- "Seasonal Restriction"
|
132 |
+
- "Aircraft Limitation"
|
133 |
+
- "Route Restriction"
|
134 |
+
- "Connection Impact"
|
135 |
+
- "Tag Requirement"
|
136 |
+
- "Label Requirement"
|
137 |
+
- "Documentation Requirement"
|
138 |
+
- "Declaration Policy"
|
139 |
+
- "Handling Standard"
|
140 |
+
- "Service Level Agreement"
|
141 |
+
- "Priority Service Standard"
|
142 |
+
- "Delivery Time Standard"
|
143 |
+
- "Medical Exception"
|
144 |
+
- "Military Exception"
|
145 |
+
- "Diplomatic Exception"
|
146 |
+
- "Event Exception"
|
147 |
+
- "Emergency Exception"
|
148 |
+
prompt: "prompts/claim_extraction.txt"
|
149 |
+
description: "Extract baggage measurements, weight limits, and restrictions from airline documentation."
|
150 |
+
max_gleanings: 2
|
151 |
+
source_filter: "first_paragraphs"
|
152 |
+
|
153 |
+
local_search:
|
154 |
+
text_unit_prop: 0.7
|
155 |
+
community_prop: 0.3
|
156 |
+
top_k_mapped_entities: 15
|
157 |
+
top_k_relationships: 15
|
158 |
+
max_tokens: 4000
|
159 |
+
source_priority:
|
160 |
+
graph_search: "first_paragraphs"
|
161 |
+
answer_retrieval: "full_documents"
|
162 |
+
|
163 |
+
global_search:
|
164 |
+
max_tokens: 4000
|
165 |
+
data_max_tokens: 4000
|
166 |
+
map_max_tokens: 1000
|
167 |
+
reduce_max_tokens: 2000
|
168 |
+
allow_general_knowledge: false
|
169 |
+
min_score_threshold: 0.1
|
170 |
+
concurrency: 10
|
171 |
+
|
172 |
+
embed_graph:
|
173 |
+
enabled: true
|
174 |
+
num_walks: 100
|
175 |
+
walk_length: 10
|
176 |
+
window_size: 5
|
177 |
+
iterations: 10
|
178 |
+
|
179 |
+
umap:
|
180 |
+
enabled: true
|
181 |
+
n_neighbors: 15
|
182 |
+
min_dist: 0.1
|
183 |
+
n_components: 2
|
184 |
+
|
185 |
+
storage:
|
186 |
+
type: file
|
187 |
+
base_dir: "output"
|
188 |
+
|
189 |
+
cache:
|
190 |
+
type: file
|
191 |
+
base_dir: "cache"
|
192 |
+
|
193 |
+
reporting:
|
194 |
+
type: file
|
195 |
+
base_dir: "reports"
|
196 |
+
include_source_tracking: true
|
styles.css
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/* Container for all messages */
|
2 |
+
.chat-container {
|
3 |
+
display: flex;
|
4 |
+
flex-direction: column;
|
5 |
+
gap: 1rem;
|
6 |
+
padding: 1rem;
|
7 |
+
}
|
8 |
+
|
9 |
+
/* Message wrapper with avatar support */
|
10 |
+
.chat-message {
|
11 |
+
display: flex;
|
12 |
+
align-items: flex-start;
|
13 |
+
gap: 0.5rem;
|
14 |
+
width: 100%;
|
15 |
+
max-width: 900px;
|
16 |
+
margin: 0.5rem 0;
|
17 |
+
}
|
18 |
+
|
19 |
+
/* Avatar container */
|
20 |
+
.avatar {
|
21 |
+
width: 40px;
|
22 |
+
height: 40px;
|
23 |
+
border-radius: 50%;
|
24 |
+
overflow: hidden;
|
25 |
+
flex-shrink: 0;
|
26 |
+
}
|
27 |
+
|
28 |
+
.avatar img {
|
29 |
+
width: 100%;
|
30 |
+
height: 100%;
|
31 |
+
object-fit: cover;
|
32 |
+
}
|
33 |
+
|
34 |
+
/* Message content wrapper */
|
35 |
+
.message-content-wrapper {
|
36 |
+
display: flex;
|
37 |
+
flex-direction: column;
|
38 |
+
max-width: 80%;
|
39 |
+
}
|
40 |
+
|
41 |
+
/* Message bubble */
|
42 |
+
.message-bubble {
|
43 |
+
padding: 1rem;
|
44 |
+
border-radius: 0.5rem;
|
45 |
+
margin: 0.2rem 0;
|
46 |
+
}
|
47 |
+
|
48 |
+
/* User message specific styling */
|
49 |
+
.user-message {
|
50 |
+
flex-direction: row-reverse;
|
51 |
+
}
|
52 |
+
|
53 |
+
.user-message .message-bubble {
|
54 |
+
background-color: #2b313e;
|
55 |
+
border-top-right-radius: 0;
|
56 |
+
color: white;
|
57 |
+
}
|
58 |
+
|
59 |
+
/* Assistant message specific styling */
|
60 |
+
.assistant-message .message-bubble {
|
61 |
+
background-color: #343741;
|
62 |
+
border-top-left-radius: 0;
|
63 |
+
color: white;
|
64 |
+
}
|
65 |
+
|
66 |
+
/* Message content */
|
67 |
+
.message-content {
|
68 |
+
word-wrap: break-word;
|
69 |
+
}
|
70 |
+
|
71 |
+
/* Remove default streamlit margins */
|
72 |
+
.stMarkdown {
|
73 |
+
margin: 0 !important;
|
74 |
+
}
|
75 |
+
|
76 |
+
/* Style for code blocks within messages */
|
77 |
+
.message-content pre {
|
78 |
+
background-color: #1e1e1e;
|
79 |
+
padding: 0.5rem;
|
80 |
+
border-radius: 0.3rem;
|
81 |
+
margin: 0.5rem 0;
|
82 |
+
overflow-x: auto;
|
83 |
+
}
|
84 |
+
|
85 |
+
/* Improved loading spinner visibility */
|
86 |
+
.stSpinner {
|
87 |
+
text-align: center;
|
88 |
+
margin: 1rem 0;
|
89 |
+
}
|
90 |
+
|
91 |
+
/* Time stamp styling */
|
92 |
+
.timestamp {
|
93 |
+
font-size: 0.8em;
|
94 |
+
color: #999;
|
95 |
+
margin: 0.2rem 0;
|
96 |
+
}
|
97 |
+
|
98 |
+
.logo-container {
|
99 |
+
display: flex;
|
100 |
+
/* Enable flexbox layout */
|
101 |
+
align-items: center;
|
102 |
+
/* Vertically center-align items */
|
103 |
+
padding: 10px 0;
|
104 |
+
/* Add padding top/bottom */
|
105 |
+
}
|
106 |
+
|
107 |
+
.logo-image {
|
108 |
+
max-width: 110px;
|
109 |
+
/* Set maximum width */
|
110 |
+
height: auto;
|
111 |
+
/* Maintain aspect ratio */
|
112 |
+
}
|
wiki.py
ADDED
@@ -0,0 +1,965 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# wiki.py
|
2 |
+
# import streamlit_mermaid as stmd
|
3 |
+
import streamlit.components.v1 as components
|
4 |
+
import streamlit as st
|
5 |
+
from streamlit.components.v1 import html
|
6 |
+
|
7 |
+
|
8 |
+
def mermaid(code: str, height: int = 600) -> None:
|
9 |
+
components.html(
|
10 |
+
f"""
|
11 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
12 |
+
<div style="height: {height}px">
|
13 |
+
<pre class="mermaid">
|
14 |
+
{code}
|
15 |
+
</pre>
|
16 |
+
</div>
|
17 |
+
<script type="module">
|
18 |
+
import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs';
|
19 |
+
mermaid.initialize({{ startOnLoad: true }});
|
20 |
+
</script>
|
21 |
+
""",
|
22 |
+
height=height,
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
def render_wiki_tab():
|
27 |
+
"""Render the Wiki tab content."""
|
28 |
+
st.header("Overview")
|
29 |
+
|
30 |
+
st.markdown(
|
31 |
+
"""
|
32 |
+
This documentation details the process I followed to achieve the assignment of using GraphRAG for indexing the first paragraphs of seven documents, embedding the full documents, performing initial search on the graph built from the first paragraphs, and retrieving answers from the full document content.
|
33 |
+
""")
|
34 |
+
st.markdown(
|
35 |
+
"""
|
36 |
+
This project implements a specialized document processing and querying system using GraphRAG for El Al baggage requirements\allowance documentation. The system processes first paragraphs separately from full documents, enabling graph-based search while maintaining comprehensive answer retrieval capabilities.
|
37 |
+
|
38 |
+
"""
|
39 |
+
)
|
40 |
+
|
41 |
+
st.markdown(
|
42 |
+
"""
|
43 |
+
### Implementation Process
|
44 |
+
|
45 |
+
Initially, I attempted to implement this using separate processing paths for first paragraphs and full documents, but I discovered a more elegant solution through GraphRAG's source tracking and processing order capabilities. Instead of maintaining separate indexes, I configured a unified approach where documents were processed together but with clear priorities and purposes.
|
46 |
+
|
47 |
+
I set up the configuration to treat first paragraphs with priority 1 for graph building and full documents with priority 2 for retrieval. This was achieved through careful configuration of source tracking, processing order, and source filters in the `settings.yaml` file, which allowed me to maintain the separation of concerns.
|
48 |
+
"""
|
49 |
+
)
|
50 |
+
|
51 |
+
st.markdown(
|
52 |
+
"""
|
53 |
+
### Final Implementation
|
54 |
+
|
55 |
+
The final implementation proved successful, creating a knowledge graph from the first paragraphs while maintaining access to full document content for comprehensive answers. I used entity types specific to airport security (like **Baggage Type**, **Dimension**, **Weight Limit**) and configured claim extraction to focus on relevant restrictions and allowances.
|
56 |
+
|
57 |
+
"""
|
58 |
+
)
|
59 |
+
|
60 |
+
st.markdown(
|
61 |
+
"""
|
62 |
+
### Using the Chat Application
|
63 |
+
|
64 |
+
The chat application provides an interactive interface to query the GraphRAG system. Here's how it works:
|
65 |
+
|
66 |
+
##### Getting Started:
|
67 |
+
- **Step 1**: Click on the chat tab.
|
68 |
+
- **Step 2**: Choose the desired search type from the sidebar:
|
69 |
+
- **Local Search**: Focuses on specific text chunks and direct relationships in the graph.
|
70 |
+
- **Global Search**: Analyzes the entire dataset at a high level using community summaries.
|
71 |
+
- **DRIFT Search**: Combines local and global search for complex queries requiring both detailed and contextual answers.
|
72 |
+
|
73 |
+
##### Submitting a Query:
|
74 |
+
- Enter your question in the input field at the bottom of the chat interface.
|
75 |
+
- Depending on the selected search type, the system will:
|
76 |
+
- Use the graph for initial navigation.
|
77 |
+
- Retrieve answers from full documents for comprehensive responses.
|
78 |
+
|
79 |
+
##### Viewing Results:
|
80 |
+
- The assistant's response appears in the chat window, formatted for clarity.
|
81 |
+
|
82 |
+
##### Key Features:
|
83 |
+
- **Streaming Responses**: Responses are displayed in real-time for supported search types.
|
84 |
+
- **Session History**: Previous queries and responses are retained within the session for reference.
|
85 |
+
|
86 |
+
##### Example Queries:
|
87 |
+
- "What are the liquid restrictions for carry-on bags?"
|
88 |
+
- "How do pet carrier size restrictions compare to regular carry-on limits?"
|
89 |
+
"""
|
90 |
+
)
|
91 |
+
|
92 |
+
with st.expander("Architecture", expanded=False):
|
93 |
+
st.markdown(
|
94 |
+
"""
|
95 |
+
The architecture of the system is designed to process data through multiple stages, including input preparation, processing, and search functionalities. Below is a detailed diagram illustrating the workflow of the system:
|
96 |
+
"""
|
97 |
+
)
|
98 |
+
|
99 |
+
mermaid_code = """
|
100 |
+
%%{init: {'theme': 'base', 'themeVariables': {'primaryColor': '#1E90FF', 'edgeLabelBackground': '#FFFFFF', 'secondaryColor': '#F0F8FF', 'tertiaryColor': '#FFFFFF', 'primaryTextColor': '#000000'}}}%%
|
101 |
+
graph TD
|
102 |
+
subgraph Input
|
103 |
+
FP[First Paragraphs] --> P[Processing]
|
104 |
+
FD[Full Documents] --> P
|
105 |
+
end
|
106 |
+
|
107 |
+
subgraph Processing
|
108 |
+
P --> IE[Entity Extraction]
|
109 |
+
P --> CD[Community Detection]
|
110 |
+
P --> E[Embeddings Generation]
|
111 |
+
|
112 |
+
IE --> G[Graph Construction]
|
113 |
+
CD --> G
|
114 |
+
E --> VS[Vector Store]
|
115 |
+
end
|
116 |
+
|
117 |
+
subgraph Search
|
118 |
+
Q[Query] --> DS[DRIFT Search]
|
119 |
+
DS --> GS[Graph Search]
|
120 |
+
DS --> FR[Full Retrieval]
|
121 |
+
GS --> VS
|
122 |
+
FR --> VS
|
123 |
+
GS --> A[Answer Generation]
|
124 |
+
FR --> A
|
125 |
+
end
|
126 |
+
"""
|
127 |
+
mermaid(mermaid_code, height=600)
|
128 |
+
|
129 |
+
with st.expander("Graph Analysis", expanded=False):
|
130 |
+
st.markdown("### System Components Breakdown:")
|
131 |
+
|
132 |
+
mermaid_code = """
|
133 |
+
pie
|
134 |
+
title "System Components"
|
135 |
+
"Documents" : 14
|
136 |
+
"Text Units" : 36
|
137 |
+
"Entities" : 315
|
138 |
+
"Relationships" : 372
|
139 |
+
"Communities" : 66
|
140 |
+
"""
|
141 |
+
mermaid(mermaid_code, height=500)
|
142 |
+
|
143 |
+
# Description and graph statistics
|
144 |
+
st.markdown(
|
145 |
+
"""
|
146 |
+
### Knowledge Graph Visualization
|
147 |
+
|
148 |
+
The graph displayed below represents the relationships between various entities extracted from the input data. Nodes in the graph correspond to entities like "Documents," "Policies," and "Restrictions," while edges represent the relationships or connections between these entities. The graph is constructed using the extracted entities and relationships, processed through NetworkX, and visualized with Pyvis.
|
149 |
+
|
150 |
+
**Process of Creation**:
|
151 |
+
- **Data Preparation**: Entities and relationships are extracted and saved as `create_final_nodes.parquet` and `create_final_relationships.parquet` files, respectively.
|
152 |
+
- **Graph Construction**: Using NetworkX, nodes and edges are added based on the extracted data.
|
153 |
+
- **Visualization**: Pyvis is used to create an interactive visualization with options like physics-based layout, node grouping, and hover effects.
|
154 |
+
|
155 |
+
The resulting graph provides insights into the data's structure, including:
|
156 |
+
- Node type distribution
|
157 |
+
- Community detection levels
|
158 |
+
- Connectivity patterns
|
159 |
+
|
160 |
+
Explore the graph below to understand the relationships between key entities.
|
161 |
+
"""
|
162 |
+
)
|
163 |
+
|
164 |
+
# Load and display the graph visualization (HTML file)
|
165 |
+
with open("knowledge_graph.html", "r") as f:
|
166 |
+
html_content = f.read()
|
167 |
+
st.components.v1.html(html_content, height=800)
|
168 |
+
|
169 |
+
# Graph statistics
|
170 |
+
st.markdown(
|
171 |
+
"""
|
172 |
+
### Graph Statistics:
|
173 |
+
|
174 |
+
* **Number of nodes:** 427
|
175 |
+
* **Number of edges:** 453
|
176 |
+
|
177 |
+
#### Node Type Distribution:
|
178 |
+
|
179 |
+
| Node Type | Distribution |
|
180 |
+
|-----------------------|--------------|
|
181 |
+
| REQUIRED DOCUMENT | 39 |
|
182 |
+
| SERVICE TYPE | 35 |
|
183 |
+
| POLICY | 30 |
|
184 |
+
| RESTRICTION | 27 |
|
185 |
+
| SPECIAL ITEM | 26 |
|
186 |
+
| PROHIBITED ITEM | 23 |
|
187 |
+
| AIRPORT | 22 |
|
188 |
+
| BAGGAGE TYPE | 21 |
|
189 |
+
| SERVICE LOCATION | 18 |
|
190 |
+
| DANGEROUS GOOD | 14 |
|
191 |
+
| ALLOWANCE | 13 |
|
192 |
+
| GEO | 12 |
|
193 |
+
| MEASUREMENT UNIT | 11 |
|
194 |
+
| FEE STRUCTURE | 10 |
|
195 |
+
| LINEAR DIMENSION | 8 |
|
196 |
+
| TIME PERIOD | 8 |
|
197 |
+
| CABIN SECTION | 8 |
|
198 |
+
| WEIGHT | 8 |
|
199 |
+
| WEIGHT CATEGORY | 7 |
|
200 |
+
| AIRLINE | 7 |
|
201 |
+
| CITY | 7 |
|
202 |
+
| DIMENSION | 6 |
|
203 |
+
| VALUABLE ITEM | 5 |
|
204 |
+
| ROUTE TYPE | 5 |
|
205 |
+
| TRAVEL CLASS | 5 |
|
206 |
+
| ORGANIZATION | 5 |
|
207 |
+
| PASSENGER TYPE | 4 |
|
208 |
+
| RESTRICTED ITEM | 3 |
|
209 |
+
| CURRENCY | 2 |
|
210 |
+
| EXEMPTION | 2 |
|
211 |
+
| LABEL TYPE | 2 |
|
212 |
+
| MATERIAL TYPE | 2 |
|
213 |
+
| CARGO | 2 |
|
214 |
+
| MEMBERSHIP LEVEL | 2 |
|
215 |
+
| AIRCRAFT TYPE | 1 |
|
216 |
+
| REGION | 1 |
|
217 |
+
| COUNTRY | 1 |
|
218 |
+
| SIZE CATEGORY | 1 |
|
219 |
+
| WHEEL CONFIGURATION | 1 |
|
220 |
+
| TAG CATEGORY | 1 |
|
221 |
+
| GROUP CATEGORY | 1 |
|
222 |
+
|
223 |
+
#### Most Connected Nodes:
|
224 |
+
|
225 |
+
| Node | Connections |
|
226 |
+
|--------------------|-------------|
|
227 |
+
| EL AL | 49 |
|
228 |
+
| ANIMAL | 29 |
|
229 |
+
| CHECKED BAGGAGE | 25 |
|
230 |
+
| BAGGAGE | 21 |
|
231 |
+
| PET | 19 |
|
232 |
+
"""
|
233 |
+
)
|
234 |
+
|
235 |
+
with st.expander("Implementation Results", expanded=False):
|
236 |
+
st.markdown(
|
237 |
+
"""
|
238 |
+
### Document Processing
|
239 |
+
|
240 |
+
* **Total Documents**: 14 (7 first paragraphs + 7 full documents)
|
241 |
+
* **Text Units**: 36
|
242 |
+
* **Entities**: 315
|
243 |
+
* **Relationships**: 372
|
244 |
+
* **Communities**: 66 across 4 levels
|
245 |
+
|
246 |
+
### Community Structure
|
247 |
+
|
248 |
+
* **Level 0**: 11 communities
|
249 |
+
* **Level 1**: 44 communities
|
250 |
+
* **Level 2**: 9 communities
|
251 |
+
* **Level 3**: 2 communities
|
252 |
+
"""
|
253 |
+
)
|
254 |
+
|
255 |
+
st.markdown("### System Operation Flow")
|
256 |
+
|
257 |
+
mermaid_code = """
|
258 |
+
sequenceDiagram
|
259 |
+
participant U as User
|
260 |
+
participant Q as Query Engine
|
261 |
+
participant G as Graph Search
|
262 |
+
participant V as Vector Store
|
263 |
+
participant D as Document Retrieval
|
264 |
+
|
265 |
+
U->>Q: Submit Query
|
266 |
+
Q->>G: Search in First Paragraph Graph
|
267 |
+
G->>V: Lookup Relevant Entities
|
268 |
+
V->>D: Retrieve Full Content
|
269 |
+
D->>Q: Return Comprehensive Answer
|
270 |
+
Q->>U: Present Response
|
271 |
+
"""
|
272 |
+
|
273 |
+
mermaid(mermaid_code, height=400)
|
274 |
+
|
275 |
+
with st.expander("Implementation Details", expanded=False):
|
276 |
+
st.markdown(
|
277 |
+
"""
|
278 |
+
The implementation of the system follows a processing pipeline that integrates data from the first paragraphs and full documents, creating a unified structure for efficient querying. Below is the pipeline representation:
|
279 |
+
"""
|
280 |
+
)
|
281 |
+
|
282 |
+
mermaid_code = """
|
283 |
+
flowchart TB
|
284 |
+
subgraph First Paragraphs
|
285 |
+
FP[Load First Paragraphs] --> EP[Extract Entities]
|
286 |
+
EP --> RP[Build Relationships]
|
287 |
+
RP --> CP[Create Communities]
|
288 |
+
end
|
289 |
+
|
290 |
+
subgraph Full Documents
|
291 |
+
FD[Load Full Documents] --> CH[Chunk Documents]
|
292 |
+
CH --> EF[Generate Embeddings]
|
293 |
+
end
|
294 |
+
|
295 |
+
subgraph Integration
|
296 |
+
CP --> VS[(Vector Store)]
|
297 |
+
EF --> VS
|
298 |
+
end
|
299 |
+
|
300 |
+
subgraph Search
|
301 |
+
Q[Query] --> GS[Graph Search]
|
302 |
+
GS --> VS
|
303 |
+
VS --> RD[Retrieve Details]
|
304 |
+
RD --> AG[Answer Generation]
|
305 |
+
end
|
306 |
+
"""
|
307 |
+
mermaid(mermaid_code, height=800)
|
308 |
+
with st.expander("Requirements Fulfillment", expanded=False):
|
309 |
+
st.markdown(
|
310 |
+
"""
|
311 |
+
### Requirements Fulfillment
|
312 |
+
|
313 |
+
**First Paragraph Processing**: ✓
|
314 |
+
* Implemented through `source_filter` and `processing_order`
|
315 |
+
* Verified by entity and relationship extraction
|
316 |
+
|
317 |
+
**Full Document Embeddings**: ✓
|
318 |
+
* Stored in LanceDB
|
319 |
+
* Accessible for comprehensive retrieval
|
320 |
+
|
321 |
+
**Graph-Based Search**: ✓
|
322 |
+
* Communities and relationships established
|
323 |
+
* DRIFT search implemented
|
324 |
+
|
325 |
+
**Complete Answer Retrieval**: ✓
|
326 |
+
* Source priority configuration
|
327 |
+
* Full document content available
|
328 |
+
|
329 |
+
### Performance Metrics
|
330 |
+
|
331 |
+
* **Indexing Speed**: 212.44 seconds total
|
332 |
+
* **Graph Density**: 372 relationships among 315 entities
|
333 |
+
* **Community Structure**: 4-level hierarchy
|
334 |
+
* **Vector Store Size**: 3 Lance files for different embedding types
|
335 |
+
"""
|
336 |
+
)
|
337 |
+
|
338 |
+
with st.expander("Achieving the Requirement", expanded=False):
|
339 |
+
st.markdown("### Source-Based Processing Control:")
|
340 |
+
|
341 |
+
st.markdown(
|
342 |
+
"""
|
343 |
+
```yaml
|
344 |
+
input:
|
345 |
+
source_tracking: true
|
346 |
+
processing_order:
|
347 |
+
- path: "first_paragraphs"
|
348 |
+
priority: 1
|
349 |
+
purpose: "graph_building"
|
350 |
+
- path: "full_documents"
|
351 |
+
priority: 2
|
352 |
+
purpose: "retrieval"
|
353 |
+
```
|
354 |
+
"""
|
355 |
+
)
|
356 |
+
st.markdown(
|
357 |
+
"""
|
358 |
+
This configuration ensures that GraphRAG knows which content is for graph building (first paragraphs) and which is for retrieval (full documents). The priority system makes sure first paragraphs are processed first and used primarily for the knowledge graph construction.
|
359 |
+
"""
|
360 |
+
)
|
361 |
+
|
362 |
+
st.markdown("### Targeted Entity and Claim Extraction:")
|
363 |
+
|
364 |
+
st.markdown(
|
365 |
+
"""
|
366 |
+
```yaml
|
367 |
+
entity_extraction:
|
368 |
+
source_filter: "first_paragraphs"
|
369 |
+
max_gleanings: 2
|
370 |
+
|
371 |
+
claim_extraction:
|
372 |
+
source_filter: "first_paragraphs"
|
373 |
+
```
|
374 |
+
"""
|
375 |
+
)
|
376 |
+
st.markdown(
|
377 |
+
"""
|
378 |
+
These filters ensure that the knowledge graph (entities, relationships, and claims) is built only from the first paragraphs. This is crucial because it means our initial search will only traverse the graph built from these first paragraphs, matching the requirement. The `max_gleanings: 2` allows for thorough extraction while maintaining precision.
|
379 |
+
"""
|
380 |
+
)
|
381 |
+
|
382 |
+
st.markdown("### Search Priority and Retrieval Control:")
|
383 |
+
|
384 |
+
st.markdown(
|
385 |
+
"""
|
386 |
+
```yaml
|
387 |
+
local_search:
|
388 |
+
source_priority:
|
389 |
+
graph_search: "first_paragraphs"
|
390 |
+
answer_retrieval: "full_documents"
|
391 |
+
text_unit_prop: 0.7
|
392 |
+
community_prop: 0.3
|
393 |
+
```
|
394 |
+
"""
|
395 |
+
)
|
396 |
+
st.markdown(
|
397 |
+
"""
|
398 |
+
This is where the magic happens - when a query is made, the system first searches using the graph built from first paragraphs (`graph_search: "first_paragraphs"`), but when it needs to construct the answer, it pulls the content from the full documents (`answer_retrieval: "full_documents"`).
|
399 |
+
|
400 |
+
The text_unit and community proportions ensure we're making good use of both the graph structure and the actual content. Looking at the output files we generated (`create_final_entities.parquet`, `create_final_relationships.parquet`, etc.), we can see this two-phase approach in action: the graph structure is built and stored separately from the full content, but they're linked through the unified vector store in LanceDB, allowing seamless transitions between graph search and content retrieval during query processing.
|
401 |
+
"""
|
402 |
+
)
|
403 |
+
|
404 |
+
with st.expander("Improvements to Make the Graph Creation Process Leaner and Faster", expanded=False):
|
405 |
+
st.markdown("### Optimization of Chunk Size and Overlap:")
|
406 |
+
|
407 |
+
st.markdown(
|
408 |
+
"""
|
409 |
+
```yaml
|
410 |
+
chunks:
|
411 |
+
size: 300 # Reduced from 500
|
412 |
+
overlap: 25 # Reduced from 50
|
413 |
+
group_by_columns: [id]
|
414 |
+
```
|
415 |
+
"""
|
416 |
+
)
|
417 |
+
st.markdown(
|
418 |
+
"""
|
419 |
+
**Rationale**:
|
420 |
+
- Smaller chunks with minimal overlap reduce token usage.
|
421 |
+
- Maintains context while processing fewer tokens per API call.
|
422 |
+
- Especially efficient for first paragraphs processing.
|
423 |
+
"""
|
424 |
+
)
|
425 |
+
|
426 |
+
st.markdown("### Streamline Entity Types and Claims:")
|
427 |
+
|
428 |
+
st.markdown(
|
429 |
+
"""
|
430 |
+
```yaml
|
431 |
+
entity_extraction:
|
432 |
+
entity_types:
|
433 |
+
- "Baggage"
|
434 |
+
- "Restriction"
|
435 |
+
- "Item"
|
436 |
+
max_gleanings: 1 # Reduced from 2
|
437 |
+
|
438 |
+
claim_extraction:
|
439 |
+
enabled: false # Disable unless absolutely necessary
|
440 |
+
```
|
441 |
+
"""
|
442 |
+
)
|
443 |
+
st.markdown(
|
444 |
+
"""
|
445 |
+
**Rationale**:
|
446 |
+
- Fewer entity types mean fewer extraction operations.
|
447 |
+
- Single gleaning pass is often sufficient.
|
448 |
+
- Claims processing is expensive and often redundant.
|
449 |
+
"""
|
450 |
+
)
|
451 |
+
|
452 |
+
st.markdown("### Optimize Graph Embeddings:")
|
453 |
+
|
454 |
+
st.markdown(
|
455 |
+
"""
|
456 |
+
```yaml
|
457 |
+
embed_graph:
|
458 |
+
enabled: true
|
459 |
+
num_walks: 50 # Reduced from 100
|
460 |
+
walk_length: 5 # Reduced from 10
|
461 |
+
window_size: 3 # Reduced from 5
|
462 |
+
iterations: 5 # Reduced from 10
|
463 |
+
```
|
464 |
+
"""
|
465 |
+
)
|
466 |
+
st.markdown(
|
467 |
+
"""
|
468 |
+
**Rationale**:
|
469 |
+
- Fewer random walks still capture essential graph structure.
|
470 |
+
- Shorter walks reduce computation time.
|
471 |
+
- Smaller window size focuses on immediate relationships.
|
472 |
+
"""
|
473 |
+
)
|
474 |
+
|
475 |
+
st.markdown("### Batch Processing and Parallelization:")
|
476 |
+
|
477 |
+
st.markdown(
|
478 |
+
"""
|
479 |
+
```yaml
|
480 |
+
embeddings:
|
481 |
+
async_mode: asyncio # Changed from threaded
|
482 |
+
batch_size: 32 # Increased from 16
|
483 |
+
batch_max_tokens: 8191
|
484 |
+
```
|
485 |
+
"""
|
486 |
+
)
|
487 |
+
st.markdown(
|
488 |
+
"""
|
489 |
+
**Rationale**:
|
490 |
+
- Asyncio performs better than threading for I/O-bound operations.
|
491 |
+
- Larger batch size reduces API calls.
|
492 |
+
- Maximizes throughput within token limits.
|
493 |
+
"""
|
494 |
+
)
|
495 |
+
|
496 |
+
st.markdown("### Community Structure Optimization:")
|
497 |
+
|
498 |
+
st.markdown(
|
499 |
+
"""
|
500 |
+
```yaml
|
501 |
+
cluster_graph:
|
502 |
+
max_cluster_size: 15 # Increased slightly
|
503 |
+
min_cluster_size: 3 # Added parameter
|
504 |
+
|
505 |
+
community_reports:
|
506 |
+
max_input_length: 2000 # Reduced from default
|
507 |
+
max_length: 1000 # Reduced summary length
|
508 |
+
```
|
509 |
+
"""
|
510 |
+
)
|
511 |
+
st.markdown(
|
512 |
+
"""
|
513 |
+
**Rationale**:
|
514 |
+
- Balanced cluster sizes reduce processing overhead.
|
515 |
+
- Shorter community reports still maintain essential information.
|
516 |
+
- Fewer tokens per report means faster processing.
|
517 |
+
"""
|
518 |
+
)
|
519 |
+
|
520 |
+
st.markdown("### Caching and Storage:")
|
521 |
+
|
522 |
+
st.markdown(
|
523 |
+
"""
|
524 |
+
```yaml
|
525 |
+
cache:
|
526 |
+
type: file
|
527 |
+
base_dir: "cache"
|
528 |
+
compression: true # Add compression
|
529 |
+
cache_embeddings: true
|
530 |
+
|
531 |
+
storage:
|
532 |
+
type: file
|
533 |
+
base_dir: "output"
|
534 |
+
compression: true # Add compression
|
535 |
+
```
|
536 |
+
"""
|
537 |
+
)
|
538 |
+
st.markdown(
|
539 |
+
"""
|
540 |
+
**Rationale**:
|
541 |
+
- Compression reduces I/O overhead.
|
542 |
+
- Caching embeddings prevents recomputation.
|
543 |
+
- File-based storage is faster than blob storage for local processing.
|
544 |
+
"""
|
545 |
+
)
|
546 |
+
|
547 |
+
st.markdown("### Disable Non-Essential Features:")
|
548 |
+
|
549 |
+
st.markdown(
|
550 |
+
"""
|
551 |
+
```yaml
|
552 |
+
umap:
|
553 |
+
enabled: false # Disable unless visualization needed
|
554 |
+
|
555 |
+
snapshots:
|
556 |
+
graphml: false
|
557 |
+
raw_entities: false
|
558 |
+
top_level_nodes: false
|
559 |
+
```
|
560 |
+
"""
|
561 |
+
)
|
562 |
+
st.markdown(
|
563 |
+
"""
|
564 |
+
**Rationale**:
|
565 |
+
- UMAP calculation is computationally expensive.
|
566 |
+
- Snapshots are useful for debugging but add overhead.
|
567 |
+
"""
|
568 |
+
)
|
569 |
+
|
570 |
+
st.markdown("### LLM Configuration Optimization:")
|
571 |
+
|
572 |
+
st.markdown(
|
573 |
+
"""
|
574 |
+
```yaml
|
575 |
+
llm:
|
576 |
+
concurrent_requests: 25
|
577 |
+
tokens_per_minute: 150000
|
578 |
+
requests_per_minute: 10000
|
579 |
+
max_retries: 5 # Reduced from 10
|
580 |
+
```
|
581 |
+
"""
|
582 |
+
)
|
583 |
+
st.markdown(
|
584 |
+
"""
|
585 |
+
**Rationale**:
|
586 |
+
- Balanced concurrency prevents rate limiting.
|
587 |
+
- Fewer retries reduce waiting time.
|
588 |
+
- Token and request limits prevent throttling.
|
589 |
+
"""
|
590 |
+
)
|
591 |
+
|
592 |
+
with st.expander("Query Types", expanded=False):
|
593 |
+
st.markdown("### Local Search:")
|
594 |
+
|
595 |
+
st.markdown(
|
596 |
+
"""
|
597 |
+
```yaml
|
598 |
+
local_search:
|
599 |
+
text_unit_prop: 0.7 # Focus on specific text chunks
|
600 |
+
community_prop: 0.3 # Some consideration of community context
|
601 |
+
top_k_mapped_entities: 15
|
602 |
+
source_priority:
|
603 |
+
graph_search: "first_paragraphs"
|
604 |
+
answer_retrieval: "full_documents"
|
605 |
+
```
|
606 |
+
"""
|
607 |
+
)
|
608 |
+
st.markdown(
|
609 |
+
"""
|
610 |
+
**Best when**: Looking for specific baggage rules or restrictions
|
611 |
+
**Example Query**: "What are the liquid restrictions for carry-on bags?"
|
612 |
+
|
613 |
+
**How it works with our data**:
|
614 |
+
- Searches for entities in first paragraphs (like "liquid", "carry-on").
|
615 |
+
- Follows direct relationships in the graph.
|
616 |
+
- Retrieves detailed rules from full documents.
|
617 |
+
|
618 |
+
**Meets requirement?** Yes, but in a limited way - focuses on direct connections.
|
619 |
+
"""
|
620 |
+
)
|
621 |
+
|
622 |
+
st.markdown("### Global Search:")
|
623 |
+
|
624 |
+
st.markdown(
|
625 |
+
"""
|
626 |
+
```yaml
|
627 |
+
global_search:
|
628 |
+
max_tokens: 4000
|
629 |
+
data_max_tokens: 4000
|
630 |
+
min_score_threshold: 0.1
|
631 |
+
allow_general_knowledge: false
|
632 |
+
```
|
633 |
+
"""
|
634 |
+
)
|
635 |
+
st.markdown(
|
636 |
+
"""
|
637 |
+
**Best when**: Understanding overall policies or themes
|
638 |
+
**Example Query**: "What are the main types of baggage restrictions?"
|
639 |
+
|
640 |
+
**How it works with our data**:
|
641 |
+
- Looks at community summaries built from first paragraphs.
|
642 |
+
- Provides broader context about baggage policies.
|
643 |
+
- Pulls supporting details from full documents.
|
644 |
+
|
645 |
+
**Meets requirement?** Partially - good for overview but might miss specific connections.
|
646 |
+
"""
|
647 |
+
)
|
648 |
+
|
649 |
+
st.markdown("### DRIFT Search (Dynamic Reasoning and Inference with Flexible Traversal):")
|
650 |
+
|
651 |
+
st.markdown(
|
652 |
+
"""
|
653 |
+
```yaml
|
654 |
+
local_search:
|
655 |
+
source_priority:
|
656 |
+
graph_search: "first_paragraphs"
|
657 |
+
answer_retrieval: "full_documents"
|
658 |
+
```
|
659 |
+
"""
|
660 |
+
)
|
661 |
+
st.markdown(
|
662 |
+
"""
|
663 |
+
**Best when**: Complex queries requiring both specific details and context
|
664 |
+
**Example Query**: "How do pet carrier size restrictions compare to regular carry-on limits?"
|
665 |
+
|
666 |
+
**How it works with our data**:
|
667 |
+
- Starts with first paragraphs graph to understand relationships between:
|
668 |
+
- Pet carriers
|
669 |
+
- Regular carry-on bags
|
670 |
+
- Size restrictions
|
671 |
+
- Uses community understanding to find related policies.
|
672 |
+
- Retrieves specific details from full documents.
|
673 |
+
|
674 |
+
**Meets requirement?** Yes, most comprehensively.
|
675 |
+
"""
|
676 |
+
)
|
677 |
+
|
678 |
+
st.markdown("### Best Choice for Our Requirement:")
|
679 |
+
st.markdown(
|
680 |
+
"""
|
681 |
+
**DRIFT Search** is the most suitable because:
|
682 |
+
- It naturally implements our two-phase requirement:
|
683 |
+
- Initial search on graph (from first paragraphs).
|
684 |
+
- Answer retrieval from full documents.
|
685 |
+
- It can handle complex queries that need:
|
686 |
+
- Understanding of relationships (from graph).
|
687 |
+
- Specific details (from full documents).
|
688 |
+
- It can dynamically adjust between:
|
689 |
+
- Local search when specific rules are needed.
|
690 |
+
- Global search when context is important.
|
691 |
+
"""
|
692 |
+
)
|
693 |
+
with st.expander("Configuration: full `settings.yaml`", expanded=False):
|
694 |
+
|
695 |
+
st.markdown(
|
696 |
+
"""
|
697 |
+
```yaml
|
698 |
+
# Root configuration for GraphRAG, a system leveraging LLMs for advanced Retrieval Augmented Generation.
|
699 |
+
|
700 |
+
encoding_model: cl100k_base
|
701 |
+
# Specifies the model used for token encoding. The default 'cl100k_base' is common for OpenAI's text models,
|
702 |
+
# determining how text is tokenized into machine-readable units.
|
703 |
+
|
704 |
+
skip_workflows: []
|
705 |
+
# A list of workflows to skip during execution. Empty indicates all workflows are executed.
|
706 |
+
|
707 |
+
llm:
|
708 |
+
api_key: ${GRAPHRAG_API_KEY}
|
709 |
+
# Placeholder for the API key, replaced dynamically from environment variables.
|
710 |
+
# Ensures secure API access for LLM queries.
|
711 |
+
|
712 |
+
type: openai_chat
|
713 |
+
# Defines the type of LLM interface used. Here, it connects to OpenAI's chat-based API.
|
714 |
+
|
715 |
+
model: gpt-4o-mini
|
716 |
+
# Specifies the model variant to use.
|
717 |
+
|
718 |
+
model_supports_json: true
|
719 |
+
# Indicates whether the LLM natively supports JSON responses, useful for structured outputs.
|
720 |
+
|
721 |
+
max_tokens: 4000
|
722 |
+
# Maximum number of tokens in the output. Balances performance and context length.
|
723 |
+
|
724 |
+
temperature: 0
|
725 |
+
# Controls randomness in outputs. 0 means deterministic responses, often preferred for accuracy.
|
726 |
+
|
727 |
+
embeddings:
|
728 |
+
async_mode: threaded
|
729 |
+
# Asynchronous embedding computation mode. 'threaded' uses multi-threading for better performance.
|
730 |
+
|
731 |
+
batch_size: 16
|
732 |
+
# Number of data points processed per batch during embedding, balancing speed and resource use.
|
733 |
+
|
734 |
+
vector_store:
|
735 |
+
type: lancedb
|
736 |
+
# Database type used for storing vectorized embeddings. 'lancedb' supports efficient vector operations.
|
737 |
+
|
738 |
+
db_uri: 'output/lancedb'
|
739 |
+
# URI pointing to the database location where embeddings are stored.
|
740 |
+
|
741 |
+
container_name: default
|
742 |
+
# Logical name for the container storing vector data.
|
743 |
+
|
744 |
+
overwrite: true
|
745 |
+
# Whether to overwrite existing vectors. True allows updating the database during reruns.
|
746 |
+
|
747 |
+
llm:
|
748 |
+
api_key: ${GRAPHRAG_API_KEY}
|
749 |
+
type: openai_embedding
|
750 |
+
model: text-embedding-3-small
|
751 |
+
# Dedicated LLM for embedding tasks. A smaller, specialized model is specified for embeddings.
|
752 |
+
|
753 |
+
chunks:
|
754 |
+
size: 500
|
755 |
+
# Number of tokens per chunk of text. Controls granularity for processing long documents.
|
756 |
+
|
757 |
+
overlap: 50
|
758 |
+
# Overlap between adjacent chunks to ensure continuity in analysis.
|
759 |
+
|
760 |
+
group_by_columns: [id]
|
761 |
+
# Groups data by 'id' before chunking, preserving document boundaries.
|
762 |
+
|
763 |
+
input:
|
764 |
+
type: file
|
765 |
+
file_type: text
|
766 |
+
base_dir: "input"
|
767 |
+
file_pattern: ".*\\.txt$"
|
768 |
+
recursive: true
|
769 |
+
source_tracking: true
|
770 |
+
processing_order:
|
771 |
+
- path: "first_paragraphs"
|
772 |
+
priority: 1
|
773 |
+
purpose: "graph_building"
|
774 |
+
- path: "full_documents"
|
775 |
+
priority: 2
|
776 |
+
purpose: "retrieval"
|
777 |
+
# Specifies the data source for ingestion:
|
778 |
+
# - Input is file-based text.
|
779 |
+
# - Reads files recursively from "input" directory matching '.txt' files.
|
780 |
+
# - Prioritizes "first_paragraphs" for graph building and full documents for retrieval.
|
781 |
+
|
782 |
+
entity_extraction:
|
783 |
+
prompt: "prompts/entity_extraction.txt"
|
784 |
+
# Path to the custom prompt used for entity extraction tasks.
|
785 |
+
|
786 |
+
entity_types:
|
787 |
+
- "Baggage Type"
|
788 |
+
- "Dimension"
|
789 |
+
- "Linear Dimension"
|
790 |
+
- "Weight"
|
791 |
+
- "Material Type"
|
792 |
+
- "Wheel Configuration"
|
793 |
+
- "Measurement Unit"
|
794 |
+
- "Size Category"
|
795 |
+
- "Weight Category"
|
796 |
+
- "Airline"
|
797 |
+
- "Alliance"
|
798 |
+
- "Airport"
|
799 |
+
- "Route Type"
|
800 |
+
- "Travel Class"
|
801 |
+
- "Cabin Section"
|
802 |
+
- "Aircraft Type"
|
803 |
+
- "Restriction"
|
804 |
+
- "Exemption"
|
805 |
+
- "Policy"
|
806 |
+
- "Fee Structure"
|
807 |
+
- "Currency"
|
808 |
+
- "Allowance"
|
809 |
+
- "Special Item"
|
810 |
+
- "Prohibited Item"
|
811 |
+
- "Restricted Item"
|
812 |
+
- "Dangerous Good"
|
813 |
+
- "Fragile Item"
|
814 |
+
- "Valuable Item"
|
815 |
+
- "Required Document"
|
816 |
+
- "Label Type"
|
817 |
+
- "Tag Category"
|
818 |
+
- "Service Type"
|
819 |
+
- "Handler Role"
|
820 |
+
- "Service Location"
|
821 |
+
- "Time Period"
|
822 |
+
- "Passenger Type"
|
823 |
+
- "Membership Level"
|
824 |
+
- "Group Category"
|
825 |
+
# Defines the types of entities the system should extract.
|
826 |
+
|
827 |
+
max_gleanings: 2
|
828 |
+
# Maximum number of re-processing rounds to refine entity detection.
|
829 |
+
|
830 |
+
source_filter: "first_paragraphs"
|
831 |
+
# Restricts extraction to text from "first_paragraphs," optimizing focus.
|
832 |
+
|
833 |
+
claim_extraction:
|
834 |
+
enabled: true
|
835 |
+
# Enables claim extraction, capturing specific conditions or assertions from text.
|
836 |
+
|
837 |
+
claim_types:
|
838 |
+
- "Basic Size Restriction"
|
839 |
+
- "Oversize Condition"
|
840 |
+
- "Weight Limit Standard"
|
841 |
+
- "Overweight Condition"
|
842 |
+
- "Combined Dimension Limit"
|
843 |
+
- "Cabin Storage Requirement"
|
844 |
+
- "Standard Fee"
|
845 |
+
- "Excess Fee"
|
846 |
+
- "Oversize Fee"
|
847 |
+
- "Overweight Fee"
|
848 |
+
- "Special Handling Fee"
|
849 |
+
- "Season Surcharge"
|
850 |
+
- "Route-Specific Fee"
|
851 |
+
- "Multi-Piece Pricing"
|
852 |
+
- "Fee Waiver Condition"
|
853 |
+
- "Basic Allowance"
|
854 |
+
- "Class-Based Allowance"
|
855 |
+
- "Status-Based Allowance"
|
856 |
+
- "Route-Based Allowance"
|
857 |
+
- "Special Group Allowance"
|
858 |
+
- "Seasonal Allowance"
|
859 |
+
- "Equipment Allowance"
|
860 |
+
- "Prohibited Item Policy"
|
861 |
+
- "Restricted Item Condition"
|
862 |
+
- "Dangerous Goods Policy"
|
863 |
+
- "Special Item Restriction"
|
864 |
+
- "Packaging Requirement"
|
865 |
+
- "Declaration Requirement"
|
866 |
+
- "Check-in Deadline"
|
867 |
+
- "Special Handling Procedure"
|
868 |
+
- "Priority Handling Rule"
|
869 |
+
- "Transfer Handling Policy"
|
870 |
+
- "Delivery Service Policy"
|
871 |
+
- "Storage Policy"
|
872 |
+
- "Liability Limit"
|
873 |
+
- "Insurance Requirement"
|
874 |
+
- "Claim Procedure"
|
875 |
+
- "Compensation Policy"
|
876 |
+
- "Time Limit Policy"
|
877 |
+
- "Weather Restriction"
|
878 |
+
- "Seasonal Restriction"
|
879 |
+
- "Aircraft Limitation"
|
880 |
+
- "Route Restriction"
|
881 |
+
- "Connection Impact"
|
882 |
+
- "Tag Requirement"
|
883 |
+
- "Label Requirement"
|
884 |
+
- "Documentation Requirement"
|
885 |
+
- "Declaration Policy"
|
886 |
+
- "Handling Standard"
|
887 |
+
- "Service Level Agreement"
|
888 |
+
- "Priority Service Standard"
|
889 |
+
- "Delivery Time Standard"
|
890 |
+
- "Medical Exception"
|
891 |
+
- "Military Exception"
|
892 |
+
- "Diplomatic Exception"
|
893 |
+
- "Event Exception"
|
894 |
+
- "Emergency Exception"
|
895 |
+
# Types of claims to extract, covering diverse scenarios (e.g., fees, allowances).
|
896 |
+
|
897 |
+
prompt: "prompts/claim_extraction.txt"
|
898 |
+
description: "Extract baggage measurements, weight limits, and restrictions from airline documentation."
|
899 |
+
# Customizes the extraction logic for airline baggage policies.
|
900 |
+
|
901 |
+
max_gleanings: 2
|
902 |
+
source_filter: "first_paragraphs"
|
903 |
+
# Restricts claims to "first_paragraphs," mirroring entity extraction.
|
904 |
+
|
905 |
+
local_search:
|
906 |
+
text_unit_prop: 0.7
|
907 |
+
community_prop: 0.3
|
908 |
+
top_k_mapped_entities: 15
|
909 |
+
top_k_relationships: 15
|
910 |
+
max_tokens: 4000
|
911 |
+
source_priority:
|
912 |
+
graph_search: "first_paragraphs"
|
913 |
+
answer_retrieval: "full_documents"
|
914 |
+
# Configures search behavior:
|
915 |
+
# - Balances searches between individual text units and community-level summaries.
|
916 |
+
# - Limits results to top 15 entities and relationships for relevance.
|
917 |
+
|
918 |
+
global_search:
|
919 |
+
max_tokens: 4000
|
920 |
+
data_max_tokens: 4000
|
921 |
+
map_max_tokens: 1000
|
922 |
+
reduce_max_tokens: 2000
|
923 |
+
allow_general_knowledge: false
|
924 |
+
min_score_threshold: 0.1
|
925 |
+
concurrency: 10
|
926 |
+
# Defines query-wide global search capabilities:
|
927 |
+
# - Token limits for different operations.
|
928 |
+
# - Restricts non-specific general knowledge responses.
|
929 |
+
# - Handles up to 10 parallel queries.
|
930 |
+
|
931 |
+
embed_graph:
|
932 |
+
enabled: true
|
933 |
+
num_walks: 100
|
934 |
+
walk_length: 10
|
935 |
+
window_size: 5
|
936 |
+
iterations: 10
|
937 |
+
# Enables graph embedding (e.g., for node2vec):
|
938 |
+
# - Generates 100 random walks per node to learn embeddings.
|
939 |
+
|
940 |
+
umap:
|
941 |
+
enabled: true
|
942 |
+
n_neighbors: 15
|
943 |
+
min_dist: 0.1
|
944 |
+
n_components: 2
|
945 |
+
# Configures UMAP for dimensionality reduction and visualization.
|
946 |
+
|
947 |
+
storage:
|
948 |
+
type: file
|
949 |
+
base_dir: "output"
|
950 |
+
# Outputs processed data to local "output" directory.
|
951 |
+
|
952 |
+
cache:
|
953 |
+
type: file
|
954 |
+
base_dir: "cache"
|
955 |
+
# Stores temporary files in "cache."
|
956 |
+
|
957 |
+
reporting:
|
958 |
+
type: file
|
959 |
+
base_dir: "reports"
|
960 |
+
include_source_tracking: true
|
961 |
+
# Generates reports, including provenance for traceability.
|
962 |
+
|
963 |
+
```
|
964 |
+
"""
|
965 |
+
)
|