CordwainerSmith commited on
Commit
b96b49a
·
verified ·
1 Parent(s): 90c3fb2

Add project files and Docker setup

Browse files
Files changed (11) hide show
  1. Dockerfile +48 -0
  2. README.md +10 -10
  3. app.py +305 -0
  4. auth.py +39 -0
  5. knowledge_graph.html +0 -0
  6. query_config.yaml +23 -0
  7. requirements.txt +3 -0
  8. search_handlers.py +285 -0
  9. settings.yaml +196 -0
  10. styles.css +112 -0
  11. wiki.py +965 -0
Dockerfile ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Step 1: Use an official Python slim base image
2
+ FROM python:3.10-slim
3
+
4
+ # Step 2: Install system dependencies
5
+ RUN apt-get update && apt-get install -y \
6
+ wget \
7
+ tar \
8
+ && apt-get clean
9
+
10
+ # Step 3: Add a non-root user (required by Hugging Face Spaces)
11
+ RUN useradd -m -u 1000 user
12
+
13
+ # Step 4: Switch to the "user" user
14
+ USER user
15
+
16
+ # Step 5: Set home and working directory
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+ WORKDIR $HOME/app
20
+
21
+ # Step 6: Copy requirements into the container
22
+ COPY --chown=user requirements.txt ./requirements.txt
23
+
24
+ # Step 7: Install Python dependencies
25
+ RUN pip install --no-cache-dir --upgrade pip && \
26
+ pip install --no-cache-dir -r requirements.txt
27
+
28
+ # Step 8: Copy all necessary files and folders into the container
29
+ COPY --chown=user .output ./.output
30
+ COPY --chown=user cache ./cache
31
+ COPY --chown=user input ./input
32
+ COPY --chown=user output ./output
33
+ COPY --chown=user prompts ./prompts
34
+ COPY --chown=user reports ./reports
35
+ COPY --chown=user auth.py ./auth.py
36
+ COPY --chown=user knowledge_graph.html ./knowledge_graph.html
37
+ COPY --chown=user query_config.yaml ./query_config.yaml
38
+ COPY --chown=user app.py ./app.py
39
+ COPY --chown=user search_handlers.py ./search_handlers.py
40
+ COPY --chown=user settings.yaml ./settings.yaml
41
+ COPY --chown=user styles.css ./styles.css
42
+ COPY --chown=user wiki.py ./wiki.py
43
+
44
+ # Step 10: Expose the Streamlit default port
45
+ EXPOSE 7860
46
+
47
+ # Step 11: Define the entrypoint command
48
+ CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
- ---
2
- title: PwcGraphRAG
3
- emoji: ⚡
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: PwcGraphRAG
3
+ emoji: ⚡
4
+ colorFrom: purple
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import asyncio
3
+ import sys
4
+ from pathlib import Path
5
+ import base64
6
+ import pandas as pd
7
+ from typing import Literal, Tuple, Optional
8
+ from wiki import render_wiki_tab
9
+ from search_handlers import run_global_search, run_local_search, run_drift_search
10
+ import auth
11
+
12
+
13
+ import graphrag.api as api
14
+ from graphrag.config import GraphRagConfig, load_config, resolve_paths
15
+ from graphrag.index.create_pipeline_config import create_pipeline_config
16
+ from graphrag.logging import PrintProgressReporter
17
+ from graphrag.utils.storage import _create_storage, _load_table_from_storage
18
+
19
+
20
+ st.set_page_config(page_title="GraphRAG Chat Interface", page_icon="🔍", layout="wide")
21
+
22
+ # Define default avatars at the module level
23
+ DEFAULT_USER_AVATAR = "👤"
24
+ DEFAULT_BOT_AVATAR = "🤖"
25
+
26
+ # Initialize session state for avatars
27
+ if "user_avatar" not in st.session_state:
28
+ st.session_state.user_avatar = DEFAULT_USER_AVATAR
29
+ if "bot_avatar" not in st.session_state:
30
+ st.session_state.bot_avatar = DEFAULT_BOT_AVATAR
31
+
32
+ # Define avatar images
33
+ USER_AVATAR = "👤" # Default user emoji
34
+ BOT_AVATAR = "🤖" # Default bot emoji
35
+
36
+
37
+ class StreamlitProgressReporter(PrintProgressReporter):
38
+ def __init__(self, placeholder):
39
+ super().__init__("")
40
+ self.placeholder = placeholder
41
+
42
+ def success(self, message: str):
43
+ self.placeholder.success(message)
44
+
45
+
46
+ def render_chat_tab():
47
+ """Render the Chat tab content."""
48
+ format_message_history()
49
+
50
+ # Chat input
51
+ if prompt := st.chat_input("Enter your query..."):
52
+ # Add user message to history with timestamp
53
+ st.session_state.messages.append(
54
+ {
55
+ "role": "user",
56
+ "content": prompt,
57
+ "timestamp": pd.Timestamp.now().strftime("%H:%M"),
58
+ }
59
+ )
60
+
61
+ # Process query
62
+ with st.spinner("Processing your query..."):
63
+ response_placeholder = st.empty()
64
+ try:
65
+ if st.session_state.search_type == "global":
66
+ response, context = run_global_search(
67
+ config_filepath=st.session_state.config_filepath,
68
+ data_dir=st.session_state.data_dir,
69
+ root_dir=st.session_state.root_dir,
70
+ community_level=st.session_state.community_level,
71
+ response_type=st.session_state.response_type,
72
+ streaming=st.session_state.streaming,
73
+ query=prompt,
74
+ progress_placeholder=response_placeholder,
75
+ )
76
+ elif st.session_state.search_type == "drift":
77
+ response, context = run_drift_search(
78
+ config_filepath=st.session_state.config_filepath,
79
+ data_dir=st.session_state.data_dir,
80
+ root_dir=st.session_state.root_dir,
81
+ community_level=st.session_state.community_level,
82
+ response_type=st.session_state.response_type,
83
+ streaming=st.session_state.streaming,
84
+ query=prompt,
85
+ progress_placeholder=response_placeholder,
86
+ )
87
+ else:
88
+ response, context = run_local_search(
89
+ config_filepath=st.session_state.config_filepath,
90
+ data_dir=st.session_state.data_dir,
91
+ root_dir=st.session_state.root_dir,
92
+ community_level=st.session_state.community_level,
93
+ response_type=st.session_state.response_type,
94
+ streaming=st.session_state.streaming,
95
+ query=prompt,
96
+ progress_placeholder=response_placeholder,
97
+ )
98
+
99
+ # Clear the placeholder before adding the final response
100
+ response_placeholder.empty()
101
+
102
+ # Add assistant response to history with timestamp
103
+ st.session_state.messages.append(
104
+ {
105
+ "role": "assistant",
106
+ "content": response,
107
+ "timestamp": pd.Timestamp.now().strftime("%H:%M"),
108
+ }
109
+ )
110
+
111
+ # Show context in expander
112
+ with st.expander("View Search Context"):
113
+ st.json(context)
114
+
115
+ except Exception as e:
116
+ error_message = f"Error processing query: {str(e)}"
117
+ st.session_state.messages.append(
118
+ {
119
+ "role": "assistant",
120
+ "content": error_message,
121
+ "timestamp": pd.Timestamp.now().strftime("%H:%M"),
122
+ }
123
+ )
124
+
125
+
126
+ st.rerun()
127
+
128
+
129
+ def display_message(msg: str, is_user: bool = False, timestamp: str = "") -> None:
130
+ """Display a chat message with avatar and consistent formatting."""
131
+ role = "user" if is_user else "assistant"
132
+ message_class = "user-message" if is_user else "assistant-message"
133
+ avatar = st.session_state.user_avatar if is_user else st.session_state.bot_avatar
134
+
135
+ message_container = f"""
136
+ <div class="chat-message {message_class}">
137
+ <div class="avatar">
138
+ <div style="font-size: 25px; text-align: center;">{avatar}</div>
139
+ </div>
140
+ <div class="message-content-wrapper">
141
+ <div class="message-bubble">
142
+ <div class="message-content">
143
+ {msg}
144
+ </div>
145
+ </div>
146
+ <div class="timestamp">{timestamp}</div>
147
+ </div>
148
+ </div>
149
+ """
150
+ st.markdown(message_container, unsafe_allow_html=True)
151
+
152
+
153
+ def format_message_history() -> None:
154
+ """Display all messages in the chat history with consistent formatting."""
155
+ st.markdown('<div class="chat-container">', unsafe_allow_html=True)
156
+ for message in st.session_state.messages:
157
+ timestamp = message.get("timestamp", "")
158
+ display_message(
159
+ msg=message["content"],
160
+ is_user=(message["role"] == "user"),
161
+ timestamp=timestamp,
162
+ )
163
+ st.markdown("</div>", unsafe_allow_html=True)
164
+
165
+
166
+ @st.cache_resource
167
+ def load_css():
168
+ with open("styles.css", "r") as f:
169
+ return f.read()
170
+
171
+
172
+ def initialize_session_state():
173
+ """Initialize session state variables if they don't exist."""
174
+ if "messages" not in st.session_state:
175
+ st.session_state.messages = []
176
+ if "response_placeholder" not in st.session_state:
177
+ st.session_state.response_placeholder = None
178
+ if "config_filepath" not in st.session_state:
179
+ st.session_state.config_filepath = None
180
+ if "data_dir" not in st.session_state:
181
+ st.session_state.data_dir = None
182
+ if "root_dir" not in st.session_state:
183
+ st.session_state.root_dir = "."
184
+ if "community_level" not in st.session_state:
185
+ st.session_state.community_level = 2
186
+ if "response_type" not in st.session_state:
187
+ st.session_state.response_type = "concise"
188
+ if "search_type" not in st.session_state:
189
+ st.session_state.search_type = "global"
190
+ if "streaming" not in st.session_state:
191
+ st.session_state.streaming = True
192
+ if "authenticated" not in st.session_state:
193
+ st.session_state.authenticated = False
194
+
195
+
196
+ def main():
197
+ initialize_session_state()
198
+
199
+ # Authentication check
200
+ if not st.session_state.authenticated:
201
+ if auth.check_credentials():
202
+ st.session_state.authenticated = True
203
+ st.rerun() # Rerun to reflect the authentication state
204
+ else:
205
+ st.stop() # Stop further execution if authentication fails
206
+
207
+ # If authenticated, proceed with the main app
208
+ if st.session_state.authenticated:
209
+ # Main application content
210
+ st.title("PWC Home Assigment #1, Graphrag")
211
+
212
+ css = load_css()
213
+ st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
214
+
215
+ # Sidebar configuration
216
+ with st.sidebar:
217
+ # Display logos side by side at the top of the sidebar
218
+ col1, col2 = st.columns(2)
219
+ with col1:
220
+ st.markdown(
221
+ '<div class="logo-container"><img class="logo-image" src="https://nexttech.pwc.co.il/wp-content/uploads/2023/12/image-2.png"></div>',
222
+ unsafe_allow_html=True,
223
+ )
224
+ with col2:
225
+ st.markdown(
226
+ '<div class="logo-container"><img class="logo-image" src="https://nexttech.pwc.co.il/wp-content/uploads/2023/12/Frame.png"></div>',
227
+ unsafe_allow_html=True,
228
+ )
229
+
230
+ st.header("Configuration")
231
+ st.session_state.community_level = st.number_input(
232
+ "Community Level",
233
+ min_value=0,
234
+ max_value=10,
235
+ value=st.session_state.community_level,
236
+ help="Controls the granularity of the search...",
237
+ )
238
+
239
+ # Only show response type for global and local search
240
+ if st.session_state.search_type != "drift":
241
+ st.session_state.response_type = st.selectbox(
242
+ "Response Type",
243
+ options=["concise", "detailed"],
244
+ index=0 if st.session_state.response_type == "concise" else 1,
245
+ help="Style of response generation",
246
+ )
247
+
248
+ st.session_state.search_type = st.selectbox(
249
+ "Search Type",
250
+ options=["global", "local", "drift"],
251
+ index=(
252
+ 0
253
+ if st.session_state.search_type == "global"
254
+ else 1 if st.session_state.search_type == "local" else 2
255
+ ),
256
+ help="""Search Types:
257
+ - Local Search: "Focuses on finding specific information by searching through direct connections in the knowledge graph. Best for precise, fact-based queries."
258
+ - Global Search: "Analyzes the entire document collection at a high level using community summaries. Best for understanding broad themes and general policies."
259
+ - DRIFT Search: "Combines local and global search capabilities, dynamically exploring connections while gathering detailed information. Best for complex queries requiring both specific details and broader context."
260
+ """,
261
+ )
262
+
263
+ # Show streaming option only for supported search types
264
+ if st.session_state.search_type != "drift":
265
+ st.session_state.streaming = st.checkbox(
266
+ "Enable Streaming",
267
+ value=st.session_state.streaming,
268
+ help="Stream response tokens as they're generated",
269
+ )
270
+ else:
271
+ st.session_state.streaming = False
272
+ st.info("Streaming is not available for DRIFT search")
273
+
274
+ # logout button
275
+ if st.button("Logout"):
276
+ st.session_state.clear() # Clear all session state data
277
+ initialize_session_state() # Reinitialize the session state
278
+ st.query_params = {"restart": "true"} # Refresh the UI
279
+ st.rerun()
280
+
281
+ # Create tabs
282
+ tab1, tab2 = st.tabs(["Assignment Documentation", "Chat"])
283
+
284
+ # readme tab content
285
+ with tab1:
286
+ render_wiki_tab()
287
+
288
+ # Chat tab content
289
+ with tab2:
290
+ render_chat_tab()
291
+
292
+ st.sidebar.markdown(
293
+ """
294
+ <div style="position: absolute; bottom: 0; width: 100%; text-align: center; font-size: 14px; margin-bottom: -200px;">
295
+ Liran Baba |
296
+ <a href="https://linkedin.com/in/liranba" target="_blank">LinkedIn</a> |
297
+ <a href="https://huggingface.co/CordwainerSmith" target="_blank">HuggingFace</a>
298
+ </div>
299
+ """,
300
+ unsafe_allow_html=True,
301
+ )
302
+
303
+
304
+ if __name__ == "__main__":
305
+ main()
auth.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+
4
+
5
+ def check_credentials():
6
+ """Handles login form and returns True if authenticated successfully."""
7
+
8
+ # Check if already authenticated
9
+ if st.session_state.get("authenticated", False):
10
+ return True # User is already authenticated
11
+
12
+ # Retrieve credentials from environment variables (set via Hugging Face Secrets)
13
+ expected_username = os.environ.get("APP_USERNAME")
14
+ expected_password = os.environ.get("APP_PASSWORD")
15
+
16
+ if not expected_username or not expected_password:
17
+ st.error("Server is misconfigured: missing credentials.")
18
+ return False
19
+
20
+ # Show the login form only if not authenticated
21
+ with st.form("login_form", clear_on_submit=True):
22
+ st.text_input("Username", key="username")
23
+ st.text_input("Password", type="password", key="password")
24
+ submit_button = st.form_submit_button("Login")
25
+
26
+ if submit_button:
27
+ # Validate credentials
28
+ if (
29
+ st.session_state["username"] == expected_username
30
+ and st.session_state["password"] == expected_password
31
+ ):
32
+ st.session_state["authenticated"] = True # Mark user as authenticated
33
+ return True
34
+ else:
35
+ st.error("😕 Incorrect username or password")
36
+ return False # Indicate failed authentication
37
+
38
+ # Return False if login not attempted or failed
39
+ return False
knowledge_graph.html ADDED
The diff for this file is too large to render. See raw diff
 
query_config.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ drift_search:
2
+ max_tokens: 4000
3
+ drift_k_followups: 3
4
+ n_depth: 2
5
+ local_search_text_unit_prop: 0.6
6
+ local_search_community_prop: 0.4
7
+ local_search_top_k_mapped_entities: 10
8
+ local_search_top_k_relationships: 10
9
+
10
+ local_search:
11
+ text_unit_prop: 0.5
12
+ community_prop: 0.3
13
+ conversation_history_max_turns: 5
14
+ top_k_mapped_entities: 10
15
+ top_k_relationships: 10
16
+ max_tokens: 8000
17
+
18
+ global_search:
19
+ max_tokens: 8000
20
+ data_max_tokens: 8000
21
+ map_max_tokens: 1000
22
+ reduce_max_tokens: 2000
23
+ concurrency: 16
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit==1.40.1
2
+ pandas
3
+ graphrag==0.4.1
search_handlers.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ from typing import Tuple, Optional
5
+ from graphrag.config import GraphRagConfig, load_config, resolve_paths
6
+ from graphrag.index.create_pipeline_config import create_pipeline_config
7
+ from graphrag.logging import PrintProgressReporter
8
+ from graphrag.utils.storage import _create_storage, _load_table_from_storage
9
+ import graphrag.api as api
10
+
11
+
12
+ class StreamlitProgressReporter(PrintProgressReporter):
13
+ def __init__(self, placeholder):
14
+ super().__init__("")
15
+ self.placeholder = placeholder
16
+
17
+ def success(self, message: str):
18
+ self.placeholder.success(message)
19
+
20
+
21
+ def _resolve_parquet_files(
22
+ root_dir: str,
23
+ config: GraphRagConfig,
24
+ parquet_list: list[str],
25
+ optional_list: list[str],
26
+ ) -> dict[str, pd.DataFrame]:
27
+ """Read parquet files to a dataframe dict."""
28
+ dataframe_dict = {}
29
+ pipeline_config = create_pipeline_config(config)
30
+ storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
31
+
32
+ for parquet_file in parquet_list:
33
+ df_key = parquet_file.split(".")[0]
34
+ df_value = asyncio.run(
35
+ _load_table_from_storage(name=parquet_file, storage=storage_obj)
36
+ )
37
+ dataframe_dict[df_key] = df_value
38
+
39
+ for optional_file in optional_list:
40
+ file_exists = asyncio.run(storage_obj.has(optional_file))
41
+ df_key = optional_file.split(".")[0]
42
+ if file_exists:
43
+ df_value = asyncio.run(
44
+ _load_table_from_storage(name=optional_file, storage=storage_obj)
45
+ )
46
+ dataframe_dict[df_key] = df_value
47
+ else:
48
+ dataframe_dict[df_key] = None
49
+
50
+ return dataframe_dict
51
+
52
+
53
+ def run_global_search(
54
+ config_filepath: Optional[str],
55
+ data_dir: Optional[str],
56
+ root_dir: str,
57
+ community_level: int,
58
+ response_type: str,
59
+ streaming: bool,
60
+ query: str,
61
+ progress_placeholder,
62
+ ) -> Tuple[str, dict]:
63
+ """Perform a global search with a given query."""
64
+ root = Path(root_dir).resolve()
65
+ config = load_config(root, config_filepath)
66
+ reporter = StreamlitProgressReporter(progress_placeholder)
67
+
68
+ config.storage.base_dir = data_dir or config.storage.base_dir
69
+ resolve_paths(config)
70
+
71
+ dataframe_dict = _resolve_parquet_files(
72
+ root_dir=root_dir,
73
+ config=config,
74
+ parquet_list=[
75
+ "create_final_nodes.parquet",
76
+ "create_final_entities.parquet",
77
+ "create_final_community_reports.parquet",
78
+ ],
79
+ optional_list=[],
80
+ )
81
+
82
+ final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
83
+ final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
84
+ final_community_reports: pd.DataFrame = dataframe_dict[
85
+ "create_final_community_reports"
86
+ ]
87
+
88
+ if streaming:
89
+
90
+ async def run_streaming_search():
91
+ full_response = ""
92
+ context_data = None
93
+ get_context_data = True
94
+ try:
95
+ async for stream_chunk in api.global_search_streaming(
96
+ config=config,
97
+ nodes=final_nodes,
98
+ entities=final_entities,
99
+ community_reports=final_community_reports,
100
+ community_level=community_level,
101
+ response_type=response_type,
102
+ query=query,
103
+ ):
104
+ if get_context_data:
105
+ context_data = stream_chunk
106
+ get_context_data = False
107
+ else:
108
+ full_response += stream_chunk
109
+ progress_placeholder.markdown(full_response)
110
+ except Exception as e:
111
+ progress_placeholder.error(f"Error during streaming search: {e}")
112
+ return None, None
113
+
114
+ return full_response, context_data
115
+
116
+ result = asyncio.run(run_streaming_search())
117
+ if result is None:
118
+ return "", {} # Graceful fallback
119
+ return result
120
+
121
+ # Non-streaming logic
122
+ try:
123
+ response, context_data = asyncio.run(
124
+ api.global_search(
125
+ config=config,
126
+ nodes=final_nodes,
127
+ entities=final_entities,
128
+ community_reports=final_community_reports,
129
+ community_level=community_level,
130
+ response_type=response_type,
131
+ query=query,
132
+ )
133
+ )
134
+ reporter.success(f"Global Search Response:\n{response}")
135
+ return response, context_data
136
+ except Exception as e:
137
+ progress_placeholder.error(f"Error during global search: {e}")
138
+ return "", {} # Graceful fallback
139
+
140
+
141
+ def run_local_search(
142
+ config_filepath: Optional[str],
143
+ data_dir: Optional[str],
144
+ root_dir: str,
145
+ community_level: int,
146
+ response_type: str,
147
+ streaming: bool,
148
+ query: str,
149
+ progress_placeholder,
150
+ ) -> Tuple[str, dict]:
151
+ """Perform a local search with a given query."""
152
+ root = Path(root_dir).resolve()
153
+ config = load_config(root, config_filepath)
154
+ reporter = StreamlitProgressReporter(progress_placeholder)
155
+
156
+ config.storage.base_dir = data_dir or config.storage.base_dir
157
+ resolve_paths(config)
158
+
159
+ dataframe_dict = _resolve_parquet_files(
160
+ root_dir=root_dir,
161
+ config=config,
162
+ parquet_list=[
163
+ "create_final_nodes.parquet",
164
+ "create_final_community_reports.parquet",
165
+ "create_final_text_units.parquet",
166
+ "create_final_relationships.parquet",
167
+ "create_final_entities.parquet",
168
+ ],
169
+ optional_list=["create_final_covariates.parquet"],
170
+ )
171
+
172
+ final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
173
+ final_community_reports: pd.DataFrame = dataframe_dict[
174
+ "create_final_community_reports"
175
+ ]
176
+ final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
177
+ final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
178
+ final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
179
+ final_covariates: Optional[pd.DataFrame] = dataframe_dict["create_final_covariates"]
180
+
181
+ if streaming:
182
+
183
+ async def run_streaming_search():
184
+ full_response = ""
185
+ context_data = None
186
+ get_context_data = True
187
+ async for stream_chunk in api.local_search_streaming(
188
+ config=config,
189
+ nodes=final_nodes,
190
+ entities=final_entities,
191
+ community_reports=final_community_reports,
192
+ text_units=final_text_units,
193
+ relationships=final_relationships,
194
+ covariates=final_covariates,
195
+ community_level=community_level,
196
+ response_type=response_type,
197
+ query=query,
198
+ ):
199
+ if get_context_data:
200
+ context_data = stream_chunk
201
+ get_context_data = False
202
+ else:
203
+ full_response += stream_chunk
204
+ progress_placeholder.markdown(full_response)
205
+ return full_response, context_data
206
+
207
+ return asyncio.run(run_streaming_search())
208
+
209
+ response, context_data = asyncio.run(
210
+ api.local_search(
211
+ config=config,
212
+ nodes=final_nodes,
213
+ entities=final_entities,
214
+ community_reports=final_community_reports,
215
+ text_units=final_text_units,
216
+ relationships=final_relationships,
217
+ covariates=final_covariates,
218
+ community_level=community_level,
219
+ response_type=response_type,
220
+ query=query,
221
+ )
222
+ )
223
+ reporter.success(f"Local Search Response:\n{response}")
224
+ return response, context_data
225
+
226
+
227
+ def run_drift_search(
228
+ config_filepath: Optional[str],
229
+ data_dir: Optional[str],
230
+ root_dir: str,
231
+ community_level: int,
232
+ response_type: str,
233
+ streaming: bool,
234
+ query: str,
235
+ progress_placeholder,
236
+ ) -> Tuple[str, dict]:
237
+ """Perform a DRIFT search with a given query."""
238
+ root = Path(root_dir).resolve()
239
+ config = load_config(root, config_filepath)
240
+ reporter = StreamlitProgressReporter(progress_placeholder)
241
+
242
+ config.storage.base_dir = data_dir or config.storage.base_dir
243
+ resolve_paths(config)
244
+
245
+ dataframe_dict = _resolve_parquet_files(
246
+ root_dir=root_dir,
247
+ config=config,
248
+ parquet_list=[
249
+ "create_final_nodes.parquet",
250
+ "create_final_entities.parquet",
251
+ "create_final_community_reports.parquet",
252
+ "create_final_text_units.parquet",
253
+ "create_final_relationships.parquet",
254
+ ],
255
+ optional_list=[], # Remove covariates as it's not supported
256
+ )
257
+
258
+ final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
259
+ final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
260
+ final_community_reports: pd.DataFrame = dataframe_dict[
261
+ "create_final_community_reports"
262
+ ]
263
+ final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
264
+ final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
265
+
266
+ # Note: DRIFT search doesn't support streaming
267
+ if streaming:
268
+ progress_placeholder.warning(
269
+ "Streaming is not supported for DRIFT search. Using standard search instead."
270
+ )
271
+
272
+ response, context_data = asyncio.run(
273
+ api.drift_search(
274
+ config=config,
275
+ nodes=final_nodes,
276
+ entities=final_entities,
277
+ community_reports=final_community_reports,
278
+ text_units=final_text_units,
279
+ relationships=final_relationships,
280
+ community_level=community_level,
281
+ query=query,
282
+ )
283
+ )
284
+ reporter.success(f"DRIFT Search Response:\n{response}")
285
+ return response, context_data
settings.yaml ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ encoding_model: cl100k_base
2
+ skip_workflows: []
3
+ llm:
4
+ api_key: ${GRAPHRAG_API_KEY}
5
+ type: openai_chat
6
+ model: gpt-4o-mini
7
+ model_supports_json: true
8
+ max_tokens: 4000
9
+ temperature: 0
10
+
11
+ embeddings:
12
+ async_mode: threaded
13
+ batch_size: 16
14
+ vector_store:
15
+ type: lancedb
16
+ db_uri: 'output/lancedb'
17
+ container_name: default
18
+ overwrite: true
19
+ llm:
20
+ api_key: ${GRAPHRAG_API_KEY}
21
+ type: openai_embedding
22
+ model: text-embedding-3-small
23
+
24
+ chunks:
25
+ size: 500
26
+ overlap: 50
27
+ group_by_columns: [id]
28
+
29
+ input:
30
+ type: file
31
+ file_type: text
32
+ base_dir: "input"
33
+ file_pattern: ".*\\.txt$"
34
+ recursive: true
35
+ source_tracking: true
36
+ processing_order:
37
+ - path: "first_paragraphs"
38
+ priority: 1
39
+ purpose: "graph_building"
40
+ - path: "full_documents"
41
+ priority: 2
42
+ purpose: "retrieval"
43
+
44
+ entity_extraction:
45
+ prompt: "prompts/entity_extraction.txt"
46
+ entity_types:
47
+ - "Baggage Type"
48
+ - "Dimension"
49
+ - "Linear Dimension"
50
+ - "Weight"
51
+ - "Material Type"
52
+ - "Wheel Configuration"
53
+ - "Measurement Unit"
54
+ - "Size Category"
55
+ - "Weight Category"
56
+ - "Airline"
57
+ - "Alliance"
58
+ - "Airport"
59
+ - "Route Type"
60
+ - "Travel Class"
61
+ - "Cabin Section"
62
+ - "Aircraft Type"
63
+ - "Restriction"
64
+ - "Exemption"
65
+ - "Policy"
66
+ - "Fee Structure"
67
+ - "Currency"
68
+ - "Allowance"
69
+ - "Special Item"
70
+ - "Prohibited Item"
71
+ - "Restricted Item"
72
+ - "Dangerous Good"
73
+ - "Fragile Item"
74
+ - "Valuable Item"
75
+ - "Required Document"
76
+ - "Label Type"
77
+ - "Tag Category"
78
+ - "Service Type"
79
+ - "Handler Role"
80
+ - "Service Location"
81
+ - "Time Period"
82
+ - "Passenger Type"
83
+ - "Membership Level"
84
+ - "Group Category"
85
+ max_gleanings: 2
86
+ source_filter: "first_paragraphs"
87
+
88
+ claim_extraction:
89
+ enabled: true
90
+ claim_types:
91
+ - "Basic Size Restriction"
92
+ - "Oversize Condition"
93
+ - "Weight Limit Standard"
94
+ - "Overweight Condition"
95
+ - "Combined Dimension Limit"
96
+ - "Cabin Storage Requirement"
97
+ - "Standard Fee"
98
+ - "Excess Fee"
99
+ - "Oversize Fee"
100
+ - "Overweight Fee"
101
+ - "Special Handling Fee"
102
+ - "Season Surcharge"
103
+ - "Route-Specific Fee"
104
+ - "Multi-Piece Pricing"
105
+ - "Fee Waiver Condition"
106
+ - "Basic Allowance"
107
+ - "Class-Based Allowance"
108
+ - "Status-Based Allowance"
109
+ - "Route-Based Allowance"
110
+ - "Special Group Allowance"
111
+ - "Seasonal Allowance"
112
+ - "Equipment Allowance"
113
+ - "Prohibited Item Policy"
114
+ - "Restricted Item Condition"
115
+ - "Dangerous Goods Policy"
116
+ - "Special Item Restriction"
117
+ - "Packaging Requirement"
118
+ - "Declaration Requirement"
119
+ - "Check-in Deadline"
120
+ - "Special Handling Procedure"
121
+ - "Priority Handling Rule"
122
+ - "Transfer Handling Policy"
123
+ - "Delivery Service Policy"
124
+ - "Storage Policy"
125
+ - "Liability Limit"
126
+ - "Insurance Requirement"
127
+ - "Claim Procedure"
128
+ - "Compensation Policy"
129
+ - "Time Limit Policy"
130
+ - "Weather Restriction"
131
+ - "Seasonal Restriction"
132
+ - "Aircraft Limitation"
133
+ - "Route Restriction"
134
+ - "Connection Impact"
135
+ - "Tag Requirement"
136
+ - "Label Requirement"
137
+ - "Documentation Requirement"
138
+ - "Declaration Policy"
139
+ - "Handling Standard"
140
+ - "Service Level Agreement"
141
+ - "Priority Service Standard"
142
+ - "Delivery Time Standard"
143
+ - "Medical Exception"
144
+ - "Military Exception"
145
+ - "Diplomatic Exception"
146
+ - "Event Exception"
147
+ - "Emergency Exception"
148
+ prompt: "prompts/claim_extraction.txt"
149
+ description: "Extract baggage measurements, weight limits, and restrictions from airline documentation."
150
+ max_gleanings: 2
151
+ source_filter: "first_paragraphs"
152
+
153
+ local_search:
154
+ text_unit_prop: 0.7
155
+ community_prop: 0.3
156
+ top_k_mapped_entities: 15
157
+ top_k_relationships: 15
158
+ max_tokens: 4000
159
+ source_priority:
160
+ graph_search: "first_paragraphs"
161
+ answer_retrieval: "full_documents"
162
+
163
+ global_search:
164
+ max_tokens: 4000
165
+ data_max_tokens: 4000
166
+ map_max_tokens: 1000
167
+ reduce_max_tokens: 2000
168
+ allow_general_knowledge: false
169
+ min_score_threshold: 0.1
170
+ concurrency: 10
171
+
172
+ embed_graph:
173
+ enabled: true
174
+ num_walks: 100
175
+ walk_length: 10
176
+ window_size: 5
177
+ iterations: 10
178
+
179
+ umap:
180
+ enabled: true
181
+ n_neighbors: 15
182
+ min_dist: 0.1
183
+ n_components: 2
184
+
185
+ storage:
186
+ type: file
187
+ base_dir: "output"
188
+
189
+ cache:
190
+ type: file
191
+ base_dir: "cache"
192
+
193
+ reporting:
194
+ type: file
195
+ base_dir: "reports"
196
+ include_source_tracking: true
styles.css ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Container for all messages */
2
+ .chat-container {
3
+ display: flex;
4
+ flex-direction: column;
5
+ gap: 1rem;
6
+ padding: 1rem;
7
+ }
8
+
9
+ /* Message wrapper with avatar support */
10
+ .chat-message {
11
+ display: flex;
12
+ align-items: flex-start;
13
+ gap: 0.5rem;
14
+ width: 100%;
15
+ max-width: 900px;
16
+ margin: 0.5rem 0;
17
+ }
18
+
19
+ /* Avatar container */
20
+ .avatar {
21
+ width: 40px;
22
+ height: 40px;
23
+ border-radius: 50%;
24
+ overflow: hidden;
25
+ flex-shrink: 0;
26
+ }
27
+
28
+ .avatar img {
29
+ width: 100%;
30
+ height: 100%;
31
+ object-fit: cover;
32
+ }
33
+
34
+ /* Message content wrapper */
35
+ .message-content-wrapper {
36
+ display: flex;
37
+ flex-direction: column;
38
+ max-width: 80%;
39
+ }
40
+
41
+ /* Message bubble */
42
+ .message-bubble {
43
+ padding: 1rem;
44
+ border-radius: 0.5rem;
45
+ margin: 0.2rem 0;
46
+ }
47
+
48
+ /* User message specific styling */
49
+ .user-message {
50
+ flex-direction: row-reverse;
51
+ }
52
+
53
+ .user-message .message-bubble {
54
+ background-color: #2b313e;
55
+ border-top-right-radius: 0;
56
+ color: white;
57
+ }
58
+
59
+ /* Assistant message specific styling */
60
+ .assistant-message .message-bubble {
61
+ background-color: #343741;
62
+ border-top-left-radius: 0;
63
+ color: white;
64
+ }
65
+
66
+ /* Message content */
67
+ .message-content {
68
+ word-wrap: break-word;
69
+ }
70
+
71
+ /* Remove default streamlit margins */
72
+ .stMarkdown {
73
+ margin: 0 !important;
74
+ }
75
+
76
+ /* Style for code blocks within messages */
77
+ .message-content pre {
78
+ background-color: #1e1e1e;
79
+ padding: 0.5rem;
80
+ border-radius: 0.3rem;
81
+ margin: 0.5rem 0;
82
+ overflow-x: auto;
83
+ }
84
+
85
+ /* Improved loading spinner visibility */
86
+ .stSpinner {
87
+ text-align: center;
88
+ margin: 1rem 0;
89
+ }
90
+
91
+ /* Time stamp styling */
92
+ .timestamp {
93
+ font-size: 0.8em;
94
+ color: #999;
95
+ margin: 0.2rem 0;
96
+ }
97
+
98
+ .logo-container {
99
+ display: flex;
100
+ /* Enable flexbox layout */
101
+ align-items: center;
102
+ /* Vertically center-align items */
103
+ padding: 10px 0;
104
+ /* Add padding top/bottom */
105
+ }
106
+
107
+ .logo-image {
108
+ max-width: 110px;
109
+ /* Set maximum width */
110
+ height: auto;
111
+ /* Maintain aspect ratio */
112
+ }
wiki.py ADDED
@@ -0,0 +1,965 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # wiki.py
2
+ # import streamlit_mermaid as stmd
3
+ import streamlit.components.v1 as components
4
+ import streamlit as st
5
+ from streamlit.components.v1 import html
6
+
7
+
8
+ def mermaid(code: str, height: int = 600) -> None:
9
+ components.html(
10
+ f"""
11
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
12
+ <div style="height: {height}px">
13
+ <pre class="mermaid">
14
+ {code}
15
+ </pre>
16
+ </div>
17
+ <script type="module">
18
+ import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs';
19
+ mermaid.initialize({{ startOnLoad: true }});
20
+ </script>
21
+ """,
22
+ height=height,
23
+ )
24
+
25
+
26
+ def render_wiki_tab():
27
+ """Render the Wiki tab content."""
28
+ st.header("Overview")
29
+
30
+ st.markdown(
31
+ """
32
+ This documentation details the process I followed to achieve the assignment of using GraphRAG for indexing the first paragraphs of seven documents, embedding the full documents, performing initial search on the graph built from the first paragraphs, and retrieving answers from the full document content.
33
+ """)
34
+ st.markdown(
35
+ """
36
+ This project implements a specialized document processing and querying system using GraphRAG for El Al baggage requirements\allowance documentation. The system processes first paragraphs separately from full documents, enabling graph-based search while maintaining comprehensive answer retrieval capabilities.
37
+
38
+ """
39
+ )
40
+
41
+ st.markdown(
42
+ """
43
+ ### Implementation Process
44
+
45
+ Initially, I attempted to implement this using separate processing paths for first paragraphs and full documents, but I discovered a more elegant solution through GraphRAG's source tracking and processing order capabilities. Instead of maintaining separate indexes, I configured a unified approach where documents were processed together but with clear priorities and purposes.
46
+
47
+ I set up the configuration to treat first paragraphs with priority 1 for graph building and full documents with priority 2 for retrieval. This was achieved through careful configuration of source tracking, processing order, and source filters in the `settings.yaml` file, which allowed me to maintain the separation of concerns.
48
+ """
49
+ )
50
+
51
+ st.markdown(
52
+ """
53
+ ### Final Implementation
54
+
55
+ The final implementation proved successful, creating a knowledge graph from the first paragraphs while maintaining access to full document content for comprehensive answers. I used entity types specific to airport security (like **Baggage Type**, **Dimension**, **Weight Limit**) and configured claim extraction to focus on relevant restrictions and allowances.
56
+
57
+ """
58
+ )
59
+
60
+ st.markdown(
61
+ """
62
+ ### Using the Chat Application
63
+
64
+ The chat application provides an interactive interface to query the GraphRAG system. Here's how it works:
65
+
66
+ ##### Getting Started:
67
+ - **Step 1**: Click on the chat tab.
68
+ - **Step 2**: Choose the desired search type from the sidebar:
69
+ - **Local Search**: Focuses on specific text chunks and direct relationships in the graph.
70
+ - **Global Search**: Analyzes the entire dataset at a high level using community summaries.
71
+ - **DRIFT Search**: Combines local and global search for complex queries requiring both detailed and contextual answers.
72
+
73
+ ##### Submitting a Query:
74
+ - Enter your question in the input field at the bottom of the chat interface.
75
+ - Depending on the selected search type, the system will:
76
+ - Use the graph for initial navigation.
77
+ - Retrieve answers from full documents for comprehensive responses.
78
+
79
+ ##### Viewing Results:
80
+ - The assistant's response appears in the chat window, formatted for clarity.
81
+
82
+ ##### Key Features:
83
+ - **Streaming Responses**: Responses are displayed in real-time for supported search types.
84
+ - **Session History**: Previous queries and responses are retained within the session for reference.
85
+
86
+ ##### Example Queries:
87
+ - "What are the liquid restrictions for carry-on bags?"
88
+ - "How do pet carrier size restrictions compare to regular carry-on limits?"
89
+ """
90
+ )
91
+
92
+ with st.expander("Architecture", expanded=False):
93
+ st.markdown(
94
+ """
95
+ The architecture of the system is designed to process data through multiple stages, including input preparation, processing, and search functionalities. Below is a detailed diagram illustrating the workflow of the system:
96
+ """
97
+ )
98
+
99
+ mermaid_code = """
100
+ %%{init: {'theme': 'base', 'themeVariables': {'primaryColor': '#1E90FF', 'edgeLabelBackground': '#FFFFFF', 'secondaryColor': '#F0F8FF', 'tertiaryColor': '#FFFFFF', 'primaryTextColor': '#000000'}}}%%
101
+ graph TD
102
+ subgraph Input
103
+ FP[First Paragraphs] --> P[Processing]
104
+ FD[Full Documents] --> P
105
+ end
106
+
107
+ subgraph Processing
108
+ P --> IE[Entity Extraction]
109
+ P --> CD[Community Detection]
110
+ P --> E[Embeddings Generation]
111
+
112
+ IE --> G[Graph Construction]
113
+ CD --> G
114
+ E --> VS[Vector Store]
115
+ end
116
+
117
+ subgraph Search
118
+ Q[Query] --> DS[DRIFT Search]
119
+ DS --> GS[Graph Search]
120
+ DS --> FR[Full Retrieval]
121
+ GS --> VS
122
+ FR --> VS
123
+ GS --> A[Answer Generation]
124
+ FR --> A
125
+ end
126
+ """
127
+ mermaid(mermaid_code, height=600)
128
+
129
+ with st.expander("Graph Analysis", expanded=False):
130
+ st.markdown("### System Components Breakdown:")
131
+
132
+ mermaid_code = """
133
+ pie
134
+ title "System Components"
135
+ "Documents" : 14
136
+ "Text Units" : 36
137
+ "Entities" : 315
138
+ "Relationships" : 372
139
+ "Communities" : 66
140
+ """
141
+ mermaid(mermaid_code, height=500)
142
+
143
+ # Description and graph statistics
144
+ st.markdown(
145
+ """
146
+ ### Knowledge Graph Visualization
147
+
148
+ The graph displayed below represents the relationships between various entities extracted from the input data. Nodes in the graph correspond to entities like "Documents," "Policies," and "Restrictions," while edges represent the relationships or connections between these entities. The graph is constructed using the extracted entities and relationships, processed through NetworkX, and visualized with Pyvis.
149
+
150
+ **Process of Creation**:
151
+ - **Data Preparation**: Entities and relationships are extracted and saved as `create_final_nodes.parquet` and `create_final_relationships.parquet` files, respectively.
152
+ - **Graph Construction**: Using NetworkX, nodes and edges are added based on the extracted data.
153
+ - **Visualization**: Pyvis is used to create an interactive visualization with options like physics-based layout, node grouping, and hover effects.
154
+
155
+ The resulting graph provides insights into the data's structure, including:
156
+ - Node type distribution
157
+ - Community detection levels
158
+ - Connectivity patterns
159
+
160
+ Explore the graph below to understand the relationships between key entities.
161
+ """
162
+ )
163
+
164
+ # Load and display the graph visualization (HTML file)
165
+ with open("knowledge_graph.html", "r") as f:
166
+ html_content = f.read()
167
+ st.components.v1.html(html_content, height=800)
168
+
169
+ # Graph statistics
170
+ st.markdown(
171
+ """
172
+ ### Graph Statistics:
173
+
174
+ * **Number of nodes:** 427
175
+ * **Number of edges:** 453
176
+
177
+ #### Node Type Distribution:
178
+
179
+ | Node Type | Distribution |
180
+ |-----------------------|--------------|
181
+ | REQUIRED DOCUMENT | 39 |
182
+ | SERVICE TYPE | 35 |
183
+ | POLICY | 30 |
184
+ | RESTRICTION | 27 |
185
+ | SPECIAL ITEM | 26 |
186
+ | PROHIBITED ITEM | 23 |
187
+ | AIRPORT | 22 |
188
+ | BAGGAGE TYPE | 21 |
189
+ | SERVICE LOCATION | 18 |
190
+ | DANGEROUS GOOD | 14 |
191
+ | ALLOWANCE | 13 |
192
+ | GEO | 12 |
193
+ | MEASUREMENT UNIT | 11 |
194
+ | FEE STRUCTURE | 10 |
195
+ | LINEAR DIMENSION | 8 |
196
+ | TIME PERIOD | 8 |
197
+ | CABIN SECTION | 8 |
198
+ | WEIGHT | 8 |
199
+ | WEIGHT CATEGORY | 7 |
200
+ | AIRLINE | 7 |
201
+ | CITY | 7 |
202
+ | DIMENSION | 6 |
203
+ | VALUABLE ITEM | 5 |
204
+ | ROUTE TYPE | 5 |
205
+ | TRAVEL CLASS | 5 |
206
+ | ORGANIZATION | 5 |
207
+ | PASSENGER TYPE | 4 |
208
+ | RESTRICTED ITEM | 3 |
209
+ | CURRENCY | 2 |
210
+ | EXEMPTION | 2 |
211
+ | LABEL TYPE | 2 |
212
+ | MATERIAL TYPE | 2 |
213
+ | CARGO | 2 |
214
+ | MEMBERSHIP LEVEL | 2 |
215
+ | AIRCRAFT TYPE | 1 |
216
+ | REGION | 1 |
217
+ | COUNTRY | 1 |
218
+ | SIZE CATEGORY | 1 |
219
+ | WHEEL CONFIGURATION | 1 |
220
+ | TAG CATEGORY | 1 |
221
+ | GROUP CATEGORY | 1 |
222
+
223
+ #### Most Connected Nodes:
224
+
225
+ | Node | Connections |
226
+ |--------------------|-------------|
227
+ | EL AL | 49 |
228
+ | ANIMAL | 29 |
229
+ | CHECKED BAGGAGE | 25 |
230
+ | BAGGAGE | 21 |
231
+ | PET | 19 |
232
+ """
233
+ )
234
+
235
+ with st.expander("Implementation Results", expanded=False):
236
+ st.markdown(
237
+ """
238
+ ### Document Processing
239
+
240
+ * **Total Documents**: 14 (7 first paragraphs + 7 full documents)
241
+ * **Text Units**: 36
242
+ * **Entities**: 315
243
+ * **Relationships**: 372
244
+ * **Communities**: 66 across 4 levels
245
+
246
+ ### Community Structure
247
+
248
+ * **Level 0**: 11 communities
249
+ * **Level 1**: 44 communities
250
+ * **Level 2**: 9 communities
251
+ * **Level 3**: 2 communities
252
+ """
253
+ )
254
+
255
+ st.markdown("### System Operation Flow")
256
+
257
+ mermaid_code = """
258
+ sequenceDiagram
259
+ participant U as User
260
+ participant Q as Query Engine
261
+ participant G as Graph Search
262
+ participant V as Vector Store
263
+ participant D as Document Retrieval
264
+
265
+ U->>Q: Submit Query
266
+ Q->>G: Search in First Paragraph Graph
267
+ G->>V: Lookup Relevant Entities
268
+ V->>D: Retrieve Full Content
269
+ D->>Q: Return Comprehensive Answer
270
+ Q->>U: Present Response
271
+ """
272
+
273
+ mermaid(mermaid_code, height=400)
274
+
275
+ with st.expander("Implementation Details", expanded=False):
276
+ st.markdown(
277
+ """
278
+ The implementation of the system follows a processing pipeline that integrates data from the first paragraphs and full documents, creating a unified structure for efficient querying. Below is the pipeline representation:
279
+ """
280
+ )
281
+
282
+ mermaid_code = """
283
+ flowchart TB
284
+ subgraph First Paragraphs
285
+ FP[Load First Paragraphs] --> EP[Extract Entities]
286
+ EP --> RP[Build Relationships]
287
+ RP --> CP[Create Communities]
288
+ end
289
+
290
+ subgraph Full Documents
291
+ FD[Load Full Documents] --> CH[Chunk Documents]
292
+ CH --> EF[Generate Embeddings]
293
+ end
294
+
295
+ subgraph Integration
296
+ CP --> VS[(Vector Store)]
297
+ EF --> VS
298
+ end
299
+
300
+ subgraph Search
301
+ Q[Query] --> GS[Graph Search]
302
+ GS --> VS
303
+ VS --> RD[Retrieve Details]
304
+ RD --> AG[Answer Generation]
305
+ end
306
+ """
307
+ mermaid(mermaid_code, height=800)
308
+ with st.expander("Requirements Fulfillment", expanded=False):
309
+ st.markdown(
310
+ """
311
+ ### Requirements Fulfillment
312
+
313
+ **First Paragraph Processing**: ✓
314
+ * Implemented through `source_filter` and `processing_order`
315
+ * Verified by entity and relationship extraction
316
+
317
+ **Full Document Embeddings**: ✓
318
+ * Stored in LanceDB
319
+ * Accessible for comprehensive retrieval
320
+
321
+ **Graph-Based Search**: ✓
322
+ * Communities and relationships established
323
+ * DRIFT search implemented
324
+
325
+ **Complete Answer Retrieval**: ✓
326
+ * Source priority configuration
327
+ * Full document content available
328
+
329
+ ### Performance Metrics
330
+
331
+ * **Indexing Speed**: 212.44 seconds total
332
+ * **Graph Density**: 372 relationships among 315 entities
333
+ * **Community Structure**: 4-level hierarchy
334
+ * **Vector Store Size**: 3 Lance files for different embedding types
335
+ """
336
+ )
337
+
338
+ with st.expander("Achieving the Requirement", expanded=False):
339
+ st.markdown("### Source-Based Processing Control:")
340
+
341
+ st.markdown(
342
+ """
343
+ ```yaml
344
+ input:
345
+ source_tracking: true
346
+ processing_order:
347
+ - path: "first_paragraphs"
348
+ priority: 1
349
+ purpose: "graph_building"
350
+ - path: "full_documents"
351
+ priority: 2
352
+ purpose: "retrieval"
353
+ ```
354
+ """
355
+ )
356
+ st.markdown(
357
+ """
358
+ This configuration ensures that GraphRAG knows which content is for graph building (first paragraphs) and which is for retrieval (full documents). The priority system makes sure first paragraphs are processed first and used primarily for the knowledge graph construction.
359
+ """
360
+ )
361
+
362
+ st.markdown("### Targeted Entity and Claim Extraction:")
363
+
364
+ st.markdown(
365
+ """
366
+ ```yaml
367
+ entity_extraction:
368
+ source_filter: "first_paragraphs"
369
+ max_gleanings: 2
370
+
371
+ claim_extraction:
372
+ source_filter: "first_paragraphs"
373
+ ```
374
+ """
375
+ )
376
+ st.markdown(
377
+ """
378
+ These filters ensure that the knowledge graph (entities, relationships, and claims) is built only from the first paragraphs. This is crucial because it means our initial search will only traverse the graph built from these first paragraphs, matching the requirement. The `max_gleanings: 2` allows for thorough extraction while maintaining precision.
379
+ """
380
+ )
381
+
382
+ st.markdown("### Search Priority and Retrieval Control:")
383
+
384
+ st.markdown(
385
+ """
386
+ ```yaml
387
+ local_search:
388
+ source_priority:
389
+ graph_search: "first_paragraphs"
390
+ answer_retrieval: "full_documents"
391
+ text_unit_prop: 0.7
392
+ community_prop: 0.3
393
+ ```
394
+ """
395
+ )
396
+ st.markdown(
397
+ """
398
+ This is where the magic happens - when a query is made, the system first searches using the graph built from first paragraphs (`graph_search: "first_paragraphs"`), but when it needs to construct the answer, it pulls the content from the full documents (`answer_retrieval: "full_documents"`).
399
+
400
+ The text_unit and community proportions ensure we're making good use of both the graph structure and the actual content. Looking at the output files we generated (`create_final_entities.parquet`, `create_final_relationships.parquet`, etc.), we can see this two-phase approach in action: the graph structure is built and stored separately from the full content, but they're linked through the unified vector store in LanceDB, allowing seamless transitions between graph search and content retrieval during query processing.
401
+ """
402
+ )
403
+
404
+ with st.expander("Improvements to Make the Graph Creation Process Leaner and Faster", expanded=False):
405
+ st.markdown("### Optimization of Chunk Size and Overlap:")
406
+
407
+ st.markdown(
408
+ """
409
+ ```yaml
410
+ chunks:
411
+ size: 300 # Reduced from 500
412
+ overlap: 25 # Reduced from 50
413
+ group_by_columns: [id]
414
+ ```
415
+ """
416
+ )
417
+ st.markdown(
418
+ """
419
+ **Rationale**:
420
+ - Smaller chunks with minimal overlap reduce token usage.
421
+ - Maintains context while processing fewer tokens per API call.
422
+ - Especially efficient for first paragraphs processing.
423
+ """
424
+ )
425
+
426
+ st.markdown("### Streamline Entity Types and Claims:")
427
+
428
+ st.markdown(
429
+ """
430
+ ```yaml
431
+ entity_extraction:
432
+ entity_types:
433
+ - "Baggage"
434
+ - "Restriction"
435
+ - "Item"
436
+ max_gleanings: 1 # Reduced from 2
437
+
438
+ claim_extraction:
439
+ enabled: false # Disable unless absolutely necessary
440
+ ```
441
+ """
442
+ )
443
+ st.markdown(
444
+ """
445
+ **Rationale**:
446
+ - Fewer entity types mean fewer extraction operations.
447
+ - Single gleaning pass is often sufficient.
448
+ - Claims processing is expensive and often redundant.
449
+ """
450
+ )
451
+
452
+ st.markdown("### Optimize Graph Embeddings:")
453
+
454
+ st.markdown(
455
+ """
456
+ ```yaml
457
+ embed_graph:
458
+ enabled: true
459
+ num_walks: 50 # Reduced from 100
460
+ walk_length: 5 # Reduced from 10
461
+ window_size: 3 # Reduced from 5
462
+ iterations: 5 # Reduced from 10
463
+ ```
464
+ """
465
+ )
466
+ st.markdown(
467
+ """
468
+ **Rationale**:
469
+ - Fewer random walks still capture essential graph structure.
470
+ - Shorter walks reduce computation time.
471
+ - Smaller window size focuses on immediate relationships.
472
+ """
473
+ )
474
+
475
+ st.markdown("### Batch Processing and Parallelization:")
476
+
477
+ st.markdown(
478
+ """
479
+ ```yaml
480
+ embeddings:
481
+ async_mode: asyncio # Changed from threaded
482
+ batch_size: 32 # Increased from 16
483
+ batch_max_tokens: 8191
484
+ ```
485
+ """
486
+ )
487
+ st.markdown(
488
+ """
489
+ **Rationale**:
490
+ - Asyncio performs better than threading for I/O-bound operations.
491
+ - Larger batch size reduces API calls.
492
+ - Maximizes throughput within token limits.
493
+ """
494
+ )
495
+
496
+ st.markdown("### Community Structure Optimization:")
497
+
498
+ st.markdown(
499
+ """
500
+ ```yaml
501
+ cluster_graph:
502
+ max_cluster_size: 15 # Increased slightly
503
+ min_cluster_size: 3 # Added parameter
504
+
505
+ community_reports:
506
+ max_input_length: 2000 # Reduced from default
507
+ max_length: 1000 # Reduced summary length
508
+ ```
509
+ """
510
+ )
511
+ st.markdown(
512
+ """
513
+ **Rationale**:
514
+ - Balanced cluster sizes reduce processing overhead.
515
+ - Shorter community reports still maintain essential information.
516
+ - Fewer tokens per report means faster processing.
517
+ """
518
+ )
519
+
520
+ st.markdown("### Caching and Storage:")
521
+
522
+ st.markdown(
523
+ """
524
+ ```yaml
525
+ cache:
526
+ type: file
527
+ base_dir: "cache"
528
+ compression: true # Add compression
529
+ cache_embeddings: true
530
+
531
+ storage:
532
+ type: file
533
+ base_dir: "output"
534
+ compression: true # Add compression
535
+ ```
536
+ """
537
+ )
538
+ st.markdown(
539
+ """
540
+ **Rationale**:
541
+ - Compression reduces I/O overhead.
542
+ - Caching embeddings prevents recomputation.
543
+ - File-based storage is faster than blob storage for local processing.
544
+ """
545
+ )
546
+
547
+ st.markdown("### Disable Non-Essential Features:")
548
+
549
+ st.markdown(
550
+ """
551
+ ```yaml
552
+ umap:
553
+ enabled: false # Disable unless visualization needed
554
+
555
+ snapshots:
556
+ graphml: false
557
+ raw_entities: false
558
+ top_level_nodes: false
559
+ ```
560
+ """
561
+ )
562
+ st.markdown(
563
+ """
564
+ **Rationale**:
565
+ - UMAP calculation is computationally expensive.
566
+ - Snapshots are useful for debugging but add overhead.
567
+ """
568
+ )
569
+
570
+ st.markdown("### LLM Configuration Optimization:")
571
+
572
+ st.markdown(
573
+ """
574
+ ```yaml
575
+ llm:
576
+ concurrent_requests: 25
577
+ tokens_per_minute: 150000
578
+ requests_per_minute: 10000
579
+ max_retries: 5 # Reduced from 10
580
+ ```
581
+ """
582
+ )
583
+ st.markdown(
584
+ """
585
+ **Rationale**:
586
+ - Balanced concurrency prevents rate limiting.
587
+ - Fewer retries reduce waiting time.
588
+ - Token and request limits prevent throttling.
589
+ """
590
+ )
591
+
592
+ with st.expander("Query Types", expanded=False):
593
+ st.markdown("### Local Search:")
594
+
595
+ st.markdown(
596
+ """
597
+ ```yaml
598
+ local_search:
599
+ text_unit_prop: 0.7 # Focus on specific text chunks
600
+ community_prop: 0.3 # Some consideration of community context
601
+ top_k_mapped_entities: 15
602
+ source_priority:
603
+ graph_search: "first_paragraphs"
604
+ answer_retrieval: "full_documents"
605
+ ```
606
+ """
607
+ )
608
+ st.markdown(
609
+ """
610
+ **Best when**: Looking for specific baggage rules or restrictions
611
+ **Example Query**: "What are the liquid restrictions for carry-on bags?"
612
+
613
+ **How it works with our data**:
614
+ - Searches for entities in first paragraphs (like "liquid", "carry-on").
615
+ - Follows direct relationships in the graph.
616
+ - Retrieves detailed rules from full documents.
617
+
618
+ **Meets requirement?** Yes, but in a limited way - focuses on direct connections.
619
+ """
620
+ )
621
+
622
+ st.markdown("### Global Search:")
623
+
624
+ st.markdown(
625
+ """
626
+ ```yaml
627
+ global_search:
628
+ max_tokens: 4000
629
+ data_max_tokens: 4000
630
+ min_score_threshold: 0.1
631
+ allow_general_knowledge: false
632
+ ```
633
+ """
634
+ )
635
+ st.markdown(
636
+ """
637
+ **Best when**: Understanding overall policies or themes
638
+ **Example Query**: "What are the main types of baggage restrictions?"
639
+
640
+ **How it works with our data**:
641
+ - Looks at community summaries built from first paragraphs.
642
+ - Provides broader context about baggage policies.
643
+ - Pulls supporting details from full documents.
644
+
645
+ **Meets requirement?** Partially - good for overview but might miss specific connections.
646
+ """
647
+ )
648
+
649
+ st.markdown("### DRIFT Search (Dynamic Reasoning and Inference with Flexible Traversal):")
650
+
651
+ st.markdown(
652
+ """
653
+ ```yaml
654
+ local_search:
655
+ source_priority:
656
+ graph_search: "first_paragraphs"
657
+ answer_retrieval: "full_documents"
658
+ ```
659
+ """
660
+ )
661
+ st.markdown(
662
+ """
663
+ **Best when**: Complex queries requiring both specific details and context
664
+ **Example Query**: "How do pet carrier size restrictions compare to regular carry-on limits?"
665
+
666
+ **How it works with our data**:
667
+ - Starts with first paragraphs graph to understand relationships between:
668
+ - Pet carriers
669
+ - Regular carry-on bags
670
+ - Size restrictions
671
+ - Uses community understanding to find related policies.
672
+ - Retrieves specific details from full documents.
673
+
674
+ **Meets requirement?** Yes, most comprehensively.
675
+ """
676
+ )
677
+
678
+ st.markdown("### Best Choice for Our Requirement:")
679
+ st.markdown(
680
+ """
681
+ **DRIFT Search** is the most suitable because:
682
+ - It naturally implements our two-phase requirement:
683
+ - Initial search on graph (from first paragraphs).
684
+ - Answer retrieval from full documents.
685
+ - It can handle complex queries that need:
686
+ - Understanding of relationships (from graph).
687
+ - Specific details (from full documents).
688
+ - It can dynamically adjust between:
689
+ - Local search when specific rules are needed.
690
+ - Global search when context is important.
691
+ """
692
+ )
693
+ with st.expander("Configuration: full `settings.yaml`", expanded=False):
694
+
695
+ st.markdown(
696
+ """
697
+ ```yaml
698
+ # Root configuration for GraphRAG, a system leveraging LLMs for advanced Retrieval Augmented Generation.
699
+
700
+ encoding_model: cl100k_base
701
+ # Specifies the model used for token encoding. The default 'cl100k_base' is common for OpenAI's text models,
702
+ # determining how text is tokenized into machine-readable units.
703
+
704
+ skip_workflows: []
705
+ # A list of workflows to skip during execution. Empty indicates all workflows are executed.
706
+
707
+ llm:
708
+ api_key: ${GRAPHRAG_API_KEY}
709
+ # Placeholder for the API key, replaced dynamically from environment variables.
710
+ # Ensures secure API access for LLM queries.
711
+
712
+ type: openai_chat
713
+ # Defines the type of LLM interface used. Here, it connects to OpenAI's chat-based API.
714
+
715
+ model: gpt-4o-mini
716
+ # Specifies the model variant to use.
717
+
718
+ model_supports_json: true
719
+ # Indicates whether the LLM natively supports JSON responses, useful for structured outputs.
720
+
721
+ max_tokens: 4000
722
+ # Maximum number of tokens in the output. Balances performance and context length.
723
+
724
+ temperature: 0
725
+ # Controls randomness in outputs. 0 means deterministic responses, often preferred for accuracy.
726
+
727
+ embeddings:
728
+ async_mode: threaded
729
+ # Asynchronous embedding computation mode. 'threaded' uses multi-threading for better performance.
730
+
731
+ batch_size: 16
732
+ # Number of data points processed per batch during embedding, balancing speed and resource use.
733
+
734
+ vector_store:
735
+ type: lancedb
736
+ # Database type used for storing vectorized embeddings. 'lancedb' supports efficient vector operations.
737
+
738
+ db_uri: 'output/lancedb'
739
+ # URI pointing to the database location where embeddings are stored.
740
+
741
+ container_name: default
742
+ # Logical name for the container storing vector data.
743
+
744
+ overwrite: true
745
+ # Whether to overwrite existing vectors. True allows updating the database during reruns.
746
+
747
+ llm:
748
+ api_key: ${GRAPHRAG_API_KEY}
749
+ type: openai_embedding
750
+ model: text-embedding-3-small
751
+ # Dedicated LLM for embedding tasks. A smaller, specialized model is specified for embeddings.
752
+
753
+ chunks:
754
+ size: 500
755
+ # Number of tokens per chunk of text. Controls granularity for processing long documents.
756
+
757
+ overlap: 50
758
+ # Overlap between adjacent chunks to ensure continuity in analysis.
759
+
760
+ group_by_columns: [id]
761
+ # Groups data by 'id' before chunking, preserving document boundaries.
762
+
763
+ input:
764
+ type: file
765
+ file_type: text
766
+ base_dir: "input"
767
+ file_pattern: ".*\\.txt$"
768
+ recursive: true
769
+ source_tracking: true
770
+ processing_order:
771
+ - path: "first_paragraphs"
772
+ priority: 1
773
+ purpose: "graph_building"
774
+ - path: "full_documents"
775
+ priority: 2
776
+ purpose: "retrieval"
777
+ # Specifies the data source for ingestion:
778
+ # - Input is file-based text.
779
+ # - Reads files recursively from "input" directory matching '.txt' files.
780
+ # - Prioritizes "first_paragraphs" for graph building and full documents for retrieval.
781
+
782
+ entity_extraction:
783
+ prompt: "prompts/entity_extraction.txt"
784
+ # Path to the custom prompt used for entity extraction tasks.
785
+
786
+ entity_types:
787
+ - "Baggage Type"
788
+ - "Dimension"
789
+ - "Linear Dimension"
790
+ - "Weight"
791
+ - "Material Type"
792
+ - "Wheel Configuration"
793
+ - "Measurement Unit"
794
+ - "Size Category"
795
+ - "Weight Category"
796
+ - "Airline"
797
+ - "Alliance"
798
+ - "Airport"
799
+ - "Route Type"
800
+ - "Travel Class"
801
+ - "Cabin Section"
802
+ - "Aircraft Type"
803
+ - "Restriction"
804
+ - "Exemption"
805
+ - "Policy"
806
+ - "Fee Structure"
807
+ - "Currency"
808
+ - "Allowance"
809
+ - "Special Item"
810
+ - "Prohibited Item"
811
+ - "Restricted Item"
812
+ - "Dangerous Good"
813
+ - "Fragile Item"
814
+ - "Valuable Item"
815
+ - "Required Document"
816
+ - "Label Type"
817
+ - "Tag Category"
818
+ - "Service Type"
819
+ - "Handler Role"
820
+ - "Service Location"
821
+ - "Time Period"
822
+ - "Passenger Type"
823
+ - "Membership Level"
824
+ - "Group Category"
825
+ # Defines the types of entities the system should extract.
826
+
827
+ max_gleanings: 2
828
+ # Maximum number of re-processing rounds to refine entity detection.
829
+
830
+ source_filter: "first_paragraphs"
831
+ # Restricts extraction to text from "first_paragraphs," optimizing focus.
832
+
833
+ claim_extraction:
834
+ enabled: true
835
+ # Enables claim extraction, capturing specific conditions or assertions from text.
836
+
837
+ claim_types:
838
+ - "Basic Size Restriction"
839
+ - "Oversize Condition"
840
+ - "Weight Limit Standard"
841
+ - "Overweight Condition"
842
+ - "Combined Dimension Limit"
843
+ - "Cabin Storage Requirement"
844
+ - "Standard Fee"
845
+ - "Excess Fee"
846
+ - "Oversize Fee"
847
+ - "Overweight Fee"
848
+ - "Special Handling Fee"
849
+ - "Season Surcharge"
850
+ - "Route-Specific Fee"
851
+ - "Multi-Piece Pricing"
852
+ - "Fee Waiver Condition"
853
+ - "Basic Allowance"
854
+ - "Class-Based Allowance"
855
+ - "Status-Based Allowance"
856
+ - "Route-Based Allowance"
857
+ - "Special Group Allowance"
858
+ - "Seasonal Allowance"
859
+ - "Equipment Allowance"
860
+ - "Prohibited Item Policy"
861
+ - "Restricted Item Condition"
862
+ - "Dangerous Goods Policy"
863
+ - "Special Item Restriction"
864
+ - "Packaging Requirement"
865
+ - "Declaration Requirement"
866
+ - "Check-in Deadline"
867
+ - "Special Handling Procedure"
868
+ - "Priority Handling Rule"
869
+ - "Transfer Handling Policy"
870
+ - "Delivery Service Policy"
871
+ - "Storage Policy"
872
+ - "Liability Limit"
873
+ - "Insurance Requirement"
874
+ - "Claim Procedure"
875
+ - "Compensation Policy"
876
+ - "Time Limit Policy"
877
+ - "Weather Restriction"
878
+ - "Seasonal Restriction"
879
+ - "Aircraft Limitation"
880
+ - "Route Restriction"
881
+ - "Connection Impact"
882
+ - "Tag Requirement"
883
+ - "Label Requirement"
884
+ - "Documentation Requirement"
885
+ - "Declaration Policy"
886
+ - "Handling Standard"
887
+ - "Service Level Agreement"
888
+ - "Priority Service Standard"
889
+ - "Delivery Time Standard"
890
+ - "Medical Exception"
891
+ - "Military Exception"
892
+ - "Diplomatic Exception"
893
+ - "Event Exception"
894
+ - "Emergency Exception"
895
+ # Types of claims to extract, covering diverse scenarios (e.g., fees, allowances).
896
+
897
+ prompt: "prompts/claim_extraction.txt"
898
+ description: "Extract baggage measurements, weight limits, and restrictions from airline documentation."
899
+ # Customizes the extraction logic for airline baggage policies.
900
+
901
+ max_gleanings: 2
902
+ source_filter: "first_paragraphs"
903
+ # Restricts claims to "first_paragraphs," mirroring entity extraction.
904
+
905
+ local_search:
906
+ text_unit_prop: 0.7
907
+ community_prop: 0.3
908
+ top_k_mapped_entities: 15
909
+ top_k_relationships: 15
910
+ max_tokens: 4000
911
+ source_priority:
912
+ graph_search: "first_paragraphs"
913
+ answer_retrieval: "full_documents"
914
+ # Configures search behavior:
915
+ # - Balances searches between individual text units and community-level summaries.
916
+ # - Limits results to top 15 entities and relationships for relevance.
917
+
918
+ global_search:
919
+ max_tokens: 4000
920
+ data_max_tokens: 4000
921
+ map_max_tokens: 1000
922
+ reduce_max_tokens: 2000
923
+ allow_general_knowledge: false
924
+ min_score_threshold: 0.1
925
+ concurrency: 10
926
+ # Defines query-wide global search capabilities:
927
+ # - Token limits for different operations.
928
+ # - Restricts non-specific general knowledge responses.
929
+ # - Handles up to 10 parallel queries.
930
+
931
+ embed_graph:
932
+ enabled: true
933
+ num_walks: 100
934
+ walk_length: 10
935
+ window_size: 5
936
+ iterations: 10
937
+ # Enables graph embedding (e.g., for node2vec):
938
+ # - Generates 100 random walks per node to learn embeddings.
939
+
940
+ umap:
941
+ enabled: true
942
+ n_neighbors: 15
943
+ min_dist: 0.1
944
+ n_components: 2
945
+ # Configures UMAP for dimensionality reduction and visualization.
946
+
947
+ storage:
948
+ type: file
949
+ base_dir: "output"
950
+ # Outputs processed data to local "output" directory.
951
+
952
+ cache:
953
+ type: file
954
+ base_dir: "cache"
955
+ # Stores temporary files in "cache."
956
+
957
+ reporting:
958
+ type: file
959
+ base_dir: "reports"
960
+ include_source_tracking: true
961
+ # Generates reports, including provenance for traceability.
962
+
963
+ ```
964
+ """
965
+ )