cordwainersmith commited on
Commit
13e0d32
·
2 Parent(s): 05aa63b b96b49a

Add project files and Docker setup

Browse files
Files changed (10) hide show
  1. Dockerfile +50 -0
  2. README.md +10 -10
  3. app.py +308 -0
  4. auth.py +42 -0
  5. knowledge_graph.html +0 -0
  6. query_config.yaml +25 -0
  7. requirements.txt +5 -0
  8. search_handlers.py +288 -0
  9. styles.css +114 -0
  10. wiki.py +968 -0
Dockerfile CHANGED
@@ -1,3 +1,4 @@
 
1
  # Step 1: Use an official Python slim base image
2
  FROM python:3.10-slim
3
 
@@ -45,4 +46,53 @@ COPY --chown=user wiki.py ./wiki.py
45
  EXPOSE 7860
46
 
47
  # Step 11: Define the entrypoint command
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
 
1
+ <<<<<<< HEAD
2
  # Step 1: Use an official Python slim base image
3
  FROM python:3.10-slim
4
 
 
46
  EXPOSE 7860
47
 
48
  # Step 11: Define the entrypoint command
49
+ =======
50
+ # Step 1: Use an official Python slim base image
51
+ FROM python:3.10-slim
52
+
53
+ # Step 2: Install system dependencies
54
+ RUN apt-get update && apt-get install -y \
55
+ wget \
56
+ tar \
57
+ && apt-get clean
58
+
59
+ # Step 3: Add a non-root user (required by Hugging Face Spaces)
60
+ RUN useradd -m -u 1000 user
61
+
62
+ # Step 4: Switch to the "user" user
63
+ USER user
64
+
65
+ # Step 5: Set home and working directory
66
+ ENV HOME=/home/user \
67
+ PATH=/home/user/.local/bin:$PATH
68
+ WORKDIR $HOME/app
69
+
70
+ # Step 6: Copy requirements into the container
71
+ COPY --chown=user requirements.txt ./requirements.txt
72
+
73
+ # Step 7: Install Python dependencies
74
+ RUN pip install --no-cache-dir --upgrade pip && \
75
+ pip install --no-cache-dir -r requirements.txt
76
+
77
+ # Step 8: Copy all necessary files and folders into the container
78
+ COPY --chown=user .output ./.output
79
+ COPY --chown=user cache ./cache
80
+ COPY --chown=user input ./input
81
+ COPY --chown=user output ./output
82
+ COPY --chown=user prompts ./prompts
83
+ COPY --chown=user reports ./reports
84
+ COPY --chown=user auth.py ./auth.py
85
+ COPY --chown=user knowledge_graph.html ./knowledge_graph.html
86
+ COPY --chown=user query_config.yaml ./query_config.yaml
87
+ COPY --chown=user app.py ./app.py
88
+ COPY --chown=user search_handlers.py ./search_handlers.py
89
+ COPY --chown=user settings.yaml ./settings.yaml
90
+ COPY --chown=user styles.css ./styles.css
91
+ COPY --chown=user wiki.py ./wiki.py
92
+
93
+ # Step 10: Expose the Streamlit default port
94
+ EXPOSE 7860
95
+
96
+ # Step 11: Define the entrypoint command
97
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
98
  CMD ["streamlit", "run", "app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
README.md CHANGED
@@ -1,10 +1,10 @@
1
- ---
2
- title: PwcGraphRAG
3
- emoji: ⚡
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: PwcGraphRAG
3
+ emoji: ⚡
4
+ colorFrom: purple
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import asyncio
3
  import sys
@@ -303,3 +304,310 @@ def main():
303
 
304
  if __name__ == "__main__":
305
  main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  import streamlit as st
3
  import asyncio
4
  import sys
 
304
 
305
  if __name__ == "__main__":
306
  main()
307
+ =======
308
+ import streamlit as st
309
+ import asyncio
310
+ import sys
311
+ from pathlib import Path
312
+ import base64
313
+ import pandas as pd
314
+ from typing import Literal, Tuple, Optional
315
+ from wiki import render_wiki_tab
316
+ from search_handlers import run_global_search, run_local_search, run_drift_search
317
+ import auth
318
+
319
+
320
+ import graphrag.api as api
321
+ from graphrag.config import GraphRagConfig, load_config, resolve_paths
322
+ from graphrag.index.create_pipeline_config import create_pipeline_config
323
+ from graphrag.logging import PrintProgressReporter
324
+ from graphrag.utils.storage import _create_storage, _load_table_from_storage
325
+
326
+
327
+ st.set_page_config(page_title="GraphRAG Chat Interface", page_icon="🔍", layout="wide")
328
+
329
+ # Define default avatars at the module level
330
+ DEFAULT_USER_AVATAR = "👤"
331
+ DEFAULT_BOT_AVATAR = "🤖"
332
+
333
+ # Initialize session state for avatars
334
+ if "user_avatar" not in st.session_state:
335
+ st.session_state.user_avatar = DEFAULT_USER_AVATAR
336
+ if "bot_avatar" not in st.session_state:
337
+ st.session_state.bot_avatar = DEFAULT_BOT_AVATAR
338
+
339
+ # Define avatar images
340
+ USER_AVATAR = "👤" # Default user emoji
341
+ BOT_AVATAR = "🤖" # Default bot emoji
342
+
343
+
344
+ class StreamlitProgressReporter(PrintProgressReporter):
345
+ def __init__(self, placeholder):
346
+ super().__init__("")
347
+ self.placeholder = placeholder
348
+
349
+ def success(self, message: str):
350
+ self.placeholder.success(message)
351
+
352
+
353
+ def render_chat_tab():
354
+ """Render the Chat tab content."""
355
+ format_message_history()
356
+
357
+ # Chat input
358
+ if prompt := st.chat_input("Enter your query..."):
359
+ # Add user message to history with timestamp
360
+ st.session_state.messages.append(
361
+ {
362
+ "role": "user",
363
+ "content": prompt,
364
+ "timestamp": pd.Timestamp.now().strftime("%H:%M"),
365
+ }
366
+ )
367
+
368
+ # Process query
369
+ with st.spinner("Processing your query..."):
370
+ response_placeholder = st.empty()
371
+ try:
372
+ if st.session_state.search_type == "global":
373
+ response, context = run_global_search(
374
+ config_filepath=st.session_state.config_filepath,
375
+ data_dir=st.session_state.data_dir,
376
+ root_dir=st.session_state.root_dir,
377
+ community_level=st.session_state.community_level,
378
+ response_type=st.session_state.response_type,
379
+ streaming=st.session_state.streaming,
380
+ query=prompt,
381
+ progress_placeholder=response_placeholder,
382
+ )
383
+ elif st.session_state.search_type == "drift":
384
+ response, context = run_drift_search(
385
+ config_filepath=st.session_state.config_filepath,
386
+ data_dir=st.session_state.data_dir,
387
+ root_dir=st.session_state.root_dir,
388
+ community_level=st.session_state.community_level,
389
+ response_type=st.session_state.response_type,
390
+ streaming=st.session_state.streaming,
391
+ query=prompt,
392
+ progress_placeholder=response_placeholder,
393
+ )
394
+ else:
395
+ response, context = run_local_search(
396
+ config_filepath=st.session_state.config_filepath,
397
+ data_dir=st.session_state.data_dir,
398
+ root_dir=st.session_state.root_dir,
399
+ community_level=st.session_state.community_level,
400
+ response_type=st.session_state.response_type,
401
+ streaming=st.session_state.streaming,
402
+ query=prompt,
403
+ progress_placeholder=response_placeholder,
404
+ )
405
+
406
+ # Clear the placeholder before adding the final response
407
+ response_placeholder.empty()
408
+
409
+ # Add assistant response to history with timestamp
410
+ st.session_state.messages.append(
411
+ {
412
+ "role": "assistant",
413
+ "content": response,
414
+ "timestamp": pd.Timestamp.now().strftime("%H:%M"),
415
+ }
416
+ )
417
+
418
+ # Show context in expander
419
+ with st.expander("View Search Context"):
420
+ st.json(context)
421
+
422
+ except Exception as e:
423
+ error_message = f"Error processing query: {str(e)}"
424
+ st.session_state.messages.append(
425
+ {
426
+ "role": "assistant",
427
+ "content": error_message,
428
+ "timestamp": pd.Timestamp.now().strftime("%H:%M"),
429
+ }
430
+ )
431
+
432
+
433
+ st.rerun()
434
+
435
+
436
+ def display_message(msg: str, is_user: bool = False, timestamp: str = "") -> None:
437
+ """Display a chat message with avatar and consistent formatting."""
438
+ role = "user" if is_user else "assistant"
439
+ message_class = "user-message" if is_user else "assistant-message"
440
+ avatar = st.session_state.user_avatar if is_user else st.session_state.bot_avatar
441
+
442
+ message_container = f"""
443
+ <div class="chat-message {message_class}">
444
+ <div class="avatar">
445
+ <div style="font-size: 25px; text-align: center;">{avatar}</div>
446
+ </div>
447
+ <div class="message-content-wrapper">
448
+ <div class="message-bubble">
449
+ <div class="message-content">
450
+ {msg}
451
+ </div>
452
+ </div>
453
+ <div class="timestamp">{timestamp}</div>
454
+ </div>
455
+ </div>
456
+ """
457
+ st.markdown(message_container, unsafe_allow_html=True)
458
+
459
+
460
+ def format_message_history() -> None:
461
+ """Display all messages in the chat history with consistent formatting."""
462
+ st.markdown('<div class="chat-container">', unsafe_allow_html=True)
463
+ for message in st.session_state.messages:
464
+ timestamp = message.get("timestamp", "")
465
+ display_message(
466
+ msg=message["content"],
467
+ is_user=(message["role"] == "user"),
468
+ timestamp=timestamp,
469
+ )
470
+ st.markdown("</div>", unsafe_allow_html=True)
471
+
472
+
473
+ @st.cache_resource
474
+ def load_css():
475
+ with open("styles.css", "r") as f:
476
+ return f.read()
477
+
478
+
479
+ def initialize_session_state():
480
+ """Initialize session state variables if they don't exist."""
481
+ if "messages" not in st.session_state:
482
+ st.session_state.messages = []
483
+ if "response_placeholder" not in st.session_state:
484
+ st.session_state.response_placeholder = None
485
+ if "config_filepath" not in st.session_state:
486
+ st.session_state.config_filepath = None
487
+ if "data_dir" not in st.session_state:
488
+ st.session_state.data_dir = None
489
+ if "root_dir" not in st.session_state:
490
+ st.session_state.root_dir = "."
491
+ if "community_level" not in st.session_state:
492
+ st.session_state.community_level = 2
493
+ if "response_type" not in st.session_state:
494
+ st.session_state.response_type = "concise"
495
+ if "search_type" not in st.session_state:
496
+ st.session_state.search_type = "global"
497
+ if "streaming" not in st.session_state:
498
+ st.session_state.streaming = True
499
+ if "authenticated" not in st.session_state:
500
+ st.session_state.authenticated = False
501
+
502
+
503
+ def main():
504
+ initialize_session_state()
505
+
506
+ # Authentication check
507
+ if not st.session_state.authenticated:
508
+ if auth.check_credentials():
509
+ st.session_state.authenticated = True
510
+ st.rerun() # Rerun to reflect the authentication state
511
+ else:
512
+ st.stop() # Stop further execution if authentication fails
513
+
514
+ # If authenticated, proceed with the main app
515
+ if st.session_state.authenticated:
516
+ # Main application content
517
+ st.title("PWC Home Assigment #1, Graphrag")
518
+
519
+ css = load_css()
520
+ st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
521
+
522
+ # Sidebar configuration
523
+ with st.sidebar:
524
+ # Display logos side by side at the top of the sidebar
525
+ col1, col2 = st.columns(2)
526
+ with col1:
527
+ st.markdown(
528
+ '<div class="logo-container"><img class="logo-image" src="https://nexttech.pwc.co.il/wp-content/uploads/2023/12/image-2.png"></div>',
529
+ unsafe_allow_html=True,
530
+ )
531
+ with col2:
532
+ st.markdown(
533
+ '<div class="logo-container"><img class="logo-image" src="https://nexttech.pwc.co.il/wp-content/uploads/2023/12/Frame.png"></div>',
534
+ unsafe_allow_html=True,
535
+ )
536
+
537
+ st.header("Configuration")
538
+ st.session_state.community_level = st.number_input(
539
+ "Community Level",
540
+ min_value=0,
541
+ max_value=10,
542
+ value=st.session_state.community_level,
543
+ help="Controls the granularity of the search...",
544
+ )
545
+
546
+ # Only show response type for global and local search
547
+ if st.session_state.search_type != "drift":
548
+ st.session_state.response_type = st.selectbox(
549
+ "Response Type",
550
+ options=["concise", "detailed"],
551
+ index=0 if st.session_state.response_type == "concise" else 1,
552
+ help="Style of response generation",
553
+ )
554
+
555
+ st.session_state.search_type = st.selectbox(
556
+ "Search Type",
557
+ options=["global", "local", "drift"],
558
+ index=(
559
+ 0
560
+ if st.session_state.search_type == "global"
561
+ else 1 if st.session_state.search_type == "local" else 2
562
+ ),
563
+ help="""Search Types:
564
+ - Local Search: "Focuses on finding specific information by searching through direct connections in the knowledge graph. Best for precise, fact-based queries."
565
+ - Global Search: "Analyzes the entire document collection at a high level using community summaries. Best for understanding broad themes and general policies."
566
+ - DRIFT Search: "Combines local and global search capabilities, dynamically exploring connections while gathering detailed information. Best for complex queries requiring both specific details and broader context."
567
+ """,
568
+ )
569
+
570
+ # Show streaming option only for supported search types
571
+ if st.session_state.search_type != "drift":
572
+ st.session_state.streaming = st.checkbox(
573
+ "Enable Streaming",
574
+ value=st.session_state.streaming,
575
+ help="Stream response tokens as they're generated",
576
+ )
577
+ else:
578
+ st.session_state.streaming = False
579
+ st.info("Streaming is not available for DRIFT search")
580
+
581
+ # logout button
582
+ if st.button("Logout"):
583
+ st.session_state.clear() # Clear all session state data
584
+ initialize_session_state() # Reinitialize the session state
585
+ st.query_params = {"restart": "true"} # Refresh the UI
586
+ st.rerun()
587
+
588
+ # Create tabs
589
+ tab1, tab2 = st.tabs(["Assignment Documentation", "Chat"])
590
+
591
+ # readme tab content
592
+ with tab1:
593
+ render_wiki_tab()
594
+
595
+ # Chat tab content
596
+ with tab2:
597
+ render_chat_tab()
598
+
599
+ st.sidebar.markdown(
600
+ """
601
+ <div style="position: absolute; bottom: 0; width: 100%; text-align: center; font-size: 14px; margin-bottom: -200px;">
602
+ Liran Baba |
603
+ <a href="https://linkedin.com/in/liranba" target="_blank">LinkedIn</a> |
604
+ <a href="https://huggingface.co/CordwainerSmith" target="_blank">HuggingFace</a>
605
+ </div>
606
+ """,
607
+ unsafe_allow_html=True,
608
+ )
609
+
610
+
611
+ if __name__ == "__main__":
612
+ main()
613
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
auth.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import streamlit as st
3
 
@@ -37,3 +38,44 @@ def check_credentials():
37
 
38
  # Return False if login not attempted or failed
39
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  import os
3
  import streamlit as st
4
 
 
38
 
39
  # Return False if login not attempted or failed
40
  return False
41
+ =======
42
+ import os
43
+ import streamlit as st
44
+
45
+
46
+ def check_credentials():
47
+ """Handles login form and returns True if authenticated successfully."""
48
+
49
+ # Check if already authenticated
50
+ if st.session_state.get("authenticated", False):
51
+ return True # User is already authenticated
52
+
53
+ # Retrieve credentials from environment variables (set via Hugging Face Secrets)
54
+ expected_username = os.environ.get("APP_USERNAME")
55
+ expected_password = os.environ.get("APP_PASSWORD")
56
+
57
+ if not expected_username or not expected_password:
58
+ st.error("Server is misconfigured: missing credentials.")
59
+ return False
60
+
61
+ # Show the login form only if not authenticated
62
+ with st.form("login_form", clear_on_submit=True):
63
+ st.text_input("Username", key="username")
64
+ st.text_input("Password", type="password", key="password")
65
+ submit_button = st.form_submit_button("Login")
66
+
67
+ if submit_button:
68
+ # Validate credentials
69
+ if (
70
+ st.session_state["username"] == expected_username
71
+ and st.session_state["password"] == expected_password
72
+ ):
73
+ st.session_state["authenticated"] = True # Mark user as authenticated
74
+ return True
75
+ else:
76
+ st.error("😕 Incorrect username or password")
77
+ return False # Indicate failed authentication
78
+
79
+ # Return False if login not attempted or failed
80
+ return False
81
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
knowledge_graph.html CHANGED
The diff for this file is too large to render. See raw diff
 
query_config.yaml CHANGED
@@ -1,3 +1,4 @@
 
1
  drift_search:
2
  max_tokens: 4000
3
  drift_k_followups: 3
@@ -20,4 +21,28 @@ global_search:
20
  data_max_tokens: 8000
21
  map_max_tokens: 1000
22
  reduce_max_tokens: 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  concurrency: 16
 
1
+ <<<<<<< HEAD
2
  drift_search:
3
  max_tokens: 4000
4
  drift_k_followups: 3
 
21
  data_max_tokens: 8000
22
  map_max_tokens: 1000
23
  reduce_max_tokens: 2000
24
+ =======
25
+ drift_search:
26
+ max_tokens: 4000
27
+ drift_k_followups: 3
28
+ n_depth: 2
29
+ local_search_text_unit_prop: 0.6
30
+ local_search_community_prop: 0.4
31
+ local_search_top_k_mapped_entities: 10
32
+ local_search_top_k_relationships: 10
33
+
34
+ local_search:
35
+ text_unit_prop: 0.5
36
+ community_prop: 0.3
37
+ conversation_history_max_turns: 5
38
+ top_k_mapped_entities: 10
39
+ top_k_relationships: 10
40
+ max_tokens: 8000
41
+
42
+ global_search:
43
+ max_tokens: 8000
44
+ data_max_tokens: 8000
45
+ map_max_tokens: 1000
46
+ reduce_max_tokens: 2000
47
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
48
  concurrency: 16
requirements.txt CHANGED
@@ -1,3 +1,8 @@
 
1
  streamlit==1.40.1
2
  pandas
 
 
 
 
3
  graphrag==0.4.1
 
1
+ <<<<<<< HEAD
2
  streamlit==1.40.1
3
  pandas
4
+ =======
5
+ streamlit==1.40.1
6
+ pandas
7
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
8
  graphrag==0.4.1
search_handlers.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import asyncio
2
  from pathlib import Path
3
  import pandas as pd
@@ -283,3 +284,290 @@ def run_drift_search(
283
  )
284
  reporter.success(f"DRIFT Search Response:\n{response}")
285
  return response, context_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  import asyncio
3
  from pathlib import Path
4
  import pandas as pd
 
284
  )
285
  reporter.success(f"DRIFT Search Response:\n{response}")
286
  return response, context_data
287
+ =======
288
+ import asyncio
289
+ from pathlib import Path
290
+ import pandas as pd
291
+ from typing import Tuple, Optional
292
+ from graphrag.config import GraphRagConfig, load_config, resolve_paths
293
+ from graphrag.index.create_pipeline_config import create_pipeline_config
294
+ from graphrag.logging import PrintProgressReporter
295
+ from graphrag.utils.storage import _create_storage, _load_table_from_storage
296
+ import graphrag.api as api
297
+
298
+
299
+ class StreamlitProgressReporter(PrintProgressReporter):
300
+ def __init__(self, placeholder):
301
+ super().__init__("")
302
+ self.placeholder = placeholder
303
+
304
+ def success(self, message: str):
305
+ self.placeholder.success(message)
306
+
307
+
308
+ def _resolve_parquet_files(
309
+ root_dir: str,
310
+ config: GraphRagConfig,
311
+ parquet_list: list[str],
312
+ optional_list: list[str],
313
+ ) -> dict[str, pd.DataFrame]:
314
+ """Read parquet files to a dataframe dict."""
315
+ dataframe_dict = {}
316
+ pipeline_config = create_pipeline_config(config)
317
+ storage_obj = _create_storage(root_dir=root_dir, config=pipeline_config.storage)
318
+
319
+ for parquet_file in parquet_list:
320
+ df_key = parquet_file.split(".")[0]
321
+ df_value = asyncio.run(
322
+ _load_table_from_storage(name=parquet_file, storage=storage_obj)
323
+ )
324
+ dataframe_dict[df_key] = df_value
325
+
326
+ for optional_file in optional_list:
327
+ file_exists = asyncio.run(storage_obj.has(optional_file))
328
+ df_key = optional_file.split(".")[0]
329
+ if file_exists:
330
+ df_value = asyncio.run(
331
+ _load_table_from_storage(name=optional_file, storage=storage_obj)
332
+ )
333
+ dataframe_dict[df_key] = df_value
334
+ else:
335
+ dataframe_dict[df_key] = None
336
+
337
+ return dataframe_dict
338
+
339
+
340
+ def run_global_search(
341
+ config_filepath: Optional[str],
342
+ data_dir: Optional[str],
343
+ root_dir: str,
344
+ community_level: int,
345
+ response_type: str,
346
+ streaming: bool,
347
+ query: str,
348
+ progress_placeholder,
349
+ ) -> Tuple[str, dict]:
350
+ """Perform a global search with a given query."""
351
+ root = Path(root_dir).resolve()
352
+ config = load_config(root, config_filepath)
353
+ reporter = StreamlitProgressReporter(progress_placeholder)
354
+
355
+ config.storage.base_dir = data_dir or config.storage.base_dir
356
+ resolve_paths(config)
357
+
358
+ dataframe_dict = _resolve_parquet_files(
359
+ root_dir=root_dir,
360
+ config=config,
361
+ parquet_list=[
362
+ "create_final_nodes.parquet",
363
+ "create_final_entities.parquet",
364
+ "create_final_community_reports.parquet",
365
+ ],
366
+ optional_list=[],
367
+ )
368
+
369
+ final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
370
+ final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
371
+ final_community_reports: pd.DataFrame = dataframe_dict[
372
+ "create_final_community_reports"
373
+ ]
374
+
375
+ if streaming:
376
+
377
+ async def run_streaming_search():
378
+ full_response = ""
379
+ context_data = None
380
+ get_context_data = True
381
+ try:
382
+ async for stream_chunk in api.global_search_streaming(
383
+ config=config,
384
+ nodes=final_nodes,
385
+ entities=final_entities,
386
+ community_reports=final_community_reports,
387
+ community_level=community_level,
388
+ response_type=response_type,
389
+ query=query,
390
+ ):
391
+ if get_context_data:
392
+ context_data = stream_chunk
393
+ get_context_data = False
394
+ else:
395
+ full_response += stream_chunk
396
+ progress_placeholder.markdown(full_response)
397
+ except Exception as e:
398
+ progress_placeholder.error(f"Error during streaming search: {e}")
399
+ return None, None
400
+
401
+ return full_response, context_data
402
+
403
+ result = asyncio.run(run_streaming_search())
404
+ if result is None:
405
+ return "", {} # Graceful fallback
406
+ return result
407
+
408
+ # Non-streaming logic
409
+ try:
410
+ response, context_data = asyncio.run(
411
+ api.global_search(
412
+ config=config,
413
+ nodes=final_nodes,
414
+ entities=final_entities,
415
+ community_reports=final_community_reports,
416
+ community_level=community_level,
417
+ response_type=response_type,
418
+ query=query,
419
+ )
420
+ )
421
+ reporter.success(f"Global Search Response:\n{response}")
422
+ return response, context_data
423
+ except Exception as e:
424
+ progress_placeholder.error(f"Error during global search: {e}")
425
+ return "", {} # Graceful fallback
426
+
427
+
428
+ def run_local_search(
429
+ config_filepath: Optional[str],
430
+ data_dir: Optional[str],
431
+ root_dir: str,
432
+ community_level: int,
433
+ response_type: str,
434
+ streaming: bool,
435
+ query: str,
436
+ progress_placeholder,
437
+ ) -> Tuple[str, dict]:
438
+ """Perform a local search with a given query."""
439
+ root = Path(root_dir).resolve()
440
+ config = load_config(root, config_filepath)
441
+ reporter = StreamlitProgressReporter(progress_placeholder)
442
+
443
+ config.storage.base_dir = data_dir or config.storage.base_dir
444
+ resolve_paths(config)
445
+
446
+ dataframe_dict = _resolve_parquet_files(
447
+ root_dir=root_dir,
448
+ config=config,
449
+ parquet_list=[
450
+ "create_final_nodes.parquet",
451
+ "create_final_community_reports.parquet",
452
+ "create_final_text_units.parquet",
453
+ "create_final_relationships.parquet",
454
+ "create_final_entities.parquet",
455
+ ],
456
+ optional_list=["create_final_covariates.parquet"],
457
+ )
458
+
459
+ final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
460
+ final_community_reports: pd.DataFrame = dataframe_dict[
461
+ "create_final_community_reports"
462
+ ]
463
+ final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
464
+ final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
465
+ final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
466
+ final_covariates: Optional[pd.DataFrame] = dataframe_dict["create_final_covariates"]
467
+
468
+ if streaming:
469
+
470
+ async def run_streaming_search():
471
+ full_response = ""
472
+ context_data = None
473
+ get_context_data = True
474
+ async for stream_chunk in api.local_search_streaming(
475
+ config=config,
476
+ nodes=final_nodes,
477
+ entities=final_entities,
478
+ community_reports=final_community_reports,
479
+ text_units=final_text_units,
480
+ relationships=final_relationships,
481
+ covariates=final_covariates,
482
+ community_level=community_level,
483
+ response_type=response_type,
484
+ query=query,
485
+ ):
486
+ if get_context_data:
487
+ context_data = stream_chunk
488
+ get_context_data = False
489
+ else:
490
+ full_response += stream_chunk
491
+ progress_placeholder.markdown(full_response)
492
+ return full_response, context_data
493
+
494
+ return asyncio.run(run_streaming_search())
495
+
496
+ response, context_data = asyncio.run(
497
+ api.local_search(
498
+ config=config,
499
+ nodes=final_nodes,
500
+ entities=final_entities,
501
+ community_reports=final_community_reports,
502
+ text_units=final_text_units,
503
+ relationships=final_relationships,
504
+ covariates=final_covariates,
505
+ community_level=community_level,
506
+ response_type=response_type,
507
+ query=query,
508
+ )
509
+ )
510
+ reporter.success(f"Local Search Response:\n{response}")
511
+ return response, context_data
512
+
513
+
514
+ def run_drift_search(
515
+ config_filepath: Optional[str],
516
+ data_dir: Optional[str],
517
+ root_dir: str,
518
+ community_level: int,
519
+ response_type: str,
520
+ streaming: bool,
521
+ query: str,
522
+ progress_placeholder,
523
+ ) -> Tuple[str, dict]:
524
+ """Perform a DRIFT search with a given query."""
525
+ root = Path(root_dir).resolve()
526
+ config = load_config(root, config_filepath)
527
+ reporter = StreamlitProgressReporter(progress_placeholder)
528
+
529
+ config.storage.base_dir = data_dir or config.storage.base_dir
530
+ resolve_paths(config)
531
+
532
+ dataframe_dict = _resolve_parquet_files(
533
+ root_dir=root_dir,
534
+ config=config,
535
+ parquet_list=[
536
+ "create_final_nodes.parquet",
537
+ "create_final_entities.parquet",
538
+ "create_final_community_reports.parquet",
539
+ "create_final_text_units.parquet",
540
+ "create_final_relationships.parquet",
541
+ ],
542
+ optional_list=[], # Remove covariates as it's not supported
543
+ )
544
+
545
+ final_nodes: pd.DataFrame = dataframe_dict["create_final_nodes"]
546
+ final_entities: pd.DataFrame = dataframe_dict["create_final_entities"]
547
+ final_community_reports: pd.DataFrame = dataframe_dict[
548
+ "create_final_community_reports"
549
+ ]
550
+ final_text_units: pd.DataFrame = dataframe_dict["create_final_text_units"]
551
+ final_relationships: pd.DataFrame = dataframe_dict["create_final_relationships"]
552
+
553
+ # Note: DRIFT search doesn't support streaming
554
+ if streaming:
555
+ progress_placeholder.warning(
556
+ "Streaming is not supported for DRIFT search. Using standard search instead."
557
+ )
558
+
559
+ response, context_data = asyncio.run(
560
+ api.drift_search(
561
+ config=config,
562
+ nodes=final_nodes,
563
+ entities=final_entities,
564
+ community_reports=final_community_reports,
565
+ text_units=final_text_units,
566
+ relationships=final_relationships,
567
+ community_level=community_level,
568
+ query=query,
569
+ )
570
+ )
571
+ reporter.success(f"DRIFT Search Response:\n{response}")
572
+ return response, context_data
573
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
styles.css CHANGED
@@ -1,3 +1,4 @@
 
1
  /* Container for all messages */
2
  .chat-container {
3
  display: flex;
@@ -109,4 +110,117 @@
109
  /* Set maximum width */
110
  height: auto;
111
  /* Maintain aspect ratio */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  }
 
1
+ <<<<<<< HEAD
2
  /* Container for all messages */
3
  .chat-container {
4
  display: flex;
 
110
  /* Set maximum width */
111
  height: auto;
112
  /* Maintain aspect ratio */
113
+ =======
114
+ /* Container for all messages */
115
+ .chat-container {
116
+ display: flex;
117
+ flex-direction: column;
118
+ gap: 1rem;
119
+ padding: 1rem;
120
+ }
121
+
122
+ /* Message wrapper with avatar support */
123
+ .chat-message {
124
+ display: flex;
125
+ align-items: flex-start;
126
+ gap: 0.5rem;
127
+ width: 100%;
128
+ max-width: 900px;
129
+ margin: 0.5rem 0;
130
+ }
131
+
132
+ /* Avatar container */
133
+ .avatar {
134
+ width: 40px;
135
+ height: 40px;
136
+ border-radius: 50%;
137
+ overflow: hidden;
138
+ flex-shrink: 0;
139
+ }
140
+
141
+ .avatar img {
142
+ width: 100%;
143
+ height: 100%;
144
+ object-fit: cover;
145
+ }
146
+
147
+ /* Message content wrapper */
148
+ .message-content-wrapper {
149
+ display: flex;
150
+ flex-direction: column;
151
+ max-width: 80%;
152
+ }
153
+
154
+ /* Message bubble */
155
+ .message-bubble {
156
+ padding: 1rem;
157
+ border-radius: 0.5rem;
158
+ margin: 0.2rem 0;
159
+ }
160
+
161
+ /* User message specific styling */
162
+ .user-message {
163
+ flex-direction: row-reverse;
164
+ }
165
+
166
+ .user-message .message-bubble {
167
+ background-color: #2b313e;
168
+ border-top-right-radius: 0;
169
+ color: white;
170
+ }
171
+
172
+ /* Assistant message specific styling */
173
+ .assistant-message .message-bubble {
174
+ background-color: #343741;
175
+ border-top-left-radius: 0;
176
+ color: white;
177
+ }
178
+
179
+ /* Message content */
180
+ .message-content {
181
+ word-wrap: break-word;
182
+ }
183
+
184
+ /* Remove default streamlit margins */
185
+ .stMarkdown {
186
+ margin: 0 !important;
187
+ }
188
+
189
+ /* Style for code blocks within messages */
190
+ .message-content pre {
191
+ background-color: #1e1e1e;
192
+ padding: 0.5rem;
193
+ border-radius: 0.3rem;
194
+ margin: 0.5rem 0;
195
+ overflow-x: auto;
196
+ }
197
+
198
+ /* Improved loading spinner visibility */
199
+ .stSpinner {
200
+ text-align: center;
201
+ margin: 1rem 0;
202
+ }
203
+
204
+ /* Time stamp styling */
205
+ .timestamp {
206
+ font-size: 0.8em;
207
+ color: #999;
208
+ margin: 0.2rem 0;
209
+ }
210
+
211
+ .logo-container {
212
+ display: flex;
213
+ /* Enable flexbox layout */
214
+ align-items: center;
215
+ /* Vertically center-align items */
216
+ padding: 10px 0;
217
+ /* Add padding top/bottom */
218
+ }
219
+
220
+ .logo-image {
221
+ max-width: 110px;
222
+ /* Set maximum width */
223
+ height: auto;
224
+ /* Maintain aspect ratio */
225
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff
226
  }
wiki.py CHANGED
@@ -1,3 +1,4 @@
 
1
  # wiki.py
2
  # import streamlit_mermaid as stmd
3
  import streamlit.components.v1 as components
@@ -963,3 +964,970 @@ def render_wiki_tab():
963
  ```
964
  """
965
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
  # wiki.py
3
  # import streamlit_mermaid as stmd
4
  import streamlit.components.v1 as components
 
964
  ```
965
  """
966
  )
967
+ =======
968
+ # wiki.py
969
+ # import streamlit_mermaid as stmd
970
+ import streamlit.components.v1 as components
971
+ import streamlit as st
972
+ from streamlit.components.v1 import html
973
+
974
+
975
+ def mermaid(code: str, height: int = 600) -> None:
976
+ components.html(
977
+ f"""
978
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
979
+ <div style="height: {height}px">
980
+ <pre class="mermaid">
981
+ {code}
982
+ </pre>
983
+ </div>
984
+ <script type="module">
985
+ import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs';
986
+ mermaid.initialize({{ startOnLoad: true }});
987
+ </script>
988
+ """,
989
+ height=height,
990
+ )
991
+
992
+
993
+ def render_wiki_tab():
994
+ """Render the Wiki tab content."""
995
+ st.header("Overview")
996
+
997
+ st.markdown(
998
+ """
999
+ This documentation details the process I followed to achieve the assignment of using GraphRAG for indexing the first paragraphs of seven documents, embedding the full documents, performing initial search on the graph built from the first paragraphs, and retrieving answers from the full document content.
1000
+ """)
1001
+ st.markdown(
1002
+ """
1003
+ This project implements a specialized document processing and querying system using GraphRAG for El Al baggage requirements\allowance documentation. The system processes first paragraphs separately from full documents, enabling graph-based search while maintaining comprehensive answer retrieval capabilities.
1004
+
1005
+ """
1006
+ )
1007
+
1008
+ st.markdown(
1009
+ """
1010
+ ### Implementation Process
1011
+
1012
+ Initially, I attempted to implement this using separate processing paths for first paragraphs and full documents, but I discovered a more elegant solution through GraphRAG's source tracking and processing order capabilities. Instead of maintaining separate indexes, I configured a unified approach where documents were processed together but with clear priorities and purposes.
1013
+
1014
+ I set up the configuration to treat first paragraphs with priority 1 for graph building and full documents with priority 2 for retrieval. This was achieved through careful configuration of source tracking, processing order, and source filters in the `settings.yaml` file, which allowed me to maintain the separation of concerns.
1015
+ """
1016
+ )
1017
+
1018
+ st.markdown(
1019
+ """
1020
+ ### Final Implementation
1021
+
1022
+ The final implementation proved successful, creating a knowledge graph from the first paragraphs while maintaining access to full document content for comprehensive answers. I used entity types specific to airport security (like **Baggage Type**, **Dimension**, **Weight Limit**) and configured claim extraction to focus on relevant restrictions and allowances.
1023
+
1024
+ """
1025
+ )
1026
+
1027
+ st.markdown(
1028
+ """
1029
+ ### Using the Chat Application
1030
+
1031
+ The chat application provides an interactive interface to query the GraphRAG system. Here's how it works:
1032
+
1033
+ ##### Getting Started:
1034
+ - **Step 1**: Click on the chat tab.
1035
+ - **Step 2**: Choose the desired search type from the sidebar:
1036
+ - **Local Search**: Focuses on specific text chunks and direct relationships in the graph.
1037
+ - **Global Search**: Analyzes the entire dataset at a high level using community summaries.
1038
+ - **DRIFT Search**: Combines local and global search for complex queries requiring both detailed and contextual answers.
1039
+
1040
+ ##### Submitting a Query:
1041
+ - Enter your question in the input field at the bottom of the chat interface.
1042
+ - Depending on the selected search type, the system will:
1043
+ - Use the graph for initial navigation.
1044
+ - Retrieve answers from full documents for comprehensive responses.
1045
+
1046
+ ##### Viewing Results:
1047
+ - The assistant's response appears in the chat window, formatted for clarity.
1048
+
1049
+ ##### Key Features:
1050
+ - **Streaming Responses**: Responses are displayed in real-time for supported search types.
1051
+ - **Session History**: Previous queries and responses are retained within the session for reference.
1052
+
1053
+ ##### Example Queries:
1054
+ - "What are the liquid restrictions for carry-on bags?"
1055
+ - "How do pet carrier size restrictions compare to regular carry-on limits?"
1056
+ """
1057
+ )
1058
+
1059
+ with st.expander("Architecture", expanded=False):
1060
+ st.markdown(
1061
+ """
1062
+ The architecture of the system is designed to process data through multiple stages, including input preparation, processing, and search functionalities. Below is a detailed diagram illustrating the workflow of the system:
1063
+ """
1064
+ )
1065
+
1066
+ mermaid_code = """
1067
+ %%{init: {'theme': 'base', 'themeVariables': {'primaryColor': '#1E90FF', 'edgeLabelBackground': '#FFFFFF', 'secondaryColor': '#F0F8FF', 'tertiaryColor': '#FFFFFF', 'primaryTextColor': '#000000'}}}%%
1068
+ graph TD
1069
+ subgraph Input
1070
+ FP[First Paragraphs] --> P[Processing]
1071
+ FD[Full Documents] --> P
1072
+ end
1073
+
1074
+ subgraph Processing
1075
+ P --> IE[Entity Extraction]
1076
+ P --> CD[Community Detection]
1077
+ P --> E[Embeddings Generation]
1078
+
1079
+ IE --> G[Graph Construction]
1080
+ CD --> G
1081
+ E --> VS[Vector Store]
1082
+ end
1083
+
1084
+ subgraph Search
1085
+ Q[Query] --> DS[DRIFT Search]
1086
+ DS --> GS[Graph Search]
1087
+ DS --> FR[Full Retrieval]
1088
+ GS --> VS
1089
+ FR --> VS
1090
+ GS --> A[Answer Generation]
1091
+ FR --> A
1092
+ end
1093
+ """
1094
+ mermaid(mermaid_code, height=600)
1095
+
1096
+ with st.expander("Graph Analysis", expanded=False):
1097
+ st.markdown("### System Components Breakdown:")
1098
+
1099
+ mermaid_code = """
1100
+ pie
1101
+ title "System Components"
1102
+ "Documents" : 14
1103
+ "Text Units" : 36
1104
+ "Entities" : 315
1105
+ "Relationships" : 372
1106
+ "Communities" : 66
1107
+ """
1108
+ mermaid(mermaid_code, height=500)
1109
+
1110
+ # Description and graph statistics
1111
+ st.markdown(
1112
+ """
1113
+ ### Knowledge Graph Visualization
1114
+
1115
+ The graph displayed below represents the relationships between various entities extracted from the input data. Nodes in the graph correspond to entities like "Documents," "Policies," and "Restrictions," while edges represent the relationships or connections between these entities. The graph is constructed using the extracted entities and relationships, processed through NetworkX, and visualized with Pyvis.
1116
+
1117
+ **Process of Creation**:
1118
+ - **Data Preparation**: Entities and relationships are extracted and saved as `create_final_nodes.parquet` and `create_final_relationships.parquet` files, respectively.
1119
+ - **Graph Construction**: Using NetworkX, nodes and edges are added based on the extracted data.
1120
+ - **Visualization**: Pyvis is used to create an interactive visualization with options like physics-based layout, node grouping, and hover effects.
1121
+
1122
+ The resulting graph provides insights into the data's structure, including:
1123
+ - Node type distribution
1124
+ - Community detection levels
1125
+ - Connectivity patterns
1126
+
1127
+ Explore the graph below to understand the relationships between key entities.
1128
+ """
1129
+ )
1130
+
1131
+ # Load and display the graph visualization (HTML file)
1132
+ with open("knowledge_graph.html", "r") as f:
1133
+ html_content = f.read()
1134
+ st.components.v1.html(html_content, height=800)
1135
+
1136
+ # Graph statistics
1137
+ st.markdown(
1138
+ """
1139
+ ### Graph Statistics:
1140
+
1141
+ * **Number of nodes:** 427
1142
+ * **Number of edges:** 453
1143
+
1144
+ #### Node Type Distribution:
1145
+
1146
+ | Node Type | Distribution |
1147
+ |-----------------------|--------------|
1148
+ | REQUIRED DOCUMENT | 39 |
1149
+ | SERVICE TYPE | 35 |
1150
+ | POLICY | 30 |
1151
+ | RESTRICTION | 27 |
1152
+ | SPECIAL ITEM | 26 |
1153
+ | PROHIBITED ITEM | 23 |
1154
+ | AIRPORT | 22 |
1155
+ | BAGGAGE TYPE | 21 |
1156
+ | SERVICE LOCATION | 18 |
1157
+ | DANGEROUS GOOD | 14 |
1158
+ | ALLOWANCE | 13 |
1159
+ | GEO | 12 |
1160
+ | MEASUREMENT UNIT | 11 |
1161
+ | FEE STRUCTURE | 10 |
1162
+ | LINEAR DIMENSION | 8 |
1163
+ | TIME PERIOD | 8 |
1164
+ | CABIN SECTION | 8 |
1165
+ | WEIGHT | 8 |
1166
+ | WEIGHT CATEGORY | 7 |
1167
+ | AIRLINE | 7 |
1168
+ | CITY | 7 |
1169
+ | DIMENSION | 6 |
1170
+ | VALUABLE ITEM | 5 |
1171
+ | ROUTE TYPE | 5 |
1172
+ | TRAVEL CLASS | 5 |
1173
+ | ORGANIZATION | 5 |
1174
+ | PASSENGER TYPE | 4 |
1175
+ | RESTRICTED ITEM | 3 |
1176
+ | CURRENCY | 2 |
1177
+ | EXEMPTION | 2 |
1178
+ | LABEL TYPE | 2 |
1179
+ | MATERIAL TYPE | 2 |
1180
+ | CARGO | 2 |
1181
+ | MEMBERSHIP LEVEL | 2 |
1182
+ | AIRCRAFT TYPE | 1 |
1183
+ | REGION | 1 |
1184
+ | COUNTRY | 1 |
1185
+ | SIZE CATEGORY | 1 |
1186
+ | WHEEL CONFIGURATION | 1 |
1187
+ | TAG CATEGORY | 1 |
1188
+ | GROUP CATEGORY | 1 |
1189
+
1190
+ #### Most Connected Nodes:
1191
+
1192
+ | Node | Connections |
1193
+ |--------------------|-------------|
1194
+ | EL AL | 49 |
1195
+ | ANIMAL | 29 |
1196
+ | CHECKED BAGGAGE | 25 |
1197
+ | BAGGAGE | 21 |
1198
+ | PET | 19 |
1199
+ """
1200
+ )
1201
+
1202
+ with st.expander("Implementation Results", expanded=False):
1203
+ st.markdown(
1204
+ """
1205
+ ### Document Processing
1206
+
1207
+ * **Total Documents**: 14 (7 first paragraphs + 7 full documents)
1208
+ * **Text Units**: 36
1209
+ * **Entities**: 315
1210
+ * **Relationships**: 372
1211
+ * **Communities**: 66 across 4 levels
1212
+
1213
+ ### Community Structure
1214
+
1215
+ * **Level 0**: 11 communities
1216
+ * **Level 1**: 44 communities
1217
+ * **Level 2**: 9 communities
1218
+ * **Level 3**: 2 communities
1219
+ """
1220
+ )
1221
+
1222
+ st.markdown("### System Operation Flow")
1223
+
1224
+ mermaid_code = """
1225
+ sequenceDiagram
1226
+ participant U as User
1227
+ participant Q as Query Engine
1228
+ participant G as Graph Search
1229
+ participant V as Vector Store
1230
+ participant D as Document Retrieval
1231
+
1232
+ U->>Q: Submit Query
1233
+ Q->>G: Search in First Paragraph Graph
1234
+ G->>V: Lookup Relevant Entities
1235
+ V->>D: Retrieve Full Content
1236
+ D->>Q: Return Comprehensive Answer
1237
+ Q->>U: Present Response
1238
+ """
1239
+
1240
+ mermaid(mermaid_code, height=400)
1241
+
1242
+ with st.expander("Implementation Details", expanded=False):
1243
+ st.markdown(
1244
+ """
1245
+ The implementation of the system follows a processing pipeline that integrates data from the first paragraphs and full documents, creating a unified structure for efficient querying. Below is the pipeline representation:
1246
+ """
1247
+ )
1248
+
1249
+ mermaid_code = """
1250
+ flowchart TB
1251
+ subgraph First Paragraphs
1252
+ FP[Load First Paragraphs] --> EP[Extract Entities]
1253
+ EP --> RP[Build Relationships]
1254
+ RP --> CP[Create Communities]
1255
+ end
1256
+
1257
+ subgraph Full Documents
1258
+ FD[Load Full Documents] --> CH[Chunk Documents]
1259
+ CH --> EF[Generate Embeddings]
1260
+ end
1261
+
1262
+ subgraph Integration
1263
+ CP --> VS[(Vector Store)]
1264
+ EF --> VS
1265
+ end
1266
+
1267
+ subgraph Search
1268
+ Q[Query] --> GS[Graph Search]
1269
+ GS --> VS
1270
+ VS --> RD[Retrieve Details]
1271
+ RD --> AG[Answer Generation]
1272
+ end
1273
+ """
1274
+ mermaid(mermaid_code, height=800)
1275
+ with st.expander("Requirements Fulfillment", expanded=False):
1276
+ st.markdown(
1277
+ """
1278
+ ### Requirements Fulfillment
1279
+
1280
+ **First Paragraph Processing**: ✓
1281
+ * Implemented through `source_filter` and `processing_order`
1282
+ * Verified by entity and relationship extraction
1283
+
1284
+ **Full Document Embeddings**: ✓
1285
+ * Stored in LanceDB
1286
+ * Accessible for comprehensive retrieval
1287
+
1288
+ **Graph-Based Search**: ✓
1289
+ * Communities and relationships established
1290
+ * DRIFT search implemented
1291
+
1292
+ **Complete Answer Retrieval**: ✓
1293
+ * Source priority configuration
1294
+ * Full document content available
1295
+
1296
+ ### Performance Metrics
1297
+
1298
+ * **Indexing Speed**: 212.44 seconds total
1299
+ * **Graph Density**: 372 relationships among 315 entities
1300
+ * **Community Structure**: 4-level hierarchy
1301
+ * **Vector Store Size**: 3 Lance files for different embedding types
1302
+ """
1303
+ )
1304
+
1305
+ with st.expander("Achieving the Requirement", expanded=False):
1306
+ st.markdown("### Source-Based Processing Control:")
1307
+
1308
+ st.markdown(
1309
+ """
1310
+ ```yaml
1311
+ input:
1312
+ source_tracking: true
1313
+ processing_order:
1314
+ - path: "first_paragraphs"
1315
+ priority: 1
1316
+ purpose: "graph_building"
1317
+ - path: "full_documents"
1318
+ priority: 2
1319
+ purpose: "retrieval"
1320
+ ```
1321
+ """
1322
+ )
1323
+ st.markdown(
1324
+ """
1325
+ This configuration ensures that GraphRAG knows which content is for graph building (first paragraphs) and which is for retrieval (full documents). The priority system makes sure first paragraphs are processed first and used primarily for the knowledge graph construction.
1326
+ """
1327
+ )
1328
+
1329
+ st.markdown("### Targeted Entity and Claim Extraction:")
1330
+
1331
+ st.markdown(
1332
+ """
1333
+ ```yaml
1334
+ entity_extraction:
1335
+ source_filter: "first_paragraphs"
1336
+ max_gleanings: 2
1337
+
1338
+ claim_extraction:
1339
+ source_filter: "first_paragraphs"
1340
+ ```
1341
+ """
1342
+ )
1343
+ st.markdown(
1344
+ """
1345
+ These filters ensure that the knowledge graph (entities, relationships, and claims) is built only from the first paragraphs. This is crucial because it means our initial search will only traverse the graph built from these first paragraphs, matching the requirement. The `max_gleanings: 2` allows for thorough extraction while maintaining precision.
1346
+ """
1347
+ )
1348
+
1349
+ st.markdown("### Search Priority and Retrieval Control:")
1350
+
1351
+ st.markdown(
1352
+ """
1353
+ ```yaml
1354
+ local_search:
1355
+ source_priority:
1356
+ graph_search: "first_paragraphs"
1357
+ answer_retrieval: "full_documents"
1358
+ text_unit_prop: 0.7
1359
+ community_prop: 0.3
1360
+ ```
1361
+ """
1362
+ )
1363
+ st.markdown(
1364
+ """
1365
+ This is where the magic happens - when a query is made, the system first searches using the graph built from first paragraphs (`graph_search: "first_paragraphs"`), but when it needs to construct the answer, it pulls the content from the full documents (`answer_retrieval: "full_documents"`).
1366
+
1367
+ The text_unit and community proportions ensure we're making good use of both the graph structure and the actual content. Looking at the output files we generated (`create_final_entities.parquet`, `create_final_relationships.parquet`, etc.), we can see this two-phase approach in action: the graph structure is built and stored separately from the full content, but they're linked through the unified vector store in LanceDB, allowing seamless transitions between graph search and content retrieval during query processing.
1368
+ """
1369
+ )
1370
+
1371
+ with st.expander("Improvements to Make the Graph Creation Process Leaner and Faster", expanded=False):
1372
+ st.markdown("### Optimization of Chunk Size and Overlap:")
1373
+
1374
+ st.markdown(
1375
+ """
1376
+ ```yaml
1377
+ chunks:
1378
+ size: 300 # Reduced from 500
1379
+ overlap: 25 # Reduced from 50
1380
+ group_by_columns: [id]
1381
+ ```
1382
+ """
1383
+ )
1384
+ st.markdown(
1385
+ """
1386
+ **Rationale**:
1387
+ - Smaller chunks with minimal overlap reduce token usage.
1388
+ - Maintains context while processing fewer tokens per API call.
1389
+ - Especially efficient for first paragraphs processing.
1390
+ """
1391
+ )
1392
+
1393
+ st.markdown("### Streamline Entity Types and Claims:")
1394
+
1395
+ st.markdown(
1396
+ """
1397
+ ```yaml
1398
+ entity_extraction:
1399
+ entity_types:
1400
+ - "Baggage"
1401
+ - "Restriction"
1402
+ - "Item"
1403
+ max_gleanings: 1 # Reduced from 2
1404
+
1405
+ claim_extraction:
1406
+ enabled: false # Disable unless absolutely necessary
1407
+ ```
1408
+ """
1409
+ )
1410
+ st.markdown(
1411
+ """
1412
+ **Rationale**:
1413
+ - Fewer entity types mean fewer extraction operations.
1414
+ - Single gleaning pass is often sufficient.
1415
+ - Claims processing is expensive and often redundant.
1416
+ """
1417
+ )
1418
+
1419
+ st.markdown("### Optimize Graph Embeddings:")
1420
+
1421
+ st.markdown(
1422
+ """
1423
+ ```yaml
1424
+ embed_graph:
1425
+ enabled: true
1426
+ num_walks: 50 # Reduced from 100
1427
+ walk_length: 5 # Reduced from 10
1428
+ window_size: 3 # Reduced from 5
1429
+ iterations: 5 # Reduced from 10
1430
+ ```
1431
+ """
1432
+ )
1433
+ st.markdown(
1434
+ """
1435
+ **Rationale**:
1436
+ - Fewer random walks still capture essential graph structure.
1437
+ - Shorter walks reduce computation time.
1438
+ - Smaller window size focuses on immediate relationships.
1439
+ """
1440
+ )
1441
+
1442
+ st.markdown("### Batch Processing and Parallelization:")
1443
+
1444
+ st.markdown(
1445
+ """
1446
+ ```yaml
1447
+ embeddings:
1448
+ async_mode: asyncio # Changed from threaded
1449
+ batch_size: 32 # Increased from 16
1450
+ batch_max_tokens: 8191
1451
+ ```
1452
+ """
1453
+ )
1454
+ st.markdown(
1455
+ """
1456
+ **Rationale**:
1457
+ - Asyncio performs better than threading for I/O-bound operations.
1458
+ - Larger batch size reduces API calls.
1459
+ - Maximizes throughput within token limits.
1460
+ """
1461
+ )
1462
+
1463
+ st.markdown("### Community Structure Optimization:")
1464
+
1465
+ st.markdown(
1466
+ """
1467
+ ```yaml
1468
+ cluster_graph:
1469
+ max_cluster_size: 15 # Increased slightly
1470
+ min_cluster_size: 3 # Added parameter
1471
+
1472
+ community_reports:
1473
+ max_input_length: 2000 # Reduced from default
1474
+ max_length: 1000 # Reduced summary length
1475
+ ```
1476
+ """
1477
+ )
1478
+ st.markdown(
1479
+ """
1480
+ **Rationale**:
1481
+ - Balanced cluster sizes reduce processing overhead.
1482
+ - Shorter community reports still maintain essential information.
1483
+ - Fewer tokens per report means faster processing.
1484
+ """
1485
+ )
1486
+
1487
+ st.markdown("### Caching and Storage:")
1488
+
1489
+ st.markdown(
1490
+ """
1491
+ ```yaml
1492
+ cache:
1493
+ type: file
1494
+ base_dir: "cache"
1495
+ compression: true # Add compression
1496
+ cache_embeddings: true
1497
+
1498
+ storage:
1499
+ type: file
1500
+ base_dir: "output"
1501
+ compression: true # Add compression
1502
+ ```
1503
+ """
1504
+ )
1505
+ st.markdown(
1506
+ """
1507
+ **Rationale**:
1508
+ - Compression reduces I/O overhead.
1509
+ - Caching embeddings prevents recomputation.
1510
+ - File-based storage is faster than blob storage for local processing.
1511
+ """
1512
+ )
1513
+
1514
+ st.markdown("### Disable Non-Essential Features:")
1515
+
1516
+ st.markdown(
1517
+ """
1518
+ ```yaml
1519
+ umap:
1520
+ enabled: false # Disable unless visualization needed
1521
+
1522
+ snapshots:
1523
+ graphml: false
1524
+ raw_entities: false
1525
+ top_level_nodes: false
1526
+ ```
1527
+ """
1528
+ )
1529
+ st.markdown(
1530
+ """
1531
+ **Rationale**:
1532
+ - UMAP calculation is computationally expensive.
1533
+ - Snapshots are useful for debugging but add overhead.
1534
+ """
1535
+ )
1536
+
1537
+ st.markdown("### LLM Configuration Optimization:")
1538
+
1539
+ st.markdown(
1540
+ """
1541
+ ```yaml
1542
+ llm:
1543
+ concurrent_requests: 25
1544
+ tokens_per_minute: 150000
1545
+ requests_per_minute: 10000
1546
+ max_retries: 5 # Reduced from 10
1547
+ ```
1548
+ """
1549
+ )
1550
+ st.markdown(
1551
+ """
1552
+ **Rationale**:
1553
+ - Balanced concurrency prevents rate limiting.
1554
+ - Fewer retries reduce waiting time.
1555
+ - Token and request limits prevent throttling.
1556
+ """
1557
+ )
1558
+
1559
+ with st.expander("Query Types", expanded=False):
1560
+ st.markdown("### Local Search:")
1561
+
1562
+ st.markdown(
1563
+ """
1564
+ ```yaml
1565
+ local_search:
1566
+ text_unit_prop: 0.7 # Focus on specific text chunks
1567
+ community_prop: 0.3 # Some consideration of community context
1568
+ top_k_mapped_entities: 15
1569
+ source_priority:
1570
+ graph_search: "first_paragraphs"
1571
+ answer_retrieval: "full_documents"
1572
+ ```
1573
+ """
1574
+ )
1575
+ st.markdown(
1576
+ """
1577
+ **Best when**: Looking for specific baggage rules or restrictions
1578
+ **Example Query**: "What are the liquid restrictions for carry-on bags?"
1579
+
1580
+ **How it works with our data**:
1581
+ - Searches for entities in first paragraphs (like "liquid", "carry-on").
1582
+ - Follows direct relationships in the graph.
1583
+ - Retrieves detailed rules from full documents.
1584
+
1585
+ **Meets requirement?** Yes, but in a limited way - focuses on direct connections.
1586
+ """
1587
+ )
1588
+
1589
+ st.markdown("### Global Search:")
1590
+
1591
+ st.markdown(
1592
+ """
1593
+ ```yaml
1594
+ global_search:
1595
+ max_tokens: 4000
1596
+ data_max_tokens: 4000
1597
+ min_score_threshold: 0.1
1598
+ allow_general_knowledge: false
1599
+ ```
1600
+ """
1601
+ )
1602
+ st.markdown(
1603
+ """
1604
+ **Best when**: Understanding overall policies or themes
1605
+ **Example Query**: "What are the main types of baggage restrictions?"
1606
+
1607
+ **How it works with our data**:
1608
+ - Looks at community summaries built from first paragraphs.
1609
+ - Provides broader context about baggage policies.
1610
+ - Pulls supporting details from full documents.
1611
+
1612
+ **Meets requirement?** Partially - good for overview but might miss specific connections.
1613
+ """
1614
+ )
1615
+
1616
+ st.markdown("### DRIFT Search (Dynamic Reasoning and Inference with Flexible Traversal):")
1617
+
1618
+ st.markdown(
1619
+ """
1620
+ ```yaml
1621
+ local_search:
1622
+ source_priority:
1623
+ graph_search: "first_paragraphs"
1624
+ answer_retrieval: "full_documents"
1625
+ ```
1626
+ """
1627
+ )
1628
+ st.markdown(
1629
+ """
1630
+ **Best when**: Complex queries requiring both specific details and context
1631
+ **Example Query**: "How do pet carrier size restrictions compare to regular carry-on limits?"
1632
+
1633
+ **How it works with our data**:
1634
+ - Starts with first paragraphs graph to understand relationships between:
1635
+ - Pet carriers
1636
+ - Regular carry-on bags
1637
+ - Size restrictions
1638
+ - Uses community understanding to find related policies.
1639
+ - Retrieves specific details from full documents.
1640
+
1641
+ **Meets requirement?** Yes, most comprehensively.
1642
+ """
1643
+ )
1644
+
1645
+ st.markdown("### Best Choice for Our Requirement:")
1646
+ st.markdown(
1647
+ """
1648
+ **DRIFT Search** is the most suitable because:
1649
+ - It naturally implements our two-phase requirement:
1650
+ - Initial search on graph (from first paragraphs).
1651
+ - Answer retrieval from full documents.
1652
+ - It can handle complex queries that need:
1653
+ - Understanding of relationships (from graph).
1654
+ - Specific details (from full documents).
1655
+ - It can dynamically adjust between:
1656
+ - Local search when specific rules are needed.
1657
+ - Global search when context is important.
1658
+ """
1659
+ )
1660
+ with st.expander("Configuration: full `settings.yaml`", expanded=False):
1661
+
1662
+ st.markdown(
1663
+ """
1664
+ ```yaml
1665
+ # Root configuration for GraphRAG, a system leveraging LLMs for advanced Retrieval Augmented Generation.
1666
+
1667
+ encoding_model: cl100k_base
1668
+ # Specifies the model used for token encoding. The default 'cl100k_base' is common for OpenAI's text models,
1669
+ # determining how text is tokenized into machine-readable units.
1670
+
1671
+ skip_workflows: []
1672
+ # A list of workflows to skip during execution. Empty indicates all workflows are executed.
1673
+
1674
+ llm:
1675
+ api_key: ${GRAPHRAG_API_KEY}
1676
+ # Placeholder for the API key, replaced dynamically from environment variables.
1677
+ # Ensures secure API access for LLM queries.
1678
+
1679
+ type: openai_chat
1680
+ # Defines the type of LLM interface used. Here, it connects to OpenAI's chat-based API.
1681
+
1682
+ model: gpt-4o-mini
1683
+ # Specifies the model variant to use.
1684
+
1685
+ model_supports_json: true
1686
+ # Indicates whether the LLM natively supports JSON responses, useful for structured outputs.
1687
+
1688
+ max_tokens: 4000
1689
+ # Maximum number of tokens in the output. Balances performance and context length.
1690
+
1691
+ temperature: 0
1692
+ # Controls randomness in outputs. 0 means deterministic responses, often preferred for accuracy.
1693
+
1694
+ embeddings:
1695
+ async_mode: threaded
1696
+ # Asynchronous embedding computation mode. 'threaded' uses multi-threading for better performance.
1697
+
1698
+ batch_size: 16
1699
+ # Number of data points processed per batch during embedding, balancing speed and resource use.
1700
+
1701
+ vector_store:
1702
+ type: lancedb
1703
+ # Database type used for storing vectorized embeddings. 'lancedb' supports efficient vector operations.
1704
+
1705
+ db_uri: 'output/lancedb'
1706
+ # URI pointing to the database location where embeddings are stored.
1707
+
1708
+ container_name: default
1709
+ # Logical name for the container storing vector data.
1710
+
1711
+ overwrite: true
1712
+ # Whether to overwrite existing vectors. True allows updating the database during reruns.
1713
+
1714
+ llm:
1715
+ api_key: ${GRAPHRAG_API_KEY}
1716
+ type: openai_embedding
1717
+ model: text-embedding-3-small
1718
+ # Dedicated LLM for embedding tasks. A smaller, specialized model is specified for embeddings.
1719
+
1720
+ chunks:
1721
+ size: 500
1722
+ # Number of tokens per chunk of text. Controls granularity for processing long documents.
1723
+
1724
+ overlap: 50
1725
+ # Overlap between adjacent chunks to ensure continuity in analysis.
1726
+
1727
+ group_by_columns: [id]
1728
+ # Groups data by 'id' before chunking, preserving document boundaries.
1729
+
1730
+ input:
1731
+ type: file
1732
+ file_type: text
1733
+ base_dir: "input"
1734
+ file_pattern: ".*\\.txt$"
1735
+ recursive: true
1736
+ source_tracking: true
1737
+ processing_order:
1738
+ - path: "first_paragraphs"
1739
+ priority: 1
1740
+ purpose: "graph_building"
1741
+ - path: "full_documents"
1742
+ priority: 2
1743
+ purpose: "retrieval"
1744
+ # Specifies the data source for ingestion:
1745
+ # - Input is file-based text.
1746
+ # - Reads files recursively from "input" directory matching '.txt' files.
1747
+ # - Prioritizes "first_paragraphs" for graph building and full documents for retrieval.
1748
+
1749
+ entity_extraction:
1750
+ prompt: "prompts/entity_extraction.txt"
1751
+ # Path to the custom prompt used for entity extraction tasks.
1752
+
1753
+ entity_types:
1754
+ - "Baggage Type"
1755
+ - "Dimension"
1756
+ - "Linear Dimension"
1757
+ - "Weight"
1758
+ - "Material Type"
1759
+ - "Wheel Configuration"
1760
+ - "Measurement Unit"
1761
+ - "Size Category"
1762
+ - "Weight Category"
1763
+ - "Airline"
1764
+ - "Alliance"
1765
+ - "Airport"
1766
+ - "Route Type"
1767
+ - "Travel Class"
1768
+ - "Cabin Section"
1769
+ - "Aircraft Type"
1770
+ - "Restriction"
1771
+ - "Exemption"
1772
+ - "Policy"
1773
+ - "Fee Structure"
1774
+ - "Currency"
1775
+ - "Allowance"
1776
+ - "Special Item"
1777
+ - "Prohibited Item"
1778
+ - "Restricted Item"
1779
+ - "Dangerous Good"
1780
+ - "Fragile Item"
1781
+ - "Valuable Item"
1782
+ - "Required Document"
1783
+ - "Label Type"
1784
+ - "Tag Category"
1785
+ - "Service Type"
1786
+ - "Handler Role"
1787
+ - "Service Location"
1788
+ - "Time Period"
1789
+ - "Passenger Type"
1790
+ - "Membership Level"
1791
+ - "Group Category"
1792
+ # Defines the types of entities the system should extract.
1793
+
1794
+ max_gleanings: 2
1795
+ # Maximum number of re-processing rounds to refine entity detection.
1796
+
1797
+ source_filter: "first_paragraphs"
1798
+ # Restricts extraction to text from "first_paragraphs," optimizing focus.
1799
+
1800
+ claim_extraction:
1801
+ enabled: true
1802
+ # Enables claim extraction, capturing specific conditions or assertions from text.
1803
+
1804
+ claim_types:
1805
+ - "Basic Size Restriction"
1806
+ - "Oversize Condition"
1807
+ - "Weight Limit Standard"
1808
+ - "Overweight Condition"
1809
+ - "Combined Dimension Limit"
1810
+ - "Cabin Storage Requirement"
1811
+ - "Standard Fee"
1812
+ - "Excess Fee"
1813
+ - "Oversize Fee"
1814
+ - "Overweight Fee"
1815
+ - "Special Handling Fee"
1816
+ - "Season Surcharge"
1817
+ - "Route-Specific Fee"
1818
+ - "Multi-Piece Pricing"
1819
+ - "Fee Waiver Condition"
1820
+ - "Basic Allowance"
1821
+ - "Class-Based Allowance"
1822
+ - "Status-Based Allowance"
1823
+ - "Route-Based Allowance"
1824
+ - "Special Group Allowance"
1825
+ - "Seasonal Allowance"
1826
+ - "Equipment Allowance"
1827
+ - "Prohibited Item Policy"
1828
+ - "Restricted Item Condition"
1829
+ - "Dangerous Goods Policy"
1830
+ - "Special Item Restriction"
1831
+ - "Packaging Requirement"
1832
+ - "Declaration Requirement"
1833
+ - "Check-in Deadline"
1834
+ - "Special Handling Procedure"
1835
+ - "Priority Handling Rule"
1836
+ - "Transfer Handling Policy"
1837
+ - "Delivery Service Policy"
1838
+ - "Storage Policy"
1839
+ - "Liability Limit"
1840
+ - "Insurance Requirement"
1841
+ - "Claim Procedure"
1842
+ - "Compensation Policy"
1843
+ - "Time Limit Policy"
1844
+ - "Weather Restriction"
1845
+ - "Seasonal Restriction"
1846
+ - "Aircraft Limitation"
1847
+ - "Route Restriction"
1848
+ - "Connection Impact"
1849
+ - "Tag Requirement"
1850
+ - "Label Requirement"
1851
+ - "Documentation Requirement"
1852
+ - "Declaration Policy"
1853
+ - "Handling Standard"
1854
+ - "Service Level Agreement"
1855
+ - "Priority Service Standard"
1856
+ - "Delivery Time Standard"
1857
+ - "Medical Exception"
1858
+ - "Military Exception"
1859
+ - "Diplomatic Exception"
1860
+ - "Event Exception"
1861
+ - "Emergency Exception"
1862
+ # Types of claims to extract, covering diverse scenarios (e.g., fees, allowances).
1863
+
1864
+ prompt: "prompts/claim_extraction.txt"
1865
+ description: "Extract baggage measurements, weight limits, and restrictions from airline documentation."
1866
+ # Customizes the extraction logic for airline baggage policies.
1867
+
1868
+ max_gleanings: 2
1869
+ source_filter: "first_paragraphs"
1870
+ # Restricts claims to "first_paragraphs," mirroring entity extraction.
1871
+
1872
+ local_search:
1873
+ text_unit_prop: 0.7
1874
+ community_prop: 0.3
1875
+ top_k_mapped_entities: 15
1876
+ top_k_relationships: 15
1877
+ max_tokens: 4000
1878
+ source_priority:
1879
+ graph_search: "first_paragraphs"
1880
+ answer_retrieval: "full_documents"
1881
+ # Configures search behavior:
1882
+ # - Balances searches between individual text units and community-level summaries.
1883
+ # - Limits results to top 15 entities and relationships for relevance.
1884
+
1885
+ global_search:
1886
+ max_tokens: 4000
1887
+ data_max_tokens: 4000
1888
+ map_max_tokens: 1000
1889
+ reduce_max_tokens: 2000
1890
+ allow_general_knowledge: false
1891
+ min_score_threshold: 0.1
1892
+ concurrency: 10
1893
+ # Defines query-wide global search capabilities:
1894
+ # - Token limits for different operations.
1895
+ # - Restricts non-specific general knowledge responses.
1896
+ # - Handles up to 10 parallel queries.
1897
+
1898
+ embed_graph:
1899
+ enabled: true
1900
+ num_walks: 100
1901
+ walk_length: 10
1902
+ window_size: 5
1903
+ iterations: 10
1904
+ # Enables graph embedding (e.g., for node2vec):
1905
+ # - Generates 100 random walks per node to learn embeddings.
1906
+
1907
+ umap:
1908
+ enabled: true
1909
+ n_neighbors: 15
1910
+ min_dist: 0.1
1911
+ n_components: 2
1912
+ # Configures UMAP for dimensionality reduction and visualization.
1913
+
1914
+ storage:
1915
+ type: file
1916
+ base_dir: "output"
1917
+ # Outputs processed data to local "output" directory.
1918
+
1919
+ cache:
1920
+ type: file
1921
+ base_dir: "cache"
1922
+ # Stores temporary files in "cache."
1923
+
1924
+ reporting:
1925
+ type: file
1926
+ base_dir: "reports"
1927
+ include_source_tracking: true
1928
+ # Generates reports, including provenance for traceability.
1929
+
1930
+ ```
1931
+ """
1932
+ )
1933
+ >>>>>>> b96b49ad31dd18a91ca19d59316e1b3ef2f531ff