# wiki.py # import streamlit_mermaid as stmd import streamlit.components.v1 as components import streamlit as st from streamlit.components.v1 import html def mermaid(code: str, height: int = 600) -> None: components.html( f"""

            {code}

""", height=height, ) def render_wiki_tab(): """Render the Wiki tab content.""" st.header("Overview") st.markdown( """ This documentation details the process I followed to achieve the assignment of using GraphRAG for indexing the first paragraphs of seven documents, embedding the full documents, performing initial search on the graph built from the first paragraphs, and retrieving answers from the full document content. """) st.markdown( """ This project implements a specialized document processing and querying system using GraphRAG for El Al baggage requirements\allowance documentation. The system processes first paragraphs separately from full documents, enabling graph-based search while maintaining comprehensive answer retrieval capabilities. """ ) st.markdown( """ ### Implementation Process Initially, I attempted to implement this using separate processing paths for first paragraphs and full documents, but I discovered a more elegant solution through GraphRAG's source tracking and processing order capabilities. Instead of maintaining separate indexes, I configured a unified approach where documents were processed together but with clear priorities and purposes. I set up the configuration to treat first paragraphs with priority 1 for graph building and full documents with priority 2 for retrieval. This was achieved through careful configuration of source tracking, processing order, and source filters in the `settings.yaml` file, which allowed me to maintain the separation of concerns. """ ) st.markdown( """ ### Final Implementation The final implementation proved successful, creating a knowledge graph from the first paragraphs while maintaining access to full document content for comprehensive answers. I used entity types specific to airport security (like **Baggage Type**, **Dimension**, **Weight Limit**) and configured claim extraction to focus on relevant restrictions and allowances. """ ) st.markdown( """ ### Using the Chat Application The chat application provides an interactive interface to query the GraphRAG system. Here's how it works: ##### Getting Started: - **Step 1**: Click on the chat tab. - **Step 2**: Choose the desired search type from the sidebar: - **Local Search**: Focuses on specific text chunks and direct relationships in the graph. - **Global Search**: Analyzes the entire dataset at a high level using community summaries. - **DRIFT Search**: Combines local and global search for complex queries requiring both detailed and contextual answers. ##### Submitting a Query: - Enter your question in the input field at the bottom of the chat interface. - Depending on the selected search type, the system will: - Use the graph for initial navigation. - Retrieve answers from full documents for comprehensive responses. ##### Viewing Results: - The assistant's response appears in the chat window, formatted for clarity. ##### Key Features: - **Streaming Responses**: Responses are displayed in real-time for supported search types. - **Session History**: Previous queries and responses are retained within the session for reference. ##### Example Queries: - "What are the liquid restrictions for carry-on bags?" - "How do pet carrier size restrictions compare to regular carry-on limits?" """ ) with st.expander("Architecture", expanded=False): st.markdown( """ The architecture of the system is designed to process data through multiple stages, including input preparation, processing, and search functionalities. Below is a detailed diagram illustrating the workflow of the system: """ ) mermaid_code = """ %%{init: {'theme': 'base', 'themeVariables': {'primaryColor': '#1E90FF', 'edgeLabelBackground': '#FFFFFF', 'secondaryColor': '#F0F8FF', 'tertiaryColor': '#FFFFFF', 'primaryTextColor': '#000000'}}}%% graph TD subgraph Input FP[First Paragraphs] --> P[Processing] FD[Full Documents] --> P end subgraph Processing P --> IE[Entity Extraction] P --> CD[Community Detection] P --> E[Embeddings Generation] IE --> G[Graph Construction] CD --> G E --> VS[Vector Store] end subgraph Search Q[Query] --> DS[DRIFT Search] DS --> GS[Graph Search] DS --> FR[Full Retrieval] GS --> VS FR --> VS GS --> A[Answer Generation] FR --> A end """ mermaid(mermaid_code, height=600) with st.expander("Graph Analysis", expanded=False): st.markdown("### System Components Breakdown:") mermaid_code = """ pie title "System Components" "Documents" : 14 "Text Units" : 36 "Entities" : 315 "Relationships" : 372 "Communities" : 66 """ mermaid(mermaid_code, height=500) # Description and graph statistics st.markdown( """ ### Knowledge Graph Visualization The graph displayed below represents the relationships between various entities extracted from the input data. Nodes in the graph correspond to entities like "Documents," "Policies," and "Restrictions," while edges represent the relationships or connections between these entities. The graph is constructed using the extracted entities and relationships, processed through NetworkX, and visualized with Pyvis. **Process of Creation**: - **Data Preparation**: Entities and relationships are extracted and saved as `create_final_nodes.parquet` and `create_final_relationships.parquet` files, respectively. - **Graph Construction**: Using NetworkX, nodes and edges are added based on the extracted data. - **Visualization**: Pyvis is used to create an interactive visualization with options like physics-based layout, node grouping, and hover effects. The resulting graph provides insights into the data's structure, including: - Node type distribution - Community detection levels - Connectivity patterns Explore the graph below to understand the relationships between key entities. """ ) # Load and display the graph visualization (HTML file) with open("knowledge_graph.html", "r") as f: html_content = f.read() st.components.v1.html(html_content, height=800) # Graph statistics st.markdown( """ ### Graph Statistics: * **Number of nodes:** 427 * **Number of edges:** 453 #### Node Type Distribution: | Node Type | Distribution | |-----------------------|--------------| | REQUIRED DOCUMENT | 39 | | SERVICE TYPE | 35 | | POLICY | 30 | | RESTRICTION | 27 | | SPECIAL ITEM | 26 | | PROHIBITED ITEM | 23 | | AIRPORT | 22 | | BAGGAGE TYPE | 21 | | SERVICE LOCATION | 18 | | DANGEROUS GOOD | 14 | | ALLOWANCE | 13 | | GEO | 12 | | MEASUREMENT UNIT | 11 | | FEE STRUCTURE | 10 | | LINEAR DIMENSION | 8 | | TIME PERIOD | 8 | | CABIN SECTION | 8 | | WEIGHT | 8 | | WEIGHT CATEGORY | 7 | | AIRLINE | 7 | | CITY | 7 | | DIMENSION | 6 | | VALUABLE ITEM | 5 | | ROUTE TYPE | 5 | | TRAVEL CLASS | 5 | | ORGANIZATION | 5 | | PASSENGER TYPE | 4 | | RESTRICTED ITEM | 3 | | CURRENCY | 2 | | EXEMPTION | 2 | | LABEL TYPE | 2 | | MATERIAL TYPE | 2 | | CARGO | 2 | | MEMBERSHIP LEVEL | 2 | | AIRCRAFT TYPE | 1 | | REGION | 1 | | COUNTRY | 1 | | SIZE CATEGORY | 1 | | WHEEL CONFIGURATION | 1 | | TAG CATEGORY | 1 | | GROUP CATEGORY | 1 | #### Most Connected Nodes: | Node | Connections | |--------------------|-------------| | EL AL | 49 | | ANIMAL | 29 | | CHECKED BAGGAGE | 25 | | BAGGAGE | 21 | | PET | 19 | """ ) with st.expander("Implementation Results", expanded=False): st.markdown( """ ### Document Processing * **Total Documents**: 14 (7 first paragraphs + 7 full documents) * **Text Units**: 36 * **Entities**: 315 * **Relationships**: 372 * **Communities**: 66 across 4 levels ### Community Structure * **Level 0**: 11 communities * **Level 1**: 44 communities * **Level 2**: 9 communities * **Level 3**: 2 communities """ ) st.markdown("### System Operation Flow") mermaid_code = """ sequenceDiagram participant U as User participant Q as Query Engine participant G as Graph Search participant V as Vector Store participant D as Document Retrieval U->>Q: Submit Query Q->>G: Search in First Paragraph Graph G->>V: Lookup Relevant Entities V->>D: Retrieve Full Content D->>Q: Return Comprehensive Answer Q->>U: Present Response """ mermaid(mermaid_code, height=400) with st.expander("Implementation Details", expanded=False): st.markdown( """ The implementation of the system follows a processing pipeline that integrates data from the first paragraphs and full documents, creating a unified structure for efficient querying. Below is the pipeline representation: """ ) mermaid_code = """ flowchart TB subgraph First Paragraphs FP[Load First Paragraphs] --> EP[Extract Entities] EP --> RP[Build Relationships] RP --> CP[Create Communities] end subgraph Full Documents FD[Load Full Documents] --> CH[Chunk Documents] CH --> EF[Generate Embeddings] end subgraph Integration CP --> VS[(Vector Store)] EF --> VS end subgraph Search Q[Query] --> GS[Graph Search] GS --> VS VS --> RD[Retrieve Details] RD --> AG[Answer Generation] end """ mermaid(mermaid_code, height=800) with st.expander("Requirements Fulfillment", expanded=False): st.markdown( """ ### Requirements Fulfillment **First Paragraph Processing**: ✓ * Implemented through `source_filter` and `processing_order` * Verified by entity and relationship extraction **Full Document Embeddings**: ✓ * Stored in LanceDB * Accessible for comprehensive retrieval **Graph-Based Search**: ✓ * Communities and relationships established * DRIFT search implemented **Complete Answer Retrieval**: ✓ * Source priority configuration * Full document content available ### Performance Metrics * **Indexing Speed**: 212.44 seconds total * **Graph Density**: 372 relationships among 315 entities * **Community Structure**: 4-level hierarchy * **Vector Store Size**: 3 Lance files for different embedding types """ ) with st.expander("Achieving the Requirement", expanded=False): st.markdown("### Source-Based Processing Control:") st.markdown( """ ```yaml input: source_tracking: true processing_order: - path: "first_paragraphs" priority: 1 purpose: "graph_building" - path: "full_documents" priority: 2 purpose: "retrieval" ``` """ ) st.markdown( """ This configuration ensures that GraphRAG knows which content is for graph building (first paragraphs) and which is for retrieval (full documents). The priority system makes sure first paragraphs are processed first and used primarily for the knowledge graph construction. """ ) st.markdown("### Targeted Entity and Claim Extraction:") st.markdown( """ ```yaml entity_extraction: source_filter: "first_paragraphs" max_gleanings: 2 claim_extraction: source_filter: "first_paragraphs" ``` """ ) st.markdown( """ These filters ensure that the knowledge graph (entities, relationships, and claims) is built only from the first paragraphs. This is crucial because it means our initial search will only traverse the graph built from these first paragraphs, matching the requirement. The `max_gleanings: 2` allows for thorough extraction while maintaining precision. """ ) st.markdown("### Search Priority and Retrieval Control:") st.markdown( """ ```yaml local_search: source_priority: graph_search: "first_paragraphs" answer_retrieval: "full_documents" text_unit_prop: 0.7 community_prop: 0.3 ``` """ ) st.markdown( """ This is where the magic happens - when a query is made, the system first searches using the graph built from first paragraphs (`graph_search: "first_paragraphs"`), but when it needs to construct the answer, it pulls the content from the full documents (`answer_retrieval: "full_documents"`). The text_unit and community proportions ensure we're making good use of both the graph structure and the actual content. Looking at the output files we generated (`create_final_entities.parquet`, `create_final_relationships.parquet`, etc.), we can see this two-phase approach in action: the graph structure is built and stored separately from the full content, but they're linked through the unified vector store in LanceDB, allowing seamless transitions between graph search and content retrieval during query processing. """ ) with st.expander("Improvements to Make the Graph Creation Process Leaner and Faster", expanded=False): st.markdown("### Optimization of Chunk Size and Overlap:") st.markdown( """ ```yaml chunks: size: 300 # Reduced from 500 overlap: 25 # Reduced from 50 group_by_columns: [id] ``` """ ) st.markdown( """ **Rationale**: - Smaller chunks with minimal overlap reduce token usage. - Maintains context while processing fewer tokens per API call. - Especially efficient for first paragraphs processing. """ ) st.markdown("### Streamline Entity Types and Claims:") st.markdown( """ ```yaml entity_extraction: entity_types: - "Baggage" - "Restriction" - "Item" max_gleanings: 1 # Reduced from 2 claim_extraction: enabled: false # Disable unless absolutely necessary ``` """ ) st.markdown( """ **Rationale**: - Fewer entity types mean fewer extraction operations. - Single gleaning pass is often sufficient. - Claims processing is expensive and often redundant. """ ) st.markdown("### Optimize Graph Embeddings:") st.markdown( """ ```yaml embed_graph: enabled: true num_walks: 50 # Reduced from 100 walk_length: 5 # Reduced from 10 window_size: 3 # Reduced from 5 iterations: 5 # Reduced from 10 ``` """ ) st.markdown( """ **Rationale**: - Fewer random walks still capture essential graph structure. - Shorter walks reduce computation time. - Smaller window size focuses on immediate relationships. """ ) st.markdown("### Batch Processing and Parallelization:") st.markdown( """ ```yaml embeddings: async_mode: asyncio # Changed from threaded batch_size: 32 # Increased from 16 batch_max_tokens: 8191 ``` """ ) st.markdown( """ **Rationale**: - Asyncio performs better than threading for I/O-bound operations. - Larger batch size reduces API calls. - Maximizes throughput within token limits. """ ) st.markdown("### Community Structure Optimization:") st.markdown( """ ```yaml cluster_graph: max_cluster_size: 15 # Increased slightly min_cluster_size: 3 # Added parameter community_reports: max_input_length: 2000 # Reduced from default max_length: 1000 # Reduced summary length ``` """ ) st.markdown( """ **Rationale**: - Balanced cluster sizes reduce processing overhead. - Shorter community reports still maintain essential information. - Fewer tokens per report means faster processing. """ ) st.markdown("### Caching and Storage:") st.markdown( """ ```yaml cache: type: file base_dir: "cache" compression: true # Add compression cache_embeddings: true storage: type: file base_dir: "output" compression: true # Add compression ``` """ ) st.markdown( """ **Rationale**: - Compression reduces I/O overhead. - Caching embeddings prevents recomputation. - File-based storage is faster than blob storage for local processing. """ ) st.markdown("### Disable Non-Essential Features:") st.markdown( """ ```yaml umap: enabled: false # Disable unless visualization needed snapshots: graphml: false raw_entities: false top_level_nodes: false ``` """ ) st.markdown( """ **Rationale**: - UMAP calculation is computationally expensive. - Snapshots are useful for debugging but add overhead. """ ) st.markdown("### LLM Configuration Optimization:") st.markdown( """ ```yaml llm: concurrent_requests: 25 tokens_per_minute: 150000 requests_per_minute: 10000 max_retries: 5 # Reduced from 10 ``` """ ) st.markdown( """ **Rationale**: - Balanced concurrency prevents rate limiting. - Fewer retries reduce waiting time. - Token and request limits prevent throttling. """ ) with st.expander("Query Types", expanded=False): st.markdown("### Local Search:") st.markdown( """ ```yaml local_search: text_unit_prop: 0.7 # Focus on specific text chunks community_prop: 0.3 # Some consideration of community context top_k_mapped_entities: 15 source_priority: graph_search: "first_paragraphs" answer_retrieval: "full_documents" ``` """ ) st.markdown( """ **Best when**: Looking for specific baggage rules or restrictions **Example Query**: "What are the liquid restrictions for carry-on bags?" **How it works with our data**: - Searches for entities in first paragraphs (like "liquid", "carry-on"). - Follows direct relationships in the graph. - Retrieves detailed rules from full documents. **Meets requirement?** Yes, but in a limited way - focuses on direct connections. """ ) st.markdown("### Global Search:") st.markdown( """ ```yaml global_search: max_tokens: 4000 data_max_tokens: 4000 min_score_threshold: 0.1 allow_general_knowledge: false ``` """ ) st.markdown( """ **Best when**: Understanding overall policies or themes **Example Query**: "What are the main types of baggage restrictions?" **How it works with our data**: - Looks at community summaries built from first paragraphs. - Provides broader context about baggage policies. - Pulls supporting details from full documents. **Meets requirement?** Partially - good for overview but might miss specific connections. """ ) st.markdown("### DRIFT Search (Dynamic Reasoning and Inference with Flexible Traversal):") st.markdown( """ ```yaml local_search: source_priority: graph_search: "first_paragraphs" answer_retrieval: "full_documents" ``` """ ) st.markdown( """ **Best when**: Complex queries requiring both specific details and context **Example Query**: "How do pet carrier size restrictions compare to regular carry-on limits?" **How it works with our data**: - Starts with first paragraphs graph to understand relationships between: - Pet carriers - Regular carry-on bags - Size restrictions - Uses community understanding to find related policies. - Retrieves specific details from full documents. **Meets requirement?** Yes, most comprehensively. """ ) st.markdown("### Best Choice for Our Requirement:") st.markdown( """ **DRIFT Search** is the most suitable because: - It naturally implements our two-phase requirement: - Initial search on graph (from first paragraphs). - Answer retrieval from full documents. - It can handle complex queries that need: - Understanding of relationships (from graph). - Specific details (from full documents). - It can dynamically adjust between: - Local search when specific rules are needed. - Global search when context is important. """ ) with st.expander("Configuration: full `settings.yaml`", expanded=False): st.markdown( """ ```yaml # Root configuration for GraphRAG, a system leveraging LLMs for advanced Retrieval Augmented Generation. encoding_model: cl100k_base # Specifies the model used for token encoding. The default 'cl100k_base' is common for OpenAI's text models, # determining how text is tokenized into machine-readable units. skip_workflows: [] # A list of workflows to skip during execution. Empty indicates all workflows are executed. llm: api_key: ${GRAPHRAG_API_KEY} # Placeholder for the API key, replaced dynamically from environment variables. # Ensures secure API access for LLM queries. type: openai_chat # Defines the type of LLM interface used. Here, it connects to OpenAI's chat-based API. model: gpt-4o-mini # Specifies the model variant to use. model_supports_json: true # Indicates whether the LLM natively supports JSON responses, useful for structured outputs. max_tokens: 4000 # Maximum number of tokens in the output. Balances performance and context length. temperature: 0 # Controls randomness in outputs. 0 means deterministic responses, often preferred for accuracy. embeddings: async_mode: threaded # Asynchronous embedding computation mode. 'threaded' uses multi-threading for better performance. batch_size: 16 # Number of data points processed per batch during embedding, balancing speed and resource use. vector_store: type: lancedb # Database type used for storing vectorized embeddings. 'lancedb' supports efficient vector operations. db_uri: 'output/lancedb' # URI pointing to the database location where embeddings are stored. container_name: default # Logical name for the container storing vector data. overwrite: true # Whether to overwrite existing vectors. True allows updating the database during reruns. llm: api_key: ${GRAPHRAG_API_KEY} type: openai_embedding model: text-embedding-3-small # Dedicated LLM for embedding tasks. A smaller, specialized model is specified for embeddings. chunks: size: 500 # Number of tokens per chunk of text. Controls granularity for processing long documents. overlap: 50 # Overlap between adjacent chunks to ensure continuity in analysis. group_by_columns: [id] # Groups data by 'id' before chunking, preserving document boundaries. input: type: file file_type: text base_dir: "input" file_pattern: ".*\\.txt$" recursive: true source_tracking: true processing_order: - path: "first_paragraphs" priority: 1 purpose: "graph_building" - path: "full_documents" priority: 2 purpose: "retrieval" # Specifies the data source for ingestion: # - Input is file-based text. # - Reads files recursively from "input" directory matching '.txt' files. # - Prioritizes "first_paragraphs" for graph building and full documents for retrieval. entity_extraction: prompt: "prompts/entity_extraction.txt" # Path to the custom prompt used for entity extraction tasks. entity_types: - "Baggage Type" - "Dimension" - "Linear Dimension" - "Weight" - "Material Type" - "Wheel Configuration" - "Measurement Unit" - "Size Category" - "Weight Category" - "Airline" - "Alliance" - "Airport" - "Route Type" - "Travel Class" - "Cabin Section" - "Aircraft Type" - "Restriction" - "Exemption" - "Policy" - "Fee Structure" - "Currency" - "Allowance" - "Special Item" - "Prohibited Item" - "Restricted Item" - "Dangerous Good" - "Fragile Item" - "Valuable Item" - "Required Document" - "Label Type" - "Tag Category" - "Service Type" - "Handler Role" - "Service Location" - "Time Period" - "Passenger Type" - "Membership Level" - "Group Category" # Defines the types of entities the system should extract. max_gleanings: 2 # Maximum number of re-processing rounds to refine entity detection. source_filter: "first_paragraphs" # Restricts extraction to text from "first_paragraphs," optimizing focus. claim_extraction: enabled: true # Enables claim extraction, capturing specific conditions or assertions from text. claim_types: - "Basic Size Restriction" - "Oversize Condition" - "Weight Limit Standard" - "Overweight Condition" - "Combined Dimension Limit" - "Cabin Storage Requirement" - "Standard Fee" - "Excess Fee" - "Oversize Fee" - "Overweight Fee" - "Special Handling Fee" - "Season Surcharge" - "Route-Specific Fee" - "Multi-Piece Pricing" - "Fee Waiver Condition" - "Basic Allowance" - "Class-Based Allowance" - "Status-Based Allowance" - "Route-Based Allowance" - "Special Group Allowance" - "Seasonal Allowance" - "Equipment Allowance" - "Prohibited Item Policy" - "Restricted Item Condition" - "Dangerous Goods Policy" - "Special Item Restriction" - "Packaging Requirement" - "Declaration Requirement" - "Check-in Deadline" - "Special Handling Procedure" - "Priority Handling Rule" - "Transfer Handling Policy" - "Delivery Service Policy" - "Storage Policy" - "Liability Limit" - "Insurance Requirement" - "Claim Procedure" - "Compensation Policy" - "Time Limit Policy" - "Weather Restriction" - "Seasonal Restriction" - "Aircraft Limitation" - "Route Restriction" - "Connection Impact" - "Tag Requirement" - "Label Requirement" - "Documentation Requirement" - "Declaration Policy" - "Handling Standard" - "Service Level Agreement" - "Priority Service Standard" - "Delivery Time Standard" - "Medical Exception" - "Military Exception" - "Diplomatic Exception" - "Event Exception" - "Emergency Exception" # Types of claims to extract, covering diverse scenarios (e.g., fees, allowances). prompt: "prompts/claim_extraction.txt" description: "Extract baggage measurements, weight limits, and restrictions from airline documentation." # Customizes the extraction logic for airline baggage policies. max_gleanings: 2 source_filter: "first_paragraphs" # Restricts claims to "first_paragraphs," mirroring entity extraction. local_search: text_unit_prop: 0.7 community_prop: 0.3 top_k_mapped_entities: 15 top_k_relationships: 15 max_tokens: 4000 source_priority: graph_search: "first_paragraphs" answer_retrieval: "full_documents" # Configures search behavior: # - Balances searches between individual text units and community-level summaries. # - Limits results to top 15 entities and relationships for relevance. global_search: max_tokens: 4000 data_max_tokens: 4000 map_max_tokens: 1000 reduce_max_tokens: 2000 allow_general_knowledge: false min_score_threshold: 0.1 concurrency: 10 # Defines query-wide global search capabilities: # - Token limits for different operations. # - Restricts non-specific general knowledge responses. # - Handles up to 10 parallel queries. embed_graph: enabled: true num_walks: 100 walk_length: 10 window_size: 5 iterations: 10 # Enables graph embedding (e.g., for node2vec): # - Generates 100 random walks per node to learn embeddings. umap: enabled: true n_neighbors: 15 min_dist: 0.1 n_components: 2 # Configures UMAP for dimensionality reduction and visualization. storage: type: file base_dir: "output" # Outputs processed data to local "output" directory. cache: type: file base_dir: "cache" # Stores temporary files in "cache." reporting: type: file base_dir: "reports" include_source_tracking: true # Generates reports, including provenance for traceability. ``` """ )