diff --git a/groundX-doc-pipeline/app.py b/groundX-doc-pipeline/app.py index c6d75281b..13c73ea2d 100644 --- a/groundX-doc-pipeline/app.py +++ b/groundX-doc-pipeline/app.py @@ -1,5 +1,6 @@ import os import tempfile +import base64 import requests from typing import Any, Dict @@ -15,10 +16,8 @@ # Application Configuration st.set_page_config(page_title="Ground X - X-Ray", layout="wide") -# Custom CSS for enhanced chat interface layout st.markdown(""" """, unsafe_allow_html=True) @@ -59,31 +255,50 @@ def reset_analysis(): if key in st.session_state: del st.session_state[key] +def display_pdf(file): + """Display PDF preview using embedded iframe""" + st.markdown("### PDF Preview") + base64_pdf = base64.b64encode(file.read()).decode("utf-8") +<<<<<<< Updated upstream + + # Embedding PDF in HTML +======= +>>>>>>> Stashed changes + pdf_display = f"""""" +<<<<<<< Updated upstream + + # Displaying File + st.markdown(pdf_display, unsafe_allow_html=True) + # Chat Interface Functions +======= + st.markdown(pdf_display, unsafe_allow_html=True) + +>>>>>>> Stashed changes def prepare_chat_context(xray_data, prompt): """Prepare context from X-Ray data for the LLM""" context_parts = [] - # Add summary first for quick overview if xray_data.get('fileSummary'): context_parts.append(f"Summary: {xray_data['fileSummary']}") - # Add limited document content (first 2 pages, first 3 chunks each) if 'documentPages' in xray_data and xray_data['documentPages']: extracted_texts = [] - for page in xray_data['documentPages'][:2]: # Only first 2 pages + for page in xray_data['documentPages'][:2]: if 'chunks' in page: - for chunk in page['chunks'][:3]: # Only first 3 chunks per page + for chunk in page['chunks'][:3]: if 'text' in chunk and chunk['text']: text = chunk['text'] - if len(text) > 500: # Shorter limit + if len(text) > 500: text = text[:500] + "..." extracted_texts.append(text) if extracted_texts: context_parts.append(f"Document Content: {' '.join(extracted_texts)}") - # Add essential metadata if xray_data.get('fileType'): context_parts.append(f"File Type: {xray_data['fileType']}") if xray_data.get('language'): @@ -92,8 +307,7 @@ def prepare_chat_context(xray_data, prompt): return "\n\n".join(context_parts) def generate_chat_response(prompt, context): - """Generate AI response using Ollama with structured prompt engineering""" - # Construct comprehensive prompt for intelligent query handling + """Generate AI response using Ollama""" full_prompt = f"""You are an AI assistant helping analyze a document. You have access to the following document information: {context} @@ -110,7 +324,6 @@ def generate_chat_response(prompt, context): Response:""" - # Initialize Ollama API request try: response = requests.post( "http://localhost:11434/api/generate", @@ -138,22 +351,15 @@ def generate_chat_response(prompt, context): except Exception as e: return f"I'm having trouble accessing the AI model right now. Error: {str(e)}" -# Initialize Streamlit session state -for key in ["xray_data", "uploaded_file_path", "uploaded_file_name", "uploaded_file_type", "processing_complete", "used_existing_file", "auto_loaded_file"]: +for key in ["xray_data", "uploaded_file_path", "uploaded_file_name", "uploaded_file_type", "processing_complete", "used_existing_file", "auto_loaded_file", "active_tab"]: if key not in st.session_state: st.session_state[key] = None if key == "xray_data" else False -# Application Header -st.markdown(""" -# World-class Document Processing Pipeline -""") +st.markdown("# World-class Document Processing Pipeline") -# Load and display GroundX branding -import base64 try: with open("assets/groundx.png", "rb") as img_file: logo_base64 = base64.b64encode(img_file.read()).decode() - st.markdown(f"""
powered by @@ -170,7 +376,6 @@ def generate_chat_response(prompt, context):
""", unsafe_allow_html=True) -# Document Upload Interface with st.sidebar: st.header("📄 Upload Document") @@ -196,10 +401,54 @@ def generate_chat_response(prompt, context): st.info(f"**File**: {uploaded.name}\n**Size**: {uploaded.size / 1024:.1f} KB\n**Type**: {uploaded.type}") st.session_state.uploaded_file_name = uploaded.name st.session_state.uploaded_file_type = uploaded.type + +<<<<<<< Updated upstream + # Document Preview Section + st.markdown("---") + st.markdown("### 📄 Document Preview") + + # Show preview based on file type + if uploaded.type == "application/pdf": + # For PDF files, show the actual PDF preview using iframe + display_pdf(uploaded) + + elif uploaded.type.startswith("image/"): + # For image files, show the actual image + st.image(uploaded, caption=f"Preview: {uploaded.name}", use_column_width=True) + + elif uploaded.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": + # For DOCX files + st.info("📝 **Word Document** - Preview will be available after processing") + st.markdown(f"**Content**: Text extraction in progress...") + + else: + # For other file types + st.info(f"📄 **{uploaded.type}** - Preview will be available after processing") + + # Show file metadata + st.markdown("**File Details:**") + st.markdown(f"- **Name**: {uploaded.name}") + st.markdown(f"- **Size**: {uploaded.size / 1024:.1f} KB") + st.markdown(f"- **Type**: {uploaded.type}") + st.markdown(f"- **Status**: Ready for processing") +======= + st.markdown("---") + st.markdown("### 📄 Document Preview") + + if uploaded.type == "application/pdf": + display_pdf(uploaded) + elif uploaded.type.startswith("image/"): + st.image(uploaded, caption=f"Preview: {uploaded.name}", use_column_width=True) + elif uploaded.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": + st.info("📝 **Word Document** - Preview will be available after processing") + st.markdown(f"**Content**: Text extraction in progress...") + else: + st.info(f"📄 **{uploaded.type}** - Preview will be available after processing") + +>>>>>>> Stashed changes st.button("🔄 Clear Analysis", on_click=reset_analysis) -# Initialize Ground X API client and storage bucket try: gx = create_client() bucket_id = ensure_bucket(gx) @@ -207,9 +456,7 @@ def generate_chat_response(prompt, context): st.error(f"❌ {e}") st.stop() -# Auto-load existing document from bucket if available if not st.session_state.auto_loaded_file and not st.session_state.xray_data: - # Configurable sample file - can be set via environment variable existing_file_name = os.getenv("SAMPLE_FILE_NAME", "tmpivkf8qf8_sample-file.pdf") existing_doc_id = check_file_exists(gx, bucket_id, existing_file_name) @@ -226,7 +473,6 @@ def generate_chat_response(prompt, context): else: st.session_state.auto_loaded_file = True -# Document Processing Logic should_process = False file_to_process = None @@ -255,39 +501,47 @@ def getbuffer(self): ) if should_process and st.session_state.xray_data is None: - with st.status("🔄 Processing document...", expanded=True) as status: - # Determine file path for processing (temporary for new uploads, stored for existing) + # Create half-width container for status + col1, col2 = st.columns([1, 1]) + + with col1: if hasattr(file_to_process, 'path'): file_path = file_to_process.path else: file_path = st.session_state.uploaded_file_path try: + # Step 1: Upload + st.write("📤 **Uploading to Ground X...**") + xray, used_existing = process_document(gx, bucket_id, file_to_process, file_path) + + if used_existing: + st.write("✅ **File already exists in bucket**") + st.write("📥 **Fetching existing X-Ray data...**") + else: + st.write("⏳ **Processing document...**") + st.write("📥 **Fetching X-Ray data...**") + st.session_state.xray_data = xray st.session_state.processing_complete = True st.session_state.used_existing_file = used_existing + st.session_state.active_tab = "analysis" # Auto-switch to analysis tab if used_existing: - st.write("✅ **File already exists in bucket**") - st.write("📥 **Fetched X-Ray data...**") st.success("✅ Document analysis completed! (Used existing file)") else: - st.write("📤 **Uploaded to Ground X...**") - st.write("⏳ **Processed document...**") - st.write("📥 **Fetched X-Ray data...**") st.success("✅ Document parsed successfully! Explore the results below.") st.write("🎉 **Analysis complete!**") + except Exception as e: st.error(f"❌ Error processing document: {str(e)}") st.session_state.processing_complete = False -# Analysis Results Display if st.session_state.xray_data: xray = st.session_state.xray_data - # Extract and display document metadata metrics file_type = xray.get('fileType', 'Unknown') language = xray.get('language', 'Unknown') pages = len(xray.get("documentPages", [])) @@ -295,13 +549,180 @@ def getbuffer(self): st.markdown(f"**File Type:** {file_type}              **Language:** {language}              **Pages:** {pages}              **Keywords:** {keywords}") +<<<<<<< Updated upstream + # Document Preview Section (after processing) + with st.expander("📄 Document Preview", expanded=False): + st.markdown("### 📋 Document Summary") + file_summary = xray.get("fileSummary") + if file_summary: + st.markdown(file_summary) + else: + st.info("No summary available") + + st.markdown("### 📝 Sample Content") + # Show first few chunks of extracted text + if "documentPages" in xray and xray["documentPages"]: + sample_texts = [] + for page in xray["documentPages"][:2]: # First 2 pages + if "chunks" in page: + for chunk in page["chunks"][:2]: # First 2 chunks per page + if "text" in chunk and chunk["text"]: + text = chunk["text"] + if len(text) > 200: + text = text[:200] + "..." + sample_texts.append(text) + + if sample_texts: + for i, text in enumerate(sample_texts, 1): + st.markdown(f"**Sample {i}:**") + st.markdown(text) + st.markdown("---") + else: + st.info("No text content available for preview") + + st.markdown("### 🏷️ Key Topics") + if xray.get("fileKeywords"): + keywords_list = xray["fileKeywords"].split(",") + # Show first 10 keywords + display_keywords = keywords_list[:10] + keyword_tags = " ".join([f"`{kw.strip()}`" for kw in display_keywords]) + st.markdown(keyword_tags) + else: + st.info("No keywords available") + # Primary interface tabs for analysis and interaction main_tabs = st.tabs([ "📊 X-Ray Analysis", "💬 Chat" ]) +======= + # Create a left-aligned container for the tab buttons + col1, col2 = st.columns([1, 4]) +>>>>>>> Stashed changes + + with col1: + # Create a single container with custom button styling + st.markdown(""" + + """, unsafe_allow_html=True) + + # Use uneven column widths to move chat button left + col1, col2 = st.columns([0.7, 1.3]) + + with col1: + if st.button("📊 X-Ray Analysis", key="analysis_tab_btn", type="primary" if st.session_state.active_tab == "analysis" else "secondary"): + st.session_state.active_tab = "analysis" + st.rerun() + + with col2: + if st.button("💬 Chat", key="chat_tab_btn", type="primary" if st.session_state.active_tab == "chat" else "secondary"): + st.session_state.active_tab = "chat" + st.rerun() + + if st.session_state.active_tab is None: + st.session_state.active_tab = "analysis" - with main_tabs[0]: + if st.session_state.active_tab == "analysis": st.markdown("### 📊 X-Ray Analysis Results") tabs = st.tabs([ "🔍 JSON Output", @@ -312,105 +733,115 @@ def getbuffer(self): "🏷️ Keywords" ]) - with tabs[0]: - st.subheader("🔍 Raw JSON Data") - st.json(xray) + with tabs[0]: + st.subheader("🔍 Raw JSON Data") + st.json(xray) - with tabs[1]: - st.subheader("📝 Narrative Summary") - # Extract and display narrative content from document chunks - narratives = [] - if "documentPages" in xray: - for page in xray["documentPages"]: - if "chunks" in page: - for chunk in page["chunks"]: - if "narrative" in chunk and chunk["narrative"]: - narratives.extend(chunk["narrative"]) - - if narratives: - for i, narrative in enumerate(narratives, 1): - st.markdown(f"**Narrative {i}:**") - st.markdown(narrative) - st.divider() - else: - st.info("No narrative text found in the X-Ray data") + with tabs[1]: + st.subheader("📝 Narrative Summary") +<<<<<<< Updated upstream + # Extract and display narrative content from document chunks +======= +>>>>>>> Stashed changes + narratives = [] + if "documentPages" in xray: + for page in xray["documentPages"]: + if "chunks" in page: + for chunk in page["chunks"]: + if "narrative" in chunk and chunk["narrative"]: + narratives.extend(chunk["narrative"]) + + if narratives: + for i, narrative in enumerate(narratives, 1): + st.markdown(f"**Narrative {i}:**") + st.markdown(narrative) + st.divider() + else: + st.info("No narrative text found in the X-Ray data") - with tabs[2]: - st.subheader("📋 File Summary") - file_summary = xray.get("fileSummary") - if file_summary: - st.markdown(file_summary) - else: - st.info("No file summary found in the X-Ray data") - - with tabs[3]: - st.subheader("💡 Suggested Text") - # Extract and display suggested text content from document chunks - suggested_texts = [] - if "documentPages" in xray: - for page in xray["documentPages"]: - if "chunks" in page: - for chunk in page["chunks"]: - if "suggestedText" in chunk and chunk["suggestedText"]: - suggested_texts.append(chunk["suggestedText"]) - - if suggested_texts: - for i, suggested in enumerate(suggested_texts, 1): - st.markdown(f"**Suggested Text {i}:**") - st.markdown(suggested) - st.divider() - else: - st.info("No suggested text found in the X-Ray data") + with tabs[2]: + st.subheader("📋 File Summary") + file_summary = xray.get("fileSummary") + if file_summary: + st.markdown(file_summary) + else: + st.info("No file summary found in the X-Ray data") - with tabs[4]: - st.subheader("📄 Extracted Text") - # Extract and display raw text content from document chunks - extracted_texts = [] - if "documentPages" in xray: - for page in xray["documentPages"]: - if "chunks" in page: - for chunk in page["chunks"]: - if "text" in chunk and chunk["text"]: - extracted_texts.append(chunk["text"]) - - if extracted_texts: - combined_text = "\n\n---\n\n".join(extracted_texts) - st.text_area("Extracted Content", combined_text, height=400) - else: - st.info("No extracted text found in the X-Ray data") + with tabs[3]: + st.subheader("💡 Suggested Text") +<<<<<<< Updated upstream + # Extract and display suggested text content from document chunks +======= +>>>>>>> Stashed changes + suggested_texts = [] + if "documentPages" in xray: + for page in xray["documentPages"]: + if "chunks" in page: + for chunk in page["chunks"]: + if "suggestedText" in chunk and chunk["suggestedText"]: + suggested_texts.append(chunk["suggestedText"]) + + if suggested_texts: + for i, suggested in enumerate(suggested_texts, 1): + st.markdown(f"**Suggested Text {i}:**") + st.markdown(suggested) + st.divider() + else: + st.info("No suggested text found in the X-Ray data") - with tabs[5]: - st.subheader("🏷️ Keywords") - keywords = xray.get("fileKeywords") - if keywords: - st.write(keywords) - else: - st.info("No keywords found in the X-Ray data") + with tabs[4]: + st.subheader("📄 Extracted Text") +<<<<<<< Updated upstream + # Extract and display raw text content from document chunks +======= +>>>>>>> Stashed changes + extracted_texts = [] + if "documentPages" in xray: + for page in xray["documentPages"]: + if "chunks" in page: + for chunk in page["chunks"]: + if "text" in chunk and chunk["text"]: + extracted_texts.append(chunk["text"]) + + if extracted_texts: + combined_text = "\n\n---\n\n".join(extracted_texts) + st.text_area("Extracted Content", combined_text, height=400) + else: + st.info("No extracted text found in the X-Ray data") + + with tabs[5]: + st.subheader("🏷️ Keywords") + keywords = xray.get("fileKeywords") + if keywords: + st.write(keywords) + else: + st.info("No keywords found in the X-Ray data") - # Interactive Chat Interface - with main_tabs[1]: + elif st.session_state.active_tab == "chat": st.markdown("### 💬 Chat with Document") st.markdown("Ask questions about your document.") - # Initialize and display chat conversation history if "chat_history" not in st.session_state: st.session_state.chat_history = [] - # Render existing chat messages for message in st.session_state.chat_history: with st.chat_message(message["role"]): st.markdown(message["content"]) - # Process user input and generate responses if prompt := st.chat_input("Ask a question about your document..."): - # Store user message in conversation history st.session_state.chat_history.append({"role": "user", "content": prompt}) + st.session_state.active_tab = "chat" + +<<<<<<< Updated upstream + # Ensure we stay in chat mode + st.session_state.in_chat_mode = True # Display user message in chat interface +======= +>>>>>>> Stashed changes with st.chat_message("user"): st.markdown(prompt) - # Generate and display AI assistant response with st.chat_message("assistant"): with st.spinner("Thinking..."): context = prepare_chat_context(xray, prompt) @@ -423,4 +854,4 @@ def getbuffer(self): elif uploaded is None and st.session_state.uploaded_file_name and st.session_state.xray_data: status = "Auto-loaded" if st.session_state.auto_loaded_file else "Analysis" st.success(f"✅ **{status} complete for**: {st.session_state.uploaded_file_name}") - st.info("💡 **Tip**: You can re-process the file using the button in the sidebar, or upload a new document.") + st.info("💡 **Tip**: You can re-process the file using the button in the sidebar, or upload a new document.") \ No newline at end of file diff --git a/groundX-doc-pipeline/groundx_utils.py b/groundX-doc-pipeline/groundx_utils.py index d6b0fb324..4adf4ab45 100644 --- a/groundX-doc-pipeline/groundx_utils.py +++ b/groundX-doc-pipeline/groundx_utils.py @@ -52,48 +52,17 @@ def ingest_document(gx: GroundX, bucket_id: str, path: Path, mime: str) -> str: def poll_until_complete(gx: GroundX, process_id: str, timeout: int = 600) -> None: """Monitor document processing status until completion""" start_time = time.time() - status_text = st.empty() - progress_bar = st.progress(0) - - while True: - status = gx.documents.get_processing_status_by_id(process_id=process_id).ingest - - progress_value = 0 - if hasattr(status, 'percent') and status.percent is not None: - try: - progress_value = int(status.percent) - except (ValueError, TypeError): - progress_value = 0 - elif hasattr(status, 'progress') and status.progress is not None: - try: - if hasattr(status.progress, 'percent'): - progress_value = int(status.progress.percent) - elif hasattr(status.progress, 'value'): - progress_value = int(status.progress.value) - elif hasattr(status.progress, 'percentage'): - progress_value = int(status.progress.percentage) - else: - progress_value = int(status.progress) - except (ValueError, TypeError, AttributeError): - progress_value = 0 - elif hasattr(status, 'percentage') and status.percentage is not None: - try: - progress_value = int(status.percentage) - except (ValueError, TypeError): - progress_value = 0 - - progress_bar.progress(progress_value) - - status_display = f"**{status.status.capitalize()}**" - if progress_value > 0: - status_display += f" – {progress_value}%" - status_text.write(status_display) - - if status.status in {"complete", "error", "cancelled"}: - break - if time.time() - start_time > timeout: - raise TimeoutError("Ground X ingest timed out.") - time.sleep(3) + + # Use a spinner container for better UX + with st.spinner("Processing document..."): + while True: + status = gx.documents.get_processing_status_by_id(process_id=process_id).ingest + + if status.status in {"complete", "error", "cancelled"}: + break + if time.time() - start_time > timeout: + raise TimeoutError("Ground X ingest timed out.") + time.sleep(3) if status.status != "complete": raise RuntimeError(f"Ingest finished with status: {status.status!r}")