diff --git a/groundX-doc-pipeline/app.py b/groundX-doc-pipeline/app.py
index c6d75281b..13c73ea2d 100644
--- a/groundX-doc-pipeline/app.py
+++ b/groundX-doc-pipeline/app.py
@@ -1,5 +1,6 @@
import os
import tempfile
+import base64
import requests
from typing import Any, Dict
@@ -15,10 +16,8 @@
# Application Configuration
st.set_page_config(page_title="Ground X - X-Ray", layout="wide")
-# Custom CSS for enhanced chat interface layout
st.markdown("""
""", unsafe_allow_html=True)
@@ -59,31 +255,50 @@ def reset_analysis():
if key in st.session_state:
del st.session_state[key]
+def display_pdf(file):
+ """Display PDF preview using embedded iframe"""
+ st.markdown("### PDF Preview")
+ base64_pdf = base64.b64encode(file.read()).decode("utf-8")
+<<<<<<< Updated upstream
+
+ # Embedding PDF in HTML
+=======
+>>>>>>> Stashed changes
+ pdf_display = f""""""
+<<<<<<< Updated upstream
+
+ # Displaying File
+ st.markdown(pdf_display, unsafe_allow_html=True)
+
# Chat Interface Functions
+=======
+ st.markdown(pdf_display, unsafe_allow_html=True)
+
+>>>>>>> Stashed changes
def prepare_chat_context(xray_data, prompt):
"""Prepare context from X-Ray data for the LLM"""
context_parts = []
- # Add summary first for quick overview
if xray_data.get('fileSummary'):
context_parts.append(f"Summary: {xray_data['fileSummary']}")
- # Add limited document content (first 2 pages, first 3 chunks each)
if 'documentPages' in xray_data and xray_data['documentPages']:
extracted_texts = []
- for page in xray_data['documentPages'][:2]: # Only first 2 pages
+ for page in xray_data['documentPages'][:2]:
if 'chunks' in page:
- for chunk in page['chunks'][:3]: # Only first 3 chunks per page
+ for chunk in page['chunks'][:3]:
if 'text' in chunk and chunk['text']:
text = chunk['text']
- if len(text) > 500: # Shorter limit
+ if len(text) > 500:
text = text[:500] + "..."
extracted_texts.append(text)
if extracted_texts:
context_parts.append(f"Document Content: {' '.join(extracted_texts)}")
- # Add essential metadata
if xray_data.get('fileType'):
context_parts.append(f"File Type: {xray_data['fileType']}")
if xray_data.get('language'):
@@ -92,8 +307,7 @@ def prepare_chat_context(xray_data, prompt):
return "\n\n".join(context_parts)
def generate_chat_response(prompt, context):
- """Generate AI response using Ollama with structured prompt engineering"""
- # Construct comprehensive prompt for intelligent query handling
+ """Generate AI response using Ollama"""
full_prompt = f"""You are an AI assistant helping analyze a document. You have access to the following document information:
{context}
@@ -110,7 +324,6 @@ def generate_chat_response(prompt, context):
Response:"""
- # Initialize Ollama API request
try:
response = requests.post(
"http://localhost:11434/api/generate",
@@ -138,22 +351,15 @@ def generate_chat_response(prompt, context):
except Exception as e:
return f"I'm having trouble accessing the AI model right now. Error: {str(e)}"
-# Initialize Streamlit session state
-for key in ["xray_data", "uploaded_file_path", "uploaded_file_name", "uploaded_file_type", "processing_complete", "used_existing_file", "auto_loaded_file"]:
+for key in ["xray_data", "uploaded_file_path", "uploaded_file_name", "uploaded_file_type", "processing_complete", "used_existing_file", "auto_loaded_file", "active_tab"]:
if key not in st.session_state:
st.session_state[key] = None if key == "xray_data" else False
-# Application Header
-st.markdown("""
-# World-class Document Processing Pipeline
-""")
+st.markdown("# World-class Document Processing Pipeline")
-# Load and display GroundX branding
-import base64
try:
with open("assets/groundx.png", "rb") as img_file:
logo_base64 = base64.b64encode(img_file.read()).decode()
-
st.markdown(f"""
powered by
@@ -170,7 +376,6 @@ def generate_chat_response(prompt, context):
""", unsafe_allow_html=True)
-# Document Upload Interface
with st.sidebar:
st.header("📄 Upload Document")
@@ -196,10 +401,54 @@ def generate_chat_response(prompt, context):
st.info(f"**File**: {uploaded.name}\n**Size**: {uploaded.size / 1024:.1f} KB\n**Type**: {uploaded.type}")
st.session_state.uploaded_file_name = uploaded.name
st.session_state.uploaded_file_type = uploaded.type
+
+<<<<<<< Updated upstream
+ # Document Preview Section
+ st.markdown("---")
+ st.markdown("### 📄 Document Preview")
+
+ # Show preview based on file type
+ if uploaded.type == "application/pdf":
+ # For PDF files, show the actual PDF preview using iframe
+ display_pdf(uploaded)
+
+ elif uploaded.type.startswith("image/"):
+ # For image files, show the actual image
+ st.image(uploaded, caption=f"Preview: {uploaded.name}", use_column_width=True)
+
+ elif uploaded.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+ # For DOCX files
+ st.info("📝 **Word Document** - Preview will be available after processing")
+ st.markdown(f"**Content**: Text extraction in progress...")
+
+ else:
+ # For other file types
+ st.info(f"📄 **{uploaded.type}** - Preview will be available after processing")
+
+ # Show file metadata
+ st.markdown("**File Details:**")
+ st.markdown(f"- **Name**: {uploaded.name}")
+ st.markdown(f"- **Size**: {uploaded.size / 1024:.1f} KB")
+ st.markdown(f"- **Type**: {uploaded.type}")
+ st.markdown(f"- **Status**: Ready for processing")
+=======
+ st.markdown("---")
+ st.markdown("### 📄 Document Preview")
+
+ if uploaded.type == "application/pdf":
+ display_pdf(uploaded)
+ elif uploaded.type.startswith("image/"):
+ st.image(uploaded, caption=f"Preview: {uploaded.name}", use_column_width=True)
+ elif uploaded.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+ st.info("📝 **Word Document** - Preview will be available after processing")
+ st.markdown(f"**Content**: Text extraction in progress...")
+ else:
+ st.info(f"📄 **{uploaded.type}** - Preview will be available after processing")
+
+>>>>>>> Stashed changes
st.button("🔄 Clear Analysis", on_click=reset_analysis)
-# Initialize Ground X API client and storage bucket
try:
gx = create_client()
bucket_id = ensure_bucket(gx)
@@ -207,9 +456,7 @@ def generate_chat_response(prompt, context):
st.error(f"❌ {e}")
st.stop()
-# Auto-load existing document from bucket if available
if not st.session_state.auto_loaded_file and not st.session_state.xray_data:
- # Configurable sample file - can be set via environment variable
existing_file_name = os.getenv("SAMPLE_FILE_NAME", "tmpivkf8qf8_sample-file.pdf")
existing_doc_id = check_file_exists(gx, bucket_id, existing_file_name)
@@ -226,7 +473,6 @@ def generate_chat_response(prompt, context):
else:
st.session_state.auto_loaded_file = True
-# Document Processing Logic
should_process = False
file_to_process = None
@@ -255,39 +501,47 @@ def getbuffer(self):
)
if should_process and st.session_state.xray_data is None:
- with st.status("🔄 Processing document...", expanded=True) as status:
- # Determine file path for processing (temporary for new uploads, stored for existing)
+ # Create half-width container for status
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
if hasattr(file_to_process, 'path'):
file_path = file_to_process.path
else:
file_path = st.session_state.uploaded_file_path
try:
+ # Step 1: Upload
+ st.write("📤 **Uploading to Ground X...**")
+
xray, used_existing = process_document(gx, bucket_id, file_to_process, file_path)
+
+ if used_existing:
+ st.write("✅ **File already exists in bucket**")
+ st.write("📥 **Fetching existing X-Ray data...**")
+ else:
+ st.write("⏳ **Processing document...**")
+ st.write("📥 **Fetching X-Ray data...**")
+
st.session_state.xray_data = xray
st.session_state.processing_complete = True
st.session_state.used_existing_file = used_existing
+ st.session_state.active_tab = "analysis" # Auto-switch to analysis tab
if used_existing:
- st.write("✅ **File already exists in bucket**")
- st.write("📥 **Fetched X-Ray data...**")
st.success("✅ Document analysis completed! (Used existing file)")
else:
- st.write("📤 **Uploaded to Ground X...**")
- st.write("⏳ **Processed document...**")
- st.write("📥 **Fetched X-Ray data...**")
st.success("✅ Document parsed successfully! Explore the results below.")
st.write("🎉 **Analysis complete!**")
+
except Exception as e:
st.error(f"❌ Error processing document: {str(e)}")
st.session_state.processing_complete = False
-# Analysis Results Display
if st.session_state.xray_data:
xray = st.session_state.xray_data
- # Extract and display document metadata metrics
file_type = xray.get('fileType', 'Unknown')
language = xray.get('language', 'Unknown')
pages = len(xray.get("documentPages", []))
@@ -295,13 +549,180 @@ def getbuffer(self):
st.markdown(f"**File Type:** {file_type} **Language:** {language} **Pages:** {pages} **Keywords:** {keywords}")
+<<<<<<< Updated upstream
+ # Document Preview Section (after processing)
+ with st.expander("📄 Document Preview", expanded=False):
+ st.markdown("### 📋 Document Summary")
+ file_summary = xray.get("fileSummary")
+ if file_summary:
+ st.markdown(file_summary)
+ else:
+ st.info("No summary available")
+
+ st.markdown("### 📝 Sample Content")
+ # Show first few chunks of extracted text
+ if "documentPages" in xray and xray["documentPages"]:
+ sample_texts = []
+ for page in xray["documentPages"][:2]: # First 2 pages
+ if "chunks" in page:
+ for chunk in page["chunks"][:2]: # First 2 chunks per page
+ if "text" in chunk and chunk["text"]:
+ text = chunk["text"]
+ if len(text) > 200:
+ text = text[:200] + "..."
+ sample_texts.append(text)
+
+ if sample_texts:
+ for i, text in enumerate(sample_texts, 1):
+ st.markdown(f"**Sample {i}:**")
+ st.markdown(text)
+ st.markdown("---")
+ else:
+ st.info("No text content available for preview")
+
+ st.markdown("### 🏷️ Key Topics")
+ if xray.get("fileKeywords"):
+ keywords_list = xray["fileKeywords"].split(",")
+ # Show first 10 keywords
+ display_keywords = keywords_list[:10]
+ keyword_tags = " ".join([f"`{kw.strip()}`" for kw in display_keywords])
+ st.markdown(keyword_tags)
+ else:
+ st.info("No keywords available")
+
# Primary interface tabs for analysis and interaction
main_tabs = st.tabs([
"📊 X-Ray Analysis",
"💬 Chat"
])
+=======
+ # Create a left-aligned container for the tab buttons
+ col1, col2 = st.columns([1, 4])
+>>>>>>> Stashed changes
+
+ with col1:
+ # Create a single container with custom button styling
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Use uneven column widths to move chat button left
+ col1, col2 = st.columns([0.7, 1.3])
+
+ with col1:
+ if st.button("📊 X-Ray Analysis", key="analysis_tab_btn", type="primary" if st.session_state.active_tab == "analysis" else "secondary"):
+ st.session_state.active_tab = "analysis"
+ st.rerun()
+
+ with col2:
+ if st.button("💬 Chat", key="chat_tab_btn", type="primary" if st.session_state.active_tab == "chat" else "secondary"):
+ st.session_state.active_tab = "chat"
+ st.rerun()
+
+ if st.session_state.active_tab is None:
+ st.session_state.active_tab = "analysis"
- with main_tabs[0]:
+ if st.session_state.active_tab == "analysis":
st.markdown("### 📊 X-Ray Analysis Results")
tabs = st.tabs([
"🔍 JSON Output",
@@ -312,105 +733,115 @@ def getbuffer(self):
"🏷️ Keywords"
])
- with tabs[0]:
- st.subheader("🔍 Raw JSON Data")
- st.json(xray)
+ with tabs[0]:
+ st.subheader("🔍 Raw JSON Data")
+ st.json(xray)
- with tabs[1]:
- st.subheader("📝 Narrative Summary")
- # Extract and display narrative content from document chunks
- narratives = []
- if "documentPages" in xray:
- for page in xray["documentPages"]:
- if "chunks" in page:
- for chunk in page["chunks"]:
- if "narrative" in chunk and chunk["narrative"]:
- narratives.extend(chunk["narrative"])
-
- if narratives:
- for i, narrative in enumerate(narratives, 1):
- st.markdown(f"**Narrative {i}:**")
- st.markdown(narrative)
- st.divider()
- else:
- st.info("No narrative text found in the X-Ray data")
+ with tabs[1]:
+ st.subheader("📝 Narrative Summary")
+<<<<<<< Updated upstream
+ # Extract and display narrative content from document chunks
+=======
+>>>>>>> Stashed changes
+ narratives = []
+ if "documentPages" in xray:
+ for page in xray["documentPages"]:
+ if "chunks" in page:
+ for chunk in page["chunks"]:
+ if "narrative" in chunk and chunk["narrative"]:
+ narratives.extend(chunk["narrative"])
+
+ if narratives:
+ for i, narrative in enumerate(narratives, 1):
+ st.markdown(f"**Narrative {i}:**")
+ st.markdown(narrative)
+ st.divider()
+ else:
+ st.info("No narrative text found in the X-Ray data")
- with tabs[2]:
- st.subheader("📋 File Summary")
- file_summary = xray.get("fileSummary")
- if file_summary:
- st.markdown(file_summary)
- else:
- st.info("No file summary found in the X-Ray data")
-
- with tabs[3]:
- st.subheader("💡 Suggested Text")
- # Extract and display suggested text content from document chunks
- suggested_texts = []
- if "documentPages" in xray:
- for page in xray["documentPages"]:
- if "chunks" in page:
- for chunk in page["chunks"]:
- if "suggestedText" in chunk and chunk["suggestedText"]:
- suggested_texts.append(chunk["suggestedText"])
-
- if suggested_texts:
- for i, suggested in enumerate(suggested_texts, 1):
- st.markdown(f"**Suggested Text {i}:**")
- st.markdown(suggested)
- st.divider()
- else:
- st.info("No suggested text found in the X-Ray data")
+ with tabs[2]:
+ st.subheader("📋 File Summary")
+ file_summary = xray.get("fileSummary")
+ if file_summary:
+ st.markdown(file_summary)
+ else:
+ st.info("No file summary found in the X-Ray data")
- with tabs[4]:
- st.subheader("📄 Extracted Text")
- # Extract and display raw text content from document chunks
- extracted_texts = []
- if "documentPages" in xray:
- for page in xray["documentPages"]:
- if "chunks" in page:
- for chunk in page["chunks"]:
- if "text" in chunk and chunk["text"]:
- extracted_texts.append(chunk["text"])
-
- if extracted_texts:
- combined_text = "\n\n---\n\n".join(extracted_texts)
- st.text_area("Extracted Content", combined_text, height=400)
- else:
- st.info("No extracted text found in the X-Ray data")
+ with tabs[3]:
+ st.subheader("💡 Suggested Text")
+<<<<<<< Updated upstream
+ # Extract and display suggested text content from document chunks
+=======
+>>>>>>> Stashed changes
+ suggested_texts = []
+ if "documentPages" in xray:
+ for page in xray["documentPages"]:
+ if "chunks" in page:
+ for chunk in page["chunks"]:
+ if "suggestedText" in chunk and chunk["suggestedText"]:
+ suggested_texts.append(chunk["suggestedText"])
+
+ if suggested_texts:
+ for i, suggested in enumerate(suggested_texts, 1):
+ st.markdown(f"**Suggested Text {i}:**")
+ st.markdown(suggested)
+ st.divider()
+ else:
+ st.info("No suggested text found in the X-Ray data")
- with tabs[5]:
- st.subheader("🏷️ Keywords")
- keywords = xray.get("fileKeywords")
- if keywords:
- st.write(keywords)
- else:
- st.info("No keywords found in the X-Ray data")
+ with tabs[4]:
+ st.subheader("📄 Extracted Text")
+<<<<<<< Updated upstream
+ # Extract and display raw text content from document chunks
+=======
+>>>>>>> Stashed changes
+ extracted_texts = []
+ if "documentPages" in xray:
+ for page in xray["documentPages"]:
+ if "chunks" in page:
+ for chunk in page["chunks"]:
+ if "text" in chunk and chunk["text"]:
+ extracted_texts.append(chunk["text"])
+
+ if extracted_texts:
+ combined_text = "\n\n---\n\n".join(extracted_texts)
+ st.text_area("Extracted Content", combined_text, height=400)
+ else:
+ st.info("No extracted text found in the X-Ray data")
+
+ with tabs[5]:
+ st.subheader("🏷️ Keywords")
+ keywords = xray.get("fileKeywords")
+ if keywords:
+ st.write(keywords)
+ else:
+ st.info("No keywords found in the X-Ray data")
- # Interactive Chat Interface
- with main_tabs[1]:
+ elif st.session_state.active_tab == "chat":
st.markdown("### 💬 Chat with Document")
st.markdown("Ask questions about your document.")
- # Initialize and display chat conversation history
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
- # Render existing chat messages
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.markdown(message["content"])
- # Process user input and generate responses
if prompt := st.chat_input("Ask a question about your document..."):
- # Store user message in conversation history
st.session_state.chat_history.append({"role": "user", "content": prompt})
+ st.session_state.active_tab = "chat"
+
+<<<<<<< Updated upstream
+ # Ensure we stay in chat mode
+ st.session_state.in_chat_mode = True
# Display user message in chat interface
+=======
+>>>>>>> Stashed changes
with st.chat_message("user"):
st.markdown(prompt)
- # Generate and display AI assistant response
with st.chat_message("assistant"):
with st.spinner("Thinking..."):
context = prepare_chat_context(xray, prompt)
@@ -423,4 +854,4 @@ def getbuffer(self):
elif uploaded is None and st.session_state.uploaded_file_name and st.session_state.xray_data:
status = "Auto-loaded" if st.session_state.auto_loaded_file else "Analysis"
st.success(f"✅ **{status} complete for**: {st.session_state.uploaded_file_name}")
- st.info("💡 **Tip**: You can re-process the file using the button in the sidebar, or upload a new document.")
+ st.info("💡 **Tip**: You can re-process the file using the button in the sidebar, or upload a new document.")
\ No newline at end of file
diff --git a/groundX-doc-pipeline/groundx_utils.py b/groundX-doc-pipeline/groundx_utils.py
index d6b0fb324..4adf4ab45 100644
--- a/groundX-doc-pipeline/groundx_utils.py
+++ b/groundX-doc-pipeline/groundx_utils.py
@@ -52,48 +52,17 @@ def ingest_document(gx: GroundX, bucket_id: str, path: Path, mime: str) -> str:
def poll_until_complete(gx: GroundX, process_id: str, timeout: int = 600) -> None:
"""Monitor document processing status until completion"""
start_time = time.time()
- status_text = st.empty()
- progress_bar = st.progress(0)
-
- while True:
- status = gx.documents.get_processing_status_by_id(process_id=process_id).ingest
-
- progress_value = 0
- if hasattr(status, 'percent') and status.percent is not None:
- try:
- progress_value = int(status.percent)
- except (ValueError, TypeError):
- progress_value = 0
- elif hasattr(status, 'progress') and status.progress is not None:
- try:
- if hasattr(status.progress, 'percent'):
- progress_value = int(status.progress.percent)
- elif hasattr(status.progress, 'value'):
- progress_value = int(status.progress.value)
- elif hasattr(status.progress, 'percentage'):
- progress_value = int(status.progress.percentage)
- else:
- progress_value = int(status.progress)
- except (ValueError, TypeError, AttributeError):
- progress_value = 0
- elif hasattr(status, 'percentage') and status.percentage is not None:
- try:
- progress_value = int(status.percentage)
- except (ValueError, TypeError):
- progress_value = 0
-
- progress_bar.progress(progress_value)
-
- status_display = f"**{status.status.capitalize()}**"
- if progress_value > 0:
- status_display += f" – {progress_value}%"
- status_text.write(status_display)
-
- if status.status in {"complete", "error", "cancelled"}:
- break
- if time.time() - start_time > timeout:
- raise TimeoutError("Ground X ingest timed out.")
- time.sleep(3)
+
+ # Use a spinner container for better UX
+ with st.spinner("Processing document..."):
+ while True:
+ status = gx.documents.get_processing_status_by_id(process_id=process_id).ingest
+
+ if status.status in {"complete", "error", "cancelled"}:
+ break
+ if time.time() - start_time > timeout:
+ raise TimeoutError("Ground X ingest timed out.")
+ time.sleep(3)
if status.status != "complete":
raise RuntimeError(f"Ingest finished with status: {status.status!r}")