AsyncFuncAI · adriandarian · Oct 11, 2025 · Oct 11, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/api/api.py b/api/api.py
@@ -9,6 +9,8 @@
 from pydantic import BaseModel, Field
 import google.generativeai as genai
 import asyncio
+from collections import defaultdict
+import fnmatch
 
 # Configure logging
 from api.logging_config import setup_logging
@@ -18,11 +20,167 @@
 
 
 # Initialize FastAPI app
-app = FastAPI(
-    title="Streaming API",
-    description="API for streaming chat completions"
+app = FastAPI()
+
+# Configure CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # In production, specify your frontend domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
 
+
+# Pydantic models for wiki pages
+class WikiPage(BaseModel):
+    id: str
+    title: str
+    content: str
+    related_pages: List[str] = []
+
+
+# ============================================================================
+# INTELLIGENT FILE CHUNKING SYSTEM
+# ============================================================================
+
+def should_exclude_dir(dir_name: str, excluded_patterns: List[str]) -> bool:
+    """Check if directory should be excluded based on patterns."""
+    # Always exclude hidden directories and common build/cache dirs
+    if dir_name.startswith('.'):
+        return True
+    if dir_name in ['__pycache__', 'node_modules', '.venv', 'venv', 'env', 
+                    'image-cache', 'dist', 'build', 'target', 'out']:
+        return True
+
+    # Check against user-defined patterns
+    for pattern in excluded_patterns:
+        pattern_clean = pattern.strip('./').rstrip('/')
+        if fnmatch.fnmatch(dir_name, pattern_clean):
+            return True
+    return False
+
+
+def should_exclude_file(file_name: str, excluded_patterns: List[str]) -> bool:
+    """Check if file should be excluded based on patterns."""
+    # Always exclude hidden files and common files
+    if file_name.startswith('.') or file_name == '__init__.py' or file_name == '.DS_Store':
+        return True
+
+    # Check against user-defined patterns
+    for pattern in excluded_patterns:
+        if fnmatch.fnmatch(file_name, pattern):
+            return True
+    return False
+
+
+def collect_all_files(path: str, config: Dict) -> List[str]:
+    """
+    Collect ALL files from repository respecting include/exclude patterns.
+
+    Args:
+        path: Root directory path
+        config: Configuration with excluded_dirs and excluded_files
+
+    Returns:
+        List of relative file paths
+    """
+    all_files = []
+    excluded_dirs = config.get('excluded_dirs', [])
+    excluded_files = config.get('excluded_files', [])
+
+    logger.info(f"Collecting files from {path}")
+    logger.info(f"Excluded dirs: {len(excluded_dirs)} patterns")
+    logger.info(f"Excluded files: {len(excluded_files)} patterns")
+
+    for root, dirs, files in os.walk(path):
+        # Filter directories in-place
+        dirs[:] = [d for d in dirs if not should_exclude_dir(d, excluded_dirs)]
+
+        for file in files:
+            if not should_exclude_file(file, excluded_files):
+                rel_dir = os.path.relpath(root, path)
+                rel_file = os.path.join(rel_dir, file) if rel_dir != '.' else file
+                all_files.append(rel_file)
+
+    logger.info(f"Collected {len(all_files)} files after filtering")
+    return all_files
+
+
+def group_files_by_directory(files: List[str]) -> Dict[str, List[str]]:
+    """Group files by their parent directory."""
+    by_dir = defaultdict(list)
+
+    for file_path in files:
+        dir_name = os.path.dirname(file_path)
+        if not dir_name:
+            dir_name = "root"
+        by_dir[dir_name].append(file_path)
+
+    return dict(by_dir)
+
+
+def create_file_chunks(files: List[str], max_files_per_chunk: int = 500) -> List[Dict[str, Any]]:
+    """
+    Create intelligent chunks of files grouped by directory.
+
+    Args:
+        files: List of all file paths
+        max_files_per_chunk: Maximum files per chunk
+
+    Returns:
+        List of chunk dictionaries with metadata
+    """
+    # Group by directory
+    by_dir = group_files_by_directory(files)
+
+    chunks = []
+    current_chunk_files = []
+    current_chunk_dirs = []
+
+    for dir_name, dir_files in sorted(by_dir.items()):
+        # If adding this directory exceeds limit, save current chunk and start new one
+        if current_chunk_files and len(current_chunk_files) + len(dir_files) > max_files_per_chunk:
+            chunks.append({
+                'files': current_chunk_files[:],
+                'directories': current_chunk_dirs[:],
+                'file_count': len(current_chunk_files)
+            })
+            current_chunk_files = []
+            current_chunk_dirs = []
+
+        current_chunk_files.extend(dir_files)
+        current_chunk_dirs.append(dir_name)
+
+    # Add final chunk if it has files
+    if current_chunk_files:
+        chunks.append({
+            'files': current_chunk_files,
+            'directories': current_chunk_dirs,
+            'file_count': len(current_chunk_files)
+        })
+
+    logger.info(f"Created {len(chunks)} chunks from {len(files)} files")
+    for i, chunk in enumerate(chunks):
+        logger.info(f"  Chunk {i+1}: {chunk['file_count']} files across {len(chunk['directories'])} directories")
+
+    return chunks
+
+
+def format_chunk_as_tree(chunk: Dict[str, Any]) -> str:
+    """Format a chunk of files as a tree string."""
+    files = chunk['files']
+    tree_lines = sorted(files)
+
+    # Add chunk metadata
+    chunk_info = f"# Chunk contains {len(files)} files from {len(chunk['directories'])} directories\n"
+    chunk_info += f"# Directories: {', '.join(chunk['directories'][:5])}"
+    if len(chunk['directories']) > 5:
+        chunk_info += f" ... and {len(chunk['directories']) - 5} more"
+    chunk_info += "\n\n"
+
+    return chunk_info + '\n'.join(tree_lines)
+
 # Configure CORS
 app.add_middleware(
     CORSMiddleware,
@@ -273,8 +431,19 @@ async def export_wiki(request: WikiExportRequest):
         raise HTTPException(status_code=500, detail=error_msg)
 
 @app.get("/local_repo/structure")
-async def get_local_repo_structure(path: str = Query(None, description="Path to local repository")):
-    """Return the file tree and README content for a local repository."""
+async def get_local_repo_structure(
+    path: str = Query(None, description="Path to local repository"),
+    chunk_size: int = Query(500, description="Maximum files per chunk"),
+    return_chunks: bool = Query(False, description="Return chunked structure for large repos")
+):
+    """
+    Return the file tree and README content for a local repository.
+
+    Now supports intelligent chunking for large repositories:
+    - Collects ALL files respecting include/exclude patterns
+    - Groups files by directory
+    - Returns chunks if repository is large
+    """
     if not path:
         return JSONResponse(
             status_code=400,
@@ -288,30 +457,63 @@ async def get_local_repo_structure(path: str = Query(None, description="Path to
         )
 
     try:
-        logger.info(f"Processing local repository at: {path}")
-        file_tree_lines = []
+        logger.info(f"Processing local repository at: {path} (chunk_size={chunk_size}, return_chunks={return_chunks})")
+
+        # Load configuration from repo.json
+        from api.config import load_repo_config
+        config_data = load_repo_config()
+        file_filters = config_data.get('file_filters', {})
+
+        # Collect ALL files respecting patterns
+        all_files = collect_all_files(path, file_filters)
+
+        # Find README.md (case-insensitive)
         readme_content = ""
-
         for root, dirs, files in os.walk(path):
-            # Exclude hidden dirs/files and virtual envs
-            dirs[:] = [d for d in dirs if not d.startswith('.') and d != '__pycache__' and d != 'node_modules' and d != '.venv']
             for file in files:
-                if file.startswith('.') or file == '__init__.py' or file == '.DS_Store':
-                    continue
-                rel_dir = os.path.relpath(root, path)
-                rel_file = os.path.join(rel_dir, file) if rel_dir != '.' else file
-                file_tree_lines.append(rel_file)
-                # Find README.md (case-insensitive)
                 if file.lower() == 'readme.md' and not readme_content:
                     try:
                         with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                             readme_content = f.read()
+                        break
                     except Exception as e:
                         logger.warning(f"Could not read README.md: {str(e)}")
-                        readme_content = ""
-
-        file_tree_str = '\n'.join(sorted(file_tree_lines))
-        return {"file_tree": file_tree_str, "readme": readme_content}
+            if readme_content:
+                break
+
+        # Decide whether to chunk based on repository size
+        total_files = len(all_files)
+        logger.info(f"Total files collected: {total_files}")
+
+        if return_chunks or total_files > chunk_size:
+            # Create intelligent chunks
+            chunks = create_file_chunks(all_files, max_files_per_chunk=chunk_size)
+
+            return {
+                "chunked": True,
+                "total_files": total_files,
+                "chunk_count": len(chunks),
+                "chunks": [
+                    {
+                        "chunk_id": i,
+                        "file_count": chunk['file_count'],
+                        "directories": chunk['directories'],
+                        "file_tree": format_chunk_as_tree(chunk)
+                    }
+                    for i, chunk in enumerate(chunks)
+                ],
+                "readme": readme_content
+            }
+        else:
+            # Small repo, return as single tree
+            file_tree_str = '\n'.join(sorted(all_files))
+            return {
+                "chunked": False,
+                "total_files": total_files,
+                "file_tree": file_tree_str,
+                "readme": readme_content
+            }
+
     except Exception as e:
         logger.error(f"Error processing local repository: {str(e)}")
         return JSONResponse(

diff --git a/api/prompts.py b/api/prompts.py
@@ -1,5 +1,56 @@
 """Module containing all prompts used in the DeepWiki project."""
 
+# System prompt for XML Wiki Structure Generation
+WIKI_STRUCTURE_SYSTEM_PROMPT = r"""
+You are an expert code analyst tasked with analyzing a repository and creating a structured wiki outline.
+
+CRITICAL XML FORMATTING INSTRUCTIONS:
+- You MUST return ONLY valid XML with NO additional text before or after
+- DO NOT wrap the XML in markdown code blocks (no ``` or ```xml)
+- DO NOT include any explanation or commentary
+- Start directly with <wiki_structure> and end with </wiki_structure>
+- Ensure all XML tags are properly closed
+- Use proper XML escaping for special characters (&amp; &lt; &gt; &quot; &apos;)
+
+XML STRUCTURE REQUIREMENTS:
+- The root element must be <wiki_structure>
+- Include a <title> element for the wiki title
+- Include a <description> element for the repository description
+- For comprehensive mode: Include a <sections> element containing section hierarchies
+- Include a <pages> element containing all wiki pages
+- Each page must have: id, title, description, importance, relevant_files, related_pages
+
+Example XML structure (comprehensive mode):
+<wiki_structure>
+  <title>Repository Wiki</title>
+  <description>A comprehensive guide</description>
+  <sections>
+    <section id="section-1">
+      <title>Overview</title>
+      <pages>
+        <page_ref>page-1</page_ref>
+      </pages>
+    </section>
+  </sections>
+  <pages>
+    <page id="page-1">
+      <title>Introduction</title>
+      <description>Overview of the project</description>
+      <importance>high</importance>
+      <relevant_files>
+        <file_path>README.md</file_path>
+      </relevant_files>
+      <related_pages>
+        <related>page-2</related>
+      </related_pages>
+      <parent_section>section-1</parent_section>
+    </page>
+  </pages>
+</wiki_structure>
+
+IMPORTANT: Your entire response must be valid XML. Do not include any text outside the <wiki_structure> tags.
+"""
+
 # System prompt for RAG
 RAG_SYSTEM_PROMPT = r"""
 You are a code assistant which answers user questions on a Github Repo.