Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 222 additions & 20 deletions api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from pydantic import BaseModel, Field
import google.generativeai as genai
import asyncio
from collections import defaultdict
import fnmatch

# Configure logging
from api.logging_config import setup_logging
Expand All @@ -18,11 +20,167 @@


# Initialize FastAPI app
app = FastAPI(
title="Streaming API",
description="API for streaming chat completions"
app = FastAPI()

# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, specify your frontend domain
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)


# Pydantic models for wiki pages
class WikiPage(BaseModel):
id: str
title: str
content: str
related_pages: List[str] = []


# ============================================================================
# INTELLIGENT FILE CHUNKING SYSTEM
# ============================================================================

def should_exclude_dir(dir_name: str, excluded_patterns: List[str]) -> bool:
"""Check if directory should be excluded based on patterns."""
# Always exclude hidden directories and common build/cache dirs
if dir_name.startswith('.'):
return True
if dir_name in ['__pycache__', 'node_modules', '.venv', 'venv', 'env',
'image-cache', 'dist', 'build', 'target', 'out']:
return True

# Check against user-defined patterns
for pattern in excluded_patterns:
pattern_clean = pattern.strip('./').rstrip('/')
if fnmatch.fnmatch(dir_name, pattern_clean):
return True
return False


def should_exclude_file(file_name: str, excluded_patterns: List[str]) -> bool:
"""Check if file should be excluded based on patterns."""
# Always exclude hidden files and common files
if file_name.startswith('.') or file_name == '__init__.py' or file_name == '.DS_Store':
return True

# Check against user-defined patterns
for pattern in excluded_patterns:
if fnmatch.fnmatch(file_name, pattern):
return True
return False


def collect_all_files(path: str, config: Dict) -> List[str]:
"""
Collect ALL files from repository respecting include/exclude patterns.

Args:
path: Root directory path
config: Configuration with excluded_dirs and excluded_files

Returns:
List of relative file paths
"""
all_files = []
excluded_dirs = config.get('excluded_dirs', [])
excluded_files = config.get('excluded_files', [])

logger.info(f"Collecting files from {path}")
logger.info(f"Excluded dirs: {len(excluded_dirs)} patterns")
logger.info(f"Excluded files: {len(excluded_files)} patterns")

for root, dirs, files in os.walk(path):
# Filter directories in-place
dirs[:] = [d for d in dirs if not should_exclude_dir(d, excluded_dirs)]

for file in files:
if not should_exclude_file(file, excluded_files):
rel_dir = os.path.relpath(root, path)
rel_file = os.path.join(rel_dir, file) if rel_dir != '.' else file
all_files.append(rel_file)

logger.info(f"Collected {len(all_files)} files after filtering")
return all_files


def group_files_by_directory(files: List[str]) -> Dict[str, List[str]]:
"""Group files by their parent directory."""
by_dir = defaultdict(list)

for file_path in files:
dir_name = os.path.dirname(file_path)
if not dir_name:
dir_name = "root"
by_dir[dir_name].append(file_path)

return dict(by_dir)


def create_file_chunks(files: List[str], max_files_per_chunk: int = 500) -> List[Dict[str, Any]]:
"""
Create intelligent chunks of files grouped by directory.

Args:
files: List of all file paths
max_files_per_chunk: Maximum files per chunk

Returns:
List of chunk dictionaries with metadata
"""
# Group by directory
by_dir = group_files_by_directory(files)

chunks = []
current_chunk_files = []
current_chunk_dirs = []

for dir_name, dir_files in sorted(by_dir.items()):
# If adding this directory exceeds limit, save current chunk and start new one
if current_chunk_files and len(current_chunk_files) + len(dir_files) > max_files_per_chunk:
chunks.append({
'files': current_chunk_files[:],
'directories': current_chunk_dirs[:],
'file_count': len(current_chunk_files)
})
current_chunk_files = []
current_chunk_dirs = []

current_chunk_files.extend(dir_files)
current_chunk_dirs.append(dir_name)

# Add final chunk if it has files
if current_chunk_files:
chunks.append({
'files': current_chunk_files,
'directories': current_chunk_dirs,
'file_count': len(current_chunk_files)
})

logger.info(f"Created {len(chunks)} chunks from {len(files)} files")
for i, chunk in enumerate(chunks):
logger.info(f" Chunk {i+1}: {chunk['file_count']} files across {len(chunk['directories'])} directories")

return chunks


def format_chunk_as_tree(chunk: Dict[str, Any]) -> str:
"""Format a chunk of files as a tree string."""
files = chunk['files']
tree_lines = sorted(files)

# Add chunk metadata
chunk_info = f"# Chunk contains {len(files)} files from {len(chunk['directories'])} directories\n"
chunk_info += f"# Directories: {', '.join(chunk['directories'][:5])}"
if len(chunk['directories']) > 5:
chunk_info += f" ... and {len(chunk['directories']) - 5} more"
chunk_info += "\n\n"

return chunk_info + '\n'.join(tree_lines)

# Configure CORS
app.add_middleware(
CORSMiddleware,
Expand Down Expand Up @@ -273,8 +431,19 @@ async def export_wiki(request: WikiExportRequest):
raise HTTPException(status_code=500, detail=error_msg)

@app.get("/local_repo/structure")
async def get_local_repo_structure(path: str = Query(None, description="Path to local repository")):
"""Return the file tree and README content for a local repository."""
async def get_local_repo_structure(
path: str = Query(None, description="Path to local repository"),
chunk_size: int = Query(500, description="Maximum files per chunk"),
return_chunks: bool = Query(False, description="Return chunked structure for large repos")
):
"""
Return the file tree and README content for a local repository.

Now supports intelligent chunking for large repositories:
- Collects ALL files respecting include/exclude patterns
- Groups files by directory
- Returns chunks if repository is large
"""
if not path:
return JSONResponse(
status_code=400,
Expand All @@ -288,30 +457,63 @@ async def get_local_repo_structure(path: str = Query(None, description="Path to
)

try:
logger.info(f"Processing local repository at: {path}")
file_tree_lines = []
logger.info(f"Processing local repository at: {path} (chunk_size={chunk_size}, return_chunks={return_chunks})")

# Load configuration from repo.json
from api.config import load_repo_config
config_data = load_repo_config()
file_filters = config_data.get('file_filters', {})

# Collect ALL files respecting patterns
all_files = collect_all_files(path, file_filters)

# Find README.md (case-insensitive)
readme_content = ""

for root, dirs, files in os.walk(path):
# Exclude hidden dirs/files and virtual envs
dirs[:] = [d for d in dirs if not d.startswith('.') and d != '__pycache__' and d != 'node_modules' and d != '.venv']
for file in files:
if file.startswith('.') or file == '__init__.py' or file == '.DS_Store':
continue
rel_dir = os.path.relpath(root, path)
rel_file = os.path.join(rel_dir, file) if rel_dir != '.' else file
file_tree_lines.append(rel_file)
# Find README.md (case-insensitive)
if file.lower() == 'readme.md' and not readme_content:
try:
with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
readme_content = f.read()
break
except Exception as e:
logger.warning(f"Could not read README.md: {str(e)}")
readme_content = ""

file_tree_str = '\n'.join(sorted(file_tree_lines))
return {"file_tree": file_tree_str, "readme": readme_content}
if readme_content:
break

# Decide whether to chunk based on repository size
total_files = len(all_files)
logger.info(f"Total files collected: {total_files}")

if return_chunks or total_files > chunk_size:
# Create intelligent chunks
chunks = create_file_chunks(all_files, max_files_per_chunk=chunk_size)

return {
"chunked": True,
"total_files": total_files,
"chunk_count": len(chunks),
"chunks": [
{
"chunk_id": i,
"file_count": chunk['file_count'],
"directories": chunk['directories'],
"file_tree": format_chunk_as_tree(chunk)
}
for i, chunk in enumerate(chunks)
],
"readme": readme_content
}
else:
# Small repo, return as single tree
file_tree_str = '\n'.join(sorted(all_files))
return {
"chunked": False,
"total_files": total_files,
"file_tree": file_tree_str,
"readme": readme_content
}

except Exception as e:
logger.error(f"Error processing local repository: {str(e)}")
return JSONResponse(
Expand Down
51 changes: 51 additions & 0 deletions api/prompts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,56 @@
"""Module containing all prompts used in the DeepWiki project."""

# System prompt for XML Wiki Structure Generation
WIKI_STRUCTURE_SYSTEM_PROMPT = r"""
You are an expert code analyst tasked with analyzing a repository and creating a structured wiki outline.

CRITICAL XML FORMATTING INSTRUCTIONS:
- You MUST return ONLY valid XML with NO additional text before or after
- DO NOT wrap the XML in markdown code blocks (no ``` or ```xml)
- DO NOT include any explanation or commentary
- Start directly with <wiki_structure> and end with </wiki_structure>
- Ensure all XML tags are properly closed
- Use proper XML escaping for special characters (&amp; &lt; &gt; &quot; &apos;)

XML STRUCTURE REQUIREMENTS:
- The root element must be <wiki_structure>
- Include a <title> element for the wiki title
- Include a <description> element for the repository description
- For comprehensive mode: Include a <sections> element containing section hierarchies
- Include a <pages> element containing all wiki pages
- Each page must have: id, title, description, importance, relevant_files, related_pages

Example XML structure (comprehensive mode):
<wiki_structure>
<title>Repository Wiki</title>
<description>A comprehensive guide</description>
<sections>
<section id="section-1">
<title>Overview</title>
<pages>
<page_ref>page-1</page_ref>
</pages>
</section>
</sections>
<pages>
<page id="page-1">
<title>Introduction</title>
<description>Overview of the project</description>
<importance>high</importance>
<relevant_files>
<file_path>README.md</file_path>
</relevant_files>
<related_pages>
<related>page-2</related>
</related_pages>
<parent_section>section-1</parent_section>
</page>
</pages>
</wiki_structure>

IMPORTANT: Your entire response must be valid XML. Do not include any text outside the <wiki_structure> tags.
"""

# System prompt for RAG
RAG_SYSTEM_PROMPT = r"""
You are a code assistant which answers user questions on a Github Repo.
Expand Down
Loading