From 1b300a6c07aa9d0ec3985a00c23aff7f3d9fe4d9 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 17 Mar 2025 15:18:52 +0530 Subject: [PATCH 1/7] data required for archival --- src/current/_config_cockroachdb_local.yml | 1 - src/current/audit.py | 408 + src/current/audit_report.txt | 76545 ++++++++++++++++++++ src/current/v19.2_audit_report.txt | 76545 ++++++++++++++++++++ 4 files changed, 153498 insertions(+), 1 deletion(-) create mode 100644 src/current/audit.py create mode 100644 src/current/audit_report.txt create mode 100644 src/current/v19.2_audit_report.txt diff --git a/src/current/_config_cockroachdb_local.yml b/src/current/_config_cockroachdb_local.yml index 3440c9a8df7..98579dde61f 100644 --- a/src/current/_config_cockroachdb_local.yml +++ b/src/current/_config_cockroachdb_local.yml @@ -4,7 +4,6 @@ exclude: - "v2.0" - "v2.1" - "v19.1" -- "v19.2" - "v20.1" - "ci" - "scripts" diff --git a/src/current/audit.py b/src/current/audit.py new file mode 100644 index 00000000000..2d2c968735d --- /dev/null +++ b/src/current/audit.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +""" +audit.py + +An audit script that: +1) Finds cross-version links (categorized by location). +2) Finds cockroachlabs.com non-docs links. +3) Finds external (non-cockroachlabs.com) links. +4) Audits image/CSS/JS/font usage, categorizing them as present, missing, or external. + +**This version** uses a "fallback" approach in asset_status() so +we do *not* unconditionally remove "/docs/" from the path. Instead, +we generate multiple candidate paths and see if any match the disk. +""" + +import os +import sys +import re +import argparse +from bs4 import BeautifulSoup +from urllib.parse import urlparse + +def is_cross_version_link(url, current_version): + """ + Return (True, found_version) if `url` is a docs link pointing to a different version. + E.g. /docs/v19.2/... vs current_version v21.1 + """ + match = re.search(r'/docs/(v\d+\.\d+)', url) + if match: + version = match.group(1) + return (version != current_version, version) + return (False, None) + +def categorize_cross_version_link(tag): + """ + For cross-version links, figure out if they're in the sidebar, version-switcher, or body. + """ + if tag.find_parent(id="sidebar"): + return "Sidebar Navigation" + elif tag.find_parent(id="version-switcher"): + return "Version Switcher" + else: + return "Content Body" + +def find_assets(soup): + """ + Return a dict: { "images": set(), "css": set(), "js": set(), "fonts": set() } + by scanning , , + +''' + + html = re.sub(r"", nav_deps + "\n", html, flags=re.IGNORECASE) + + # Add offline styles + offline_styles = f'''''' + + html = re.sub(r"", offline_styles + "\n", html, flags=re.IGNORECASE) + + # Add navigation initialization + nav_init = """""" + + html = re.sub(r"", nav_init + "\n", html, flags=re.IGNORECASE) + + # Write output + dst_path.parent.mkdir(parents=True, exist_ok=True) + dst_path.write_text(html, encoding="utf-8") + + self.processed_files.add(str(rel_path)) + + except Exception as e: + self.log(f"Error processing {src_path}: {e}", "ERROR") + import traceback + traceback.print_exc() + + def fix_css_images(self): + """Fix image paths in CSS files""" + self.log("Fixing CSS image paths...") + + for css_file in (OUTPUT_ROOT / "css").rglob("*.css"): + try: + content = css_file.read_text(encoding="utf-8") + + # Fix various image URL patterns + content = re.sub( + r"url\((['\"]?)/?docs/images/([^)\"']+)\1\)", + r"url(\1../images/\2\1)", + content, + ) + content = re.sub( + r"url\((['\"]?)images/([^)\"']+)\1\)", + r"url(\1../images/\2\1)", + content, + ) + + css_file.write_text(content, encoding="utf-8") + + except Exception as e: + self.log(f"Error fixing CSS {css_file}: {e}", "WARNING") + + def download_google_fonts(self): + """Download and localize Google Fonts""" + self.log("Downloading Google Fonts...") + + fonts_dir = OUTPUT_ROOT / "fonts" + fonts_dir.mkdir(exist_ok=True) + + try: + # Get CSS + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} + css_response = requests.get(FONTS_CSS_URL, headers=headers, timeout=10) + css_response.raise_for_status() + css_content = css_response.text + + # Extract and download font files + font_urls = set(re.findall(r"url\((https://fonts\.gstatic\.com/[^\)]+)\)", css_content)) + + for url in font_urls: + try: + # Download font + font_response = requests.get(url, headers=headers, timeout=10) + font_response.raise_for_status() + + # Save font + parsed = urlparse(url) + font_path = parsed.path.lstrip("/") + dst = fonts_dir / font_path + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_bytes(font_response.content) + + # Update CSS + css_content = css_content.replace(url, f"../fonts/{font_path}") + + except Exception as e: + self.log(f"Failed to download font from {url}: {e}", "WARNING") + + # Save localized CSS + (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(css_content, encoding="utf-8") + self.log("Google Fonts localized", "SUCCESS") + + except Exception as e: + self.log(f"Error downloading fonts: {e}", "ERROR") + # Create fallback + fallback = """/* Fallback fonts */ +body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif; } +code, pre { font-family: Consolas, Monaco, "Courier New", monospace; }""" + (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) + + def create_link_test_page(self): + """Create a test page to verify link processing""" + test_html = f""" + + + Link Test Page + + + +

Link Processing Test Results

+

This page shows how different link patterns were processed:

+ +

From pages NOT in version directory:

+
+
Context: Page at /index.html
+
Original: /docs/insert.html
+
Should be: v19.2/insert.html
+ Test Link +
+ +
+
Context: Page at /index.html
+
Original: /docs/v19.2/secure-a-cluster.html
+
Should be: v19.2/secure-a-cluster.html
+ Test Link +
+ +

From pages IN version directory:

+
+
Context: Page at /v19.2/index.html
+
Original: /docs/secure-a-cluster.html
+
Should be: secure-a-cluster.html (same dir)
+

This link would be at: v19.2/secure-a-cluster.html

+
+ +
+
Context: Page at /v19.2/index.html
+
Original: /docs/v19.2/secure-a-cluster.html
+
Should be: secure-a-cluster.html (same dir)
+

This link would be at: v19.2/secure-a-cluster.html

+
+ +

Special cases:

+
+
Original: /docs/stable/something.html
+
Should be: v19.2/something.html
+ Test Link +
+ +
+
Original: /docs/cockroachcloud/quickstart.html
+
Should be: cockroachcloud/quickstart.html
+ Test Link +
+ +
+
Original: /docs/releases/index.html
+
Should be: releases/index.html
+ Test Link +
+ +

Note: Click each link to verify it works correctly.

+ +""" + + test_path = OUTPUT_ROOT / "_link_test.html" + test_path.write_text(test_html) + self.log("Created link test page: _link_test.html", "SUCCESS") + + def create_index_page(self): + """Create the index page""" + index_html = f""" + + + + + CockroachDB {TARGET_VERSION} Documentation (Offline) + + + + + +

CockroachDB {TARGET_VERSION}

+

Offline Documentation Archive

+ +
+ + + + +
+

ā˜ļø CockroachDB Cloud

+ +
+ + +
+ +
+

šŸ“Œ Offline Archive

+

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. + All internal links have been updated to work offline.

+

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

+
+ + + +""" + + (OUTPUT_ROOT / "index.html").write_text(index_html) + self.log("Created index.html", "SUCCESS") + """Create the index page""" + index_html = f""" + + + + + CockroachDB {TARGET_VERSION} Documentation (Offline) + + + + + +

CockroachDB {TARGET_VERSION}

+

Offline Documentation Archive

+ +
+ + + + +
+

ā˜ļø CockroachDB Cloud

+ +
+ + +
+ +
+

šŸ“Œ Offline Archive

+

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. + All internal links have been updated to work offline.

+

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

+
+ + + +""" + + (OUTPUT_ROOT / "index.html").write_text(index_html) + self.log("Created index.html", "SUCCESS") + + def build(self): + """Main build process following Code 2's structure""" + print("\n" + "="*60) + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER") + print("="*60) + + # Verify paths + self.log(f"Jekyll Root: {JEKYLL_ROOT}") + self.log(f"Site Root: {SITE_ROOT}") + self.log(f"Docs Root: {DOCS_ROOT}") + self.log(f"Output: {OUTPUT_ROOT}") + + if not SITE_ROOT.exists(): + self.log("Site root not found! Run 'jekyll build' first.", "ERROR") + return False + + # Clean output directory + if OUTPUT_ROOT.exists(): + self.log("Cleaning existing output directory...") + shutil.rmtree(OUTPUT_ROOT) + OUTPUT_ROOT.mkdir(parents=True) + + # CRITICAL: Copy global assets FIRST (from SITE_ROOT, not DOCS_ROOT) + self.log("\n--- Copying Global Assets ---") + for asset_dir in ["css", "js", "img"]: + src = SITE_ROOT / asset_dir + if src.exists(): + dst = OUTPUT_ROOT / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f"Copied global {asset_dir}/", "SUCCESS") + + # Copy docs-specific assets + self.log("\n--- Copying Docs Assets ---") + for asset_dir in ["css", "js", "images", "_internal"]: + src = DOCS_ROOT / asset_dir + if src.exists(): + dst = OUTPUT_ROOT / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f"Copied docs {asset_dir}/", "SUCCESS") + + # Ensure critical navigation assets + self.log("\n--- Ensuring Navigation Assets ---") + self.ensure_asset( + "jquery.min.js", + [DOCS_ROOT / "js" / "jquery.min.js", SITE_ROOT / "js" / "jquery.min.js"], + "https://code.jquery.com/jquery-3.6.3.min.js", + OUTPUT_ROOT / "js" + ) + self.ensure_asset( + "jquery.cookie.min.js", + [DOCS_ROOT / "js" / "jquery.cookie.min.js", SITE_ROOT / "js" / "jquery.cookie.min.js"], + "https://cdnjs.cloudflare.com/ajax/libs/jquery-cookie/1.4.1/jquery.cookie.min.js", + OUTPUT_ROOT / "js" + ) + self.ensure_asset( + "jquery.navgoco.min.js", + [DOCS_ROOT / "js" / "jquery.navgoco.min.js", SITE_ROOT / "js" / "jquery.navgoco.min.js"], + "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.js", + OUTPUT_ROOT / "js" + ) + self.ensure_asset( + "jquery.navgoco.css", + [DOCS_ROOT / "css" / "jquery.navgoco.css", SITE_ROOT / "css" / "jquery.navgoco.css"], + "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.css", + OUTPUT_ROOT / "css" + ) + + # Load sidebar + self.log("\n--- Loading Sidebar ---") + self.load_sidebar() + + # Process HTML files + self.log("\n--- Processing HTML Files ---") + + # Collect files to process + files_to_process = [] + + # Target version files + version_dir = DOCS_ROOT / TARGET_VERSION + if version_dir.exists(): + files_to_process.extend(list(version_dir.rglob("*.html"))) + self.log(f"Found {len(files_to_process)} files in {TARGET_VERSION}/", "SUCCESS") + + # Common pages + for pattern in COMMON_PAGES: + if '*' in pattern: + files_to_process.extend(list(DOCS_ROOT.glob(pattern))) + else: + file_path = DOCS_ROOT / pattern + if file_path.exists(): + files_to_process.append(file_path) + + # Remove duplicates + files_to_process = list(set(files_to_process)) + self.log(f"Total files to process: {len(files_to_process)}") + + # Process each file + for i, file_path in enumerate(files_to_process, 1): + # Skip non-v19.2 version directories + rel_path = file_path.relative_to(DOCS_ROOT) + if rel_path.parts and rel_path.parts[0].startswith('v') and rel_path.parts[0] != TARGET_VERSION: + continue + + if i % 25 == 0: + self.log(f"Progress: {i}/{len(files_to_process)} ({i*100//len(files_to_process)}%)") + + self.process_html_file(file_path) + + self.log(f"Processed {len(self.processed_files)} files", "SUCCESS") + + # Final cleanup steps + self.log("\n--- Final Steps ---") + self.fix_css_images() + self.download_google_fonts() + self.create_index_page() + + # Summary + print("\n" + "="*60) + self.log("ARCHIVE COMPLETE!", "SUCCESS") + self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") + self.log(f"Total files: {len(self.processed_files)}") + self.log("āœ… Ask AI widget removed", "SUCCESS") + self.log("āœ… All links converted to relative paths", "SUCCESS") + self.log("āœ… Version directory (v19.2) added where needed", "SUCCESS") + + print(f"\nšŸŽ‰ Offline site built in {OUTPUT_ROOT}") + print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") + print(f"\nšŸ“Œ Note: Check console output above for link transformation details") + + return True + + +def main(): + """Main entry point""" + try: + archiver = OfflineArchiver() + success = archiver.build() + sys.exit(0 if success else 1) + except KeyboardInterrupt: + print("\n\nArchiving cancelled by user.") + sys.exit(1) + except Exception as e: + print(f"\nāŒ Fatal error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file From 1fe7a2492ed23cff90e59223942504c76826c011 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Sun, 20 Jul 2025 20:04:44 +0530 Subject: [PATCH 4/7] working solution --- src/current/snapshot.py | 822 ++++++++-------------------------------- 1 file changed, 166 insertions(+), 656 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 96c63f40d95..c47d4e36e0c 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Complete Offline Documentation Archiver for Jekyll CockroachDB Documentation -Fixed version that preserves CSS structure from Code 2 +FIXED VERSION with correct JavaScript URL processing """ import re import shutil @@ -159,8 +159,149 @@ def ensure_asset(self, name, local_candidates, url, dest_dir): except Exception as e: self.log(f"Failed to download {name}: {e}", "ERROR") + def fix_sidebar_javascript(self, html): + """Fix the embedded sidebar JavaScript configuration and URL processing""" + + # Fix 1: Replace baseUrl in the embedded sidebar configuration + html = re.sub( + r'baseUrl:\s*["\'][^"\']*["\']', + 'baseUrl: ""', + html + ) + + # Fix 2: Find and replace the URL processing logic + # Look for the specific URL processing pattern in the JavaScript + url_processing_pattern = r'(if \(!/\^https\?:/.test\(url\)\) \{\s*url = sidebar\.baseUrl \+ url\.replace\([^}]+\}\s*return url;)' + + # More robust pattern that captures the entire URL processing block + better_pattern = r'(const urls = \(item\.urls \|\| \[\]\)\.map\(function \(url\) \{[\s\S]*?)(if \(!/\^https\?:/.test\(url\)\) \{[\s\S]*?url = sidebar\.baseUrl \+ url\.replace[\s\S]*?\}[\s\S]*?)(return url;[\s\S]*?\}\);)' + + def replace_url_processing(match): + start_part = match.group(1) + end_part = match.group(3) + + # Inject our custom URL processing logic + new_processing = r'''if (!/^https?:/.test(url)) { + // Remove /docs/ prefix if present + url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); + + // Better current directory detection for file:// URLs + var currentPath = window.location.pathname; + var currentDir = ''; + + // Extract just the relevant part of the path (handle both web and file:// URLs) + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + // Fallback: check if we're in root or any subdirectory + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } + } + } + + // Remove leading slash from URL + if (url.startsWith('/')) { + url = url.substring(1); + } + + // Handle stable -> v19.2 conversion + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + // Calculate relative path based on current directory context + if (currentDir) { + // We're in a subdirectory + if (url.startsWith(currentDir + '/')) { + // Same directory - remove the directory prefix + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + // Different directory - need to go up one level + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + // Root level file - go up one level + url = '../' + url; + } + } + + // Clean up any double slashes + url = url.replace(/\/+/g, '/'); + // Note: Keep .html extensions for offline file:// URLs + }''' + + return start_part + new_processing + end_part + + # Try to apply the replacement + new_html = re.sub(better_pattern, replace_url_processing, html, flags=re.DOTALL) + + # If the complex pattern didn't match, try a simpler approach + if new_html == html: + # Simple pattern - just replace the specific problematic line + simple_pattern = r'url = sidebar\.baseUrl \+ url\.replace\([^}]+\}' + + simple_replacement = r'''// Custom offline URL processing + url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); + + var currentPath = window.location.pathname; + var currentDir = ''; + + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } + } + } + + if (url.startsWith('/')) { + url = url.substring(1); + } + + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + if (currentDir) { + if (url.startsWith(currentDir + '/')) { + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + url = '../' + url; + } + } + + url = url.replace(/\/+/g, '/'); + // Keep .html extensions for offline use + }''' + + new_html = re.sub(simple_pattern, simple_replacement, html, flags=re.DOTALL) + + # Also fix the .html stripping issue + new_html = re.sub( + r'url = url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', + 'url = url.replace("/index.html", ""); // Keep .html for offline', + new_html + ) + + # Debug output + if new_html != html: + self.log("Successfully replaced JavaScript URL processing", "SUCCESS") + else: + self.log("Warning: JavaScript URL processing replacement may have failed", "WARNING") + + return new_html + def process_html_file(self, src_path): - """Process a single HTML file using Code 2's approach""" + """Process a single HTML file""" try: rel_path = src_path.relative_to(DOCS_ROOT) dst_path = OUTPUT_ROOT / rel_path @@ -177,54 +318,8 @@ def process_html_file(self, src_path): # Read content html = src_path.read_text(encoding="utf-8") - # Inject sidebar HTML if available - if self.sidebar_html: - html = re.sub( - r"(
]*>)(\s*?
)", - rf"\1{self.sidebar_html}\2", - html, - flags=re.IGNORECASE, - ) - - # Parse with BeautifulSoup to fix sidebar links - soup = BeautifulSoup(html, "html.parser") - - # Remove Ask AI widget and other unwanted elements - remove_selectors = [ - # Ask AI widget - more comprehensive selectors - '.ask-ai', '#ask-ai', '[data-ask-ai]', '.ai-widget', '.kapa-widget', - 'script[src*="kapa"]', '#kapa-widget-container', '.kapa-trigger', - '.kapa-ai-button', '[class*="kapa"]', '[id*="kapa"]', - 'div[data-kapa-widget]', 'button[aria-label*="AI"]', - '[class*="ask-ai"]', '[id*="ask-ai"]', - 'iframe[src*="kapa"]', 'iframe[id*="kapa"]', - - # Version switcher - '.version-switcher', '#version-switcher', '.version-dropdown', - - # Feedback widgets - '.feedback-widget', '#feedback-widget', '[id*="feedback"]', - '.helpful-widget', '.page-helpful', - - # Analytics - 'script[src*="googletagmanager"]', 'script[src*="google-analytics"]', - 'script[src*="segment"]', 'script[src*="heap"]', - ] - - for selector in remove_selectors: - for elem in soup.select(selector): - elem.decompose() - - # Also remove any script tags that contain kapa or AI-related code - for script in soup.find_all('script'): - if script.string and any(term in script.string.lower() for term in ['kapa', 'askai', 'ask-ai', 'aiwidget']): - script.decompose() - - # Remove any iframes that might be Ask AI related - for iframe in soup.find_all('iframe'): - src = iframe.get('src', '') - if any(term in src.lower() for term in ['kapa', 'ask', 'ai']): - iframe.decompose() + # CRITICAL: Fix sidebar JavaScript BEFORE other processing + html = self.fix_sidebar_javascript(html) # Inject sidebar HTML if available if self.sidebar_html: @@ -235,27 +330,20 @@ def process_html_file(self, src_path): flags=re.IGNORECASE, ) - # Parse with BeautifulSoup to fix sidebar links + # Parse with BeautifulSoup for additional cleanup soup = BeautifulSoup(html, "html.parser") # Remove Ask AI widget and other unwanted elements remove_selectors = [ - # Ask AI widget - more comprehensive selectors '.ask-ai', '#ask-ai', '[data-ask-ai]', '.ai-widget', '.kapa-widget', 'script[src*="kapa"]', '#kapa-widget-container', '.kapa-trigger', '.kapa-ai-button', '[class*="kapa"]', '[id*="kapa"]', 'div[data-kapa-widget]', 'button[aria-label*="AI"]', '[class*="ask-ai"]', '[id*="ask-ai"]', 'iframe[src*="kapa"]', 'iframe[id*="kapa"]', - - # Version switcher '.version-switcher', '#version-switcher', '.version-dropdown', - - # Feedback widgets '.feedback-widget', '#feedback-widget', '[id*="feedback"]', '.helpful-widget', '.page-helpful', - - # Analytics 'script[src*="googletagmanager"]', 'script[src*="google-analytics"]', 'script[src*="segment"]', 'script[src*="heap"]', ] @@ -264,7 +352,7 @@ def process_html_file(self, src_path): for elem in soup.select(selector): elem.decompose() - # Also remove any script tags that contain kapa or AI-related code + # Remove any script tags that contain kapa or AI-related code for script in soup.find_all('script'): if script.string and any(term in script.string.lower() for term in ['kapa', 'askai', 'ask-ai', 'aiwidget']): script.decompose() @@ -275,246 +363,10 @@ def process_html_file(self, src_path): if any(term in src.lower() for term in ['kapa', 'ask', 'ai']): iframe.decompose() - # Process sidebar links with clearer logic - sidebar_links = soup.select("#sidebar a[href], #sidebarMenu a[href], #mysidebar a[href]") - - for a in sidebar_links: - original_href = a.get("href", "") - - # Skip external links and anchors - if original_href.startswith(('http://', 'https://', 'mailto:', '#', 'javascript:')): - continue - - # Store original - a['data-original-href'] = original_href - - # Process the href step by step - h = original_href.strip() - - # Check if this was originally a relative link (important for context) - was_relative = not h.startswith('/') - - # Step 1: Handle stable -> v19.2 conversion - h = h.replace('/stable/', f'/{TARGET_VERSION}/') - h = h.replace('stable/', f'{TARGET_VERSION}/') - - # Step 2: Remove domain/localhost if present - if '127.0.0.1:4000/' in h: - h = h.split('127.0.0.1:4000/')[-1] - if 'localhost:4000/' in h: - h = h.split('localhost:4000/')[-1] - - # Step 3: Remove /docs/ prefix - if h.startswith('/docs/'): - h = h[6:] # Remove '/docs/' - elif h.startswith('docs/'): - h = h[5:] # Remove 'docs/' - - # Step 4: Remove any remaining leading slashes - h = h.lstrip('/') - - # Step 5: Determine if we need to add version directory - needs_version = False - if h: # If we have a path - # Check if it already has a version - if not h.startswith(f'{TARGET_VERSION}/'): - # List of paths that should NOT get version prefix - non_versioned = [ - 'cockroachcloud/', 'releases/', 'advisories/', - 'images/', 'css/', 'js/', '_internal/', 'fonts/', - 'img/', 'assets/' - ] - - # Check if it's a special non-versioned path - is_special = any(h.startswith(d) for d in non_versioned) - - # Check if it has a file extension that indicates an asset - is_asset = any(h.endswith(ext) for ext in [ - '.css', '.js', '.png', '.jpg', '.jpeg', '.gif', - '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot' - ]) - - # CRITICAL FIX: If we're already in a version directory and this is - # a simple doc page (like secure-a-cluster.html), we DON'T need to add version - # because it will be relative to the current directory - if is_in_version_dir and not is_special and not is_asset and '/' not in h: - # This is a simple filename in the same version directory - needs_version = False - if 'secure-a-cluster' in h: - self.log(f"NOT adding version to '{h}' - already in version dir", "WARNING") - elif was_relative and is_in_version_dir: - # Original link was relative AND we're in a version directory - needs_version = False - elif not is_special and not is_asset: - # Otherwise, if it's not special and not an asset, it needs version - needs_version = True - if sidebar_links.index(a) < 5: # Debug first few - self.log(f"Adding version to: {h} (was_relative={was_relative}, in_version={is_in_version_dir})", "DEBUG") - - # Add version directory if needed - if needs_version: - h = f'{TARGET_VERSION}/{h}' - - # Step 6: Add .html if needed - if h and not h.endswith('/') and not h.endswith('.html'): - # Check if it already has an extension - parts = h.split('/') - last_part = parts[-1] - if '.' not in last_part: - h += '.html' - - # Step 7: Calculate the correct relative path - # Now that we've been smart about adding version, this is simpler - - # Special debugging for secure-a-cluster.html - if 'secure-a-cluster' in h or sidebar_links.index(a) < 3: - self.log(f" Final path calc: h='{h}' in_v_dir={is_in_version_dir}", "DEBUG") - - if is_in_version_dir: - # We're in a version directory - if h.startswith(f'{TARGET_VERSION}/'): - # This shouldn't happen if we were smart above, but just in case - # Remove redundant version prefix - h = h[len(TARGET_VERSION) + 1:] - final_href = h - self.log(f" WARNING: Had to strip redundant version prefix", "WARNING") - elif any(h.startswith(d) for d in ['cockroachcloud/', 'releases/', 'advisories/', 'images/', 'css/', 'js/']): - # These need to go up a level from version dir - final_href = "../" + h - else: - # Simple filename in same directory - final_href = h - else: - # We're NOT in version dir, use normal prefix - final_href = prefix + h if h else prefix + "index.html" - - a["href"] = final_href - - # Debug output - if sidebar_links.index(a) < 5 or 'secure-a-cluster' in original_href: - self.log(f"Sidebar: '{original_href}' -> '{final_href}'", "INFO") - - # Process ALL other links - all_links = soup.select("a[href]") - content_link_count = 0 - for a in all_links: - if a in sidebar_links: # Skip already processed - continue - - original_href = a.get("href", "") - - # Skip external links and anchors - if original_href.startswith(('http://', 'https://', 'mailto:', '#', 'javascript:')): - continue - - # Store original - a['data-original-href'] = original_href - - # Apply same processing - h = original_href.strip() - - # Check if this was originally relative - was_relative = not h.startswith('/') - - # Handle stable -> v19.2 - h = h.replace('/stable/', f'/{TARGET_VERSION}/') - h = h.replace('stable/', f'{TARGET_VERSION}/') - - # Remove domain - if '127.0.0.1:4000/' in h: - h = h.split('127.0.0.1:4000/')[-1] - if 'localhost:4000/' in h: - h = h.split('localhost:4000/')[-1] - - # Remove /docs/ prefix - if h.startswith('/docs/'): - h = h[6:] - elif h.startswith('docs/'): - h = h[5:] - - # Remove leading slashes - h = h.lstrip('/') - - # Determine if we need to add version directory - needs_version = False - if h: # If we have a path - # Check if it already has a version - if not h.startswith(f'{TARGET_VERSION}/'): - # List of paths that should NOT get version prefix - non_versioned = [ - 'cockroachcloud/', 'releases/', 'advisories/', - 'images/', 'css/', 'js/', '_internal/', 'fonts/', - 'img/', 'assets/' - ] - - # Check if it's a special non-versioned path - is_special = any(h.startswith(d) for d in non_versioned) - - # Check for file extensions that indicate assets - is_asset = any(h.endswith(ext) for ext in [ - '.css', '.js', '.png', '.jpg', '.jpeg', '.gif', - '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot' - ]) - - # CRITICAL FIX: If we're already in a version directory and this is - # a simple doc page (like secure-a-cluster.html), we DON'T need to add version - if is_in_version_dir and not is_special and not is_asset and '/' not in h: - # This is a simple filename in the same version directory - needs_version = False - if 'secure-a-cluster' in h: - self.log(f"NOT adding version to '{h}' - already in version dir", "WARNING") - elif was_relative and is_in_version_dir: - # Original link was relative AND we're in a version directory - needs_version = False - elif not is_special and not is_asset: - # Otherwise, if it's not special and not an asset, it needs version - needs_version = True - - # Add version directory if needed - if needs_version: - h = f'{TARGET_VERSION}/{h}' - - # Add .html if needed - if h and not h.endswith('/') and not h.endswith('.html'): - parts = h.split('/') - last_part = parts[-1] - if '.' not in last_part: - h += '.html' - - # Calculate the correct relative path - # Now that we've been smart about adding version, this is simpler - - if is_in_version_dir: - # We're in a version directory - if h.startswith(f'{TARGET_VERSION}/'): - # This shouldn't happen if we were smart above, but just in case - # Remove redundant version prefix - h = h[len(TARGET_VERSION) + 1:] - final_href = h - elif any(h.startswith(d) for d in ['cockroachcloud/', 'releases/', 'advisories/', 'images/', 'css/', 'js/']): - # These need to go up a level from version dir - final_href = "../" + h - else: - # Simple filename in same directory - final_href = h - else: - # We're NOT in version dir, use normal prefix - final_href = prefix + h if h else prefix + "index.html" - - a["href"] = final_href - - # Debug first few content links - if content_link_count < 3 or 'secure-a-cluster' in original_href: - self.log(f"Content: '{original_href}' -> '{final_href}'", "INFO") - content_link_count += 1 - # Convert back to string html = str(soup) - # Convert back to string - html = str(soup) - - # Clean up query parameters + # Clean up various path patterns html = re.sub( r"(src|href)=\"([^\"?]+)\?[^\" ]+\"", lambda m: f'{m.group(1)}="{m.group(2)}"', @@ -522,24 +374,15 @@ def process_html_file(self, src_path): ) # Fix various path patterns - # Handle stable version references first html = re.sub(r'(href|src)="/docs/stable/', rf'\1="{TARGET_VERSION}/', html) html = re.sub(r'(href|src)="docs/stable/', rf'\1="{TARGET_VERSION}/', html) - - # Remove /docs/ prefix while preserving version - # This regex specifically handles /docs/vXX.X/ patterns html = re.sub(r'(href|src)="/docs/(v\d+\.\d+/[^"]+)"', r'\1="\2"', html) html = re.sub(r'(href|src)="docs/(v\d+\.\d+/[^"]+)"', r'\1="\2"', html) - - # For non-versioned docs paths html = re.sub(r'(href|src)="/docs/([^v][^"]+)"', r'\1="\2"', html) html = re.sub(r'(href|src)="docs/([^v][^"]+)"', r'\1="\2"', html) - - # Remove any remaining leading slashes from local paths - # Skip URLs that start with // (protocol-relative) html = re.sub(r'(href|src)="/(?!/)([^"]+)"', r'\1="\2"', html) - # Fix asset paths - this is critical for CSS + # Fix asset paths for asset in ["css", "js", "images", "_internal"]: html = re.sub( rf"(src|href)=[\"']/{asset}/([^\"']+)[\"']", @@ -547,31 +390,13 @@ def process_html_file(self, src_path): html, ) - # Fix img paths - html = re.sub( - r"(src|href)=[\"']/?img/([^\"']+)[\"']", - r'\1="img/\2"', - html, - ) - - # Fix docs/images paths - html = re.sub( - r"(src|href|xlink:href)=[\"']/?docs/images/([^\"']+)[\"']", - r'\1="images/\2"', - html, - ) + html = re.sub(r"(src|href)=[\"']/?img/([^\"']+)[\"']", r'\1="img/\2"', html) + html = re.sub(r"(src|href|xlink:href)=[\"']/?docs/images/([^\"']+)[\"']", r'\1="images/\2"', html) # Replace Google Fonts html = re.sub( r"]+fonts\.googleapis\.com[^>]+>", - '', - html, - ) - - # Fix CSS imports - html = re.sub( - r"@import\s+url\((['\"]?)/docs/(css/[^)]+)\1\);", - r"@import url(\2);", + f'', html, ) @@ -583,32 +408,7 @@ def process_html_file(self, src_path): html, ) - # # Fix remaining paths that need prefix - # # Only add prefix to paths that don't already have it and aren't external - # html = re.sub( - # r'(href|src)="(?!\.\./)(?!https?:)(?!mailto:)(?!#)(?!javascript:)(?!//)([^"]+)"', - # rf'\1="{prefix}\2"', - # html, - # ) - - # Debug: Check if we still have absolute paths - if len(self.processed_files) < 3: # Only for first few files - import re as regex - abs_paths = regex.findall(r'href="/(v19\.2/[^"]+)"', html) - if abs_paths: - self.log(f"Warning: Found absolute paths in {rel_path}: {abs_paths[:3]}", "WARNING") - - # Final cleanup - remove any double slashes or incorrect patterns - html = html.replace('"//', '"/') # Fix double slashes - html = re.sub(r'"\.\./+', '"../', html) # Fix multiple slashes after ../ - - # Fix any paths that might have lost their 'v' prefix - html = re.sub(r'(href|src)="(\.\./)*19\.2/', rf'\1="\2v19.2/', html) - - # Ensure v19.2 paths don't have unnecessary prefixes - html = re.sub(r'(href|src)="(\.\./)+v19\.2/v19\.2/', r'\1="\2v19.2/', html) - - # Inject navigation dependencies - CRITICAL FOR STYLING + # Inject navigation dependencies nav_deps = f''' @@ -626,12 +426,13 @@ def process_html_file(self, src_path): overflow: visible !important; }} -/* Hide online-only elements - comprehensive */ +/* Hide online-only elements */ .ask-ai, #ask-ai, [data-ask-ai], .ai-widget, .kapa-widget, [class*="kapa"], [id*="kapa"], [class*="ask-ai"], [id*="ask-ai"], .version-switcher, #version-switcher, .feedback-widget, button[aria-label*="AI"], div[data-kapa-widget], -.kapa-ai-button, .ai-assistant, .ai-chat {{ +.kapa-ai-button, .ai-assistant, .ai-chat, +.floating-action-button, .fab, [class*="floating-button"] {{ display: none !important; visibility: hidden !important; opacity: 0 !important; @@ -640,23 +441,6 @@ def process_html_file(self, src_path): left: -9999px !important; }} -/* Hide floating action buttons */ -.floating-action-button, .fab, [class*="floating-button"], -button[style*="fixed"], button[style*="absolute"] {{ - display: none !important; -}} - -/* Hide any fixed position elements in bottom right (common for chat widgets) */ -[style*="position: fixed"][style*="bottom"][style*="right"], -[style*="position:fixed"][style*="bottom"][style*="right"] {{ - display: none !important; -}} - -/* Hide iframes that might be chat widgets */ -iframe[src*="kapa"], iframe[id*="kapa"], iframe[class*="chat"] {{ - display: none !important; -}} - /* Navgoco styling */ .navgoco li {{ list-style: none; }} .navgoco li.active > a {{ @@ -673,21 +457,12 @@ def process_html_file(self, src_path): # Add navigation initialization nav_init = """""" html = re.sub(r"", nav_init + "\n", html, flags=re.IGNORECASE) @@ -761,121 +527,39 @@ def download_google_fonts(self): fonts_dir.mkdir(exist_ok=True) try: - # Get CSS headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} css_response = requests.get(FONTS_CSS_URL, headers=headers, timeout=10) css_response.raise_for_status() css_content = css_response.text - # Extract and download font files font_urls = set(re.findall(r"url\((https://fonts\.gstatic\.com/[^\)]+)\)", css_content)) for url in font_urls: try: - # Download font font_response = requests.get(url, headers=headers, timeout=10) font_response.raise_for_status() - # Save font parsed = urlparse(url) font_path = parsed.path.lstrip("/") dst = fonts_dir / font_path dst.parent.mkdir(parents=True, exist_ok=True) dst.write_bytes(font_response.content) - # Update CSS css_content = css_content.replace(url, f"../fonts/{font_path}") except Exception as e: self.log(f"Failed to download font from {url}: {e}", "WARNING") - # Save localized CSS (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(css_content, encoding="utf-8") self.log("Google Fonts localized", "SUCCESS") except Exception as e: self.log(f"Error downloading fonts: {e}", "ERROR") - # Create fallback fallback = """/* Fallback fonts */ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif; } code, pre { font-family: Consolas, Monaco, "Courier New", monospace; }""" (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) - def create_link_test_page(self): - """Create a test page to verify link processing""" - test_html = f""" - - - Link Test Page - - - -

Link Processing Test Results

-

This page shows how different link patterns were processed:

- -

From pages NOT in version directory:

-
-
Context: Page at /index.html
-
Original: /docs/insert.html
-
Should be: v19.2/insert.html
- Test Link -
- -
-
Context: Page at /index.html
-
Original: /docs/v19.2/secure-a-cluster.html
-
Should be: v19.2/secure-a-cluster.html
- Test Link -
- -

From pages IN version directory:

-
-
Context: Page at /v19.2/index.html
-
Original: /docs/secure-a-cluster.html
-
Should be: secure-a-cluster.html (same dir)
-

This link would be at: v19.2/secure-a-cluster.html

-
- -
-
Context: Page at /v19.2/index.html
-
Original: /docs/v19.2/secure-a-cluster.html
-
Should be: secure-a-cluster.html (same dir)
-

This link would be at: v19.2/secure-a-cluster.html

-
- -

Special cases:

-
-
Original: /docs/stable/something.html
-
Should be: v19.2/something.html
- Test Link -
- -
-
Original: /docs/cockroachcloud/quickstart.html
-
Should be: cockroachcloud/quickstart.html
- Test Link -
- -
-
Original: /docs/releases/index.html
-
Should be: releases/index.html
- Test Link -
- -

Note: Click each link to verify it works correctly.

- -""" - - test_path = OUTPUT_ROOT / "_link_test.html" - test_path.write_text(test_html) - self.log("Created link test page: _link_test.html", "SUCCESS") - def create_index_page(self): """Create the index page""" index_html = f""" @@ -887,17 +571,6 @@ def create_index_page(self): - - -

CockroachDB {TARGET_VERSION}

-

Offline Documentation Archive

- - - -
-

šŸ“Œ Offline Archive

-

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. - All internal links have been updated to work offline.

-

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

-
- - """ @@ -1179,9 +689,9 @@ def create_index_page(self): self.log("Created index.html", "SUCCESS") def build(self): - """Main build process following Code 2's structure""" + """Main build process""" print("\n" + "="*60) - print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER") + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (FIXED)") print("="*60) # Verify paths @@ -1200,7 +710,7 @@ def build(self): shutil.rmtree(OUTPUT_ROOT) OUTPUT_ROOT.mkdir(parents=True) - # CRITICAL: Copy global assets FIRST (from SITE_ROOT, not DOCS_ROOT) + # Copy global assets FIRST self.log("\n--- Copying Global Assets ---") for asset_dir in ["css", "js", "img"]: src = SITE_ROOT / asset_dir @@ -1296,16 +806,16 @@ def build(self): # Summary print("\n" + "="*60) - self.log("ARCHIVE COMPLETE!", "SUCCESS") + self.log("ARCHIVE COMPLETE WITH JAVASCRIPT FIXES!", "SUCCESS") self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") - self.log("āœ… Ask AI widget removed", "SUCCESS") - self.log("āœ… All links converted to relative paths", "SUCCESS") - self.log("āœ… Version directory (v19.2) added where needed", "SUCCESS") + self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") + self.log("āœ… Relative path calculation corrected", "SUCCESS") + self.log("āœ… cockroachcloud/ links should now work correctly", "SUCCESS") - print(f"\nšŸŽ‰ Offline site built in {OUTPUT_ROOT}") + print(f"\nšŸŽ‰ Fixed offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") - print(f"\nšŸ“Œ Note: Check console output above for link transformation details") + print(f"\nšŸ”— Test the problematic link: cockroachcloud/quickstart.html → create-an-account.html") return True From 4c4dde1cecc89ce9f7020e3ed32af14304f76215 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 21 Jul 2025 06:57:12 +0530 Subject: [PATCH 5/7] index page fixed --- src/current/snapshot.py | 151 +++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 64 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index c47d4e36e0c..840b76e7297 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -185,46 +185,58 @@ def replace_url_processing(match): // Remove /docs/ prefix if present url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); - // Better current directory detection for file:// URLs - var currentPath = window.location.pathname; - var currentDir = ''; - - // Extract just the relevant part of the path (handle both web and file:// URLs) - var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); - if (pathMatch) { - currentDir = pathMatch[1]; + // Handle root/home URLs + if (url === '/' || url === '' || url === 'index' || url === 'index.html') { + // For docs home, determine if we need to go up directories + var currentPath = window.location.pathname; + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + url = '../index.html'; // Go up to main index + } else { + url = 'index.html'; // Stay at current level + } } else { - // Fallback: check if we're in root or any subdirectory - var pathParts = currentPath.split('/').filter(function(part) { return part; }); - for (var i = pathParts.length - 2; i >= 0; i--) { - if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || - pathParts[i] === 'releases' || pathParts[i] === 'advisories') { - currentDir = pathParts[i]; - break; + // Better current directory detection for file:// URLs + var currentPath = window.location.pathname; + var currentDir = ''; + + // Extract just the relevant part of the path (handle both web and file:// URLs) + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + // Fallback: check if we're in root or any subdirectory + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } } } - } - - // Remove leading slash from URL - if (url.startsWith('/')) { - url = url.substring(1); - } - - // Handle stable -> v19.2 conversion - url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); - - // Calculate relative path based on current directory context - if (currentDir) { - // We're in a subdirectory - if (url.startsWith(currentDir + '/')) { - // Same directory - remove the directory prefix - url = url.substring(currentDir.length + 1); - } else if (url.includes('/')) { - // Different directory - need to go up one level - url = '../' + url; - } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { - // Root level file - go up one level - url = '../' + url; + + // Remove leading slash from URL + if (url.startsWith('/')) { + url = url.substring(1); + } + + // Handle stable -> v19.2 conversion + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + // Calculate relative path based on current directory context + if (currentDir) { + // We're in a subdirectory + if (url.startsWith(currentDir + '/')) { + // Same directory - remove the directory prefix + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + // Different directory - need to go up one level + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + // Root level file - go up one level + url = '../' + url; + } } } @@ -246,36 +258,47 @@ def replace_url_processing(match): simple_replacement = r'''// Custom offline URL processing url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); - var currentPath = window.location.pathname; - var currentDir = ''; - - var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); - if (pathMatch) { - currentDir = pathMatch[1]; + // Handle root/home URLs + if (url === '/' || url === '' || url === 'index' || url === 'index.html') { + var currentPath = window.location.pathname; + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + url = '../index.html'; + } else { + url = 'index.html'; + } } else { - var pathParts = currentPath.split('/').filter(function(part) { return part; }); - for (var i = pathParts.length - 2; i >= 0; i--) { - if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || - pathParts[i] === 'releases' || pathParts[i] === 'advisories') { - currentDir = pathParts[i]; - break; + var currentPath = window.location.pathname; + var currentDir = ''; + + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } } } - } - - if (url.startsWith('/')) { - url = url.substring(1); - } - - url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); - - if (currentDir) { - if (url.startsWith(currentDir + '/')) { - url = url.substring(currentDir.length + 1); - } else if (url.includes('/')) { - url = '../' + url; - } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { - url = '../' + url; + + if (url.startsWith('/')) { + url = url.substring(1); + } + + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + if (currentDir) { + if (url.startsWith(currentDir + '/')) { + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + url = '../' + url; + } } } From 52be96aba91cc91acd9ffeb05ca7b7a4d2587815 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 21 Jul 2025 07:18:57 +0530 Subject: [PATCH 6/7] Removed dead links of files not in 19.2 version --- src/current/snapshot.py | 121 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 840b76e7297..1bd7dc796ee 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -63,6 +63,84 @@ def log(self, message, level="INFO"): }.get(level, "") print(f"[{timestamp}] {prefix} {message}") + def clean_sidebar_data(self, sidebar_data): + """Remove broken links from sidebar data""" + def check_file_exists(url): + """Check if a file exists for a given URL""" + if url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): + return True # External links are always valid + + # Normalize URL to file path + file_url = url.strip() + + # Handle root/empty URLs + if file_url in ['/', '', 'index', 'index.html']: + return True # Root index always exists + + # Remove leading slash and docs prefix + if file_url.startswith('/docs/'): + file_url = file_url[6:] + elif file_url.startswith('docs/'): + file_url = file_url[5:] + file_url = file_url.lstrip('/') + + # Handle stable -> v19.2 + file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') + file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') + + # Convert ${VERSION} placeholder + file_url = file_url.replace('${VERSION}', TARGET_VERSION) + + # Add .html if needed + if file_url and not file_url.endswith('/') and not file_url.endswith('.html'): + if '.' not in file_url.split('/')[-1]: # No extension + file_url += '.html' + + # Check if file exists + file_path = DOCS_ROOT / file_url + exists = file_path.exists() + + if not exists: + self.log(f"Removing broken link: {url} -> {file_path}", "WARNING") + + return exists + + def clean_item(item): + """Recursively clean an item and its children""" + if isinstance(item, dict): + # Clean URLs if present + if 'urls' in item: + item['urls'] = [url for url in item['urls'] if check_file_exists(url)] + # If no valid URLs left, this item is invalid + if not item['urls']: + return None + + # Clean child items if present + if 'items' in item: + cleaned_items = [] + for child in item['items']: + cleaned_child = clean_item(child) + if cleaned_child is not None: + cleaned_items.append(cleaned_child) + item['items'] = cleaned_items + + # If no URLs and no valid children, remove this item + if 'urls' not in item and not item['items']: + return None + + return item + + return item + + # Clean the sidebar data + cleaned_items = [] + for item in sidebar_data: + cleaned_item = clean_item(item) + if cleaned_item is not None: + cleaned_items.append(cleaned_item) + + return cleaned_items + def load_sidebar(self): """Load and prepare the sidebar HTML""" self.log(f"Loading sidebar from: {SIDEBAR_HTML_PATH}") @@ -83,6 +161,49 @@ def load_sidebar(self): break if self.sidebar_html: + # Extract and clean sidebar data + self.log("Cleaning sidebar data (removing broken links)...") + + # Parse the sidebar HTML to extract the JavaScript data + import re + import json + + # Extract the sidebar items from the JavaScript + items_match = re.search(r'items:\s*(\[[\s\S]*?\])\s*};', self.sidebar_html) + if items_match: + try: + # Parse the JavaScript array as JSON (with some cleaning) + items_str = items_match.group(1) + # Clean up JavaScript to make it valid JSON + items_str = re.sub(r'(\w+):', r'"\1":', items_str) # Quote keys + items_str = re.sub(r',\s*}', '}', items_str) # Remove trailing commas + items_str = re.sub(r',\s*]', ']', items_str) # Remove trailing commas in arrays + + sidebar_data = json.loads(items_str) + + # Clean the sidebar data + cleaned_data = self.clean_sidebar_data(sidebar_data) + + # Replace the items in the HTML + cleaned_items_str = json.dumps(cleaned_data, indent=2) + self.sidebar_html = re.sub( + r'items:\s*\[[\s\S]*?\]', + f'items:{cleaned_items_str}', + self.sidebar_html + ) + + self.log(f"Cleaned sidebar data: removed broken links", "SUCCESS") + + except Exception as e: + self.log(f"Could not clean sidebar data: {e}", "WARNING") + + # Simplify isVersionDirectory function for v19.2 only + self.sidebar_html = re.sub( + r'isVersionDirectory:\s*function\s*\([^}]*\{[^}]*\}', + 'isVersionDirectory: function (d) { return d === "v19.2" || d === "stable"; }', + self.sidebar_html + ) + # Clean the sidebar HTML of any Ask AI elements sidebar_soup = BeautifulSoup(self.sidebar_html, "html.parser") From 8d309786fdf452a8db6d544c7bf29c9faad86a38 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 21 Jul 2025 07:58:11 +0530 Subject: [PATCH 7/7] Updated home page --- src/current/snapshot.py | 479 ++++++++++++++++++++++++++++++++-------- 1 file changed, 390 insertions(+), 89 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 1bd7dc796ee..1443986f7ef 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Complete Offline Documentation Archiver for Jekyll CockroachDB Documentation -FIXED VERSION with correct JavaScript URL processing +FIXED VERSION with proper purple CockroachDB branding """ import re import shutil @@ -371,6 +371,13 @@ def replace_url_processing(match): # Try to apply the replacement new_html = re.sub(better_pattern, replace_url_processing, html, flags=re.DOTALL) + # Also fix the .html stripping issue - replace the line that removes .html extensions + new_html = re.sub( + r'url = url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', + 'url = url.replace("/index.html", ""); // Keep .html for offline', + new_html + ) + # If the complex pattern didn't match, try a simpler approach if new_html == html: # Simple pattern - just replace the specific problematic line @@ -705,137 +712,430 @@ def download_google_fonts(self): (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) def create_index_page(self): - """Create the index page""" + """Create the index page with proper CockroachDB purple branding""" index_html = f""" - CockroachDB {TARGET_VERSION} Documentation (Offline) + CockroachDB Documentation + -

CockroachDB {TARGET_VERSION}

-

Offline Documentation Archive

- -
-
-

šŸ“š Getting Started

- + +
+
+ šŸ“± + Offline Documentation Archive - CockroachDB Version 19.2
- - - -
-

ā˜ļø CockroachDB Cloud

- +
+ + +
+
+
+

Documentation

+

CockroachDB is the SQL database for building global, scalable cloud services that survive disasters.

+
+ +
+
+
ā˜ļø
+

Start a cloud cluster

+

Get started with CockroachDB Cloud, our fully managed service.

+ + Learn more → + +
+ +
+
šŸ–„ļø
+

Start a local cluster

+

Set up a local CockroachDB cluster for development and testing.

+ + Learn more → + +
+ +
+
šŸš€
+

Build a sample app

+

Build applications using your favorite language and framework.

+ + Learn more → + +
+
+ +
- -
+ + +
+ -
-

šŸ“Œ Offline Archive

-

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. - All internal links have been updated to work offline.

-

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

-
+ """ (OUTPUT_ROOT / "index.html").write_text(index_html) - self.log("Created index.html", "SUCCESS") + self.log("Created CockroachDB purple-branded index.html", "SUCCESS") def build(self): """Main build process""" print("\n" + "="*60) - print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (FIXED)") + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (PURPLE BRANDED)") print("="*60) # Verify paths @@ -950,16 +1250,17 @@ def build(self): # Summary print("\n" + "="*60) - self.log("ARCHIVE COMPLETE WITH JAVASCRIPT FIXES!", "SUCCESS") + self.log("ARCHIVE COMPLETE WITH PURPLE BRANDING!", "SUCCESS") self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") + self.log("🟣 CockroachDB purple branding applied", "SUCCESS") self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") - self.log("āœ… Relative path calculation corrected", "SUCCESS") - self.log("āœ… cockroachcloud/ links should now work correctly", "SUCCESS") + self.log("āœ… Broken sidebar links removed", "SUCCESS") + self.log("āœ… Professional index page created", "SUCCESS") - print(f"\nšŸŽ‰ Fixed offline site built in {OUTPUT_ROOT}") + print(f"\nšŸŽ‰ Purple-branded offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") - print(f"\nšŸ”— Test the problematic link: cockroachcloud/quickstart.html → create-an-account.html") + print(f"\n🟣 Your site now has proper CockroachDB purple branding!") return True