diff --git a/.github/workflows/data-links-check.yml b/.github/workflows/data-links-check.yml new file mode 100644 index 00000000..42c97646 --- /dev/null +++ b/.github/workflows/data-links-check.yml @@ -0,0 +1,23 @@ +name: Data Links Check + +on: + workflow_dispatch: + schedule: + - cron: "0 13 * * 6" # runs every saturday at 13:00 UTC (1 hour after docs check) + +jobs: + data-links-check: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout 🛎️ + uses: actions/checkout@v3 + + - name: Setup Node.js + uses: actions/setup-node@v3 + with: + node-version: '16' + + - name: Check Links in Data Files 🔎 + run: node scripts/check-data-links.js diff --git a/.github/workflows/docs-links-check-pr.yml b/.github/workflows/docs-links-check-pr.yml index 2ce5d05c..de6ba862 100644 --- a/.github/workflows/docs-links-check-pr.yml +++ b/.github/workflows/docs-links-check-pr.yml @@ -1,18 +1,22 @@ -name: Check links for modified docs +name: Check links for modified files on: pull_request: paths: - 'docs/**' + - 'blog/**' + - 'README.md' jobs: docs-links-check: runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout 🛎️ uses: actions/checkout@master - - name: Check links for mdx files 🔎 + - name: Check links for docs mdx files 🔎 uses: gaurav-nelson/github-action-markdown-link-check@v1 with: use-quiet-mode: 'yes' # only show errors in output. @@ -23,7 +27,7 @@ jobs: folder-path: 'docs/' # only check the docs/ folder file-extension: '.mdx' - - name: Check links for markdown files 🔎 + - name: Check links for docs markdown files 🔎 uses: gaurav-nelson/github-action-markdown-link-check@v1 with: use-quiet-mode: 'yes' # only show errors in output. @@ -33,3 +37,20 @@ jobs: check-modified-files-only: 'yes' folder-path: 'docs/' # only check the docs/ folder file-extension: '.md' + + - name: Check links for blog files 🔎 + uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + use-quiet-mode: 'yes' # only show errors in output. + config-file: '.github/workflows/docs.links.check.config.json' + check-modified-files-only: 'yes' + folder-path: 'blog/' + file-extension: '.md' + + - name: Check links for README 🔎 + uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + use-quiet-mode: 'yes' # only show errors in output. + config-file: '.github/workflows/docs.links.check.config.json' + check-modified-files-only: 'yes' + file-path: './README.md' diff --git a/.github/workflows/docs-links-check.yml b/.github/workflows/docs-links-check.yml index 8c22b251..312fbd6a 100644 --- a/.github/workflows/docs-links-check.yml +++ b/.github/workflows/docs-links-check.yml @@ -1,4 +1,4 @@ -name: Docs Links Check +name: All Links Check on: workflow_dispatch: @@ -8,11 +8,13 @@ on: jobs: docs-links-check: runs-on: ubuntu-latest + permissions: + contents: read steps: - name: Checkout 🛎️ uses: actions/checkout@master - - name: Check Links 🔎 + - name: Check Docs Links 🔎 uses: gaurav-nelson/github-action-markdown-link-check@v1 with: use-quiet-mode: 'yes' # only show errors in output. @@ -21,3 +23,18 @@ jobs: # refer to https://github.com/tcort/markdown-link-check#config-file-format config-file: '.github/workflows/docs.links.check.config.json' file-extension: '.md*' # .md or .mdx + + - name: Check Blog Links 🔎 + uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + use-quiet-mode: 'yes' # only show errors in output. + folder-path: 'blog/' + config-file: '.github/workflows/docs.links.check.config.json' + file-extension: '.md' + + - name: Check README Links 🔎 + uses: gaurav-nelson/github-action-markdown-link-check@v1 + with: + use-quiet-mode: 'yes' # only show errors in output. + config-file: '.github/workflows/docs.links.check.config.json' + file-path: './README.md' diff --git a/README.md b/README.md index 79d1c22c..57b60cb1 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,21 @@ See [Styling and Layout](https://docusaurus.io/docs/styling-layout) to learn how See [Swizzling](https://docusaurus.io/docs/swizzling) to learn how to modify Docusaurus built-in components. +### Link Checking + +The repository has comprehensive automated link checking for all URLs: + +- **Documentation, blog posts, and README**: Checked via GitHub Actions using [markdown-link-check](https://github.com/tcort/markdown-link-check) +- **Data files** (JSON/JS with GitHub profiles, repos, external links): Checked via a custom Node.js script + +You can run the data links checker locally: + +```bash +yarn check:links +``` + +For more details, see the [Link Checking documentation](./docs/LinkChecking.md). + ## LICENSE [Apache-2.0 license](./LICENSE). diff --git a/docs/LinkChecking.md b/docs/LinkChecking.md new file mode 100644 index 00000000..44cc777d --- /dev/null +++ b/docs/LinkChecking.md @@ -0,0 +1,172 @@ +--- +title: Link Checking +description: Guide to link checking in the Casbin website repository +keywords: [link checking, CI/CD, GitHub Actions, quality assurance] +--- + +# Link Checking + +The Casbin website repository has comprehensive link checking to ensure all URLs (GitHub profiles, repositories, internal links, external links) remain valid and accessible. + +## Overview + +Link checking is performed automatically through GitHub Actions workflows on: + +- **Schedule**: Weekly checks on Saturdays +- **Pull Requests**: Automatic checks when relevant files are modified +- **Manual**: Can be triggered via workflow dispatch + +## What Gets Checked + +### 1. Documentation Links (docs/) + +All markdown (`.md`) and MDX (`.mdx`) files in the `docs/` directory are checked for: + +- Internal links to other documentation pages +- External links to websites and resources +- GitHub repository and profile links +- Image references + +**Workflow**: `.github/workflows/docs-links-check.yml` (scheduled) and `.github/workflows/docs-links-check-pr.yml` (on PRs) + +### 2. Blog Posts (blog/) + +All markdown files in the `blog/` directory are checked for the same types of links as documentation. + +**Workflow**: `.github/workflows/docs-links-check.yml` (scheduled) and `.github/workflows/docs-links-check-pr.yml` (on PRs) + +### 3. README.md + +The main README file is checked to ensure all links remain valid. + +**Workflow**: `.github/workflows/docs-links-check.yml` (scheduled) and `.github/workflows/docs-links-check-pr.yml` (on PRs) + +### 4. Data Files + +JavaScript and JSON data files containing ecosystem information are checked for: + +- GitHub repository URLs +- GitHub profile URLs (author links) +- External documentation links +- Project websites + +**Files checked**: +- `static/data/users.json` - User showcase data +- All adapter data files in `src/tableData/AdapterData/` +- All middleware data files in `src/tableData/MiddlewareData/` +- All dispatcher data files in `src/tableData/DispatcherData/` +- All role manager data files in `src/tableData/RoleManagerData/` +- All watcher data files in `src/tableData/WatcherData/` + +**Workflow**: `.github/workflows/data-links-check.yml` + +**Local testing**: You can run the data links checker locally: + +```bash +npm run check:links +# or +yarn check:links +``` + +## Configuration + +### Ignored URL Patterns + +Some URLs are intentionally ignored in link checking because they: +- Block automated checkers +- Have rate limiting that causes false positives +- Are known to be problematic with HEAD requests + +These patterns are configured in `.github/workflows/docs.links.check.config.json`: + +```json +{ + "ignorePatterns": [ + { + "pattern": "^(?!https?://)" + }, + { + "pattern": "^https://codeigniter.com/" + }, + { + "pattern": "^https://www.thinkphp.cn/" + } + // ... more patterns + ] +} +``` + +The data links checker script also has its own ignore patterns in `scripts/check-data-links.js`. + +## How It Works + +### Markdown Link Checking + +Uses the [github-action-markdown-link-check](https://github.com/gaurav-nelson/github-action-markdown-link-check) action powered by [markdown-link-check](https://github.com/tcort/markdown-link-check). + +For each link found in markdown files: +1. Resolves the URL (handling relative paths, anchors, etc.) +2. Makes an HTTP HEAD request to check availability +3. Reports broken links as workflow failures + +### Data Links Checking + +Uses a custom Node.js script (`scripts/check-data-links.js`) that: +1. Extracts all URLs from specified data files using regex +2. Filters out ignored patterns +3. Makes HTTP HEAD requests with retry logic +4. Reports all broken links with error details + +## Handling False Positives + +If you encounter false positives (working links reported as broken): + +1. **Verify the link**: Manually check if the link actually works in a browser +2. **Check rate limiting**: Some sites (like LinkedIn) aggressively rate-limit automated checkers +3. **Add to ignore patterns**: If confirmed as false positive, add the pattern to the appropriate config file +4. **Document the reason**: Add a comment explaining why it's ignored + +## Contributing + +When adding new data files with URLs: + +1. Add the file path to the `filesToCheck` array in `scripts/check-data-links.js` +2. Test locally with `npm run check:links` +3. Ensure all links are valid before committing + +When adding new documentation: + +1. Ensure all links work before committing +2. The PR checks will automatically verify links in modified files +3. Internal links should use relative paths when possible + +## Troubleshooting + +### Link Checker Failures + +If the link checker fails on your PR: + +1. Check the workflow logs to see which links are broken +2. Fix or remove broken links +3. If it's a false positive, discuss with maintainers about adding to ignore list + +### Rate Limiting + +GitHub Actions may encounter rate limiting when checking many links. This is handled by: +- Retry logic in the data links checker +- Staggered execution of different workflows +- Using quiet mode to reduce noise in logs + +## Manual Workflow Runs + +You can manually trigger link checking workflows: + +1. Go to the [Actions tab](https://github.com/casbin/casbin-website-v2/actions) +2. Select the workflow you want to run +3. Click "Run workflow" +4. Choose the branch and click "Run workflow" + +This is useful for: +- Testing after making many link changes +- Verifying fixes for broken links +- One-off checks outside the regular schedule diff --git a/package.json b/package.json index 5e37d1e3..a8dfc7fe 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "lint:css": "stylelint \"src/**/*.{css,scss}\" --fix", "lint:md": "markdownlint 'docs/**/*.{md,mdx}' --fix", "lint": "yarn lint:js && yarn lint:css && yarn lint:md", + "check:links": "node scripts/check-data-links.js", "write-translations": "docusaurus write-translations", "write-heading-ids": "docusaurus write-heading-ids", "crowdin:sync": "docusaurus write-translations && crowdin upload && crowdin download", diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..6b028166 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,40 @@ +# Scripts + +This directory contains utility scripts for the Casbin website. + +## check-data-links.js + +Checks the availability of all URLs in data files (JSON and JavaScript files). + +### What it checks + +- GitHub profile URLs (e.g., `@username`) +- GitHub repository URLs +- External documentation links +- Any HTTP/HTTPS URLs in the following files: + - `static/data/users.json` + - All adapter data files in `src/tableData/AdapterData/` + - All middleware data files in `src/tableData/MiddlewareData/` + - All dispatcher data files in `src/tableData/DispatcherData/` + - All role manager data files in `src/tableData/RoleManagerData/` + - All watcher data files in `src/tableData/WatcherData/` + +### Usage + +```bash +# Run directly +node scripts/check-data-links.js + +# Or use the npm script +npm run check:links +# or +yarn check:links +``` + +### Configuration + +The script ignores certain URL patterns that are known to be problematic or block automated checks. These are configured in the `ignorePatterns` array in the script. + +### CI/CD Integration + +This script is automatically run by GitHub Actions on a schedule (weekly) via the `data-links-check.yml` workflow. diff --git a/scripts/check-data-links.js b/scripts/check-data-links.js new file mode 100755 index 00000000..932c8e48 --- /dev/null +++ b/scripts/check-data-links.js @@ -0,0 +1,195 @@ +#!/usr/bin/env node + +/** + * Script to check URL availability in data files (JSON and JS files) + * Checks GitHub profiles, repositories, and external links + */ + +const fs = require('fs'); +const path = require('path'); +const https = require('https'); +const http = require('http'); + +// Files and patterns to check +const filesToCheck = [ + 'static/data/users.json', + 'src/tableData/AdapterData/AdapterGoData.js', + 'src/tableData/AdapterData/AdapterJavaData.js', + 'src/tableData/AdapterData/AdapterNodejsData.js', + 'src/tableData/AdapterData/AdapterPhpData.js', + 'src/tableData/AdapterData/AdapterPythonData.js', + 'src/tableData/AdapterData/AdapterRustData.js', + 'src/tableData/AdapterData/AdapterRubyData.js', + 'src/tableData/AdapterData/AdapterSwiftData.js', + 'src/tableData/AdapterData/AdapterDotNETData.js', + 'src/tableData/AdapterData/AdapterLuaData.js', + 'src/tableData/MiddlewareData/MiddlewareData.js', + 'src/tableData/DispatcherData/DispatcherData.js', + 'src/tableData/RoleManagerData/RoleManagerData.js', + 'src/tableData/WatcherData/WatcherData.js', +]; + +// URL patterns to ignore (known problematic sites) +const ignorePatterns = [ + /^https:\/\/codeigniter\.com\//, + /^https:\/\/www\.thinkphp\.cn\//, + /^https:\/\/virgool\.io\//, + /^https:\/\/blog\.csdn\.net/, + /^https:\/\/kns\.cnki\.net/, + /^https:\/\/learnku\.com/, + /^http:\/\/www\.cnblogs\.com/, +]; + +// Extract URLs from file content +function extractUrls(content) { + const urls = new Set(); + + // Match various URL patterns + const urlRegex = /https?:\/\/[^\s\)"']+/g; + const matches = content.match(urlRegex); + + if (matches) { + matches.forEach(url => { + // Clean up trailing characters + url = url.replace(/[,;:)}\]]+$/, ''); + + // Validate URL to ensure it's safe + try { + const parsedUrl = new URL(url); + // Only allow http and https protocols for security + if (parsedUrl.protocol === 'http:' || parsedUrl.protocol === 'https:') { + urls.add(url); + } + } catch (e) { + // Invalid URL, skip it + } + }); + } + + return Array.from(urls); +} + +// Check if URL should be ignored +function shouldIgnore(url) { + return ignorePatterns.some(pattern => pattern.test(url)); +} + +// Check URL availability with retries +async function checkUrl(url, retries = 2) { + try { + const parsedUrl = new URL(url); + const protocol = parsedUrl.protocol === 'https:' ? https : http; + + const options = { + method: 'HEAD', + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0)', + }, + timeout: 10000, + }; + + return await new Promise((resolve, reject) => { + const req = protocol.request(url, options, (res) => { + // Accept 2xx, 3xx status codes as success + if (res.statusCode >= 200 && res.statusCode < 400) { + resolve({ url, status: 'ok', statusCode: res.statusCode }); + } else { + reject({ statusCode: res.statusCode }); + } + }); + + req.on('error', (err) => { + reject({ error: err.message }); + }); + + req.on('timeout', () => { + req.destroy(); + reject({ error: 'timeout' }); + }); + + req.end(); + }); + } catch (err) { + // Retry logic + if (retries > 0) { + // Add delay before retry + await new Promise(resolve => setTimeout(resolve, 1000)); + return checkUrl(url, retries - 1); + } + return { url, status: 'error', error: err.error || err.message, statusCode: err.statusCode }; + } +} + +// Main function +async function main() { + console.log('Starting link checking in data files...\n'); + + const results = { + total: 0, + ok: 0, + error: 0, + ignored: 0, + errors: [], + }; + + for (const file of filesToCheck) { + const filePath = path.join(process.cwd(), file); + + if (!fs.existsSync(filePath)) { + console.log(`⚠️ File not found: ${file}`); + continue; + } + + console.log(`Checking: ${file}`); + const content = fs.readFileSync(filePath, 'utf-8'); + const urls = extractUrls(content); + + console.log(` Found ${urls.length} URLs`); + + for (const url of urls) { + results.total++; + + if (shouldIgnore(url)) { + results.ignored++; + continue; + } + + const result = await checkUrl(url); + + if (result.status === 'ok') { + results.ok++; + } else { + results.error++; + results.errors.push({ file, ...result }); + console.log(` ❌ ${url} - ${result.error || result.statusCode}`); + } + + // Add delay to avoid rate limiting + await new Promise(resolve => setTimeout(resolve, 100)); + } + + console.log(''); + } + + console.log('\n=== Summary ==='); + console.log(`Total URLs checked: ${results.total}`); + console.log(`✅ OK: ${results.ok}`); + console.log(`❌ Errors: ${results.error}`); + console.log(`⚠️ Ignored: ${results.ignored}`); + + if (results.errors.length > 0) { + console.log('\n=== Failed URLs ==='); + results.errors.forEach(({ file, url, error, statusCode }) => { + console.log(`${file}: ${url}`); + console.log(` Error: ${error || statusCode}\n`); + }); + + // Exit with error code if there are failures + process.exit(1); + } +} + +main().catch((err) => { + console.error('Error:', err); + process.exit(1); +});