diff --git a/productionized/pulse-fetch/.env.example b/productionized/pulse-fetch/.env.example index 344fc700..e8260df0 100644 --- a/productionized/pulse-fetch/.env.example +++ b/productionized/pulse-fetch/.env.example @@ -5,6 +5,11 @@ # Get one at: https://www.firecrawl.dev/ FIRECRAWL_API_KEY=your-firecrawl-api-key-here +# Firecrawl API Base URL (optional) +# Custom base URL for Firecrawl API (useful for self-hosted instances) +# Defaults to: https://api.firecrawl.dev +# FIRECRAWL_API_BASE_URL=https://api.firecrawl.dev + # BrightData API Key (optional) # Get one at: https://brightdata.com/ from the Web Unlocker product # Just provide the token - 'Bearer ' will be prepended automatically diff --git a/productionized/pulse-fetch/shared/src/scraping-client/lib/firecrawl-scrape.ts b/productionized/pulse-fetch/shared/src/scraping-client/lib/firecrawl-scrape.ts index 30bd2858..5f2a5705 100644 --- a/productionized/pulse-fetch/shared/src/scraping-client/lib/firecrawl-scrape.ts +++ b/productionized/pulse-fetch/shared/src/scraping-client/lib/firecrawl-scrape.ts @@ -1,3 +1,20 @@ +// Validate and cache the base URL at module load time +const getBaseUrl = (): string => { + const baseUrl = process.env.FIRECRAWL_API_BASE_URL || 'https://api.firecrawl.dev'; + + // Validate baseUrl to prevent injection attacks + if ( + baseUrl && + (!/^https?:\/\/[^\\]+$/.test(baseUrl) || baseUrl.includes('..')) + ) { + throw new Error('Invalid FIRECRAWL_API_BASE_URL'); + } + + return baseUrl; +}; + +const FIRECRAWL_BASE_URL = getBaseUrl(); + export async function scrapeWithFirecrawl( apiKey: string, url: string, @@ -13,7 +30,7 @@ export async function scrapeWithFirecrawl( error?: string; }> { try { - const response = await fetch('https://api.firecrawl.dev/v1/scrape', { + const response = await fetch(`${FIRECRAWL_BASE_URL}/v1/scrape`, { method: 'POST', headers: { 'Content-Type': 'application/json',