diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index bdb7bfca..5481154a 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -31,6 +31,7 @@ import contextlib from functools import partial + class AsyncCrawlerStrategy(ABC): """ Abstract base class for crawler strategies. @@ -41,6 +42,7 @@ class AsyncCrawlerStrategy(ABC): async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: pass # 4 + 3 + class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ Crawler strategy using Playwright. @@ -490,7 +492,7 @@ async def _crawl_web( response_headers = {} execution_result = None status_code = None - redirected_url = url + redirected_url = url # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -519,7 +521,9 @@ async def _crawl_web( await context.add_init_script(load_js_script("navigator_overrider")) # Call hook after page creation - await self.execute_hook("on_page_context_created", page, context=context, config=config) + await self.execute_hook( + "on_page_context_created", page, context=context, config=config + ) # Set up console logging if requested if config.log_console: @@ -547,7 +551,15 @@ def log_consol( # Get SSL certificate information if requested and URL is HTTPS ssl_cert = None if config.fetch_ssl_certificate: - ssl_cert = SSLCertificate.from_url(url) + ssl_cert, err = SSLCertificate.from_url( + url, proxy_config=config.proxy_config + ) + if err: + self.logger.warning( + message="Failed to fetch SSL certificate: {error}", + tag="SSL", + params={"error": err}, + ) # Set up download handling if self.browser_config.accept_downloads: @@ -560,7 +572,9 @@ def log_consol( # Handle page navigation and content loading if not config.js_only: - await self.execute_hook("before_goto", page, context=context, url=url, config=config) + await self.execute_hook( + "before_goto", page, context=context, url=url, config=config + ) try: # Generate a unique nonce for this request @@ -582,7 +596,12 @@ def log_consol( raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") await self.execute_hook( - "after_goto", page, context=context, url=url, response=response, config=config + "after_goto", + page, + context=context, + url=url, + response=response, + config=config, ) if response is None: @@ -757,8 +776,16 @@ def log_consol( params={"error": execution_result.get("error")}, ) - await self.execute_hook("on_execution_started", page, context=context, config=config) - await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result) + await self.execute_hook( + "on_execution_started", page, context=context, config=config + ) + await self.execute_hook( + "on_execution_ended", + page, + context=context, + config=config, + result=execution_result, + ) # Handle user simulation if config.simulate_user or config.magic: @@ -802,7 +829,9 @@ def log_consol( page = await self.process_iframes(page) # Pre-content retrieval hooks and delay - await self.execute_hook("before_retrieve_html", page, context=context, config=config) + await self.execute_hook( + "before_retrieve_html", page, context=context, config=config + ) if config.delay_before_return_html: await asyncio.sleep(config.delay_before_return_html) @@ -833,7 +862,11 @@ def log_consol( # # Get final HTML content # html = await page.content() await self.execute_hook( - "before_return_html", page=page, html=html, context=context, config=config + "before_return_html", + page=page, + html=html, + context=context, + config=config, ) # Handle PDF, MHTML and screenshot generation @@ -918,7 +951,10 @@ async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): viewport_size = page.viewport_size if viewport_size is None: await page.set_viewport_size( - {"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height} + { + "width": self.browser_config.viewport_width, + "height": self.browser_config.viewport_height, + } ) viewport_size = page.viewport_size @@ -982,7 +1018,9 @@ async def _handle_download(self, download): """ try: suggested_filename = download.suggested_filename - download_path = os.path.join(self.browser_config.downloads_path, suggested_filename) + download_path = os.path.join( + self.browser_config.downloads_path, suggested_filename + ) self.logger.info( message="Downloading {filename} to {path}", @@ -1514,7 +1552,6 @@ async def execute_user_script( t1 = time.time() await page.wait_for_load_state("domcontentloaded", timeout=5000) - t1 = time.time() await page.wait_for_load_state("networkidle", timeout=5000) @@ -1698,18 +1735,22 @@ async def page_need_scroll(self, page: Page) -> bool: # HTTP Crawler Strategy #################################################################################################### + class HTTPCrawlerError(Exception): """Base error class for HTTP crawler specific exceptions""" + pass class ConnectionTimeoutError(HTTPCrawlerError): """Raised when connection timeout occurs""" + pass class HTTPStatusError(HTTPCrawlerError): """Raised for unexpected status codes""" + def __init__(self, status_code: int, message: str): self.status_code = status_code super().__init__(f"HTTP {status_code}: {message}") @@ -1719,31 +1760,41 @@ class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): """ Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency. """ - - __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config') + + __slots__ = ( + "logger", + "max_connections", + "dns_cache_ttl", + "chunk_size", + "_session", + "hooks", + "browser_config", + ) DEFAULT_TIMEOUT: Final[int] = 30 - DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024 + DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024 DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4) DEFAULT_DNS_CACHE_TTL: Final[int] = 300 - VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'}) - - _BASE_HEADERS: Final = MappingProxyType({ - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - }) - + VALID_SCHEMES: Final = frozenset({"http", "https", "file", "raw"}) + + _BASE_HEADERS: Final = MappingProxyType( + { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + } + ) + def __init__( - self, + self, browser_config: Optional[HTTPCrawlerConfig] = None, logger: Optional[AsyncLogger] = None, max_connections: int = DEFAULT_MAX_CONNECTIONS, dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL, - chunk_size: int = DEFAULT_CHUNK_SIZE + chunk_size: int = DEFAULT_CHUNK_SIZE, ): """Initialize the HTTP crawler with config""" self.browser_config = browser_config or HTTPCrawlerConfig() @@ -1752,22 +1803,21 @@ def __init__( self.dns_cache_ttl = dns_cache_ttl self.chunk_size = chunk_size self._session: Optional[aiohttp.ClientSession] = None - + self.hooks = { - k: partial(self._execute_hook, k) - for k in ('before_request', 'after_request', 'on_error') + k: partial(self._execute_hook, k) + for k in ("before_request", "after_request", "on_error") } # Set default hooks - self.set_hook('before_request', lambda *args, **kwargs: None) - self.set_hook('after_request', lambda *args, **kwargs: None) - self.set_hook('on_error', lambda *args, **kwargs: None) - + self.set_hook("before_request", lambda *args, **kwargs: None) + self.set_hook("after_request", lambda *args, **kwargs: None) + self.set_hook("on_error", lambda *args, **kwargs: None) async def __aenter__(self) -> AsyncHTTPCrawlerStrategy: await self.start() return self - + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: await self.close() @@ -1787,11 +1837,7 @@ def set_hook(self, hook_type: str, hook_func: Callable) -> None: raise ValueError(f"Invalid hook type: {hook_type}") async def _execute_hook( - self, - hook_type: str, - hook_func: Callable, - *args: Any, - **kwargs: Any + self, hook_type: str, hook_func: Callable, *args: Any, **kwargs: Any ) -> Any: if asyncio.iscoroutinefunction(hook_func): return await hook_func(*args, **kwargs) @@ -1803,12 +1849,12 @@ async def start(self) -> None: limit=self.max_connections, ttl_dns_cache=self.dns_cache_ttl, use_dns_cache=True, - force_close=False + force_close=False, ) self._session = aiohttp.ClientSession( headers=dict(self._BASE_HEADERS), connector=connector, - timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT) + timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT), ) async def close(self) -> None: @@ -1818,141 +1864,131 @@ async def close(self) -> None: except asyncio.TimeoutError: if self.logger: self.logger.warning( - message="Session cleanup timed out", - tag="CLEANUP" + message="Session cleanup timed out", tag="CLEANUP" ) finally: self._session = None async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]: - async with aiofiles.open(path, mode='rb') as f: + async with aiofiles.open(path, mode="rb") as f: while chunk := await f.read(self.chunk_size): yield memoryview(chunk) async def _handle_file(self, path: str) -> AsyncCrawlResponse: if not os.path.exists(path): raise FileNotFoundError(f"Local file not found: {path}") - + chunks = [] async for chunk in self._stream_file(path): - chunks.append(chunk.tobytes().decode('utf-8', errors='replace')) - - return AsyncCrawlResponse( - html=''.join(chunks), - response_headers={}, - status_code=200 - ) + chunks.append(chunk.tobytes().decode("utf-8", errors="replace")) - async def _handle_raw(self, content: str) -> AsyncCrawlResponse: return AsyncCrawlResponse( - html=content, - response_headers={}, - status_code=200 + html="".join(chunks), response_headers={}, status_code=200 ) + async def _handle_raw(self, content: str) -> AsyncCrawlResponse: + return AsyncCrawlResponse(html=content, response_headers={}, status_code=200) async def _handle_http( - self, - url: str, - config: CrawlerRunConfig + self, url: str, config: CrawlerRunConfig ) -> AsyncCrawlResponse: async with self._session_context() as session: timeout = ClientTimeout( total=config.page_timeout or self.DEFAULT_TIMEOUT, connect=10, - sock_read=30 + sock_read=30, ) - + headers = dict(self._BASE_HEADERS) if self.browser_config.headers: headers.update(self.browser_config.headers) request_kwargs = { - 'timeout': timeout, - 'allow_redirects': self.browser_config.follow_redirects, - 'ssl': self.browser_config.verify_ssl, - 'headers': headers + "timeout": timeout, + "allow_redirects": self.browser_config.follow_redirects, + "ssl": self.browser_config.verify_ssl, + "headers": headers, } if self.browser_config.method == "POST": if self.browser_config.data: - request_kwargs['data'] = self.browser_config.data + request_kwargs["data"] = self.browser_config.data if self.browser_config.json: - request_kwargs['json'] = self.browser_config.json + request_kwargs["json"] = self.browser_config.json - await self.hooks['before_request'](url, request_kwargs) + await self.hooks["before_request"](url, request_kwargs) try: - async with session.request(self.browser_config.method, url, **request_kwargs) as response: + async with session.request( + self.browser_config.method, url, **request_kwargs + ) as response: content = memoryview(await response.read()) - + if not (200 <= response.status < 300): raise HTTPStatusError( - response.status, - f"Unexpected status code for {url}" + response.status, f"Unexpected status code for {url}" ) - + encoding = response.charset if not encoding: - encoding = cchardet.detect(content.tobytes())['encoding'] or 'utf-8' - + encoding = ( + cchardet.detect(content.tobytes())["encoding"] or "utf-8" + ) + result = AsyncCrawlResponse( - html=content.tobytes().decode(encoding, errors='replace'), + html=content.tobytes().decode(encoding, errors="replace"), response_headers=dict(response.headers), status_code=response.status, - redirected_url=str(response.url) + redirected_url=str(response.url), ) - - await self.hooks['after_request'](result) + + await self.hooks["after_request"](result) return result except aiohttp.ServerTimeoutError as e: - await self.hooks['on_error'](e) + await self.hooks["on_error"](e) raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - + except aiohttp.ClientConnectorError as e: - await self.hooks['on_error'](e) + await self.hooks["on_error"](e) raise ConnectionError(f"Connection failed: {str(e)}") - + except aiohttp.ClientError as e: - await self.hooks['on_error'](e) + await self.hooks["on_error"](e) raise HTTPCrawlerError(f"HTTP client error: {str(e)}") - + except asyncio.exceptions.TimeoutError as e: - await self.hooks['on_error'](e) + await self.hooks["on_error"](e) raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - + except Exception as e: - await self.hooks['on_error'](e) + await self.hooks["on_error"](e) raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") async def crawl( - self, - url: str, - config: Optional[CrawlerRunConfig] = None, - **kwargs + self, url: str, config: Optional[CrawlerRunConfig] = None, **kwargs ) -> AsyncCrawlResponse: config = config or CrawlerRunConfig.from_kwargs(kwargs) - + parsed = urlparse(url) - scheme = parsed.scheme.rstrip('/') - + scheme = parsed.scheme.rstrip("/") + if scheme not in self.VALID_SCHEMES: raise ValueError(f"Unsupported URL scheme: {scheme}") - + try: - if scheme == 'file': + if scheme == "file": return await self._handle_file(parsed.path) - elif scheme == 'raw': + elif scheme == "raw": return await self._handle_raw(parsed.path) else: # http or https return await self._handle_http(url, config) - + except Exception as e: if self.logger: self.logger.error( message="Crawl failed: {error}", tag="CRAWL", - params={"error": str(e), "url": url} + params={"error": str(e), "url": url}, ) - raise \ No newline at end of file + raise diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py index 6821c566..2dbca603 100644 --- a/crawl4ai/proxy_strategy.py +++ b/crawl4ai/proxy_strategy.py @@ -2,6 +2,7 @@ from abc import ABC, abstractmethod from itertools import cycle import os +from .validators import ProxyValidator class ProxyConfig: @@ -26,6 +27,9 @@ def __init__( # Extract IP from server if not explicitly provided self.ip = ip or self._extract_ip_from_server() + + # Normalize proxy configuration + self._normalize_proxy_config() def _extract_ip_from_server(self) -> Optional[str]: """Extract IP address from server URL.""" @@ -39,7 +43,63 @@ def _extract_ip_from_server(self) -> Optional[str]: return parts[0] except Exception: return None - + + def _normalize_proxy_config(self): + """ + Normalize proxy configuration to ensure consistency. + + Example: + proxy_config = { + "server": "http://user:pass@1.1.1.1:8090", + "username": "", + "password": "", + } -> + normalized_proxy_config = { + "server": "http://1.1.1.1:8090", + "username": "user", + "password": "pass", + } + """ + if not self.server: + return self + + from urllib.parse import urlparse, unquote + + parsed = urlparse(self.server) + + # urlparse("1.1.1.1:8090") -> scheme='', netloc='', path='1.1.1.1:8090' + # urlparse("localhost:8090") -> scheme='localhost', netloc='', path='8090' + # if both of these cases, we need to try re-parse URL with `http://` prefix. + if not parsed.netloc or not parsed.scheme: + parsed = urlparse(f"http://{self.server}") + + + username = self.username + password = self.password + # The server field takes precedence over username and password. + if "@" in parsed.netloc: + auth_part, host_part = parsed.netloc.split("@", 1) + if ":" in auth_part: + username, password = auth_part.split(":", 1) + username = unquote(username) + password = unquote(password) + else: + username = unquote(auth_part) + + password = "" + server = f"{parsed.scheme}://{host_part}" + else: + server = f"{parsed.scheme}://{parsed.netloc}" + + self.server = server + self.username = username + self.password = password + + # Validate the proxy string + ProxyValidator().validate(self.server) + + return self + @staticmethod def from_string(proxy_str: str) -> "ProxyConfig": """Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" @@ -69,7 +129,7 @@ def from_dict(proxy_dict: Dict) -> "ProxyConfig": username=proxy_dict.get("username"), password=proxy_dict.get("password"), ip=proxy_dict.get("ip") - ) + )._normalize_proxy_config() @staticmethod def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]: @@ -114,7 +174,6 @@ def clone(self, **kwargs) -> "ProxyConfig": config_dict.update(kwargs) return ProxyConfig.from_dict(config_dict) - class ProxyRotationStrategy(ABC): """Base abstract class for proxy rotation strategies""" diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py index 722bb7f9..10152fc4 100644 --- a/crawl4ai/ssl_certificate.py +++ b/crawl4ai/ssl_certificate.py @@ -1,13 +1,131 @@ """SSL Certificate class for handling certificate operations.""" import ssl +import socks import socket import base64 +from datetime import datetime import json -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, Protocol, Tuple from urllib.parse import urlparse import OpenSSL.crypto from pathlib import Path +from .proxy_strategy import ProxyConfig +from .validators import SSLURLValidator + + +class ConnectionStrategy(Protocol): + """Strategy interface for creating socket connections.""" + + def create_connection(self, hostname: str, port: int, timeout: int) -> socket.socket: + """ + Create a socket connection to the specified host. + + Args: + hostname: Target hostname to connect to + port: Target port to connect to + timeout: Connection timeout in seconds + + Returns: + Connected socket object + """ + pass + + +class DirectConnectionStrategy: + """Direct connection strategy without using a proxy.""" + + def create_connection(self, hostname: str, port: int, timeout: int) -> socket.socket: + """Create a direct socket connection without proxy.""" + return socket.create_connection((hostname, port), timeout=timeout) + + +class HttpProxyConnectionStrategy: + """HTTP/HTTPS proxy connection strategy.""" + + def __init__(self, proxy_config: ProxyConfig): + """ + Initialize with proxy configuration. + + Args: + proxy_config: Proxy configuration object + """ + self.proxy_config = proxy_config + + def create_connection(self, hostname: str, port: int, timeout: int) -> socket.socket: + """Create a socket connection through HTTP/HTTPS proxy.""" + sock = socks.socksocket() + parsed = urlparse(self.proxy_config.server) + + sock.set_proxy( + socks.HTTP, + parsed.hostname, + parsed.port or 80, + username=self.proxy_config.username, + password=self.proxy_config.password, + ) + sock.settimeout(timeout) + sock.connect((hostname, port)) + return sock + + +class SocksProxyConnectionStrategy: + """SOCKS proxy connection strategy.""" + + def __init__(self, proxy_config: ProxyConfig): + """ + Initialize with proxy configuration. + + Args: + proxy_config: Proxy configuration object + """ + self.proxy_config = proxy_config + + def create_connection(self, hostname: str, port: int, timeout: int) -> socket.socket: + """Create a socket connection through SOCKS proxy.""" + sock = socks.socksocket() + parsed = urlparse(self.proxy_config.server) + protocol = socks.SOCKS5 # socks5 default use socks5 + if parsed.scheme.lower() == "socks4": + protocol = socks.SOCKS4 + + sock.set_proxy( + protocol, + parsed.hostname, + parsed.port or 1080, + username=self.proxy_config.username, + password=self.proxy_config.password, + ) + sock.settimeout(timeout) + sock.connect((hostname, port)) + return sock + + +class ConnectionStrategyFactory: + """Factory for creating appropriate connection strategies.""" + + @staticmethod + def create_strategy(proxy_config: Optional[ProxyConfig]) -> ConnectionStrategy: + """ + Create appropriate connection strategy based on proxy configuration. + + Args: + proxy_config: Optional proxy configuration + + Returns: + A connection strategy instance + """ + if not proxy_config or not proxy_config.server: + return DirectConnectionStrategy() + + proxy_schema = urlparse(proxy_config.server).scheme.lower() + + if proxy_schema.startswith("http"): + return HttpProxyConnectionStrategy(proxy_config) + elif proxy_schema.startswith("socks"): + return SocksProxyConnectionStrategy(proxy_config) + else: + raise ValueError(f"Unsupported proxy scheme: {proxy_schema}") class SSLCertificate: @@ -18,68 +136,120 @@ class SSLCertificate: cert_info (Dict[str, Any]): The certificate information. Methods: - from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL. + from_url(url: str, timeout: int = 10, proxy_config: Optional[ProxyConfig] = None) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL. from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file. from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data. - export_as_pem() -> str: Export the certificate as PEM format. - export_as_der() -> bytes: Export the certificate as DER format. - export_as_json() -> Dict[str, Any]: Export the certificate as JSON format. - export_as_text() -> str: Export the certificate as text format. + to_pem() -> Optional[str]: Export the certificate as PEM format. + to_der() -> Optional[bytes]: Export the certificate as DER format. + to_json() -> Optional[str]: Export the certificate as JSON format. + to_playwright_format() -> Dict[str, Any]: Export the certificate as Playwright format. """ def __init__(self, cert_info: Dict[str, Any]): self._cert_info = self._decode_cert_data(cert_info) @staticmethod - def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]: + def from_url( + url: str, timeout: int = 10, + proxy_config: Optional[ProxyConfig] = None, + verify_ssl: bool = False + ) -> Tuple[Optional["SSLCertificate"], str]: """ Create SSLCertificate instance from a URL. Args: url (str): URL of the website. timeout (int): Timeout for the connection (default: 10). - + proxy_config (Optional[ProxyConfig]): Proxy configuration (default: None). + verify_ssl (bool): Whether to verify SSL certificate (default: False). Returns: Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise. + Raises: + ValueError: If the URL is not a valid SSL URL. """ + + # Validate the URL + SSLURLValidator().validate(url) + try: + # Extract hostname from URL hostname = urlparse(url).netloc if ":" in hostname: hostname = hostname.split(":")[0] + + # Get appropriate connection strategy using the factory + connection_strategy = ConnectionStrategyFactory.create_strategy(proxy_config) + + # Create connection and extract certificate + sock = None + try: + sock = connection_strategy.create_connection(hostname, 443, timeout) + return SSLCertificate._extract_certificate_from_socket(sock, hostname, verify_ssl), None + finally: + # Ensure socket is closed if it wasn't transferred + if sock: + try: + sock.close() + except Exception: + pass # Ignore any errors during closing - context = ssl.create_default_context() - with socket.create_connection((hostname, 443), timeout=timeout) as sock: - with context.wrap_socket(sock, server_hostname=hostname) as ssock: - cert_binary = ssock.getpeercert(binary_form=True) - x509 = OpenSSL.crypto.load_certificate( - OpenSSL.crypto.FILETYPE_ASN1, cert_binary - ) - - cert_info = { - "subject": dict(x509.get_subject().get_components()), - "issuer": dict(x509.get_issuer().get_components()), - "version": x509.get_version(), - "serial_number": hex(x509.get_serial_number()), - "not_before": x509.get_notBefore(), - "not_after": x509.get_notAfter(), - "fingerprint": x509.digest("sha256").hex(), - "signature_algorithm": x509.get_signature_algorithm(), - "raw_cert": base64.b64encode(cert_binary), - } - - # Add extensions - extensions = [] - for i in range(x509.get_extension_count()): - ext = x509.get_extension(i) - extensions.append( - {"name": ext.get_short_name(), "value": str(ext)} - ) - cert_info["extensions"] = extensions - - return SSLCertificate(cert_info) + except (socket.gaierror, socket.timeout) as e: + return None, f"Network error: {e!s}" + except ssl.SSLCertVerificationError as e: + return None, f"SSL Verify error: {e!s}" + except socks.ProxyError as e: + return None, f"Proxy error: {e!s}" + except Exception as e: + return None, f"Error: {e!s}" - except Exception: - return None + @staticmethod + def _extract_certificate_from_socket(sock: socket.socket, hostname: str, verify_ssl: bool = False) -> "SSLCertificate": + """ + Extract certificate information from an open socket. + + Args: + sock: Connected socket to extract certificate from + hostname: Hostname for SSL verification + verify_ssl: Whether to verify SSL certificate (default: False) + + Returns: + SSLCertificate object with extracted certificate information + """ + context = ssl.create_default_context() + + if not verify_ssl: + context.check_hostname = False + context.verify_mode = ssl.CERT_NONE + + with context.wrap_socket(sock, server_hostname=hostname) as ssock: + # Socket is now managed by the SSL context + cert_binary = ssock.getpeercert(binary_form=True) + x509 = OpenSSL.crypto.load_certificate( + OpenSSL.crypto.FILETYPE_ASN1, cert_binary + ) + + cert_info = { + "subject": dict(x509.get_subject().get_components()), + "issuer": dict(x509.get_issuer().get_components()), + "version": x509.get_version(), + "serial_number": hex(x509.get_serial_number()), + "not_before": x509.get_notBefore(), + "not_after": x509.get_notAfter(), + "fingerprint": x509.digest("sha256").hex(), + "signature_algorithm": x509.get_signature_algorithm(), + "raw_cert": base64.b64encode(cert_binary), + } + + # Add extensions + extensions = [] + for i in range(x509.get_extension_count()): + ext = x509.get_extension(i) + extensions.append( + {"name": ext.get_short_name(), "value": str(ext)} + ) + cert_info["extensions"] = extensions + + return SSLCertificate(cert_info) @staticmethod def _decode_cert_data(data: Any) -> Any: @@ -97,6 +267,15 @@ def _decode_cert_data(data: Any) -> Any: return [SSLCertificate._decode_cert_data(item) for item in data] return data + @staticmethod + def _parse_certificate_time(time_str: str) -> int: + """Parse certificate time.""" + if time_str.endswith("Z"): + time_str = time_str[:-1] + + dt = datetime.strptime(time_str, "%Y%m%d%H%M%S") + return int(dt.timestamp()) + def to_json(self, filepath: Optional[str] = None) -> Optional[str]: """ Export certificate as JSON. @@ -158,6 +337,20 @@ def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]: except Exception: return None + def to_playwright_format(self) -> Dict[str, Any]: + """ + Export certificate as Playwright format. + """ + return { + "issuer": self.issuer.get("CN"), + "subject": self.subject.get("CN"), + "valid_from": self._parse_certificate_time(self.valid_from), + "valid_until": self._parse_certificate_time(self.valid_until), + } + + def __str__(self) -> str: + return self.to_json() + @property def issuer(self) -> Dict[str, str]: """Get certificate issuer information.""" diff --git a/crawl4ai/validators.py b/crawl4ai/validators.py new file mode 100644 index 00000000..e1c37c75 --- /dev/null +++ b/crawl4ai/validators.py @@ -0,0 +1,112 @@ +import re + + +class ValidationException(ValueError): + def __init__(self, input, validator): + ValueError.__init__(self, f"Input failed {validator} validation: {input}") + +class BaseValidator: + """ + Check the input against a regex and raise a ValidationException if it fails. + """ + + def __init__(self, regex, validator=None, flags=0): + if isinstance(regex, str): + self.match_object = re.compile(regex, flags) + else: + self.match_object = regex + self.validator = validator + + def validate(self, value): + """ + Validate the input against the regex. If it fails, raise a ValidationException. + """ + if self.match_object.match(value) is None: + raise ValidationException(value, self.validator) + + +class URLValidator(BaseValidator): + """ + Check if the input is a valid URL. + """ + + def __init__(self): + regex = ( + # {http,ftp}s:// (not required) + r"^((?:http|ftp)s?://)?" + # Domain + r"(?:" + r"(?:[A-Z0-9](?:[_A-Z0-9-]{0,61}[A-Z0-9])?\.)+" + r"(?:[A-Z]{2,6}\.?|[_A-Z0-9-]{2,}\.?)|" + # Localhost + r"localhost|" + # IPv6 address + r"\[[a-f0-9:]+\]|" + # IPv4 address + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\." + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" + r")" + # Optional port + r"(?::\d+)?" + # Path + r"(?:/?|[/?#]\S+)$" + ) + super(URLValidator, self).__init__(regex, "url", flags=re.IGNORECASE) + +class SSLURLValidator(BaseValidator): + """ + Check if the input is a valid SSL URL. + """ + + def __init__(self): + regex = ( + # https:// (required) + r"^(https?://)" + # Domain + r"(?:" + r"(?:[A-Z0-9](?:[_A-Z0-9-]{0,61}[A-Z0-9])?\.)+" + r"(?:[A-Z]{2,6}\.?|[_A-Z0-9-]{2,}\.?)|" + # Localhost + r"localhost|" + # IPv6 address + r"\[[a-f0-9:]+\]|" + # IPv4 address + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\." + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" + r")" + # Optional port + r"(?::\d+)?" + # Path + r"(?:/?|[/?#]\S+)$" + ) + super(SSLURLValidator, self).__init__(regex, "ssl_url(must start with http/https)", flags=re.IGNORECASE) + + + +class ProxyValidator(BaseValidator): + """ + Check if the input is a valid proxy string. + """ + + def __init__(self): + regex = ( + # proxy scheme + r"^((?:https?|socks[45])://)" + # Username and password + r"(?:\S+(?::\S*)?@)?" + # Domain + r"(?:" + r"(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+" + r"(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" + # Localhost + r"localhost|" + # IPv6 address + r"\[[a-f0-9:]+\]|" + # IPv4 address + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\." + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" + r")" + # port + r"(?::\d+)$" + ) + super(ProxyValidator, self).__init__(regex, "proxy", re.IGNORECASE) diff --git a/docs/examples/proxy_example.py b/docs/examples/proxy_example.py new file mode 100644 index 00000000..e091327c --- /dev/null +++ b/docs/examples/proxy_example.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python +# encoding: utf-8 + +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + DefaultMarkdownGenerator, + CrawlResult, +) +from crawl4ai.configs import ProxyConfig + + +async def main(): + browser_config = BrowserConfig(headless=True, verbose=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + magic=True, + fetch_ssl_certificate=True, + proxy_config=ProxyConfig(server="socks5://127.0.0.1:1088"), + markdown_generator=DefaultMarkdownGenerator( + # content_filter=PruningContentFilter( + # threshold=0.48, threshold_type="fixed", min_word_threshold=0 + # ) + ), + ) + result : CrawlResult = await crawler.arun( + url="https://www.google.com", config=crawler_config + ) + print("ssl:", result.ssl_certificate) + print("markdown: ",result.markdown[:500]) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index ad07548d..cf49b410 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "faust-cchardet>=2.1.19", "aiohttp>=3.11.11", "humanize>=4.10.0", + "PySocks~=1.7.1" ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/requirements.txt b/requirements.txt index c1f36c56..bbcca991 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ psutil>=6.1.1 nltk>=3.9.1 rich>=13.9.4 cssselect>=1.2.0 -faust-cchardet>=2.1.19 \ No newline at end of file +faust-cchardet>=2.1.19 +PySocks~=1.7.1 diff --git a/tests/validators/test_validators.py b/tests/validators/test_validators.py new file mode 100644 index 00000000..00f2b21d --- /dev/null +++ b/tests/validators/test_validators.py @@ -0,0 +1,58 @@ +import pytest +from crawl4ai.validators import ( + URLValidator, + SSLURLValidator, + ProxyValidator, + ValidationException, +) + +class TestValidators: + + def testURLValueData(self): + url_validator = URLValidator() + url_validator.validate("http://google.com/aaa") + url_validator.validate("google.com") + url_validator.validate("https://google.com") + url_validator.validate("https://google.com:7001") + url_validator.validate( + "long-foo_bar-askjdla1023u01_2u3-62532040b2148.looo0000ngurl.com" + ) + url_validator.validate( + "https://xxxx.example.com/some.php?aksljdlsa/test.html&id=foo@bar.com" + ) + url_validator.validate( + "https://xxxxx.freewebhostmost.com#foo@bar.com" + ) + with pytest.raises(ValidationException): + url_validator.validate("http://g=oogle") + with pytest.raises(ValidationException): + url_validator.validate("http://google.com/abc test/aa") + + def testProxyValueData(self): + proxy_validator = ProxyValidator() + proxy_validator.validate("socks5://127.0.0.1:1080") + proxy_validator.validate("socks4://127.0.0.1:1080") + proxy_validator.validate("http://192.168.1.1:8080") + proxy_validator.validate("https://1.1.1.1:8080") + proxy_validator.validate("https://google.com:8080") + proxy_validator.validate("http://user:pass@google.com:8080") + with pytest.raises(ValidationException): + # incorrect scheme + proxy_validator.validate("ftp://test.com") + with pytest.raises(ValidationException): + # Without port + proxy_validator.validate("http://test.com") + with pytest.raises(ValidationException): + # don't need path + proxy_validator.validate("http://test.com:8008/path") + + def testSSLURLValueData(self): + ssl_url_validator = SSLURLValidator() + ssl_url_validator.validate("https://google.com") + ssl_url_validator.validate("https://google.com:7001") + with pytest.raises(ValidationException): + # without scheme + ssl_url_validator.validate("google.com") + with pytest.raises(ValidationException): + # incorrect scheme + ssl_url_validator.validate("ftp://google.com")