From ecc67f64740e250a2cfc6f199225ca3b212db0c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Stucke?= Date: Wed, 20 Aug 2025 17:20:17 +0200 Subject: [PATCH] feat: add plugin blacklist check to scheduler which checks if there are cases where dependencies skip analyses that are allowed for that plugin (these analyses would always be skipped) --- src/scheduler/analysis/blacklist.py | 67 +++++++++++++++++++++++++++++ src/scheduler/analysis/scheduler.py | 3 ++ 2 files changed, 70 insertions(+) create mode 100644 src/scheduler/analysis/blacklist.py diff --git a/src/scheduler/analysis/blacklist.py b/src/scheduler/analysis/blacklist.py new file mode 100644 index 000000000..048e1a675 --- /dev/null +++ b/src/scheduler/analysis/blacklist.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from analysis.PluginBase import AnalysisBasePlugin + +if TYPE_CHECKING: + from analysis.plugin import AnalysisPluginV0 + + +def check_plugin_blacklists(analysis_plugins: dict[str, AnalysisPluginV0 | AnalysisBasePlugin]): + """ + Check if there are cases where the analysis of a supported file type is not allowed for a dependency: + If plugin A depends on B and allows the analysis of file type X and this type is blacklisted for dependency B, + then the analysis of plugin A will always be skipped (which does not make a lot of sense, so we want to find + such cases and warn the user). + """ + for plugin_name, plugin in analysis_plugins.items(): + dependencies = _get_recursive_dependencies(plugin_name, analysis_plugins) + blacklist = _get_blacklist(plugin) + whitelist = _get_whitelist(plugin) + for dependency_name in dependencies: + dependency = analysis_plugins.get(dependency_name) + if not dependency: + logging.warning(f'Dependency {dependency_name} of plugin {plugin_name} is missing.') + continue + dependency_blacklist = _get_blacklist(dependency) + dependency_whitelist = _get_whitelist(dependency) + type_list = set() + if difference := dependency_blacklist.difference(blacklist): + # the blacklist of the plugin should be at least as restrictive as those of the dependencies + type_list.update(difference) + if dependency_whitelist and (difference := whitelist.difference(dependency_whitelist)): + # the whitelist of the plugin should be at least as restrictive as those of the dependencies + type_list.update(difference) + if union := whitelist.intersection(dependency_blacklist): + # whitelisted types should not be blacklisted by dependencies + type_list.update(union) + if type_list: + logging.warning( + f'Plugin {plugin_name} allows analysis of types {type_list} which are either blacklisted or ' + f'not whitelisted for dependency {dependency_name}. These analyses will always be skipped!' + ) + + +def _get_recursive_dependencies( + plugin_name: str, analysis_plugins: dict[str, AnalysisPluginV0 | AnalysisBasePlugin] +) -> set[str]: + plugin = analysis_plugins.get(plugin_name) + dependencies = _get_dependencies(plugin) + for dependency in list(dependencies): + dependencies.update(_get_recursive_dependencies(dependency, analysis_plugins)) + return dependencies + + +# FIXME: simplify when old base class gets removed +def _get_dependencies(plugin: AnalysisBasePlugin | AnalysisPluginV0) -> set[str]: + return set(plugin.DEPENDENCIES if isinstance(plugin, AnalysisBasePlugin) else plugin.metadata.dependencies) + + +def _get_blacklist(plugin: AnalysisBasePlugin | AnalysisPluginV0) -> set[str]: + return set(plugin.MIME_BLACKLIST if isinstance(plugin, AnalysisBasePlugin) else plugin.metadata.mime_blacklist) + + +def _get_whitelist(plugin: AnalysisBasePlugin | AnalysisPluginV0) -> set[str]: + return set(plugin.MIME_WHITELIST if isinstance(plugin, AnalysisBasePlugin) else plugin.metadata.mime_whitelist) diff --git a/src/scheduler/analysis/scheduler.py b/src/scheduler/analysis/scheduler.py index 918250790..543a2f652 100644 --- a/src/scheduler/analysis/scheduler.py +++ b/src/scheduler/analysis/scheduler.py @@ -28,6 +28,7 @@ from storage.db_interface_view_sync import ViewUpdater from storage.fsorganizer import FSOrganizer +from .blacklist import check_plugin_blacklists from .plugin import PluginRunner, Worker if TYPE_CHECKING: @@ -110,6 +111,8 @@ def __init__( self._plugin_runners = {} self._load_plugins() + check_plugin_blacklists(self.analysis_plugins) + self.stop_condition = Value('i', 0) self.process_queue = Queue() self.unpacking_locks = unpacking_locks