ansible · Oct 21, 2024
diff --git a/‎.flake8
+5 b/‎.flake8
+5
diff --git a/‎.gitignore
+50 b/‎.gitignore
+50
diff --git a/‎Makefile
+16 b/‎Makefile
+16
diff --git a/‎README.md
+39 b/‎README.md
+39
diff --git a/‎dispatcher/__init__.py b/‎dispatcher/__init__.py
diff --git a/‎dispatcher/brokers/pg_notify.py
+84 b/‎dispatcher/brokers/pg_notify.py
+84
diff --git a/‎dispatcher/cli.py
+54 b/‎dispatcher/cli.py
+54
diff --git a/‎dispatcher/main.py
+68 b/‎dispatcher/main.py
+68
diff --git a/‎dispatcher/pool.py
+160 b/‎dispatcher/pool.py
+160
diff --git a/‎dispatcher/producers/brokered.py
+46 b/‎dispatcher/producers/brokered.py
+46
diff --git a/‎dispatcher/producers/scheduled.py
+33 b/‎dispatcher/producers/scheduled.py
+33
diff --git a/‎dispatcher/publish.py
+129 b/‎dispatcher/publish.py
+129
diff --git a/‎dispatcher/utils.py
+29 b/‎dispatcher/utils.py
+29
diff --git a/‎dispatcher/worker/task.py
+208 b/‎dispatcher/worker/task.py
+208
diff --git a/‎docker-compose.yml
+16 b/‎docker-compose.yml
+16
diff --git a/‎docs/design_notes.md
+114 b/‎docs/design_notes.md
+114
diff --git a/‎docs/message_formats.md
+58 b/‎docs/message_formats.md
+58
diff --git a/‎docs/roadmap.md
+194 b/‎docs/roadmap.md
+194
diff --git a/‎pyproject.toml
+48 b/‎pyproject.toml
+48
diff --git a/‎requirements_dev.txt
+3 b/‎requirements_dev.txt
+3
diff --git a/‎tests/benchmark/__init__.py b/‎tests/benchmark/__init__.py
diff --git a/‎tests/conftest.py
+15 b/‎tests/conftest.py
+15
diff --git a/‎tests/integration/test_main.py
+36 b/‎tests/integration/test_main.py
+36
diff --git a/‎tests/unit/test_config.py
+3 b/‎tests/unit/test_config.py
+3
diff --git a/‎tools/write_messages.py
+28 b/‎tools/write_messages.py
+28
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 160
+extend-ignore = E203
+exclude = .git,tools
+max-complexity = 10
@@ -0,0 +1,50 @@
+# Coppied from django-ansible-base
+
+# User level pre-commit hooks
+pre-commit-user
+
+# Make target touch files
+.docker-compose-built
+
+# Python & setuptools
+__pycache__
+/build
+/deb-build
+/reprepro
+/rpm-build
+/tar-build
+/setup-bundle-build
+/dist
+/*.egg-info
+*.py[c,o]
+/.eggs
+.coverage*
+coverage.xml
+coverage.json
+django-ansible-base-test-results.xml
+htmlcov
+*.tox
+venv/
+.venv/
+
+# Mac OS X
+*.DS_Store
+
+# VSCode
+.vscode/
+
+# Editors
+*.sw[poj]
+*~
+
+# SQLite
+*.sqlite3
+*.sqlite3_gw*
+*.sqlite3-journal
+
+# Container customizations
+container-startup.yml
+tools/generated/*
+
+# Gets created when testing sonar-scanner locally
+.scannerwork
@@ -0,0 +1,16 @@
+DOCKER_COMPOSE ?= docker compose
+
+
+# Mostly copied from DAB
+postgres:
+	docker start dispatch_postgres || $(DOCKER_COMPOSE) up -d msg_postgres --quiet-pull
+
+## Stops the postgres container started with 'make postgres'
+stop-postgres:
+	echo "Killing dispatch_postgres container"
+	$(DOCKER_COMPOSE) rm -fsv msg_postgres
+
+clean:
+	find . -type f -regex ".*\.py[co]$$" -delete
+	find . -type d -name "__pycache__" -delete
+	rm -rf dispatcher.egg-info/
@@ -0,0 +1,39 @@
+# dispatcher
+Working space for dispatcher prototyping
+
+This is firstly intended to be a code split of:
+
+https://github.com/ansible/awx/tree/devel/awx/main/dispatch
+
+As a part of doing the split, we also want to resolve a number of
+long-standing design and sustainability issues, thus, asyncio.
+
+### Manual Demo
+
+You need to have 2 terminal tabs open to run this.
+
+```
+# tab 1
+make postgres
+dispatcher-standalone
+# tab 2
+python tools/write_messages.py
+```
+
+This will run the dispatcher with schedules, and process a burst of messages
+that give instructions to run tasks.
+
+### Running Tests
+
+A structure has been set up for integration tests.
+The word "integration" only means that postgres must be running.
+
+```
+pip install -r requirements_dev.txt
+make postgres
+py.test tests/
+```
+
+This accomplishes the most basic of starting and shutting down.
+With no tasks submitted, it should record running 0 tasks,
+and with a task submitted, it records running 1 task.
@@ -0,0 +1,84 @@
+import logging
+
+import psycopg
+
+logger = logging.getLogger(__name__)
+
+
+"""This module exists under the theory that dispatcher messaging should be swappable
+
+to different message busses eventually.
+That means that the main code should never import psycopg.
+Thus, all psycopg-lib-specific actions must happen here.
+"""
+
+
+# TODO: get database data from settings
+# # As Django settings, may not use
+# DATABASES = {
+#     "default": {
+#         "ENGINE": "django.db.backends.postgresql",
+#         "HOST": os.getenv("DB_HOST", "127.0.0.1"),
+#         "PORT": os.getenv("DB_PORT", 55777),
+#         "USER": os.getenv("DB_USER", "dispatch"),
+#         "PASSWORD": os.getenv("DB_PASSWORD", "dispatching"),
+#         "NAME": os.getenv("DB_NAME", "dispatch_db"),
+#     }
+# }
+
+
+async def aget_connection(config):
+    return await psycopg.AsyncConnection.connect(**config, autocommit=True)
+
+
+def get_connection(config):
+    return psycopg.Connection.connect(**config, autocommit=True)
+
+
+async def aprocess_notify(connection, channels):
+    async with connection.cursor() as cur:
+        for channel in channels:
+            await cur.execute(f"LISTEN {channel};")
+            logger.info(f"Set up pg_notify listening on channel '{channel}'")
+
+        while True:
+            logger.debug('Starting listening for pg_notify notifications')
+            async for notify in connection.notifies():
+                logger.debug(f"Received notification: {notify.channel} - {notify.payload}")
+                yield notify.channel, notify.payload
+
+
+def get_django_connection():
+    try:
+        from django.conf import ImproperlyConfigured
+        from django.db import connection as pg_connection
+    except ImportError:
+        return None
+    else:
+        try:
+            if pg_connection.connection is None:
+                pg_connection.connect()
+            if pg_connection.connection is None:
+                raise RuntimeError('Unexpectedly could not connect to postgres for pg_notify actions')
+            return pg_connection.connection
+        except ImproperlyConfigured:
+            return None
+
+
+def publish_message(queue, message, config=None, new_connection=False):
+    conn = None
+    if not new_connection:
+        conn = get_django_connection()
+
+    if not conn:
+        if config is None:
+            raise RuntimeError('Could not use Django connection, and no postgres config supplied')
+        conn = get_connection(config)
+
+    with conn.cursor() as cur:
+        cur.execute('SELECT pg_notify(%s, %s);', (queue, message))
+
+    logger.debug(f'Sent pg_notify message to {queue}')
+
+    if new_connection:
+        conn.close()
@@ -0,0 +1,54 @@
+import argparse
+import asyncio
+import logging
+import sys
+from datetime import timedelta
+
+from dispatcher.main import DispatcherMain
+
+logger = logging.getLogger(__name__)
+
+
+# TODO: obviously stop hard-coding this
+CELERYBEAT_SCHEDULE = {
+    'lambda: __import__("time").sleep(1)': {'schedule': timedelta(seconds=3)},
+    'lambda: __import__("time").sleep(2)': {'schedule': timedelta(seconds=3)},
+}
+
+
+# List of channels to listen on
+CHANNELS = ['test_channel', 'test_channel2', 'test_channel2']
+
+# Database connection details
+CONNECTION_STRING = "dbname=dispatch_db user=dispatch password=dispatching host=localhost port=55777"
+
+
+def standalone():
+    parser = argparse.ArgumentParser(description="CLI entrypoint for dispatcher, mainly intended for testing.")
+    parser.add_argument(
+        '--log-level',
+        type=str,
+        default='DEBUG',
+        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
+        help='Python log level to standard out. If you want to log to file you are in the wrong place.',
+    )
+
+    args = parser.parse_args()
+    logging.basicConfig(level=getattr(logging, args.log_level), stream=sys.stdout)
+
+    logging.debug(f"Configured standard out logging at {args.log_level} level")
+
+    config = {
+        "producers": {"brokers": {"pg_notify": {"conninfo": CONNECTION_STRING}, "channels": CHANNELS}, "scheduled": CELERYBEAT_SCHEDULE},
+        "pool": {"max_workers": 3},
+    }
+
+    loop = asyncio.get_event_loop()
+    dispatcher = DispatcherMain(config)
+    try:
+        loop.run_until_complete(dispatcher.main())
+        # asyncio.run(main())
+    except KeyboardInterrupt:
+        logger.info('CLI entry point leaving')
+    finally:
+        loop.close()
@@ -0,0 +1,68 @@
+import asyncio
+import logging
+import signal
+
+from dispatcher.pool import WorkerPool
+from dispatcher.producers.brokered import BrokeredProducer
+from dispatcher.producers.scheduled import ScheduledProducer
+
+logger = logging.getLogger(__name__)
+
+
+class DispatcherMain:
+    def __init__(self, config):
+        self.exit_event = asyncio.Event()
+        num_workers = 3
+        self.pool = WorkerPool(num_workers)
+
+        # Initialize all the producers, this should not start anything, just establishes objects
+        self.producers = []
+        if 'producers' in config:
+            producer_config = config['producers']
+            if 'brokers' in producer_config:
+                for broker_name, broker_config in producer_config['brokers'].items():
+                    # TODO: import from the broker module here, some importlib stuff
+                    # TODO: make channels specific to broker, probably
+                    if broker_name != 'pg_notify':
+                        continue
+                    self.producers.append(BrokeredProducer(broker=broker_name, config=broker_config, channels=producer_config['brokers']['channels']))
+            if 'scheduled' in producer_config:
+                self.producers.append(ScheduledProducer(producer_config['scheduled']))
+
+    async def connect_signals(self):
+        loop = asyncio.get_event_loop()
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            loop.add_signal_handler(sig, lambda: asyncio.create_task(self.shutdown(sig)))
+
+    async def shutdown(self, sig=None):
+        if sig:
+            logging.info(f"Received exit signal {sig.name}...")
+
+        logging.debug(f"Shutting down, starting with producers.")
+        for producer in self.producers:
+            await producer.shutdown()
+
+        logger.debug('Gracefully shutting down worker pool')
+        await self.pool.shutdown()
+
+        logger.debug('Setting event to exit main loop')
+        self.exit_event.set()
+
+    async def start_working(self):
+        logger.debug('Filling the worker pool')
+        await self.pool.start_working()
+
+        logger.debug('Starting task production')
+        for producer in self.producers:
+            await producer.start_producing(self.pool)
+
+    async def main(self):
+        logger.info('Connecting dispatcher signal handling')
+        await self.connect_signals()
+
+        await self.start_working()
+
+        logger.info('Dispatcher running forever, or until shutdown command')
+        await self.exit_event.wait()
+
+        logger.debug('Dispatcher loop fully completed')
@@ -0,0 +1,160 @@
+import asyncio
+import json
+import logging
+import multiprocessing
+import os
+
+from dispatcher.worker.task import work_loop
+
+logger = logging.getLogger(__name__)
+
+
+class PoolWorker:
+    def __init__(self, worker_id, finished_queue):
+        self.worker_id = worker_id
+        # TODO: rename message_queue to call_queue, because this is what cpython ProcessPoolExecutor calls them
+        self.message_queue = multiprocessing.Queue()
+        self.process = multiprocessing.Process(target=work_loop, args=(self.worker_id, self.message_queue, finished_queue))
+        self.current_task = None
+        self.finished_count = 0
+        self.status = 'initialized'
+
+    def start(self):
+        self.process.start()
+        self.status = 'starting'
+
+    async def stop(self):
+        self.status = 'stopping'
+        self.message_queue.put("stop")
+        self.process.join()
+
+    def mark_finished_task(self):
+        self.current_task = None
+        self.finished_count += 1
+
+
+class WorkerPool:
+    def __init__(self, num_workers):
+        self.num_workers = num_workers
+        self.workers = {}
+        self.next_worker_id = 0
+        self.finished_queue = multiprocessing.Queue()
+        self.queued_messages = []  # TODO: use deque, invent new kinds of message anxiety and panic
+        self.read_results_task = None
+        self.shutting_down = False
+        self.finished_count = 0
+        self.shutdown_timeout = 3
+        # TODO: worker management lock
+
+    async def start_working(self):
+        self._spawn_workers()
+        self.read_results_task = asyncio.create_task(self.read_results_forever())
+
+    def _spawn_workers(self):
+        for i in range(self.num_workers):
+            worker = PoolWorker(worker_id=self.next_worker_id, finished_queue=self.finished_queue)
+            worker.start()
+            self.workers[self.next_worker_id] = worker
+            self.next_worker_id += 1
+
+    async def stop_workers(self):
+        for worker in self.workers.values():
+            await worker.stop()
+
+    async def force_shutdown(self):
+        for worker in self.workers.values():
+            if worker.process.is_alive():
+                logger.warning(f'Force killing worker {worker.worker_id} pid={worker.process.pid}')
+                os.kill(worker.process.pid)
+
+        self.read_results_task.cancel()
+        logger.info('Finished watcher had to be canceled, awaiting it a second time')
+        try:
+            await self.read_results_task
+        except asyncio.CancelledError:
+            pass
+
+    async def shutdown(self):
+        self.shutting_down = True
+        await self.stop_workers()
+        if self.read_results_task:
+            logger.info('Waiting for the finished watcher to return')
+            try:
+                await asyncio.wait_for(self.read_results_task, timeout=self.shutdown_timeout)
+            except asyncio.TimeoutError:
+                logger.warning(f'The finished task failed to cancel in {self.shutdown_timeout} seconds, will force.')
+                await self.force_shutdown()
+            except asyncio.CancelledError:
+                logger.info('The finished task was canceled, but we are shutting down so that is alright')
+        logger.info('The finished watcher has returned. Pool is shut down')
+
+    async def dispatch_task(self, message):
+        # TODO: handle this more elegantly, maybe through the DispatcherMain, or tell clients not to do this
+        if isinstance(message, str):
+            try:
+                message = json.loads(message)
+            except Exception:
+                message = {'task': message}
+
+        for candidate_worker in self.workers.values():
+            if not candidate_worker.current_task:
+                worker = candidate_worker
+                break
+        else:
+            # TODO: under certain conditions scale up workers
+            logger.warning(f'Ran out of available workers, queueing up next task, current queued {len(self.queued_messages)}')
+            self.queued_messages.append(message)
+            return
+
+        logging.debug(f"Dispatching task to worker {worker.process.pid}: {message}")
+
+        # Put the message in the selected worker's queue, NOTE: this marks the worker as busy
+        worker.current_task = message
+
+        # Go ahead and do the put synchronously, because it is just putting it on the queue
+        worker.message_queue.put(message)
+
+    async def process_finished(self, worker, message):
+        result = message["result"]
+        logger.debug(f"Task completed by worker {worker.worker_id}: {result}")
+
+        # Mark the worker as no longer busy
+        worker.mark_finished_task()
+        self.finished_count += 1
+
+    async def read_results_forever(self):
+        """Perpetual task that continuously waits for task completions."""
+        loop = asyncio.get_event_loop()
+        while True:
+            # Wait for a result from the finished queue (blocking)
+            # worker_id, finished_message
+            message = await loop.run_in_executor(None, self.finished_queue.get)
+            worker_id = message["worker"]
+            event = message["event"]
+            worker = self.workers[worker_id]
+
+            if event == 'ready':
+                worker.status = 'ready'
+
+            elif event == 'shutdown':
+                # TODO: remove worker from worker list... but we do not have autoscale pool yet so need that
+                worker.status = 'exited'
+                if self.shutting_down:
+                    if all(worker.status == 'exited' for worker in self.workers.values()):
+                        logger.debug(f"Worker {worker_id} exited and that is all, exiting finished monitoring.")
+                        break
+                    else:
+                        logger.debug(f"Worker {worker_id} exited and that is a good thing because we are trying to shut down.")
+                elif not self.workers:
+                    logger.info('All workers exited, exiting results thread out of abundance of caution')
+                    break
+                else:
+                    logger.debug(f"Worker {worker_id} finished exiting. The rest of this is not yet coded.")
+                    continue
+
+            elif event == 'done':
+                await self.process_finished(worker, message)
+
+            if self.queued_messages and (not self.shutting_down):
+                requeue_message = self.queued_messages.pop()
+                await self.dispatch_task(requeue_message)
@@ -0,0 +1,46 @@
+import asyncio
+import logging
+
+from dispatcher.brokers.pg_notify import aget_connection, aprocess_notify
+
+logger = logging.getLogger(__name__)
+
+
+class BrokeredProducer:
+    def __init__(self, broker='pg_notify', config=None, channels=()):
+        self.production_task = None
+        self.broker = broker
+        self.config = config
+        self.channels = channels
+
+    async def start_producing(self, pool):
+        self.production_task = asyncio.create_task(self.produce_forever(pool))
+
+    def all_tasks(self):
+        if self.production_task:
+            return [self.production_task]
+        return []
+
+    async def connect(self):
+        self.connection = await aget_connection(self.config)
+
+    async def produce_forever(self, pool):
+        await self.connect()
+
+        async with self.connection:
+
+            async for channel, payload in aprocess_notify(self.connection, self.channels):
+                logger.info(f"Received message from channel '{channel}': {payload}, sending to worker")
+                await pool.dispatch_task(payload)
+
+    async def shutdown(self):
+        if self.production_task:
+            self.production_task.cancel()
+            try:
+                await self.production_task
+            except asyncio.CancelledError:
+                logger.info(f'Successfully canceled production from {self.broker}')
+            self.production_task = None
+        if self.connection:
+            await self.connection.close()
+            self.connection = None
@@ -0,0 +1,33 @@
+import asyncio
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class ScheduledProducer:
+    def __init__(self, task_schedule):
+        self.task_schedule = task_schedule
+        self.scheduled_tasks = []
+
+    async def start_producing(self, pool):
+        for task_name, options in self.task_schedule.items():
+            per_seconds = options['schedule'].total_seconds()
+            self.scheduled_tasks.append(asyncio.create_task(self.run_schedule_forever(task_name, per_seconds, pool)))
+
+    def all_tasks(self):
+        return self.scheduled_tasks
+
+    async def run_schedule_forever(self, task_name, per_seconds, pool):
+        logger.info(f"Starting task runner for {task_name} with interval {per_seconds} seconds")
+        while True:
+            await asyncio.sleep(per_seconds)
+            logger.info(f"Sending scheduled task to worker: {task_name}")
+            # TODO: this will be JSON data with more supporting stuff
+            await pool.dispatch_task({"task": task_name})
+
+    async def shutdown(self):
+        logger.info('Stopping scheduled tasks')
+        for task in self.scheduled_tasks:
+            task.cancel()
+        await asyncio.gather(*self.scheduled_tasks, return_exceptions=True)
+        self.scheduled_tasks = []
@@ -0,0 +1,129 @@
+import inspect
+import json
+import logging
+import time
+from uuid import uuid4
+
+from django_guid import get_guid
+
+from . import pg_bus_conn
+
+logger = logging.getLogger('awx.main.dispatch')
+
+
+def serialize_task(f):
+    return '.'.join([f.__module__, f.__name__])
+
+
+class task:
+    """
+    Used to decorate a function or class so that it can be run asynchronously
+    via the task dispatcher.  Tasks can be simple functions:
+
+    @task()
+    def add(a, b):
+        return a + b
+
+    ...or classes that define a `run` method:
+
+    @task()
+    class Adder:
+        def run(self, a, b):
+            return a + b
+
+    # Tasks can be run synchronously...
+    assert add(1, 1) == 2
+    assert Adder().run(1, 1) == 2
+
+    # ...or published to a queue:
+    add.apply_async([1, 1])
+    Adder.apply_async([1, 1])
+
+    # Tasks can also define a specific target queue or use the special fan-out queue tower_broadcast:
+
+    @task(queue='slow-tasks')
+    def snooze():
+        time.sleep(10)
+
+    @task(queue='tower_broadcast')
+    def announce():
+        print("Run this everywhere!")
+
+    # The special parameter bind_kwargs tells the main dispatcher process to add certain kwargs
+
+    @task(bind_kwargs=['dispatch_time'])
+    def print_time(dispatch_time=None):
+        print(f"Time I was dispatched: {dispatch_time}")
+    """
+
+    def __init__(self, queue=None, bind_kwargs=None):
+        self.queue = queue
+        self.bind_kwargs = bind_kwargs
+
+    def __call__(self, fn=None):
+        queue = self.queue
+        bind_kwargs = self.bind_kwargs
+
+        class PublisherMixin(object):
+            queue = None
+
+            @classmethod
+            def delay(cls, *args, **kwargs):
+                return cls.apply_async(args, kwargs)
+
+            @classmethod
+            def get_async_body(cls, args=None, kwargs=None, uuid=None, **kw):
+                """
+                Get the python dict to become JSON data in the pg_notify message
+                This same message gets passed over the dispatcher IPC queue to workers
+                If a task is submitted to a multiprocessing pool, skipping pg_notify, this might be used directly
+                """
+                task_id = uuid or str(uuid4())
+                args = args or []
+                kwargs = kwargs or {}
+                obj = {'uuid': task_id, 'args': args, 'kwargs': kwargs, 'task': cls.name, 'time_pub': time.time()}
+                guid = get_guid()
+                if guid:
+                    obj['guid'] = guid
+                if bind_kwargs:
+                    obj['bind_kwargs'] = bind_kwargs
+                obj.update(**kw)
+                return obj
+
+            @classmethod
+            def apply_async(cls, args=None, kwargs=None, queue=None, uuid=None, **kw):
+                queue = queue or getattr(cls.queue, 'im_func', cls.queue)
+                if not queue:
+                    msg = f'{cls.name}: Queue value required and may not be None'
+                    logger.error(msg)
+                    raise ValueError(msg)
+                obj = cls.get_async_body(args=args, kwargs=kwargs, uuid=uuid, **kw)
+                if callable(queue):
+                    queue = queue()
+                # TODO: before sending, consult an app-specific callback if configured
+                with pg_bus_conn() as conn:
+                    conn.notify(queue, json.dumps(obj))
+                return (obj, queue)
+
+        # If the object we're wrapping *is* a class (e.g., RunJob), return
+        # a *new* class that inherits from the wrapped class *and* BaseTask
+        # In this way, the new class returned by our decorator is the class
+        # being decorated *plus* PublisherMixin so cls.apply_async() and
+        # cls.delay() work
+        bases = []
+        ns = {'name': serialize_task(fn), 'queue': queue}
+        if inspect.isclass(fn):
+            bases = list(fn.__bases__)
+            ns.update(fn.__dict__)
+        cls = type(fn.__name__, tuple(bases + [PublisherMixin]), ns)
+        if inspect.isclass(fn):
+            return cls
+
+        # if the object being decorated is *not* a class (it's a Python
+        # function), make fn.apply_async and fn.delay proxy through to the
+        # PublisherMixin we dynamically created above
+        setattr(fn, 'name', cls.name)
+        setattr(fn, 'apply_async', cls.apply_async)
+        setattr(fn, 'delay', cls.delay)
+        setattr(fn, 'get_async_body', cls.get_async_body)
+        return fn
@@ -0,0 +1,29 @@
+import importlib
+
+
+def resolve_callable(task):
+    """
+    Transform a dotted notation task into an imported, callable function, e.g.,
+
+    awx.main.tasks.system.delete_inventory
+    awx.main.tasks.jobs.RunProjectUpdate
+
+    In AWX this also did validation that the method was marked as a task.
+    That is out of scope of this method now.
+    This is mainly used by the worker.
+    """
+    if task.startswith('lambda:'):
+        return eval(task)
+
+    module, target = task.rsplit('.', 1)
+    module = importlib.import_module(module)
+    _call = None
+    if hasattr(module, target):
+        _call = getattr(module, target, None)
+
+    return _call
+
+
+def serialize_task(f) -> str:
+    """The reverse of resolve_callable, transform callable into dotted notation"""
+    return '.'.join([f.__module__, f.__name__])
@@ -0,0 +1,208 @@
+import inspect
+import json
+import logging
+import os
+import signal
+import sys
+import time
+import traceback
+from queue import Empty as QueueEmpty
+
+from dispatcher.utils import resolve_callable
+
+logger = logging.getLogger(__name__)
+
+
+"""This module contains code ran by the worker subprocess"""
+
+
+class WorkerSignalHandler:
+    def __init__(self):
+        self.kill_now = False
+        signal.signal(signal.SIGTERM, signal.SIG_DFL)
+        signal.signal(signal.SIGINT, self.exit_gracefully)
+
+    def exit_gracefully(self, *args, **kwargs):
+        logger.info('Received worker process exit signal')
+        self.kill_now = True
+
+
+class TaskWorker:
+    """
+    A worker implementation that deserializes task messages and runs native
+    Python code.
+
+    This mainly takes messages from the main process, imports, and calls them.
+
+    Original code existed at:
+    https://github.com/ansible/awx/blob/devel/awx/main/dispatch/worker/task.py
+    https://github.com/ansible/awx/blob/devel/awx/main/dispatch/worker/base.py
+
+    Major change from AWX is adding __init__ which now runs post-fork.
+    Previously this initialized pre-fork, making init logic unusable.
+    """
+
+    def __init__(self, worker_id):
+        self.worker_id = worker_id
+        self.ppid = os.getppid()
+        self.pid = os.getpid()
+        self.signal_handler = WorkerSignalHandler()
+
+    def should_exit(self) -> str:
+        """Called before continuing the loop, something suspicious, return True, should exit"""
+        if os.getppid() != self.ppid:
+            logger.error('My parent PID changed, this process has been orphaned, like segfault or sigkill, exiting')
+            return True
+        elif self.signal_handler.kill_now:
+            logger.error('Exiting main loop of worker process due to interupt signal')
+            return True
+        return False
+
+    def get_uuid(self, message):
+        return message.get('uuid', '<unknown>')
+
+    def run_callable(self, message):
+        """
+        Given some AMQP message, import the correct Python code and run it.
+        """
+        task = message['task']
+        args = message.get('args', [])
+        kwargs = message.get('kwargs', {})
+        _call = resolve_callable(task)
+        if inspect.isclass(_call):
+            # the callable is a class, e.g., RunJob; instantiate and
+            # return its `run()` method
+            _call = _call().run
+
+        # don't print kwargs, they often contain launch-time secrets
+        logger.debug(f'task {self.get_uuid(message)} starting {task}(*{args}) on worker {self.worker_id}')
+
+        return _call(*args, **kwargs)
+
+    def perform_work(self, message):
+        """
+        Import and run code for a task e.g.,
+
+        body = {
+            'args': [8],
+            'callbacks': [{
+                'args': [],
+                'kwargs': {}
+                'task': u'awx.main.tasks.system.handle_work_success'
+            }],
+            'errbacks': [{
+                'args': [],
+                'kwargs': {},
+                'task': 'awx.main.tasks.system.handle_work_error'
+            }],
+            'kwargs': {},
+            'task': u'awx.main.tasks.jobs.RunProjectUpdate'
+        }
+        """
+        # TODO: callback before starting task, previously ran
+        # settings.__clean_on_fork__()
+        result = None
+        try:
+            result = self.run_callable(message)
+        except Exception as exc:
+            result = exc
+
+            try:
+                if getattr(exc, 'is_awx_task_error', False):
+                    # Error caused by user / tracked in job output
+                    logger.warning("{}".format(exc))
+                else:
+                    task = message['task']
+                    args = message.get('args', [])
+                    kwargs = message.get('kwargs', {})
+                    logger.exception('Worker failed to run task {}(*{}, **{}'.format(task, args, kwargs))
+            except Exception:
+                # It's fairly critical that this code _not_ raise exceptions on logging
+                # If you configure external logging in a way that _it_ fails, there's
+                # not a lot we can do here; sys.stderr.write is a final hail mary
+                _, _, tb = sys.exc_info()
+                traceback.print_tb(tb)
+
+            for callback in message.get('errbacks', []) or []:
+                callback['uuid'] = self.get_uuid(message)
+                self.perform_work(callback)
+        finally:
+            # TODO: callback after running a task, previously ran
+            # kube_config._cleanup_temp_files()
+            pass
+
+        for callback in message.get('callbacks', []) or []:
+            callback['uuid'] = self.get_uuid(message)
+            self.perform_work(callback)
+        return result
+
+    # NOTE: on_start and on_stop were intentionally removed
+    # these were used for the consumer classes, but not the worker classes
+
+    # TODO: new WorkerTaskCall class to track timings and such
+    def get_finished_message(self, result, message, time_started):
+        """I finished the task in message, giving result. This is what I send back to traffic control."""
+        return {
+            "worker": self.worker_id,
+            "event": "done",
+            "result": result,
+            "uuid": self.get_uuid(message),
+            "time_started": time_started,
+            "time_finish": time.time(),
+        }
+
+    def get_ready_message(self):
+        """Message for traffic control, saying am entering the main work loop and am HOT TO GO"""
+        return {"worker": self.worker_id, "event": "ready"}
+
+    def get_shutdown_message(self):
+        """Message for traffic control, do not deliver any more mail to this address"""
+        return {"worker": self.worker_id, "event": "shutdown"}
+
+
+def work_loop(worker_id, queue, finished_queue):
+    """
+    Worker function that processes messages from the queue and sends confirmation
+    to the finished_queue once done.
+    """
+    worker = TaskWorker(worker_id)
+    # TODO: add an app callback here to set connection name and things like that
+
+    finished_queue.put(worker.get_ready_message())
+
+    while True:
+        if worker.should_exit():
+            break
+
+        try:
+            message = queue.get()
+        except QueueEmpty:
+            logger.info(f'Worker {worker_id} Encountered strange QueueEmpty condition')
+            continue  # a race condition that mostly can be ignored
+        except Exception as exc:
+            logger.exception(f"Exception on worker {worker_id}, type {type(exc)}, error: {str(exc)}, exiting")
+            break
+
+        if not isinstance(message, dict):
+
+            if isinstance(message, str):
+                if message.lower() == "stop":
+                    logger.warning(f"Worker {worker_id} stopping.")
+                    break
+
+            try:
+                message = json.loads(message)
+            except Exception as e:
+                logger.error(f'Worker {worker.worker_id} could not process message {message}, error: {str(e)}')
+                break
+
+        logger.info(f'message to perform_work on {message}')
+        logger.info(f'the type {type(message)}')
+        time_started = time.time()
+        result = worker.perform_work(message)
+
+        # Indicate that the task is finished by putting a message in the finished_queue
+        finished_queue.put(worker.get_finished_message(result, message, time_started))
+
+    finished_queue.put(worker.get_shutdown_message())
+    logger.debug('Informed the pool manager that we have exited')
@@ -0,0 +1,16 @@
+---
+services:
+  msg_postgres:
+    image: "postgres:15"
+    container_name: dispatch_postgres
+    environment:
+      POSTGRES_DB: dispatch_db
+      POSTGRES_USER: dispatch
+      POSTGRES_PASSWORD: dispatching
+    healthcheck:
+      test: ["CMD", "pg_isready", "-U", "dispatch", "-d", "dispatch_db"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    ports:
+      - "55777:5432"
@@ -0,0 +1,114 @@
+## Reference Designs
+
+### AWX dispatcher
+
+This is directly taken from the AWX dispatcher.
+
+https://github.com/ansible/awx/tree/devel/awx/main/dispatch
+
+This was introduced in:
+
+https://github.com/ansible/awx/pull/2266
+
+> ...much like the callback receiver implementation in 3.3.0 (on which this code is based), this entry point is a kombu.ConsumerMixin.
+
+### Kombu
+
+Kombu is a sub-package of celery.
+
+https://github.com/celery/kombu
+
+In messaging module, this has a `Producer` and `Consumer` classes.
+In mixins it has a `ConsumerMixin`, but no methods seem to have made it into AWX dispatch.
+
+This doesn't deal with worker pool management. It does have examples with `Worker` classes.
+These follow a similar contract with `process_task` here.
+
+### AMQP
+
+https://www.rabbitmq.com/tutorials/amqp-concepts
+
+This protcol deals with publishers, exchanges, queues, and consumers.
+
+### ProcessPoolExecutor
+
+The python `ProcessPoolExecutor` uses both a single call queue and a single results queue.
+
+https://github.com/python/cpython/blob/f1d33dbddd3496b062e1fbe024fb6d7b023a35f5/Lib/concurrent/futures/process.py#L217
+
+Some things it does is not applicable to the dispatcher here, because it strives to adhere
+to an existing contract around python futures that we do not care about.
+
+The local worker thread has many commonalities to the results thread being used here.
+It is most interesting to the the three-fold criteria for wakeups in that thread:
+
+```python
+result_item, is_broken, cause = self.wait_result_broken_or_wakeup()
+```
+
+By comparision, the results thread used here only has 1 condition.
+In some shutdown or recycle cases, it may be canceled.
+
+An important similarity is that the manager maintains an internal working queue.
+This is also done in this library, and diverges from AWX practice.
+In AWX dispatcher, a full queue may put messages into individual worker IPCs.
+This caused bad results, like delaying tasks due to long-running jobs,
+while the pool had many other workers free up in the mean time.
+
+## Alternative Archectures
+
+This are blue-sky ideas, which may not happen anytime soon,
+but they are described to help structure the app today so it can expand
+into these potential future roles.
+
+### Singleton task queue
+
+A major pivot from the AWX dispatcher is that we do not use 1 result queue per worker,
+but a single result queue for all workers, and each meassage includes a worker id.
+
+If you continue this pattern, then we would no longer have a call queue for each worker,
+and workers would just grab messages from the queue as they are available.
+
+The problem you encounter is that you will not know what worker started what task.
+If you do any "management" this is a problem. For instance, if you want a task
+to have a timeout, you need to know which worker to kill if it goes over its limit.
+
+There is a way to still consolidate the call queue while no losing these other features.
+When a worker receives a task, it can submit an ACK to the finished queue telling
+the main process that it has started a task, and which task it started.
+
+This isn't ultimately robust, if there is an error between getting the message and ACK,
+but this probably isn't a reasonable concern. As of now, this looks viable.
+
+### Persistent work manager
+
+Years ago, when AWX was having trouble with output processing bottlenecks,
+we stopped using the main dispatcher process to dispatch job events to workers.
+
+Essentially, any performance-sensitive data volumes should not go through the
+pool worker management system where data is passed through IPC queues.
+Doing this causes the main process to be a bottleneck.
+
+The solution was to have workers connect to a socket on their own.
+
+Nothing is wrong with this, it's just weird.
+None of the written facilities for pool management in dispatcher code is useful.
+Because of that, event processing diverged far from the rest of the dispatcher.
+
+Long-term vision here is that:
+ - a `@task` decorator may mark a task as persistent
+ - additional messages types will need to be send into the finished queue for
+   - analytics tracking, like how many messages were processed
+   - whether a particular resource being monitored has been closed
+
+The idea is that this would integrate what was prototyped in:
+
+https://github.com/AlanCoding/receptor-reporter/tree/devel
+
+That idea involved the main process more than the existing callback receiver.
+Because each job has its own socket that has to be read from, so these will come and go.
+And a worker may manage more than 1 job at the same time, asynchronously.
+
+This also requires forking from what is now `dispatcher.main`.
+We could keep the pool (and add more feature) but this requires
+an entirely different main loop.
@@ -0,0 +1,58 @@
+## Message Formats
+
+There are two different types of message formats.
+
+See the main design diagram for reference.
+
+### Broker Message Format
+
+This is the format when a client submits a task to be ran, for example, to pg_notify.
+This contains JSON-serialized data.
+
+Example:
+
+```json
+{
+  "uuid": "9760671a-6261-45aa-881a-f66929ff9725",
+  "args": [4],
+  "kwargs": {},
+  "task": "awx.main.tasks.jobs.RunJob",
+  "time_pub": 1727354869.5126922,
+  "guid": "8f887a0c51f7450db3542c501ba83756"
+}
+```
+
+The `"task"` contains an importable task to run.
+
+If you are doing the control-and-reply for something, then the submitted
+message will also contain a `"reply_to"` key for the channel to send the reply to.
+
+The message sent to the reply channel will have some other purpose-specific information,
+like debug information.
+
+### Internal Worker Pool Format
+
+The main process and workers communicate through conventional IPC queues.
+This contains the messages to start running a job, of course.
+Ideally, this only contains the bare minimum, because tracking
+stats and lifetime are the job of the main process, not the worker.
+
+```json
+{
+  "args": [4],
+  "kwargs": {},
+  "task": "awx.main.tasks.jobs.RunJob",
+}
+```
+
+#### Worker to Main Process
+
+When the worker communicates information back to the main process,
+it must identify itself, and identify the event. For example:
+
+```json
+{
+    "worker": 3,
+    "event": "ready"
+}
+```
@@ -0,0 +1,194 @@
+
+## Roadmap, planning
+
+Here, we will maintain a list of features.
+These may be converted into issues later.
+The main goal is to not forget about them.
+
+I will break this down into 2 categories.
+The first category is pre-alpha.
+I will assume that all commits will be squashed and this will be
+moved to a new repo. At that point it will become public.
+Everything before it becomes public are pre-alpha things to do.
+
+### Pre-alpha
+
+#### Track more data with new types
+
+The AWX dispatcher had a model where a task could have certain parameters.
+Like, imagine that we throw a generalized task timeout in.
+
+The design of this was a little choppy, because the `@task` decorator
+would declare these paramters.
+
+Yet these parameters would be send in the pg_notify JSON data.
+
+This doesn't really fit the model that we want.
+We want it to be _impossible_ to run a task with parameters
+other than what are declared on `@task`
+
+Think of it this way, there are
+ - runtime arguments (or parameters)
+ - configuration parameters
+
+Something like a task timeout is a configuration parameter.
+This implies that when we register a method with `@task`
+we have to **save it in a registry**.
+
+When the dispatcher gets a message saying to run a task,
+the most correct thing to do is look that task up in the registry.
+This reduces the JSON data passed, and makes a more consistent
+source-of-truth.
+
+But this means that we need a registry, and way before that,
+we need to introduce a type for methods that will be called.
+
+Additionally, the next big objective is that we want
+detailed tracking timing for every time a task is called.
+This goes with the call, not the task.
+
+So the new types we probably want are:
+ - `Task`
+ - `Call`
+
+This is in addition to the `PoolWorker` which is the worker that
+runs the task.
+
+The `Call` will track the lifecycle of the call.
+The `PoolWorker` will reference the `Call` it is running.
+The `Call` will reference the `Task` it is a call of.
+
+We don't need/want to pass any of this through the IPC queue
+to the worker, we only want it for the main dispatcher.
+This will mainly be useful as we ask the dispatcher to respond
+with stats about what it has been running.
+
+Also... I see this as a mechanism to write integration tests.
+We can submit tasks, wait for a signal they finished,
+and then get the work history from the dispatcher as it ran those.
+
+We should look at the `Call` class as corresponding to a log record.
+This should have the call details, identifiers, and mostly be a
+record of the call lifecycle. This is mostly log-like, and should have
+mostly scalar type data of floats, strings, ids, etc.
+
+#### Finish integrating publisher logic
+
+The content existing in `dispatcher.publish` is mostly not connected.
+
+What's interesting here is that `dispatcher.publish` should import
+from the broker module.
+That gets hard to manage with multiple connections (ala Django).
+But some version of it we should do...
+
+#### Finish integrating the worker loop
+
+Overlapping with the publisher stuff, the `dispatcher.publish` should
+get the method name, and the args, import the method, and run it.
+
+This requires moving more code in from AWX
+
+https://github.com/ansible/awx/blob/devel/awx/main/dispatch/worker/task.py
+
+That has
+ - importing logic
+ - calling logic
+ - supporting stuff to include timings
+ - signal handling
+ - exception handling
+
+### Post-alpha
+
+#### Conditional skipping logic on publishing
+
+AWX uses sqlite3 for unit tests, which would error on async tasks.
+Because of this, it did not publish a message if `is_testing` was True.
+It's not reasonable for us to implement that same thing here, and
+we will likely need some callback approach.
+
+So the ask here is that we have some app-wide configuration,
+which can inspect a message _before publishing_ and take some action,
+or possibly cancel the NOTIFY.
+
+Probably not good coding practice generally, but probably useful.
+
+#### Feature branch to integrate with AWX
+
+Make AWX run using this library, this should be an early goal in this stage.
+
+#### Worker and Broker Self-Checks
+
+A moderate version of this was proposed in:
+
+https://github.com/ansible/awx/pull/14749
+
+In grand conclusion, there is no way to assure that the LISTSEN connection
+is not dropped.
+Worse, when it is dropped, we may get no notification.
+Astonishingly, there appears to be no way around this.
+
+Because of this, the ultimate option of last-resource must be taken.
+That means that we can only assure health of a connection of a worker
+by experiential means.
+
+To know if a connection works, you must publish a control message and receive it.
+To know if a worker is alive, you must send a message and receive a reply.
+
+Because of this knowledge, the new dispatcher library must just straight to this eventuality.
+Implement checks for brokers and workers based on send-and-receive.
+This can be done fully with asynio patterns.
+
+For the issues related to AWX 14749, we also need means to recycle connections
+in cases where we fail to receive check messages.
+
+#### Worker Allocation Cookbook
+
+Several very practical problems are not intended to ever be solved by the dispatcher.
+However, for someone using postgres or any other modern database,
+combined with the dispatcher, they have the ability to solve these problems.
+
+https://github.com/ansible/awx/issues/11997
+
+Breakdown of those problems:
+1. Have a node in the cluster, any node, process a task
+2. Have a periodic task run, anywhere in the cluster, at a certain frequency
+
+The solution for (1) is to add an entry to a table when submitting the task.
+Then depending on the use case, there are 2 decent options:
+ - broadcast a task asking any willing node to run the task, get lock, if lock is taken, bail
+ - run a periodic task that will use `select_for_update` to get entries and mark as received
+
+The solution for (2) in AWX uses the Solo model to track a `datetime`.
+This is self-obviously needed for the feature of _user_ schedules.
+
+#### Task Timeout
+
+When using `@task()` decorator, we add `timeout=5` to timeout in 5 seconds.
+
+A solution was drafted in the branch:
+
+https://github.com/ansible/awx/compare/devel...AlanCoding:awx:dispatcher_timeout
+
+#### Singleton Tasks
+
+AWX commonly used pg locks to prevent multiple workers running the same task,
+but a more efficient alternative is to never start those tasks.
+
+This proposes another argument to `@task()` decorator that makes the task exclusive.
+When another version of the task is already running, there are 2 sub-options we could do:
+ - wait for the existing task to finish before running the new task
+ - discard the new task
+
+The use cases for AWX mainly wand the 2nd one.
+Idepotent tasks are used extremely heavily on schedules, meaning that
+when the dispatcher receives too many it should simply discard extras.
+
+#### Triggering Tasks from Tasks
+
+For the solution to (2) in the cookbook to be fully functional,
+it is best that tasks can directly start other tasks via messaging
+internal to the worker pool.
+
+This means passing some kind of object into the task being called
+where this object contains callbacks that can be used to
+trigger methods in the worker pool's finished watcher.
@@ -0,0 +1,48 @@
+[project.urls]
+Repository = "https://github.com/ansible/dispatcher"
+
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "dispatcher"
+version = "0.1.0"
+description = "An asyncio-based dispatcher for tasks in a worker pool."
+readme = "README.md"
+authors = [
+    { name = "Alan Rominger", email = "arominge@redhat.com.com" }
+]
+license = { text = "MIT" }
+keywords = ["asyncio", "multiprocessing", "dispatcher", "pg_notify", "python"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+
+[project.scripts]
+dispatcher-standalone = "dispatcher.cli:standalone"
+
+[tool.setuptools.packages.find]
+include = ["dispatcher*"]
+
+# You need psycopg, but this will not help you to install it
+
+# Linters coppied from django-ansible-base, exceptions removed
+
+[tool.setuptools_scm]
+version_scheme = "calver-by-date"
+
+[tool.black]
+line-length = 160
+fast = true
+skip-string-normalization = true
+
+[tool.isort]
+profile = "black"
+line_length = 160
@@ -0,0 +1,3 @@
+pytest
+pytest-asyncio
+pytest-benchmark
@@ -0,0 +1,15 @@
+import pytest
+
+from dispatcher.main import DispatcherMain
+
+
+# List of channels to listen on
+CHANNELS = ['test_channel', 'test_channel2', 'test_channel2']
+
+# Database connection details
+CONNECTION_STRING = "dbname=dispatch_db user=dispatch password=dispatching host=localhost port=55777"
+
+
+@pytest.fixture
+def pg_dispatcher():
+    return DispatcherMain({"producers": {"brokers": {"pg_notify": {"conninfo": CONNECTION_STRING}, "channels": CHANNELS}}, "pool": {"max_workers": 3}})
@@ -0,0 +1,36 @@
+import asyncio
+
+import pytest
+
+from dispatcher.brokers.pg_notify import publish_message
+
+
+# List of channels to listen on
+CHANNELS = ['test_channel', 'test_channel2', 'test_channel2']
+
+# Database connection details
+CONNECTION_STRING = "dbname=dispatch_db user=dispatch password=dispatching host=localhost port=55777"
+
+
+@pytest.mark.asyncio
+async def test_run_and_then_shutdown(pg_dispatcher):
+    await pg_dispatcher.start_working()
+    await asyncio.sleep(2)
+
+    await pg_dispatcher.shutdown()
+
+    assert pg_dispatcher.pool.finished_count == 0
+
+
+@pytest.mark.asyncio
+async def test_run_lambda_function(pg_dispatcher):
+    await pg_dispatcher.start_working()
+    await asyncio.sleep(1)
+
+    # TODO: do config better
+    publish_message('test_channel', 'lambda: "This worked!"', config={"conninfo": CONNECTION_STRING})
+    await asyncio.sleep(1)
+
+    await pg_dispatcher.shutdown()
+
+    assert pg_dispatcher.pool.finished_count == 1
@@ -0,0 +1,3 @@
+# this is a good place to create config files, load them, and test that we get the params we expected
+# also a good place to take some configs and test initializing dispatcher objects with them
+# None of this has been done, but you could do it
@@ -0,0 +1,28 @@
+# send_notifications.py
+import asyncio
+
+from dispatcher.brokers.pg_notify import publish_message
+
+# Database connection details
+CONNECTION_STRING = "dbname=dispatch_db user=dispatch password=dispatching host=localhost port=55777"
+
+
+TEST_MSGS = [
+    ('test_channel', 'lambda: __import__("time").sleep(1)'),
+    ('test_channel2', 'lambda: __import__("time").sleep(1)'),
+    ('test_channel', 'lambda: __import__("time").sleep(1)'),
+]
+
+
+async def main():
+    for channel, message in TEST_MSGS:
+        # Send the notification
+        publish_message(channel, message, config={'conninfo': CONNECTION_STRING})
+        # await send_notification(channel, message)
+    # send more than number of workers quickly
+    for i in range(15):
+        publish_message('test_channel', f'lambda: {i}', config={'conninfo': CONNECTION_STRING})
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+pytest`
	`2`	`+pytest-asyncio`
	`3`	`+pytest-benchmark`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# this is a good place to create config files, load them, and test that we get the params we expected`
	`2`	`+# also a good place to take some configs and test initializing dispatcher objects with them`
	`3`	`+# None of this has been done, but you could do it`