-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] feat: detect broken connection and reconnect broker #126
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,9 @@ | |
from typing import Iterable, Optional, Union | ||
from uuid import uuid4 | ||
|
||
from ..protocols import Producer | ||
from ..brokers import get_broker | ||
from ..producers import BrokeredProducer | ||
from ..protocols import Producer, BrokerSelfCheckResult | ||
from . import control_tasks | ||
from .asyncio_tasks import ensure_fatal | ||
from .next_wakeup_runner import HasWakeup, NextWakeupRunner | ||
|
@@ -234,7 +236,31 @@ async def main(self) -> None: | |
await self.start_working() | ||
|
||
logger.info(f'Dispatcher node_id={self.node_id} running forever, or until shutdown command') | ||
await self.events.exit_event.wait() | ||
|
||
while True: | ||
try: | ||
await asyncio.wait_for(self.events.exit_event.wait(), 5.0) | ||
|
||
# exit_event has fired, process should exit | ||
break | ||
except asyncio.TimeoutError: | ||
logger.info(f'initiating broker self check for node-id {self.node_id}') | ||
for producer in self.producers: | ||
if isinstance(producer, BrokeredProducer): | ||
result = await producer.broker.get_self_check_result(self.node_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's actually super non-obvious is that it might not be okay to run 2 listening tasks at the same time. There are actually a lot of hazards with connection management in async code, and every I've fought this I've given up and went back to always forever only having a single task interact with psycopg. Separate from that, this wouldn't have a timeout anyway, which is the failure scenario. |
||
|
||
if result == BrokerSelfCheckResult.IN_PROGRESS: | ||
# the last self check still hasn't finished - we treat that as a connection failure | ||
result = BrokerSelfCheckResult.FAILURE | ||
|
||
match result: | ||
case BrokerSelfCheckResult.UNDECIDED | BrokerSelfCheckResult.SUCCESS: | ||
asyncio.create_task(producer.broker.initiate_self_check(self.node_id)) | ||
continue | ||
case BrokerSelfCheckResult.FAILURE: | ||
logger.error(f'broker self check failed for node-id {self.node_id}') | ||
producer.broker.reconnect() | ||
|
||
finally: | ||
await self.shutdown() | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import json | ||
import logging | ||
import sys | ||
|
||
from dispatcher import run_service | ||
from dispatcher.factories import get_publisher_from_settings, get_control_from_settings | ||
from dispatcher.utils import MODULE_METHOD_DELIMITER | ||
from dispatcher.config import setup | ||
|
||
from time import sleep | ||
|
||
from tests.data.methods import sleep_function, sleep_discard, task_has_timeout, hello_world_binder | ||
|
||
|
||
# Setup the global config from the settings file shared with the service | ||
setup(file_path='dispatcher.yml') | ||
|
||
broker = get_publisher_from_settings() | ||
|
||
def main(): | ||
run_service() | ||
|
||
if __name__ == "__main__": | ||
logging.basicConfig(level='DEBUG', stream=sys.stdout) | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would result in all nodes sending their health check messages to all connected nodes. This isn't a good idea for scaling, but also wouldn't work unless there was some other identifying information in the message it used to verify that it came from itself. So this could be addressed by adding a uuid to it, and adding some expected uuid to the message might be a good idea anyway to rule out any possibility of getting wires crossed.