Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

demo of what checkpointing plugins might look like #3535

Draft
wants to merge 15 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/userguide/workflows/checkpoints.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ This can save time and computational resources.

This is done in two ways:

* Firstly, *app caching* will allow reuse of results within the same run.
* Firstly, *app caching* will allow reuse of results and exceptions within
the same run.

* Building on top of that, *checkpointing* will store results on the filesystem
and reuse those results in later runs.
* Building on top of that, *checkpointing* will store results (but not
exceptions) on the filesystem and reuse those results in later runs.

.. _label-appcaching:

Expand Down Expand Up @@ -264,8 +265,7 @@ of the ``slow_double`` app.
# Wait for the results
[i.result() for i in d]

cpt_dir = dfk.checkpoint()
print(cpt_dir) # Prints the checkpoint dir
dfk.checkpoint()


Resuming from a checkpoint
Expand Down
3 changes: 3 additions & 0 deletions parsl/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing_extensions import Literal

from parsl.dataflow.dependency_resolvers import DependencyResolver
from parsl.dataflow.memoization import Memoizer
from parsl.dataflow.taskrecord import TaskRecord
from parsl.errors import ConfigurationError
from parsl.executors.base import ParslExecutor
Expand Down Expand Up @@ -101,6 +102,7 @@ class Config(RepresentationMixin, UsageInformation):
def __init__(self,
executors: Optional[Iterable[ParslExecutor]] = None,
app_cache: bool = True,
memoizer: Optional[Memoizer] = None,
checkpoint_files: Optional[Sequence[str]] = None,
checkpoint_mode: Union[None,
Literal['task_exit'],
Expand Down Expand Up @@ -131,6 +133,7 @@ def __init__(self,
self._executors: Sequence[ParslExecutor] = executors
self._validate_executors()

self.memoizer = memoizer
self.app_cache = app_cache
self.checkpoint_files = checkpoint_files
self.checkpoint_mode = checkpoint_mode
Expand Down
107 changes: 35 additions & 72 deletions parsl/dataflow/dflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import inspect
import logging
import os
import pickle
import random
import sys
import threading
Expand All @@ -30,7 +29,7 @@
from parsl.dataflow.dependency_resolvers import SHALLOW_DEPENDENCY_RESOLVER
from parsl.dataflow.errors import DependencyError, JoinError
from parsl.dataflow.futures import AppFuture
from parsl.dataflow.memoization import Memoizer
from parsl.dataflow.memoization import BasicMemoizer, Memoizer
from parsl.dataflow.rundirs import make_rundir
from parsl.dataflow.states import FINAL_FAILURE_STATES, FINAL_STATES, States
from parsl.dataflow.taskrecord import TaskRecord
Expand Down Expand Up @@ -96,8 +95,6 @@ def __init__(self, config: Config) -> None:

logger.info("Parsl version: {}".format(get_version()))

self.checkpoint_lock = threading.Lock()

self.usage_tracker = UsageTracker(self)
self.usage_tracker.send_start_message()

Expand Down Expand Up @@ -160,17 +157,29 @@ def __init__(self, config: Config) -> None:
self.monitoring.send((MessageType.WORKFLOW_INFO,
workflow_info))

# TODO: this configuration should become part of the particular memoizer code
# - this is a checkpoint-implementation-specific parameter
if config.checkpoint_files is not None:
checkpoint_files = config.checkpoint_files
elif config.checkpoint_files is None and config.checkpoint_mode is not None:
checkpoint_files = get_all_checkpoints(self.run_dir)
else:
checkpoint_files = []

self.memoizer = Memoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
self.checkpointed_tasks = 0
# self.memoizer: Memoizer = BasicMemoizer(self, memoize=config.app_cache, checkpoint_files=checkpoint_files)
# the memoize flag might turn into the user choosing different instances
# of the Memoizer interface
self.memoizer: Memoizer
if config.memoizer is not None:
self.memoizer = config.memoizer
else:
self.memoizer = BasicMemoizer()

self.memoizer.start(dfk=self, memoize=config.app_cache, checkpoint_files=checkpoint_files, run_dir=self.run_dir)
self._checkpoint_timer = None
self.checkpoint_mode = config.checkpoint_mode

self._modify_checkpointable_tasks_lock = threading.Lock()
self.checkpointable_tasks: List[TaskRecord] = []

# this must be set before executors are added since add_executors calls
Expand All @@ -186,6 +195,10 @@ def __init__(self, config: Config) -> None:
self.add_executors(config.executors)
self.add_executors([parsl_internal_executor])

# TODO: these checkpoint modes should move into the memoizer implementation
# they're (probably?) checkpointer specific: for example the sqlite3-pure-memoizer
# doesn't have a notion of building up an in-memory checkpoint table that needs to be
# flushed on a separate policy
if self.checkpoint_mode == "periodic":
if config.checkpoint_period is None:
raise ConfigurationError("Checkpoint period must be specified with periodic checkpoint mode")
Expand All @@ -195,7 +208,7 @@ def __init__(self, config: Config) -> None:
except Exception:
raise ConfigurationError("invalid checkpoint_period provided: {0} expected HH:MM:SS".format(config.checkpoint_period))
checkpoint_period = (h * 3600) + (m * 60) + s
self._checkpoint_timer = Timer(self.checkpoint, interval=checkpoint_period, name="Checkpoint")
self._checkpoint_timer = Timer(self.invoke_checkpoint, interval=checkpoint_period, name="Checkpoint")

self.task_count = 0
self.tasks: Dict[int, TaskRecord] = {}
Expand Down Expand Up @@ -558,9 +571,9 @@ def handle_app_update(self, task_record: TaskRecord, future: AppFuture) -> None:
# Do we need to checkpoint now, or queue for later,
# or do nothing?
if self.checkpoint_mode == 'task_exit':
self.checkpoint(tasks=[task_record])
self.memoizer.checkpoint(tasks=[task_record])
elif self.checkpoint_mode in ('manual', 'periodic', 'dfk_exit'):
with self.checkpoint_lock:
with self._modify_checkpointable_tasks_lock:
self.checkpointable_tasks.append(task_record)
elif self.checkpoint_mode is None:
pass
Expand Down Expand Up @@ -1190,15 +1203,23 @@ def cleanup(self) -> None:

self.log_task_states()

# TODO: do this in the basic memoizer
# Checkpointing takes priority over the rest of the tasks
# checkpoint if any valid checkpoint method is specified
if self.checkpoint_mode is not None:
self.checkpoint()

# TODO: accesses to self.checkpointable_tasks should happen
# under a lock?
self.memoizer.checkpoint(self.checkpointable_tasks)

if self._checkpoint_timer:
logger.info("Stopping checkpoint timer")
self._checkpoint_timer.close()

logger.info("Closing memoizer")
self.memoizer.close()
logger.info("Closed memoizer")

# Send final stats
self.usage_tracker.send_end_message()
self.usage_tracker.close()
Expand Down Expand Up @@ -1247,68 +1268,10 @@ def cleanup(self) -> None:

logger.info("DFK cleanup complete")

def checkpoint(self, tasks: Optional[Sequence[TaskRecord]] = None) -> str:
"""Checkpoint the dfk incrementally to a checkpoint file.

When called, every task that has been completed yet not
checkpointed is checkpointed to a file.

Kwargs:
- tasks (List of task records) : List of task ids to checkpoint. Default=None
if set to None, we iterate over all tasks held by the DFK.

.. note::
Checkpointing only works if memoization is enabled

Returns:
Checkpoint dir if checkpoints were written successfully.
By default the checkpoints are written to the RUNDIR of the current
run under RUNDIR/checkpoints/tasks.pkl
"""
with self.checkpoint_lock:
if tasks:
checkpoint_queue = tasks
else:
checkpoint_queue = self.checkpointable_tasks
self.checkpointable_tasks = []

checkpoint_dir = '{0}/checkpoint'.format(self.run_dir)
checkpoint_tasks = checkpoint_dir + '/tasks.pkl'

if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir, exist_ok=True)

count = 0

with open(checkpoint_tasks, 'ab') as f:
for task_record in checkpoint_queue:
task_id = task_record['id']

app_fu = task_record['app_fu']

if app_fu.done() and app_fu.exception() is None:
hashsum = task_record['hashsum']
if not hashsum:
continue
t = {'hash': hashsum, 'exception': None, 'result': app_fu.result()}

# We are using pickle here since pickle dumps to a file in 'ab'
# mode behave like a incremental log.
pickle.dump(t, f)
count += 1
logger.debug("Task {} checkpointed".format(task_id))

self.checkpointed_tasks += count

if count == 0:
if self.checkpointed_tasks == 0:
logger.warning("No tasks checkpointed so far in this run. Please ensure caching is enabled")
else:
logger.debug("No tasks checkpointed in this pass.")
else:
logger.info("Done checkpointing {} tasks".format(count))

return checkpoint_dir
def invoke_checkpoint(self) -> None:
with self._modify_checkpointable_tasks_lock:
self.memoizer.checkpoint(self.checkpointable_tasks)
self.checkpointable_tasks = []

@staticmethod
def _log_std_streams(task_record: TaskRecord) -> None:
Expand Down
Loading
Loading