-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathnodes.py
343 lines (280 loc) · 10.3 KB
/
nodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
"""Contains implementations of tasks and nodes following the node protocols."""
from __future__ import annotations
import hashlib
import inspect
import pickle
from os import stat_result
from pathlib import Path # noqa: TCH003
from typing import Any
from typing import Callable
from typing import TYPE_CHECKING
from _pytask._hashlib import hash_value
from _pytask.node_protocols import PNode
from _pytask.node_protocols import PPathNode
from _pytask.node_protocols import PTask
from _pytask.node_protocols import PTaskWithPath
from _pytask.path import hash_path
from _pytask.typing import no_default
from _pytask.typing import NoDefault
from attrs import define
from attrs import field
if TYPE_CHECKING:
from _pytask.models import NodeInfo
from _pytask.tree_util import PyTree
from _pytask.mark import Mark
__all__ = ["PathNode", "PickleNode", "PythonNode", "Task", "TaskWithoutPath"]
@define(kw_only=True)
class TaskWithoutPath(PTask):
"""The class for tasks without a source file.
Tasks may have no source file because
- they are dynamically created in a REPL.
- they are created in a Jupyter notebook.
Attributes
----------
name
The name of the task.
function
The task function.
depends_on
A list of dependencies of task.
produces
A list of products of task.
markers
A list of markers attached to the task function.
report_sections
Reports with entries for when, what, and content.
attributes: dict[Any, Any]
A dictionary to store additional information of the task.
"""
name: str
function: Callable[..., Any]
depends_on: dict[str, PyTree[PNode]] = field(factory=dict)
produces: dict[str, PyTree[PNode]] = field(factory=dict)
markers: list[Mark] = field(factory=list)
report_sections: list[tuple[str, str, str]] = field(factory=list)
attributes: dict[Any, Any] = field(factory=dict)
@property
def signature(self) -> str:
raw_key = str(hash_value(self.name))
return hashlib.sha256(raw_key.encode()).hexdigest()
def state(self) -> str | None:
"""Return the state of the node."""
try:
source = inspect.getsource(self.function)
except OSError:
return None
else:
return hashlib.sha256(source.encode()).hexdigest()
def execute(self, **kwargs: Any) -> Any:
"""Execute the task."""
return self.function(**kwargs)
@define(kw_only=True)
class Task(PTaskWithPath):
"""The class for tasks which are Python functions.
Attributes
----------
base_name
The base name of the task.
path
Path to the file where the task was defined.
function
The task function.
name
The name of the task.
depends_on
A list of dependencies of task.
produces
A list of products of task.
markers
A list of markers attached to the task function.
report_sections
Reports with entries for when, what, and content.
attributes: dict[Any, Any]
A dictionary to store additional information of the task.
"""
base_name: str
path: Path
function: Callable[..., Any]
name: str = field(default="", init=False)
depends_on: dict[str, PyTree[PNode]] = field(factory=dict)
produces: dict[str, PyTree[PNode]] = field(factory=dict)
markers: list[Mark] = field(factory=list)
report_sections: list[tuple[str, str, str]] = field(factory=list)
attributes: dict[Any, Any] = field(factory=dict)
def __attrs_post_init__(self: Task) -> None:
"""Change class after initialization."""
if not self.name:
self.name = self.path.as_posix() + "::" + self.base_name
@property
def signature(self) -> str:
"""The unique signature of the node."""
raw_key = "".join(str(hash_value(arg)) for arg in (self.base_name, self.path))
return hashlib.sha256(raw_key.encode()).hexdigest()
def state(self) -> str | None:
"""Return the state of the node."""
if self.path.exists():
modification_time = self.path.stat().st_mtime
return hash_path(self.path, modification_time)
return None
def execute(self, **kwargs: Any) -> Any:
"""Execute the task."""
return self.function(**kwargs)
@define(kw_only=True)
class PathNode(PPathNode):
"""The class for a node which is a path.
Attributes
----------
name
Name of the node which makes it identifiable in the DAG.
path
The path to the file.
"""
path: Path
name: str = ""
@property
def signature(self) -> str:
"""The unique signature of the node."""
raw_key = str(hash_value(self.path))
return hashlib.sha256(raw_key.encode()).hexdigest()
@classmethod
def from_path(cls, path: Path) -> PathNode:
"""Instantiate class from path to file."""
return cls(name=path.as_posix(), path=path)
def state(self) -> str | None:
"""Calculate the state of the node.
The state is given by the modification timestamp.
"""
if self.path.exists():
stat = self.path.stat()
if isinstance(stat, stat_result):
modification_time = self.path.stat().st_mtime
return hash_path(self.path, modification_time)
if isinstance(stat, dict):
return stat.get("ETag", "0")
msg = "Unknown stat object."
raise NotImplementedError(msg)
return None
def load(self, is_product: bool = False) -> Path: # noqa: ARG002
"""Load the value."""
return self.path
def save(self, value: bytes | str) -> None:
"""Save strings or bytes to file."""
if isinstance(value, str):
self.path.write_text(value)
elif isinstance(value, bytes):
self.path.write_bytes(value)
else:
msg = f"'PathNode' can only save 'str' and 'bytes', not {type(value)}"
raise TypeError(msg)
@define(kw_only=True)
class PythonNode(PNode):
"""The class for a node which is a Python object.
Attributes
----------
name
The name of the node.
value
The value of the node.
hash
Whether the value should be hashed to determine the state. Use ``True`` for
objects that are hashable like strings and tuples. For dictionaries and other
non-hashable objects, you need to provide a function that can hash these
objects.
node_info
The infos acquired while collecting the node.
Examples
--------
To allow a :class:`PythonNode` to hash a dictionary, you need to pass your
own hashing function. For example, from the :mod:`deepdiff` library.
>>> from deepdiff import DeepHash
>>> node = PythonNode(name="node", value={"a": 1}, hash=lambda x: DeepHash(x)[x])
.. warning:: Hashing big objects can require some time.
"""
name: str = ""
value: Any | NoDefault = no_default
hash: bool | Callable[[Any], bool] = False # noqa: A003
node_info: NodeInfo | None = None
@property
def signature(self) -> str:
"""The unique signature of the node."""
raw_key = (
"".join(
str(hash_value(getattr(self.node_info, name)))
for name in ("arg_name", "path", "task_name", "task_path")
)
if self.node_info
else str(hash_value(self.node_info))
)
return hashlib.sha256(raw_key.encode()).hexdigest()
def load(self, is_product: bool = False) -> Any:
"""Load the value."""
if is_product:
return self
if isinstance(self.value, PythonNode):
return self.value.load()
return self.value
def save(self, value: Any) -> None:
"""Save the value."""
self.value = value
def state(self) -> str | None:
"""Calculate state of the node.
If ``hash = False``, the function returns ``"0"``, a constant hash value, so the
:class:`PythonNode` is ignored when checking for a changed state of the task.
If ``hash`` is a callable, then use this function to calculate a hash.
If ``hash = True``, the builtin ``hash()`` function (`link
<https://docs.python.org/3.11/library/functions.html?highlight=hash#hash>`_) is
used for all types except strings.
The hash for strings and bytes is calculated using hashlib because
``hash("asd")`` returns a different value every invocation since the hash of
strings is salted with a random integer and it would confuse users. See
{meth}`object.__hash__` for more information.
"""
if self.hash:
value = self.load()
if callable(self.hash):
return str(self.hash(value))
return str(hash_value(value))
return "0"
@define
class PickleNode(PPathNode):
"""A node for pickle files.
Attributes
----------
name
Name of the node which makes it identifiable in the DAG.
path
The path to the file.
"""
path: Path
name: str = ""
@property
def signature(self) -> str:
"""The unique signature of the node."""
raw_key = str(hash_value(self.path))
return hashlib.sha256(raw_key.encode()).hexdigest()
@classmethod
def from_path(cls, path: Path) -> PickleNode:
"""Instantiate class from path to file."""
if not path.is_absolute():
msg = "Node must be instantiated from absolute path."
raise ValueError(msg)
return cls(name=path.as_posix(), path=path)
def state(self) -> str | None:
if self.path.exists():
stat = self.path.stat()
if isinstance(stat, stat_result):
modification_time = self.path.stat().st_mtime
return hash_path(self.path, modification_time)
if isinstance(stat, dict):
return stat.get("ETag", "0")
msg = "Unknown stat object."
raise NotImplementedError(msg)
return None
def load(self, is_product: bool = False) -> Any:
if is_product:
return self
with self.path.open("rb") as f:
return pickle.load(f) # noqa: S301
def save(self, value: Any) -> None:
with self.path.open("wb") as f:
pickle.dump(value, f)