Skip to content

Commit 38b3141

Browse files
authored
feat: RDF4J store (#3316)
* fix: handle graph_name when it's a str * feat: wip RDF4JStore Implements: - init/open - close - add - addN - contexts - add_graph - remove_graph - __len__ * feat: RDF4J Store now supports handling namespaces and prefixes * feat: RDF4J Store triples and quads querying * feat: ensure no bnodes are used to cross document/query boundaries * chore: formatting * test: improve e2e test speed by reusing the same container and cleaning up the repo between each tests * feat: add RDF4JStore remove * feat: add RDF4JStore triples_choices tests * feat: add RDF4JStore SPARQL query and update tests * chore: fix mypy issues * test: error handling on client fixture * test: mark testcontainer tests and put test imports behind the has_httpx flag * build: remove upper python bound, bump testcontainers, and revert back to stable v7 poetry.lock * test: put testcontainer tests behind a flag for unsupported python versions * test: install rdf4j extras for python 3.9 and above * ci: skip testcontainer tests on non-linux runners
1 parent 5541907 commit 38b3141

File tree

20 files changed

+1442
-637
lines changed

20 files changed

+1442
-637
lines changed

.github/workflows/validate.yaml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ env:
1212
POETRY_CACHE_DIR: ${{ github.workspace }}/.var/cache/pypoetry
1313
PIP_CACHE_DIR: ${{ github.workspace }}/.var/cache/pip
1414

15-
1615
concurrency:
1716
group: ${{ github.workflow }}-${{ github.ref }}
1817
cancel-in-progress: true
@@ -52,7 +51,7 @@ jobs:
5251
PREPARATION: "sudo apt-get install -y firejail"
5352
extensive-tests: true
5453
TOX_TEST_HARNESS: "firejail --net=none --"
55-
TOX_PYTEST_EXTRA_ARGS: "-m 'not webtest'"
54+
TOX_PYTEST_EXTRA_ARGS: "-m 'not (testcontainer or webtest)'"
5655
steps:
5756
- uses: actions/checkout@v4
5857
- name: Cache XDG_CACHE_HOME
@@ -84,6 +83,13 @@ jobs:
8483
shell: bash
8584
run: |
8685
${{ matrix.PREPARATION }}
86+
- name: Set testcontainer exclusion for non-Linux
87+
if: ${{ matrix.os != 'ubuntu-latest' }}
88+
shell: bash
89+
run: |
90+
if [ -z "${{ matrix.TOX_PYTEST_EXTRA_ARGS }}" ]; then
91+
echo "TOX_PYTEST_EXTRA_ARGS=-m 'not testcontainer'" >> $GITHUB_ENV
92+
fi
8793
- name: Run validation
8894
shell: bash
8995
run: |
@@ -97,7 +103,7 @@ jobs:
97103
gha:validate
98104
env:
99105
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
100-
TOX_PYTEST_EXTRA_ARGS: ${{ matrix.TOX_PYTEST_EXTRA_ARGS }}
106+
TOX_PYTEST_EXTRA_ARGS: ${{ matrix.TOX_PYTEST_EXTRA_ARGS || env.TOX_PYTEST_EXTRA_ARGS }}
101107
TOX_TEST_HARNESS: ${{ matrix.TOX_TEST_HARNESS }}
102108
TOX_EXTRA_COMMAND: ${{ matrix.TOX_EXTRA_COMMAND }}
103109
- uses: actions/upload-artifact@v4

poetry.lock

Lines changed: 489 additions & 537 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,7 @@ rdfs2dot = 'rdflib.tools.rdfs2dot:main'
4141
rdfgraphisomorphism = 'rdflib.tools.graphisomorphism:main'
4242

4343
[tool.poetry.dependencies]
44-
# TODO: temporarily add new python version constraints for testcontainers
45-
# We can remove the upper bound once testcontainers releases a new version
46-
# https://github.com/testcontainers/testcontainers-python/pull/909
47-
python = ">=3.9.2, <4.0"
44+
python = ">=3.8.1"
4845
isodate = {version=">=0.7.2,<1.0.0", python = "<3.11"}
4946
pyparsing = ">=2.1.0,<4"
5047
berkeleydb = {version = "^18.1.0", optional = true}
@@ -67,7 +64,7 @@ coverage = {version = "^7.0.1", extras = ["toml"]}
6764
types-setuptools = ">=68.0.0.3,<72.0.0.0"
6865
setuptools = ">=68,<72"
6966
wheel = ">=0.42,<0.46"
70-
testcontainers = "^4.13.2"
67+
testcontainers = {version = "^4.13.2", python = ">=3.9.2"}
7168

7269
[tool.poetry.group.docs.dependencies]
7370
typing-extensions = "^4.11.0"

rdflib/contrib/rdf4j/client.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,16 @@
3030
build_sparql_query_accept_header,
3131
build_spo_param,
3232
rdf_payload_to_stream,
33+
validate_graph_name,
34+
validate_no_bnodes,
3335
)
3436
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Dataset, Graph
3537
from rdflib.query import Result
3638
from rdflib.term import IdentifiedNode, Literal, URIRef
3739

38-
SubjectType = t.Union[IdentifiedNode, None]
40+
SubjectType = t.Union[URIRef, None]
3941
PredicateType = t.Union[URIRef, None]
40-
ObjectType = t.Union[IdentifiedNode, Literal, None]
42+
ObjectType = t.Union[URIRef, Literal, None]
4143

4244

4345
@dataclass(frozen=True)
@@ -198,8 +200,14 @@ def identifier(self):
198200
@staticmethod
199201
def _build_graph_name_params(graph_name: URIRef | str):
200202
params = {}
201-
if isinstance(graph_name, URIRef) and graph_name == DATASET_DEFAULT_GRAPH_ID:
202-
# Do nothing; GraphDB does not work with `?default=`, which is the default
203+
if (
204+
isinstance(graph_name, URIRef)
205+
and graph_name == DATASET_DEFAULT_GRAPH_ID
206+
or isinstance(graph_name, str)
207+
and graph_name == str(DATASET_DEFAULT_GRAPH_ID)
208+
):
209+
# Do nothing; GraphDB does not work with `?default=`
210+
# (note the trailing equal character), which is the default
203211
# behavior of httpx when setting the param value to an empty string.
204212
# httpx completely omits query parameters whose values are `None`, so that's
205213
# not an option either.
@@ -231,6 +239,7 @@ def get(self, graph_name: URIRef | str) -> Graph:
231239
"""
232240
if not graph_name:
233241
raise ValueError("Graph name must be provided.")
242+
validate_graph_name(graph_name)
234243
headers = {
235244
"Accept": self._content_type,
236245
}
@@ -260,6 +269,7 @@ def add(self, graph_name: URIRef | str, data: str | bytes | BinaryIO | Graph):
260269
"""
261270
if not graph_name:
262271
raise ValueError("Graph name must be provided.")
272+
validate_graph_name(graph_name)
263273
stream, should_close = rdf_payload_to_stream(data)
264274
headers = {
265275
"Content-Type": self._content_type,
@@ -290,6 +300,7 @@ def overwrite(self, graph_name: URIRef | str, data: str | bytes | BinaryIO | Gra
290300
"""
291301
if not graph_name:
292302
raise ValueError("Graph name must be provided.")
303+
validate_graph_name(graph_name)
293304
stream, should_close = rdf_payload_to_stream(data)
294305
headers = {
295306
"Content-Type": self._content_type,
@@ -318,6 +329,7 @@ def clear(self, graph_name: URIRef | str):
318329
"""
319330
if not graph_name:
320331
raise ValueError("Graph name must be provided.")
332+
validate_graph_name(graph_name)
321333
params = self._build_graph_name_params(graph_name) or None
322334
response = self.http_client.delete(self._build_url(graph_name), params=params)
323335
response.raise_for_status()
@@ -412,9 +424,7 @@ def health(self) -> bool:
412424
f"Repository {self._identifier} is not healthy. {err.response.status_code} - {err.response.text}"
413425
)
414426

415-
def size(
416-
self, graph_name: IdentifiedNode | Iterable[IdentifiedNode] | str | None = None
417-
) -> int:
427+
def size(self, graph_name: URIRef | Iterable[URIRef] | str | None = None) -> int:
418428
"""The number of statements in the repository or in the specified graph name.
419429
420430
Parameters:
@@ -431,6 +441,7 @@ def size(
431441
Raises:
432442
RepositoryFormatError: Fails to parse the repository size.
433443
"""
444+
validate_graph_name(graph_name)
434445
params: dict[str, str] = {}
435446
build_context_param(params, graph_name)
436447
response = self.http_client.get(
@@ -541,12 +552,16 @@ def get(
541552
subj: SubjectType = None,
542553
pred: PredicateType = None,
543554
obj: ObjectType = None,
544-
graph_name: IdentifiedNode | Iterable[IdentifiedNode] | str | None = None,
555+
graph_name: URIRef | Iterable[URIRef] | str | None = None,
545556
infer: bool = True,
546557
content_type: str | None = None,
547558
) -> Graph | Dataset:
548559
"""Get RDF statements from the repository matching the filtering parameters.
549560
561+
!!! Note
562+
The terms for `subj`, `pred`, `obj` or `graph_name` cannot be
563+
[`BNodes`][rdflib.term.BNode].
564+
550565
Parameters:
551566
subj: Subject of the statement to filter by, or `None` to match all.
552567
pred: Predicate of the statement to filter by, or `None` to match all.
@@ -568,6 +583,7 @@ def get(
568583
A [`Graph`][rdflib.graph.Graph] or [`Dataset`][rdflib.graph.Dataset] object
569584
with the repository namespace prefixes bound to it.
570585
"""
586+
validate_no_bnodes(subj, pred, obj, graph_name)
571587
if content_type is None:
572588
content_type = "application/n-quads"
573589
headers = {"Accept": content_type}
@@ -632,7 +648,7 @@ def upload(
632648
def overwrite(
633649
self,
634650
data: str | bytes | BinaryIO | Graph | Dataset,
635-
graph_name: IdentifiedNode | Iterable[IdentifiedNode] | str | None = None,
651+
graph_name: URIRef | Iterable[URIRef] | str | None = None,
636652
base_uri: str | None = None,
637653
content_type: str | None = None,
638654
):
@@ -652,7 +668,7 @@ def overwrite(
652668
`application/n-quads` when the value is `None`.
653669
"""
654670
stream, should_close = rdf_payload_to_stream(data)
655-
671+
validate_graph_name(graph_name)
656672
try:
657673
headers = {"Content-Type": content_type or "application/n-quads"}
658674
params: dict[str, str] = {}
@@ -675,10 +691,14 @@ def delete(
675691
subj: SubjectType = None,
676692
pred: PredicateType = None,
677693
obj: ObjectType = None,
678-
graph_name: IdentifiedNode | Iterable[IdentifiedNode] | str | None = None,
694+
graph_name: URIRef | Iterable[URIRef] | str | None = None,
679695
) -> None:
680696
"""Deletes statements from the repository matching the filtering parameters.
681697
698+
!!! Note
699+
The terms for `subj`, `pred`, `obj` or `graph_name` cannot be
700+
[`BNodes`][rdflib.term.BNode].
701+
682702
Parameters:
683703
subj: Subject of the statement to filter by, or `None` to match all.
684704
pred: Predicate of the statement to filter by, or `None` to match all.
@@ -690,6 +710,7 @@ def delete(
690710
To query just the default graph, use
691711
[`DATASET_DEFAULT_GRAPH_ID`][rdflib.graph.DATASET_DEFAULT_GRAPH_ID].
692712
"""
713+
validate_no_bnodes(subj, pred, obj, graph_name)
693714
params: dict[str, str] = {}
694715
build_context_param(params, graph_name)
695716
build_spo_param(params, subj, pred, obj)
@@ -808,9 +829,7 @@ def ping(self):
808829
f"Transaction ping failed: {response.status_code} - {response.text}"
809830
)
810831

811-
def size(
812-
self, graph_name: IdentifiedNode | Iterable[IdentifiedNode] | str | None = None
813-
):
832+
def size(self, graph_name: URIRef | Iterable[URIRef] | str | None = None):
814833
"""The number of statements in the repository or in the specified graph name.
815834
816835
Parameters:
@@ -828,6 +847,7 @@ def size(
828847
RepositoryFormatError: Fails to parse the repository size.
829848
"""
830849
self._raise_for_closed()
850+
validate_graph_name(graph_name)
831851
params = {"action": "SIZE"}
832852
build_context_param(params, graph_name)
833853
response = self.repo.http_client.put(self.url, params=params)
@@ -913,12 +933,16 @@ def get(
913933
subj: SubjectType = None,
914934
pred: PredicateType = None,
915935
obj: ObjectType = None,
916-
graph_name: IdentifiedNode | Iterable[IdentifiedNode] | str | None = None,
936+
graph_name: URIRef | Iterable[URIRef] | str | None = None,
917937
infer: bool = True,
918938
content_type: str | None = None,
919939
) -> Graph | Dataset:
920940
"""Get RDF statements from the repository matching the filtering parameters.
921941
942+
!!! Note
943+
The terms for `subj`, `pred`, `obj` or `graph_name` cannot be
944+
[`BNodes`][rdflib.term.BNode].
945+
922946
Parameters:
923947
subj: Subject of the statement to filter by, or `None` to match all.
924948
pred: Predicate of the statement to filter by, or `None` to match all.
@@ -940,6 +964,7 @@ def get(
940964
A [`Graph`][rdflib.graph.Graph] or [`Dataset`][rdflib.graph.Dataset] object
941965
with the repository namespace prefixes bound to it.
942966
"""
967+
validate_no_bnodes(subj, pred, obj, graph_name)
943968
if content_type is None:
944969
content_type = "application/n-quads"
945970
headers = {"Accept": content_type}

rdflib/contrib/rdf4j/util.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, Dataset, Graph
99
from rdflib.plugins.sparql.processor import prepareQuery
10-
from rdflib.term import IdentifiedNode, URIRef
10+
from rdflib.term import BNode, IdentifiedNode, URIRef
1111

1212
if t.TYPE_CHECKING:
1313
from rdflib.contrib.rdf4j.client import ObjectType, PredicateType, SubjectType
@@ -151,3 +151,32 @@ def build_sparql_query_accept_header(query: str, headers: dict[str, str]):
151151
headers["Accept"] = "application/n-triples"
152152
else:
153153
raise ValueError(f"Unsupported query type: {prepared_query.algebra.name}")
154+
155+
156+
def validate_graph_name(graph_name: URIRef | t.Iterable[URIRef] | str | None):
157+
if (
158+
isinstance(graph_name, BNode)
159+
or isinstance(graph_name, t.Iterable)
160+
and any(isinstance(x, BNode) for x in graph_name)
161+
):
162+
raise ValueError("Graph name must not be a BNode.")
163+
164+
165+
def validate_no_bnodes(
166+
subj: SubjectType,
167+
pred: PredicateType,
168+
obj: ObjectType,
169+
graph_name: URIRef | t.Iterable[URIRef] | str | None,
170+
) -> None:
171+
"""Validate that the subject, predicate, and object are not BNodes."""
172+
if (
173+
isinstance(subj, BNode)
174+
or isinstance(pred, BNode)
175+
or isinstance(obj, BNode)
176+
or isinstance(graph_name, BNode)
177+
):
178+
raise ValueError(
179+
"Subject, predicate, and object must not be a BNode: "
180+
f"{subj}, {pred}, {obj}"
181+
)
182+
validate_graph_name(graph_name)

0 commit comments

Comments
 (0)