diff --git a/actions/s2dm-publish/action.yml b/actions/s2dm-publish/action.yml index 0f8eb376..817d12c9 100644 --- a/actions/s2dm-publish/action.yml +++ b/actions/s2dm-publish/action.yml @@ -262,7 +262,7 @@ runs: working-directory: ${{ inputs.repository-path }} shell: bash run: | - CMD="s2dm generate schema-rdf -s ${{ inputs.spec-path }} -o ${{ github.workspace }}/.artifacts/schema-rdf --namespace '${{ inputs.schema-rdf-namespace }}'" + CMD="s2dm generate schema-rdf -s ${{ inputs.spec-path }} -o ${{ github.workspace }}/.artifacts/schema-rdf --namespace '${{ inputs.schema-rdf-namespace }}' --output-formats nt,ttl,jsonld" [ -n "${{ inputs.schema-rdf-prefix }}" ] && CMD="$CMD --prefix '${{ inputs.schema-rdf-prefix }}'" [ -n "${{ inputs.schema-rdf-language }}" ] && CMD="$CMD --language '${{ inputs.schema-rdf-language }}'" eval "$CMD" diff --git a/docs-gen/content/docs/tools/cli.md b/docs-gen/content/docs/tools/cli.md index 72ea15bf..0ba8945c 100644 --- a/docs-gen/content/docs/tools/cli.md +++ b/docs-gen/content/docs/tools/cli.md @@ -1540,6 +1540,234 @@ s2dm export avro schema --help s2dm export avro protocol --help ``` +## Generate Commands + +### Schema RDF + +The `generate schema-rdf` command materializes a GraphQL schema as RDF triples using the s2dm ontology. The generated RDF mirrors the schema's structure (types, fields, enums, unions, interfaces) as a semantic graph that can be queried with SPARQL. + +#### Usage + +```bash +s2dm generate schema-rdf -s -o --namespace +``` + +#### Options + +- `-s, --schema PATH`: GraphQL schema file, directory, or URL (required, can be specified multiple times) +- `-o, --output DIR`: Output directory for RDF artifacts (required) +- `--namespace URI`: Namespace URI for concept URIs (required) +- `--prefix TEXT`: Prefix for concept URIs (default: `ns`) +- `--language TEXT`: BCP 47 language tag for prefLabels (default: `en`) +- `--output-formats TEXT`: Comma-separated output formats (default: `nt,turtle`). Supported: `json-ld` (or `jsonld`), `nt`, `turtle` (or `ttl`) + +#### Examples + +Generate sorted n-triples and Turtle (default): + +```bash +s2dm generate schema-rdf \ + -s spec/ \ + -o ./rdf-output \ + --namespace "https://covesa.org/s2dm/mydomain#" +``` + +Generate all formats including JSON-LD (for releases): + +```bash +s2dm generate schema-rdf \ + -s spec/ \ + -o ./rdf-output \ + --namespace "https://covesa.org/s2dm/mydomain#" \ + --output-formats nt,turtle,json-ld +``` + +Generate only n-triples (for git): + +```bash +s2dm generate schema-rdf \ + -s spec/ \ + -o ./rdf-output \ + --namespace "https://covesa.org/s2dm/mydomain#" \ + --output-formats nt +``` + +#### Output Formats + +| Format | Alias | Extension | Description | +|--------|-------|-----------|-------------| +| `nt` | | `.nt` | Sorted n-triples (deterministic, git-friendly diffs) | +| `turtle` | `ttl` | `.ttl` | Turtle (human-readable, for consumption) | +| `json-ld` | `jsonld` | `.jsonld` | JSON-LD (for web and linked data tooling) | + +The `nt` format is special-cased to produce lexicographically sorted lines, ensuring deterministic output suitable for version control. + +#### Ontology Mapping + +The s2dm ontology maps GraphQL SDL elements to RDF as follows: + +- **Object types**: `rdf:type skos:Concept, s2dm:ObjectType` +- **Fields**: `rdf:type skos:Concept, s2dm:Field` with `s2dm:hasOutputType` and `s2dm:usesTypeWrapperPattern` +- **Enum types**: `rdf:type skos:Concept, s2dm:EnumType` with `s2dm:hasEnumValue` +- **Enum values**: `rdf:type skos:Concept, s2dm:EnumValue` +- **Interface types**: `rdf:type skos:Concept, s2dm:InterfaceType` +- **Input object types**: `rdf:type skos:Concept, s2dm:InputObjectType` +- **Union types**: `rdf:type skos:Concept, s2dm:UnionType` with `s2dm:hasUnionMember` +- **Built-in scalars**: `s2dm:Int`, `s2dm:Float`, `s2dm:String`, `s2dm:Boolean`, `s2dm:ID` + +## Query Commands + +The `query` command group provides predefined SPARQL queries for traversing and analysing an RDF-materialized schema. Each command can either load a pre-generated RDF file or materialize on-the-fly from a GraphQL schema. + +### Input Options (shared by all query commands) + +Provide **one** of the following: + +- `--rdf PATH`: Path to a pre-generated `.nt` or `.ttl` file +- `-s, --schema PATH` + `--namespace URI`: Materialize from GraphQL schema on-the-fly + +Additional option: + +- `--json`: Output results as JSON instead of a table + +### fields-outputting-enum + +Find all fields whose output type is an enum type. + +```bash +# From a pre-generated file +s2dm query fields-outputting-enum --rdf schema.nt + +# From a GraphQL schema +s2dm query fields-outputting-enum -s spec/ --namespace "https://example.org/#" + +# JSON output +s2dm query fields-outputting-enum --rdf schema.nt --json +``` + +**Example output:** + +``` + fields-outputting-enum +┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ field ┃ enumType ┃ +┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩ +│ Cabin.kind │ CabinKindEnum │ +│ InCabinArea2x2.column │ TwoColumnsInCabinEnum │ +│ InCabinArea2x2.row │ TwoRowsInCabinEnum │ +└───────────────────────┴───────────────────────┘ +``` + +### object-types-with-fields + +List all object types and their fields. + +```bash +s2dm query object-types-with-fields --rdf schema.nt +``` + +**Example output:** + +``` + object-types-with-fields +┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ objectType ┃ field ┃ +┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━┩ +│ Cabin │ Cabin.doors │ +│ Cabin │ Cabin.kind │ +│ Door │ Door.isOpen │ +│ Door │ Door.window │ +│ Window │ Window.isTinted │ +└────────────────┴───────────────────────┘ +``` + +### list-type-fields + +Find all fields that use a list-like type wrapper pattern (`list`, `nonNullList`, `listOfNonNull`, `nonNullListOfNonNull`). + +```bash +s2dm query list-type-fields --rdf schema.nt --json +``` + +**Example JSON output:** + +```json +[ + { + "field": "https://example.org/my-domain#Cabin.doors", + "pattern": "https://covesa.global/models/s2dm#list" + } +] +``` + +### Example SPARQL Queries + +The CLI queries above are powered by SPARQL. Below are the raw queries for reference, which can also be run against any SPARQL endpoint or rdflib graph loaded with the materialized RDF. + +**Find all fields that output an enum type:** + +```sparql +PREFIX s2dm: + +SELECT ?field ?enumType +WHERE { + ?field a s2dm:Field ; + s2dm:hasOutputType ?enumType . + ?enumType a s2dm:EnumType . +} +ORDER BY ?field +``` + +**List all object types and their fields:** + +```sparql +PREFIX s2dm: + +SELECT ?objectType ?field +WHERE { + ?objectType a s2dm:ObjectType ; + s2dm:hasField ?field . +} +ORDER BY ?objectType ?field +``` + +**Find fields using list wrappers:** + +```sparql +PREFIX s2dm: + +SELECT ?field ?pattern +WHERE { + ?field a s2dm:Field ; + s2dm:usesTypeWrapperPattern ?pattern . + FILTER(?pattern IN ( + s2dm:list, + s2dm:nonNullList, + s2dm:listOfNonNull, + s2dm:nonNullListOfNonNull + )) +} +ORDER BY ?field +``` + +**Find fields whose output type has fields of enum type (nested pattern):** + +```sparql +PREFIX s2dm: + +SELECT ?parentType ?field ?nestedField ?enumType +WHERE { + ?parentType a s2dm:ObjectType ; + s2dm:hasField ?field . + ?field s2dm:hasOutputType ?outputType . + ?outputType a s2dm:ObjectType ; + s2dm:hasField ?nestedField . + ?nestedField s2dm:hasOutputType ?enumType . + ?enumType a s2dm:EnumType . +} +ORDER BY ?parentType ?field +``` + ## Common Features ### Selection Query Filtering diff --git a/examples/schema-rdf/README.md b/examples/schema-rdf/README.md index 78e5ff84..db060790 100644 --- a/examples/schema-rdf/README.md +++ b/examples/schema-rdf/README.md @@ -93,6 +93,94 @@ ns:CabinKindEnum a skos:Concept, s2dm:EnumType ; s2dm:hasEnumValue ns:CabinKindEnum.SUV, ns:CabinKindEnum.VAN . ``` +## Querying the Schema with SPARQL + +The generated RDF can be queried using SPARQL -- either via the s2dm CLI or any SPARQL-capable tool (rdflib, Apache Jena, triple stores). + +### Using the CLI + +```bash +# Find all fields that output an enum type +s2dm query fields-outputting-enum --rdf output/schema.nt + +# List all object types with their fields +s2dm query object-types-with-fields --rdf output/schema.nt + +# Find fields using list wrappers (JSON output) +s2dm query list-type-fields --rdf output/schema.nt --json + +# Or materialize on-the-fly from GraphQL +s2dm query fields-outputting-enum -s sample.graphql --namespace "https://example.org/my-domain#" +``` + +### Using rdflib (Python) + +```python +from rdflib import Graph + +g = Graph() +g.parse("output/schema.nt", format="nt") + +results = g.query(""" + PREFIX s2dm: + SELECT ?field ?enumType + WHERE { + ?field a s2dm:Field ; + s2dm:hasOutputType ?enumType . + ?enumType a s2dm:EnumType . + } + ORDER BY ?field +""") + +for row in results: + print(f"{row.field} -> {row.enumType}") +``` + +### Example SPARQL Queries + +**Find all fields that output an enum type:** + +```sparql +PREFIX s2dm: +SELECT ?field ?enumType WHERE { + ?field a s2dm:Field ; s2dm:hasOutputType ?enumType . + ?enumType a s2dm:EnumType . +} +``` + +**List all object types and their fields:** + +```sparql +PREFIX s2dm: +SELECT ?objectType ?field WHERE { + ?objectType a s2dm:ObjectType ; s2dm:hasField ?field . +} +ORDER BY ?objectType +``` + +**Find fields using list wrappers:** + +```sparql +PREFIX s2dm: +SELECT ?field ?pattern WHERE { + ?field a s2dm:Field ; s2dm:usesTypeWrapperPattern ?pattern . + FILTER(?pattern IN (s2dm:list, s2dm:nonNullList, s2dm:listOfNonNull, s2dm:nonNullListOfNonNull)) +} +``` + +**Find nested patterns (fields whose output type has enum fields):** + +```sparql +PREFIX s2dm: +SELECT ?parentType ?field ?nestedField ?enumType WHERE { + ?parentType a s2dm:ObjectType ; s2dm:hasField ?field . + ?field s2dm:hasOutputType ?outputType . + ?outputType a s2dm:ObjectType ; s2dm:hasField ?nestedField . + ?nestedField s2dm:hasOutputType ?enumType . + ?enumType a s2dm:EnumType . +} +``` + ## Exclusions - Query, Mutation, and Subscription root types diff --git a/src/s2dm/cli.py b/src/s2dm/cli.py index f2a362be..e8f3fa3e 100644 --- a/src/s2dm/cli.py +++ b/src/s2dm/cli.py @@ -8,6 +8,7 @@ import rich_click as click from graphql import DocumentNode, GraphQLSchema, parse +from rdflib import Graph from rich.traceback import install from s2dm import __version__, log @@ -16,9 +17,22 @@ from s2dm.exporters.id import IDExporter from s2dm.exporters.jsonschema import translate_to_jsonschema from s2dm.exporters.protobuf import translate_to_protobuf -from s2dm.exporters.rdf_materializer import materialize_schema_to_rdf, write_rdf_artifacts +from s2dm.exporters.rdf_materializer import ( + FORMAT_ALIASES, + FORMAT_REGISTRY, + materialize_schema_to_rdf, + write_rdf_artifacts, +) from s2dm.exporters.shacl import translate_to_shacl from s2dm.exporters.skos import generate_skos_skeleton +from s2dm.exporters.sparql_queries import ( + QUERIES as SPARQL_QUERIES, +) +from s2dm.exporters.sparql_queries import ( + format_results_as_table, + load_rdf_graph, + run_query, +) from s2dm.exporters.spec_history import SpecHistoryExporter from s2dm.exporters.utils.extraction import get_all_named_types, get_all_object_types, get_root_level_types_from_query from s2dm.exporters.utils.graphql_type import is_builtin_scalar_type, is_introspection_type @@ -151,6 +165,28 @@ def derive_variant_ids_path(base_dir: Path, version_tag: str) -> Path: return base_dir / filename +def load_diff_changes(diff_file: Path | None) -> list[DiffChange] | None: + """Load and validate a structured diff JSON file. + + Args: + diff_file: Path to the JSON diff file, or None. + + Returns: + List of DiffChange objects, or None if *diff_file* is None. + """ + if diff_file is None: + return None + try: + with open(diff_file, encoding="utf-8") as f: + json_data = json.load(f) + if not isinstance(json_data, list): + raise ValueError("Invalid diff file: expected a JSON array") + return [DiffChange.model_validate(change) for change in json_data] + except (json.JSONDecodeError, OSError, ValueError) as e: + log.error(f"Failed to load diff file from {diff_file}: {e}") + sys.exit(1) + + units_directory_option = click.option( "--directory", "-d", @@ -295,6 +331,60 @@ def registry() -> None: pass +@click.command() +@click.argument("query_name", type=click.Choice(sorted(SPARQL_QUERIES.keys()), case_sensitive=False)) +@click.option( + "--rdf", + type=click.Path(exists=True, dir_okay=False, path_type=Path), + default=None, + help="Pre-generated RDF file (.nt or .ttl) to query", +) +@click.option( + "--schema", + "-s", + "schemas", + type=str, + cls=SchemaResolverOption, + default=None, + multiple=True, + help="GraphQL schema file/dir (used with --namespace for on-the-fly materialization)", +) +@click.option( + "--namespace", + default=None, + help="Namespace URI for on-the-fly materialization (requires --schema)", +) +@click.option( + "--json", + "json_output", + is_flag=True, + default=False, + help="Output results as JSON instead of a table", +) +@optional_output_option +def query( + query_name: str, + rdf: Path | None, + schemas: list[Path] | None, + namespace: str | None, + json_output: bool, + output: Path | None, +) -> None: + """Run a predefined SPARQL query against an RDF-materialized schema. + + Load a pre-generated RDF file (--rdf) or materialize on-the-fly + from a GraphQL schema (-s/--schema + --namespace). + + Available queries: + fields-outputting-enum Find fields whose output type is an enum + object-types-with-fields List object types and their fields + list-type-fields Find fields using list-like wrappers + """ + graph = _resolve_graph(rdf, schemas, namespace) + results = run_query(graph, query_name) + _output_results(results, query_name, json_output=json_output, output=output) + + @click.group() def search() -> None: """Search commands e.g. search graphql for one specific type.""" @@ -793,7 +883,7 @@ def skos_skeleton( "-o", type=click.Path(file_okay=False, writable=True, path_type=Path), required=True, - help="Output directory for schema.nt and schema.ttl", + help="Output directory for RDF artifacts", ) @click.option( "--namespace", @@ -812,30 +902,45 @@ def skos_skeleton( help="BCP 47 language tag for prefLabels", show_default=True, ) +@click.option( + "--output-formats", + default="nt,turtle", + help=f"Comma-separated output formats. Supported: {', '.join(sorted(set(FORMAT_REGISTRY) | set(FORMAT_ALIASES)))}", + show_default=True, +) def schema_rdf( schemas: list[Path], output: Path, namespace: str, prefix: str, language: str, + output_formats: str, ) -> None: """Materialize GraphQL schema as RDF triples with SKOS and s2dm ontology. - Produces sorted n-triples (schema.nt) and Turtle (schema.ttl) in the output directory. + Produces RDF artifacts in the specified formats in the output directory. + Default formats: sorted n-triples (schema.nt) and Turtle (schema.ttl). """ + formats = [f.strip() for f in output_formats.split(",") if f.strip()] + + graphql_schema = load_schema(schemas) + graph = materialize_schema_to_rdf( + schema=graphql_schema, + namespace=namespace, + prefix=prefix, + language=language, + ) + try: - graphql_schema = load_schema(schemas) - graph = materialize_schema_to_rdf( - schema=graphql_schema, - namespace=namespace, - prefix=prefix, - language=language, - ) - write_rdf_artifacts(graph, output, base_name="schema") - log.success(f"RDF artifacts written to {output}/schema.nt and {output}/schema.ttl") + written = write_rdf_artifacts(graph, output, base_name="schema", formats=formats) + except ValueError as e: + raise click.ClickException(str(e)) from e except OSError as e: raise click.ClickException(f"Failed to write RDF artifacts: {e}") from e + file_list = ", ".join(str(p) for p in written) + log.success(f"RDF artifacts written: {file_list}") + # Check -> version bump # ---------- @@ -1106,18 +1211,7 @@ def export_id( if output: output = apply_version_tag_suffix(output, version_tag) - # Load diff output if provided - diff_output: list[DiffChange] | None = None - if diff_file: - try: - with open(diff_file, encoding="utf-8") as f: - json_data = json.load(f) - if not isinstance(json_data, list): - raise ValueError("Invalid diff file: expected a JSON array") - diff_output = [DiffChange.model_validate(change) for change in json_data] - except (json.JSONDecodeError, OSError, ValueError) as e: - log.error(f"Failed to load diff file from {diff_file}: {e}") - sys.exit(1) + diff_output = load_diff_changes(diff_file) exporter = IDExporter( schema=composed_schema, @@ -1269,18 +1363,7 @@ def registry_update( output = apply_version_tag_suffix(output, version_tag) ensure_output_parent(output) - # Load diff output if provided - diff_output: list[DiffChange] | None = None - if diff_file: - try: - with open(diff_file, encoding="utf-8") as f: - json_data = json.load(f) - if not isinstance(json_data, list): - raise ValueError("Invalid diff file: expected a JSON array") - diff_output = [DiffChange.model_validate(change) for change in json_data] - except (json.JSONDecodeError, OSError, ValueError) as e: - log.error(f"Failed to load diff file from {diff_file}: {e}") - sys.exit(1) + diff_output = load_diff_changes(diff_file) composed_schema_str = build_schema_str(schemas) composed_schema = build_schema_with_query(composed_schema_str) @@ -1568,6 +1651,75 @@ def stats_graphql(schemas: list[Path]) -> None: log.print_dict(type_counts) +# --------------------------------------------------------------------------- +# Query commands (SPARQL-based schema traversal) +# --------------------------------------------------------------------------- + + +def _resolve_graph( + rdf: Path | None, + schemas: list[Path] | None, + namespace: str | None, +) -> Graph: + """Resolve an RDF graph from either a file or on-the-fly materialization. + + Args: + rdf: Path to a pre-generated RDF file. + schemas: GraphQL schema paths for on-the-fly materialization. + namespace: Namespace URI (required with schemas). + + Returns: + Loaded or materialized rdflib Graph. + + Raises: + click.UsageError: If neither ``--rdf`` nor ``--schema + --namespace`` + are provided, or if both are provided. + """ + if rdf and schemas: + raise click.UsageError("Provide either --rdf or --schema/--namespace, not both.") + + if rdf: + return load_rdf_graph(rdf) + + if schemas: + if not namespace: + raise click.UsageError("--namespace is required when using --schema.") + graphql_schema = load_schema(schemas) + return materialize_schema_to_rdf(schema=graphql_schema, namespace=namespace, prefix="ns") + + raise click.UsageError("Provide --rdf or --schema with --namespace.") + + +def _output_results( + results: list[dict[str, str]], + query_name: str, + json_output: bool, + output: Path | None = None, +) -> None: + """Print query results as a table or write JSON to a file. + + Args: + results: Query result rows. + query_name: Name of the query (for table title). + json_output: If True, output JSON to stdout (or to *output* file). + output: Optional file path to write JSON results to. + """ + if output is not None: + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(results, indent=2), encoding="utf-8") + log.success(f"Query results written to {output}") + return + + if json_output: + click.echo(json.dumps(results, indent=2)) + return + + compact = format_results_as_table(results) + log.print_table(compact, title=query_name) + if compact: + log.info(f"{len(compact)} result(s).") + + cli.add_command(check) cli.add_command(compose) cli.add_command(diff) @@ -1575,6 +1727,7 @@ def stats_graphql(schemas: list[Path]) -> None: export.add_command(avro) cli.add_command(export) cli.add_command(generate) +cli.add_command(query) cli.add_command(registry) cli.add_command(similar) cli.add_command(search) diff --git a/src/s2dm/exporters/rdf_materializer.py b/src/s2dm/exporters/rdf_materializer.py index 723c843e..d0ed42eb 100644 --- a/src/s2dm/exporters/rdf_materializer.py +++ b/src/s2dm/exporters/rdf_materializer.py @@ -7,6 +7,7 @@ hasEnumValue, hasUnionMember, usesTypeWrapperPattern) """ +import logging from collections.abc import Callable from dataclasses import dataclass, field from pathlib import Path @@ -246,8 +247,9 @@ def _add_concept_header( graph.add((uri, RDF.type, SKOS.Concept)) graph.add((uri, RDF.type, s2dm_type)) graph.add((uri, SKOS.prefLabel, Literal(pref_label, lang=language))) - if description.strip(): - graph.add((uri, SKOS.definition, Literal(description))) + stripped_description = description.strip() + if stripped_description: + graph.add((uri, SKOS.definition, Literal(stripped_description))) def materialize_schema_to_rdf( @@ -276,6 +278,15 @@ def materialize_schema_to_rdf( Returns: rdflib Graph with all triples. """ + if not namespace.endswith(("#", "/")): + logging.getLogger(__name__).warning( + "Namespace '%s' does not end with '#' or '/'. " + "Generated URIs may be malformed (e.g. '%sCabin' instead of '%s#Cabin').", + namespace, + namespace, + namespace, + ) + graph = Graph() concept_ns = Namespace(namespace) @@ -403,21 +414,95 @@ def serialize_sorted_ntriples(graph: Graph) -> str: return _sort_ntriples_lines(nt) +# Registry mapping rdflib format names to file extensions. +# The keys are the canonical format identifiers (also accepted by rdflib). +# New formats can be added here without further code changes. +FORMAT_REGISTRY: dict[str, str] = { + "nt": ".nt", + "turtle": ".ttl", + "json-ld": ".jsonld", +} + +# Common short aliases that users may prefer on the CLI. +FORMAT_ALIASES: dict[str, str] = { + "ttl": "turtle", + "jsonld": "json-ld", +} + +# Default formats for day-to-day use (sorted n-triples for git, Turtle for reading). +DEFAULT_FORMATS: list[str] = ["nt", "turtle"] + + +def resolve_format(fmt: str) -> str: + """Resolve a format key to its canonical rdflib name. + + Accepts both canonical names (``"turtle"``, ``"json-ld"``) and common + aliases (``"ttl"``, ``"jsonld"``). + + Args: + fmt: User-supplied format key. + + Returns: + Canonical rdflib format name. + + Raises: + ValueError: If *fmt* is not a known format or alias. + """ + if fmt in FORMAT_REGISTRY: + return fmt + if fmt in FORMAT_ALIASES: + return FORMAT_ALIASES[fmt] + all_accepted = sorted(set(FORMAT_REGISTRY) | set(FORMAT_ALIASES)) + raise ValueError(f"Unknown RDF format: '{fmt}'. Supported: {', '.join(all_accepted)}") + + def write_rdf_artifacts( graph: Graph, output_dir: Path, base_name: str = "schema", -) -> None: - """Write RDF graph to sorted n-triples and Turtle files. + formats: list[str] | None = None, +) -> list[Path]: + """Write RDF graph to one or more serialization formats. + + Uses FORMAT_REGISTRY to resolve format keys to rdflib serializer names and + file extensions. The ``"nt"`` format is special-cased to use + ``serialize_sorted_ntriples`` for deterministic, git-friendly output. + + Accepts both canonical rdflib names (e.g. ``"turtle"``) and common + aliases (e.g. ``"ttl"``). Args: graph: The rdflib Graph to write. output_dir: Directory to write files into (created if needed). base_name: Base filename without extension (default: "schema"). + formats: List of format keys (e.g. ``["nt", "ttl", "json-ld"]``). + Defaults to ``["nt", "turtle"]`` when *None*. + + Returns: + List of paths to the written files. + + Raises: + ValueError: If an unknown format key is provided. """ + if formats is None: + formats = list(DEFAULT_FORMATS) + + # Resolve aliases to canonical names + resolved = [resolve_format(f) for f in formats] + output_dir.mkdir(parents=True, exist_ok=True) - nt_path = output_dir / f"{base_name}.nt" - ttl_path = output_dir / f"{base_name}.ttl" + written: list[Path] = [] + + for fmt in resolved: + extension = FORMAT_REGISTRY[fmt] + out_path = output_dir / f"{base_name}{extension}" + + if fmt == "nt": + # Sorted n-triples for deterministic, git-friendly output + out_path.write_text(serialize_sorted_ntriples(graph), encoding="utf-8") + else: + graph.serialize(destination=str(out_path), format=fmt) + + written.append(out_path) - nt_path.write_text(serialize_sorted_ntriples(graph), encoding="utf-8") - graph.serialize(destination=str(ttl_path), format="turtle") + return written diff --git a/src/s2dm/exporters/sparql_queries.py b/src/s2dm/exporters/sparql_queries.py new file mode 100644 index 00000000..878ad16e --- /dev/null +++ b/src/s2dm/exporters/sparql_queries.py @@ -0,0 +1,176 @@ +"""Predefined SPARQL queries for traversing RDF-materialized GraphQL schemas. + +This module provides a set of common SPARQL queries that operate on the s2dm +ontology triples produced by ``rdf_materializer.materialize_schema_to_rdf``. +Queries are stored in a registry and executed via ``run_query``. +""" + +from pathlib import Path + +from rdflib import Graph +from rdflib.query import ResultRow + +from s2dm.exporters.rdf_materializer import FORMAT_REGISTRY +from s2dm.exporters.skos import S2DM_NAMESPACE_URI + +# --------------------------------------------------------------------------- +# SPARQL query registry +# --------------------------------------------------------------------------- + +#: Registry mapping query name to (description, SPARQL string). +QUERIES: dict[str, tuple[str, str]] = { + "fields-outputting-enum": ( + "Find all fields whose output type is an enum type", + f"""\ +PREFIX s2dm: <{S2DM_NAMESPACE_URI}> + +SELECT ?field ?enumType +WHERE {{ + ?field a s2dm:Field ; + s2dm:hasOutputType ?enumType . + ?enumType a s2dm:EnumType . +}} +ORDER BY ?field +""", + ), + "object-types-with-fields": ( + "List all object types with their fields", + f"""\ +PREFIX s2dm: <{S2DM_NAMESPACE_URI}> + +SELECT ?objectType ?field +WHERE {{ + ?objectType a s2dm:ObjectType ; + s2dm:hasField ?field . +}} +ORDER BY ?objectType ?field +""", + ), + "list-type-fields": ( + "Find all fields that use a list-like type wrapper pattern", + f"""\ +PREFIX s2dm: <{S2DM_NAMESPACE_URI}> + +SELECT ?field ?pattern +WHERE {{ + ?field a s2dm:Field ; + s2dm:usesTypeWrapperPattern ?pattern . + FILTER(?pattern IN ( + s2dm:list, + s2dm:nonNullList, + s2dm:listOfNonNull, + s2dm:nonNullListOfNonNull + )) +}} +ORDER BY ?field +""", + ), +} + + +def get_query_names() -> list[str]: + """Return sorted list of available query names. + + Returns: + List of query name strings. + """ + return sorted(QUERIES.keys()) + + +def get_query_description(query_name: str) -> str: + """Return the human-readable description for a query. + + Args: + query_name: Key in the QUERIES registry. + + Returns: + Description string. + + Raises: + KeyError: If *query_name* is not in the registry. + """ + return QUERIES[query_name][0] + + +def load_rdf_graph(path: Path) -> Graph: + """Load an RDF graph from a file, detecting format by extension. + + Supported extensions: ``.nt`` (n-triples), ``.ttl`` (turtle), + ``.jsonld`` (JSON-LD). + + Args: + path: Path to the RDF file. + + Returns: + Parsed rdflib Graph. + + Raises: + ValueError: If the file extension is not recognised. + FileNotFoundError: If the file does not exist. + """ + # Derive extension-to-rdflib-format mapping from the shared FORMAT_REGISTRY + ext_to_format: dict[str, str] = {ext: rdflib_fmt for rdflib_fmt, ext in FORMAT_REGISTRY.items()} + + if not path.exists(): + raise FileNotFoundError(f"RDF file not found: {path}") + + fmt = ext_to_format.get(path.suffix.lower()) + if fmt is None: + supported = ", ".join(sorted(ext_to_format.keys())) + raise ValueError(f"Unsupported RDF file extension '{path.suffix}'. Supported: {supported}") + + graph = Graph() + graph.parse(str(path), format=fmt) + return graph + + +def run_query(graph: Graph, query_name: str) -> list[dict[str, str]]: + """Execute a predefined SPARQL query against an RDF graph. + + Args: + graph: The rdflib Graph to query. + query_name: Key in the QUERIES registry. + + Returns: + List of result rows, each a dict mapping variable name to string value. + + Raises: + KeyError: If *query_name* is not in the registry. + """ + _, sparql = QUERIES[query_name] + qres = graph.query(sparql) + variables = [str(v) for v in qres.vars] if qres.vars else [] + results: list[dict[str, str]] = [] + for row in qres: + if isinstance(row, ResultRow): + results.append({var: str(val) for var, val in zip(variables, row, strict=False)}) + return results + + +def format_results_as_table( + results: list[dict[str, str]], + compact: bool = True, +) -> list[dict[str, str]]: + """Optionally shorten URIs in query results for display. + + When *compact* is True, URIs are shortened to their fragment or last + path segment for readability. + + Args: + results: Raw query result rows. + compact: Whether to shorten URIs (default: True). + + Returns: + Processed result rows. + """ + if not compact: + return results + + def _shorten(uri: str) -> str: + """Extract the local name from a URI.""" + for sep in ("#", "/"): + if sep in uri: + return uri.rsplit(sep, 1)[-1] + return uri + + return [{k: _shorten(v) for k, v in row.items()} for row in results] diff --git a/src/s2dm/logger.py b/src/s2dm/logger.py index e99106e7..f18a0f17 100644 --- a/src/s2dm/logger.py +++ b/src/s2dm/logger.py @@ -2,10 +2,12 @@ import json import logging +from collections.abc import Sequence from typing import Any from rich.console import Console from rich.logging import RichHandler +from rich.table import Table class S2DMLogger(logging.Logger): @@ -128,6 +130,31 @@ def list_item(self, text: str, prefix: str = "-", style: str = "") -> None: else: self.print(f"{prefix} {text}") + def print_table( + self, + rows: Sequence[dict[str, str]], + title: str = "", + columns: Sequence[str] | None = None, + ) -> None: + """Print a list of dicts as a Rich table. + + Args: + rows: Sequence of dicts where keys are column names. + title: Optional table title. + columns: Explicit column order. Inferred from the first row when *None*. + """ + if not rows: + self.print("No results.") + return + + cols = list(columns) if columns else list(rows[0].keys()) + table = Table(title=title or None) + for col in cols: + table.add_column(col) + for row in rows: + table.add_row(*(row.get(c, "") for c in cols)) + self.console.print(table) + def format_error_with_stderr(self, base_error_msg: str, stderr: str | None) -> str: """Format and log error message with stderr preview if available. diff --git a/tests/test_e2e_cli.py b/tests/test_e2e_cli.py index 9dcfc4b7..b8b273f1 100644 --- a/tests/test_e2e_cli.py +++ b/tests/test_e2e_cli.py @@ -301,6 +301,234 @@ def test_generate_schema_rdf(runner: CliRunner, tmp_outputs: Path, spec_director assert "Vehicle" in ttl_content +def test_generate_schema_rdf_all_formats( + runner: CliRunner, tmp_outputs: Path, spec_directory: Path, units_directory: Path +) -> None: + """Generate RDF triples in all formats including JSON-LD.""" + out_dir = tmp_outputs / "schema_rdf_all" + result = runner.invoke( + cli, + [ + "generate", + "schema-rdf", + "-s", + str(spec_directory), + "-s", + str(TSD.SAMPLE1_1), + "-s", + str(TSD.SAMPLE1_2), + "-s", + str(units_directory), + "-o", + str(out_dir), + "--namespace", + "https://example.org/vss#", + "--output-formats", + "nt,ttl,jsonld", + ], + ) + assert result.exit_code == 0, result.output + assert (out_dir / "schema.nt").exists() + assert (out_dir / "schema.ttl").exists() + assert (out_dir / "schema.jsonld").exists() + + import json + + jsonld_content = (out_dir / "schema.jsonld").read_text() + data = json.loads(jsonld_content) + assert isinstance(data, list | dict) + assert "Vehicle" in jsonld_content + + +def test_generate_schema_rdf_invalid_format( + runner: CliRunner, tmp_outputs: Path, spec_directory: Path, units_directory: Path +) -> None: + """Invalid output format shows error.""" + out_dir = tmp_outputs / "schema_rdf_bad" + result = runner.invoke( + cli, + [ + "generate", + "schema-rdf", + "-s", + str(spec_directory), + "-s", + str(units_directory), + "-o", + str(out_dir), + "--namespace", + "https://example.org/vss#", + "--output-formats", + "rdfxml", + ], + ) + assert result.exit_code != 0 + assert "Unknown RDF format" in result.output + + +# --------------------------------------------------------------------------- +# Query command E2E tests +# --------------------------------------------------------------------------- + + +def test_query_fields_outputting_enum_from_schema( + runner: CliRunner, tmp_outputs: Path, spec_directory: Path, units_directory: Path +) -> None: + """Query fields-outputting-enum via on-the-fly materialization, output to file.""" + out_file = tmp_outputs / "query_enum_fields.json" + result = runner.invoke( + cli, + [ + "query", + "fields-outputting-enum", + "-s", + str(spec_directory), + "-s", + str(units_directory), + "--namespace", + "https://example.org/vss#", + "-o", + str(out_file), + ], + ) + assert result.exit_code == 0, result.output + assert out_file.exists() + data = json.loads(out_file.read_text()) + assert isinstance(data, list) + assert len(data) > 0 + assert all("field" in row and "enumType" in row for row in data) + + +def test_query_object_types_with_fields_from_rdf( + runner: CliRunner, tmp_outputs: Path, spec_directory: Path, units_directory: Path +) -> None: + """Query object-types-with-fields from a pre-generated .nt file, output to file.""" + # First generate the RDF (include sample schemas that contain Vehicle) + rdf_dir = tmp_outputs / "query_rdf" + runner.invoke( + cli, + [ + "generate", + "schema-rdf", + "-s", + str(spec_directory), + "-s", + str(TSD.SAMPLE1_1), + "-s", + str(TSD.SAMPLE1_2), + "-s", + str(units_directory), + "-o", + str(rdf_dir), + "--namespace", + "https://example.org/vss#", + "--output-formats", + "nt", + ], + ) + nt_file = rdf_dir / "schema.nt" + assert nt_file.exists() + + # Then query it + out_file = tmp_outputs / "query_obj_types.json" + result = runner.invoke( + cli, + [ + "query", + "object-types-with-fields", + "--rdf", + str(nt_file), + "-o", + str(out_file), + ], + ) + assert result.exit_code == 0, result.output + assert out_file.exists() + data = json.loads(out_file.read_text()) + assert isinstance(data, list) + assert len(data) > 0 + type_names = {row["objectType"] for row in data} + assert any("Vehicle" in t for t in type_names) + + +def test_query_list_type_fields_from_schema( + runner: CliRunner, tmp_outputs: Path, spec_directory: Path, units_directory: Path +) -> None: + """Query list-type-fields via on-the-fly materialization, output to file.""" + out_file = tmp_outputs / "query_list_fields.json" + result = runner.invoke( + cli, + [ + "query", + "list-type-fields", + "-s", + str(spec_directory), + "-s", + str(units_directory), + "--namespace", + "https://example.org/vss#", + "-o", + str(out_file), + ], + ) + assert result.exit_code == 0, result.output + assert out_file.exists() + data = json.loads(out_file.read_text()) + assert isinstance(data, list) + + +def test_query_no_source_shows_error(runner: CliRunner) -> None: + """Query without --rdf or --schema shows usage error.""" + result = runner.invoke( + cli, + ["query", "fields-outputting-enum"], + ) + assert result.exit_code != 0 + + +def test_query_both_sources_shows_error( + runner: CliRunner, tmp_outputs: Path, spec_directory: Path, units_directory: Path +) -> None: + """Query with both --rdf and --schema shows usage error.""" + # First generate an RDF file to use as the --rdf argument + rdf_dir = tmp_outputs / "query_both_src" + runner.invoke( + cli, + [ + "generate", + "schema-rdf", + "-s", + str(spec_directory), + "-s", + str(units_directory), + "-o", + str(rdf_dir), + "--namespace", + "https://example.org/vss#", + "--output-formats", + "nt", + ], + ) + nt_file = rdf_dir / "schema.nt" + assert nt_file.exists() + + result = runner.invoke( + cli, + [ + "query", + "fields-outputting-enum", + "--rdf", + str(nt_file), + "-s", + str(spec_directory), + "--namespace", + "https://example.org/vss#", + ], + ) + assert result.exit_code != 0 + assert "not both" in result.output.lower() or "Provide either" in result.output + + @pytest.mark.parametrize( "schema_file,previous_file,expected_output", [ diff --git a/tests/test_rdf_materializer.py b/tests/test_rdf_materializer.py index 94c75c0b..ad3dfefa 100644 --- a/tests/test_rdf_materializer.py +++ b/tests/test_rdf_materializer.py @@ -1,14 +1,18 @@ """Tests for RDF materialization of GraphQL schemas.""" +import json from pathlib import Path from typing import Any +import pytest from graphql import GraphQLSchema, build_schema from rdflib import Graph from rdflib.namespace import RDF, SKOS from s2dm.exporters.rdf_materializer import ( BUILTIN_SCALARS, + DEFAULT_FORMATS, + FORMAT_REGISTRY, extract_schema_for_rdf, materialize_schema_to_rdf, serialize_sorted_ntriples, @@ -255,18 +259,63 @@ def test_ends_with_newline(self) -> None: assert serialize_sorted_ntriples(graph).endswith("\n") def test_writes_nt_and_ttl(self, tmp_path: Path) -> None: - """write_rdf_artifacts creates .nt and .ttl files.""" + """write_rdf_artifacts creates .nt and .ttl files with default formats.""" graph = materialize_schema_to_rdf(schema=_cabin_door_schema(), namespace=NS, prefix=PREFIX) - write_rdf_artifacts(graph, tmp_path, base_name="schema") + written = write_rdf_artifacts(graph, tmp_path, base_name="schema") assert (tmp_path / "schema.nt").exists() assert (tmp_path / "schema.ttl").exists() + assert len(written) == 2 nt = (tmp_path / "schema.nt").read_text() assert "skos:Concept" in nt or "Concept" in nt # format varies: prefix or full URI assert "hasField" in nt and "hasOutputType" in nt ttl = (tmp_path / "schema.ttl").read_text() assert "@prefix" in ttl and "skos:" in ttl and "s2dm:" in ttl + def test_writes_nt_only(self, tmp_path: Path) -> None: + """write_rdf_artifacts with formats=['nt'] produces only .nt.""" + graph = materialize_schema_to_rdf(schema=_cabin_door_schema(), namespace=NS, prefix=PREFIX) + written = write_rdf_artifacts(graph, tmp_path, base_name="schema", formats=["nt"]) + + assert len(written) == 1 + assert (tmp_path / "schema.nt").exists() + assert not (tmp_path / "schema.ttl").exists() + assert not (tmp_path / "schema.jsonld").exists() + + def test_writes_all_formats(self, tmp_path: Path) -> None: + """write_rdf_artifacts with all formats produces .nt, .ttl, and .jsonld.""" + graph = materialize_schema_to_rdf(schema=_cabin_door_schema(), namespace=NS, prefix=PREFIX) + written = write_rdf_artifacts(graph, tmp_path, base_name="schema", formats=["nt", "ttl", "jsonld"]) + + assert len(written) == 3 + assert (tmp_path / "schema.nt").exists() + assert (tmp_path / "schema.ttl").exists() + assert (tmp_path / "schema.jsonld").exists() + + def test_jsonld_output_is_valid(self, tmp_path: Path) -> None: + """JSON-LD output is valid JSON with RDF content.""" + graph = materialize_schema_to_rdf(schema=_cabin_door_schema(), namespace=NS, prefix=PREFIX) + write_rdf_artifacts(graph, tmp_path, base_name="schema", formats=["jsonld"]) + + jsonld_path = tmp_path / "schema.jsonld" + assert jsonld_path.exists() + data = json.loads(jsonld_path.read_text()) + jsonld_str = jsonld_path.read_text() + assert "Cabin" in jsonld_str + assert "hasField" in jsonld_str or "Field" in jsonld_str + assert isinstance(data, list | dict) + + def test_unknown_format_raises(self, tmp_path: Path) -> None: + """write_rdf_artifacts raises ValueError for unknown format keys.""" + graph = materialize_schema_to_rdf(schema=_cabin_door_schema(), namespace=NS, prefix=PREFIX) + with pytest.raises(ValueError, match="Unknown RDF format"): + write_rdf_artifacts(graph, tmp_path, base_name="schema", formats=["rdfxml"]) + + def test_format_registry_has_defaults(self) -> None: + """All default formats are present in FORMAT_REGISTRY.""" + for fmt in DEFAULT_FORMATS: + assert fmt in FORMAT_REGISTRY + def test_real_schema_materializes(self) -> None: """Real schema from files materializes successfully.""" data = Path(__file__).parent / "data" diff --git a/tests/test_sparql_queries.py b/tests/test_sparql_queries.py new file mode 100644 index 00000000..610a4603 --- /dev/null +++ b/tests/test_sparql_queries.py @@ -0,0 +1,195 @@ +"""Tests for SPARQL query module for RDF-materialized GraphQL schemas.""" + +from pathlib import Path + +import pytest +from graphql import build_schema +from rdflib import Graph + +from s2dm.exporters.rdf_materializer import ( + materialize_schema_to_rdf, + serialize_sorted_ntriples, +) +from s2dm.exporters.sparql_queries import ( + QUERIES, + format_results_as_table, + get_query_description, + get_query_names, + load_rdf_graph, + run_query, +) + +NS = "https://example.org/test#" +PREFIX = "ns" + + +def _cabin_door_graph() -> Graph: + """Create a materialized RDF graph from a Cabin/Door/Window schema.""" + schema = build_schema(""" + type Query { cabin: Cabin } + + type Cabin { + kind: CabinKindEnum + doors: [Door] + } + + enum CabinKindEnum { + SUV + VAN + } + + type Door { + isOpen: Boolean + window: Window + } + + type Window { + isTinted: Boolean + } + """) + return materialize_schema_to_rdf(schema=schema, namespace=NS, prefix=PREFIX) + + +class TestQueryRegistry: + """Tests for the SPARQL query registry.""" + + def test_has_three_queries(self) -> None: + """Registry contains exactly 3 predefined queries.""" + assert len(QUERIES) == 3 + + def test_get_query_names(self) -> None: + """get_query_names returns sorted list.""" + names = get_query_names() + assert names == sorted(names) + assert "fields-outputting-enum" in names + assert "object-types-with-fields" in names + assert "list-type-fields" in names + + def test_get_query_description(self) -> None: + """get_query_description returns a non-empty string.""" + for name in get_query_names(): + desc = get_query_description(name) + assert isinstance(desc, str) + assert len(desc) > 0 + + def test_unknown_query_raises(self) -> None: + """Accessing unknown query name raises KeyError.""" + with pytest.raises(KeyError): + get_query_description("nonexistent-query") + + +class TestLoadRdfGraph: + """Tests for loading RDF graphs from files.""" + + def test_load_nt_file(self, tmp_path: Path) -> None: + """Load graph from n-triples file.""" + graph = _cabin_door_graph() + nt_path = tmp_path / "test.nt" + nt_path.write_text(serialize_sorted_ntriples(graph), encoding="utf-8") + + loaded = load_rdf_graph(nt_path) + assert len(loaded) > 0 + + def test_load_ttl_file(self, tmp_path: Path) -> None: + """Load graph from turtle file.""" + graph = _cabin_door_graph() + ttl_path = tmp_path / "test.ttl" + graph.serialize(destination=str(ttl_path), format="turtle") + + loaded = load_rdf_graph(ttl_path) + assert len(loaded) > 0 + + def test_unsupported_extension_raises(self, tmp_path: Path) -> None: + """Unsupported file extension raises ValueError.""" + bad_path = tmp_path / "test.rdf" + bad_path.write_text("", encoding="utf-8") + + with pytest.raises(ValueError, match="Unsupported RDF file extension"): + load_rdf_graph(bad_path) + + def test_load_jsonld_file(self, tmp_path: Path) -> None: + """Load graph from JSON-LD file.""" + graph = _cabin_door_graph() + jsonld_path = tmp_path / "test.jsonld" + graph.serialize(destination=str(jsonld_path), format="json-ld") + + loaded = load_rdf_graph(jsonld_path) + assert len(loaded) > 0 + + def test_missing_file_raises(self, tmp_path: Path) -> None: + """Non-existent file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError): + load_rdf_graph(tmp_path / "does_not_exist.nt") + + +class TestRunQuery: + """Tests for executing SPARQL queries.""" + + def test_fields_outputting_enum(self) -> None: + """fields-outputting-enum finds Cabin.kind -> CabinKindEnum.""" + graph = _cabin_door_graph() + results = run_query(graph, "fields-outputting-enum") + + assert len(results) > 0 + field_uris = [r["field"] for r in results] + assert any("Cabin.kind" in uri for uri in field_uris) + enum_uris = [r["enumType"] for r in results] + assert any("CabinKindEnum" in uri for uri in enum_uris) + + def test_object_types_with_fields(self) -> None: + """object-types-with-fields lists Cabin, Door, Window with fields.""" + graph = _cabin_door_graph() + results = run_query(graph, "object-types-with-fields") + + type_names = {r["objectType"] for r in results} + assert any("Cabin" in t for t in type_names) + assert any("Door" in t for t in type_names) + assert any("Window" in t for t in type_names) + + def test_list_type_fields(self) -> None: + """list-type-fields finds Cabin.doors (list wrapper).""" + graph = _cabin_door_graph() + results = run_query(graph, "list-type-fields") + + assert len(results) > 0 + field_uris = [r["field"] for r in results] + assert any("Cabin.doors" in uri for uri in field_uris) + + def test_query_empty_graph(self) -> None: + """Query on empty graph returns empty list.""" + empty_graph = Graph() + for query_name in get_query_names(): + results = run_query(empty_graph, query_name) + assert results == [] + + def test_unknown_query_raises(self) -> None: + """Unknown query name raises KeyError.""" + graph = _cabin_door_graph() + with pytest.raises(KeyError): + run_query(graph, "nonexistent") + + +class TestFormatResults: + """Tests for result formatting.""" + + def test_compact_shortens_uris(self) -> None: + """Compact mode shortens URIs to local names.""" + results = [{"field": "https://example.org/test#Cabin.kind"}] + compact = format_results_as_table(results, compact=True) + assert compact[0]["field"] == "Cabin.kind" + + def test_no_compact_preserves_uris(self) -> None: + """Non-compact mode preserves full URIs.""" + results = [{"field": "https://example.org/test#Cabin.kind"}] + non_compact = format_results_as_table(results, compact=False) + assert non_compact[0]["field"] == "https://example.org/test#Cabin.kind" + + def test_compact_shortens_slash_separator_uris(self) -> None: + """Compact mode shortens URIs using '/' separator when '#' is absent.""" + results = [{"type": "https://example.org/ontology/ObjectType"}] + compact = format_results_as_table(results, compact=True) + assert compact[0]["type"] == "ObjectType" + + def test_empty_results(self) -> None: + """Empty results return empty list.""" + assert format_results_as_table([]) == []