diff --git a/.github/workflows/ldtab-tests.yml b/.github/workflows/ldtab-tests.yml index 8537621..e66d6bb 100644 --- a/.github/workflows/ldtab-tests.yml +++ b/.github/workflows/ldtab-tests.yml @@ -14,16 +14,16 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up JDK 17 - uses: actions/setup-java@v2 + uses: actions/setup-java@v4 with: - distribution: 'adopt' + distribution: 'temurin' java-version: '17' - name: Install clojure tools - uses: DeLaGuardo/setup-clojure@12.5 + uses: DeLaGuardo/setup-clojure@13.4 with: # Install just one or all simultaneously # The value must indicate a particular version of the tool, or use 'latest' @@ -33,7 +33,7 @@ jobs: cljfmt: 0.10.2 # cljfmt - name: Cache Leiningen dependencies - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ~/.m2 key: ${{ runner.os }}-m2-${{ hashFiles('**/project.clj') }} diff --git a/src/ldtab/annotation_handling.clj b/src/ldtab/annotation_handling.clj index 7fb4733..bc069e8 100644 --- a/src/ldtab/annotation_handling.clj +++ b/src/ldtab/annotation_handling.clj @@ -126,7 +126,7 @@ The raw thick triple of an OWL annotation - {:subject wiring:blanknode:G__1130, + {:subject ldtab:blanknode:G__1130, :predicate owl:Axiom, :object {obo:IAO_0010000 [{:object obo:050-003}], owl:annotatedTarget [{:object \"literal\"}], diff --git a/src/ldtab/thick_rdf.clj b/src/ldtab/thick_rdf.clj index db3d9e7..f2f9e6c 100644 --- a/src/ldtab/thick_rdf.clj +++ b/src/ldtab/thick_rdf.clj @@ -159,10 +159,10 @@ (parse-json object) object))) -(defn is-wiring-blanknode +(defn is-ldtab-blanknode [input] (and (string? input) - (str/starts-with? input " (count v) 1)) blanknode-2-triples)) triples (remove #(contains? complex-blanknodes (:subject %)) triples) @@ -210,8 +210,8 @@ annotation (parse-json (:annotation thick-triple))] (when annotation (translate-annotation subject predicate object annotation prefix-2-base model)) - (if (is-wiring-blanknode subject-json) - model ;remove generated wiring:blank nodes + (if (is-ldtab-blanknode subject-json) + model ;remove generated ldtab:blank nodes (.add model subject predicate object)))) (defn triples-2-rdf-model-stream diff --git a/src/ldtab/thin2thick.clj b/src/ldtab/thin2thick.clj index c304972..e0c3353 100644 --- a/src/ldtab/thin2thick.clj +++ b/src/ldtab/thin2thick.clj @@ -1,28 +1,33 @@ (ns ldtab.thin2thick (:require [clojure.set :as set] [clojure.string :as str] + [clojure.walk :as walk] [ldtab.annotation-handling :as ann] [ldtab.rdf-list-handling :as rdf-list] [ldtab.gci-handling :as gci] [cheshire.core :as cs]) - (:import [org.apache.jena.graph NodeFactory Triple Node]) + (:import [org.apache.jena.graph NodeFactory Triple Node] + [java.security MessageDigest] + [java.math BigInteger]) ;[org.apache.jena.rdf.model ModelFactory Model StmtIterator Resource Property RDFNode Statement]) (:gen-class)) (declare node-2-thick-map) +(declare sort-json) +(declare sort-string-json) +;(declare expand-curies-in-json) -(defn is-wiring-blanknode +(defn is-ldtab-blanknode [input] (and (string? input) - (str/starts-with? input "")) - triple)) +(defn sha256 + "Calculate a SHA-256 digest for a given UTF-8 string." + [^String input] + (let [md (MessageDigest/getInstance "SHA-256")] + (.update md (.getBytes input "UTF-8")) + (format "%064x" (BigInteger. 1 (.digest md))))) ;TODO: add support for user input prefixes (using prefix table) (defn curify @@ -41,6 +46,72 @@ (str/replace uri (:base found) (str (:prefix found) ":")) (str "<" uri ">")))) +(defn expand-with + "Turn a CURIE into a full IRI using iri2prefix" + [^String curie iri2prefix] + (let [[prefix local] (str/split curie #":" 2) + found (some #(when (= (:prefix %) prefix) %) iri2prefix)] + (if found + (str "<" (:base found) local ">") + curie))) + +(defn expand-curies-in-json + "Walk a (parsed) JSON value and expand any CURIEs into full IRis." + [json iri2prefix] + (walk/postwalk + (fn [x] + (if (string? x) + (expand-with x iri2prefix) + x)) + json)) + +(defn contract-with + "Turn a full IRI (e.g., ) into a CURIE using iri2prefix, + If no base matches, return the original string unchanged. + Prefers the *longest* matching base" + ^String + [^String s iri2prefix] + (let [iri (if (and (str/starts-with? s "<") (str/ends-with? s ">")) + (subs s 1 (dec (count s))) ; strip angle brackets + s) + candidates (seq (filter #(str/starts-with? iri (:base %)) iri2prefix)) + best (when candidates + (apply max-key #(count (:base %)) candidates))] + (if best + (str (:prefix best) ":" (subs iri (count (:base best)))) + s))) + +(defn contract-iris-in-json + "Walk a (parsed) JSON value and contract any string IRIs into CURIEs." + [json iri2prefix] + (walk/postwalk + (fn [x] + (if (string? x) + (contract-with x iri2prefix) + x)) + json)) + + +(defn hash-existential-subject-blanknode + ([triple] + (if (is-ldtab-blanknode (:subject triple)) + (let [string-to-hash (cs/generate-string (sort-string-json (cs/parse-string (cs/generate-string (:object triple)))))] + (assoc triple + :subject + (str "")) + ) + triple)) + ([triple iri2prefix] + (if (is-ldtab-blanknode (:subject triple)) + (let [object (:object triple) + expansion (expand-curies-in-json object iri2prefix) + triple (assoc triple :object expansion) + hash-triple (hash-existential-subject-blanknode triple) + contraction (contract-iris-in-json hash-triple iri2prefix)] + contraction) + triple))) + + (defn map-on-hash-map-vals "Given a hashmap m and a function f, apply f to all values of m. @@ -77,9 +148,9 @@ "Given a set of triples, identify root blank nodes and add triples of the form - [wiring:blanknode:id type _:blankNode] + [ldtab:blanknode:id type _:blankNode] - where 'wiring:blanknode:id' is a newly generated subject, + where 'ldtab:blanknode:id' is a newly generated subject, type is the rdf:type of the identified root _:blankNode, and _:blankNode is the root node. @@ -93,7 +164,7 @@ the following triple would be added: - [wiring:blanknode:1, rdf:type, _:B] + [ldtab:blanknode:1, rdf:type, _:B] Explanation: We collapse blank nodes into JSON maps. @@ -107,11 +178,11 @@ blank-roots (filter (fn [^Node x] (.isBlank x)) root) ;TODO blank-leaves also need to be skolemised: ;for a given blank-leaf [s p _b:leaf] - ;we need to add the triple [_b:leaf rdf:type wiring:blanknode] + ;we need to add the triple [_b:leaf rdf:type ldtab:blanknode] ;so that we collapse the blank node into it's skolem form - additions (map (fn [^Node x] (new Triple (NodeFactory/createURI (str "wiring:blanknode:" (gensym))) - ;(NodeFactory/createURI "wiring:blanknode") + additions (map (fn [^Node x] (new Triple (NodeFactory/createURI (str "ldtab:blanknode:" (gensym))) + ;(NodeFactory/createURI "ldtab:blanknode") (get-type (get subject-to-triples x)) x)) blank-roots)] @@ -141,10 +212,8 @@ datatype)) :else "ERROR"))) - (defn existential-blanknode-2-triples [existential-blanknode] - ;(print "existblanknode: " existential-blanknode) (let [blanknode (:subject existential-blanknode) object (:object existential-blanknode) datatype (:datatype existential-blanknode) @@ -154,17 +223,54 @@ :object (get (first v) "object"), :datatype (get (first v) "datatype")}) object) [existential-blanknode])] - ;(print "translated: " triples) triples)) (defn split-existential-blanknode-encoding [triples] - (let [existential-blanknodes (filter (fn [x] (is-wiring-blanknode (:subject x))) triples) - triples (remove (fn [x] (is-wiring-blanknode (:subject x))) triples) + (let [existential-blanknodes (filter (fn [x] (is-ldtab-blanknode (:subject x))) triples) + triples (remove (fn [x] (is-ldtab-blanknode (:subject x))) triples) existential-blanknode-triples (mapcat existential-blanknode-2-triples existential-blanknodes) triples (concat existential-blanknode-triples triples)] triples)) + +(defn is-subject-object + [triple] + (map? (:subject triple))) + + +(defn subject-json-object-2-triples + [triple iri2prefix] + (let [subject (:subject triple) + object (:object triple) + blank (assoc subject (:predicate triple) + [{:object object + :datatype (:datatype triple)}]) + + expansion (expand-curies-in-json blank iri2prefix) + string-to-hash (cs/generate-string (sort-string-json expansion)) + + blanknode (str "") + triples (map (fn [[k v]] {:subject blanknode, + :predicate k, + :object (get (first v) "object"), + :datatype (get (first v) "datatype")}) subject) + triples (conj triples + {:subject blanknode, + :predicate (:predicate triple), + :object (:object triple), + :datatype (:datatype triple) + :annotation (:annotation triple)})] + triples)) + +(defn split-subject-json-objects + [triples iri2prefix] + (let [subject-objects (filter (fn [x] (is-subject-object x)) triples) + triples (remove (fn [x] (is-subject-object x)) triples) + subject-object-triples (mapcat #(subject-json-object-2-triples % iri2prefix) subject-objects) + triples (concat subject-object-triples triples)] + triples)) + (defn encode-object "Given a triple t = [s p o] and a map from subject nodes to its triples, returns predicate map for the o" @@ -226,6 +332,39 @@ root-triples (filter (fn [^Triple x] (contains? root (.getSubject x))) triples)] root-triples)) + +;this is the same as sort-json but keys of the JSON value are expected to be strings + + +(defn sort-string-json + "Given a JSON value, return a lexicographically ordered representation." + [m] + (cond + ; sort RDF lists + (and (map? m) + (contains? m "datatype") + (= (get m "datatype") "_JSONLIST")) + (let [sorted-list {:datatype "_JSONLIST", :object (map sort-string-json (get m "object"))}] + (if (contains? m "subject") ; top-level RDF list + (into (sorted-map) (merge sorted-list + {:subject (sort-string-json (get m "subject")) + :predicate (:predicate m) + :graph (:graph m) + :assertion (:assertion m) + :retraction (:retraction m) + :annotation (:annotation m)})) + (into (sorted-map) sorted-list))); nested RDF list + + (map? m) + (into (sorted-map) (map-on-hash-map-vals sort-string-json m)) ; sort by key + + (coll? m) + (vec (map cs/parse-string ; sort by string comparison + (sort (map #(cs/generate-string (sort-string-json %)) m)))) + + :else + m)) + ;NB: sorting transfoms keywords to strings (defn sort-json "Given a JSON value, return a lexicographically ordered representation." @@ -312,8 +451,9 @@ rdf-lists (map rdf-list/encode-rdf-list annotations) sorted (map sort-json rdf-lists) hashed (map hash-existential-subject-blanknode sorted) - split (split-existential-blanknode-encoding hashed) - normalised (map #(cs/parse-string (cs/generate-string %)) split)];TODO: stringify keys - this is a (probably an inefficient?) workaround + split-objects (split-existential-blanknode-encoding hashed) + split-subjects (split-subject-json-objects split-objects) + normalised (map #(cs/parse-string (cs/generate-string %)) split-subjects)];TODO: stringify keys - this is a (probably an inefficient?) workaround normalised)) ([triples iri2prefix] (let [raw-thick-triples (thin-2-thick-raw triples iri2prefix) @@ -327,7 +467,8 @@ %) gcis) rdf-lists (map rdf-list/encode-rdf-list annotations) sorted (map sort-json rdf-lists) - hashed (map hash-existential-subject-blanknode sorted) - split (split-existential-blanknode-encoding hashed) - normalised (map #(cs/parse-string (cs/generate-string %)) split)];TODO: stringify keys - this is a (probably an inefficient?) workaround + hashed (map #(hash-existential-subject-blanknode % iri2prefix) sorted) + split-objects (split-existential-blanknode-encoding hashed) + split-subjects (split-subject-json-objects split-objects iri2prefix) + normalised (map #(cs/parse-string (cs/generate-string %)) split-subjects)];TODO: stringify keys - this is a (probably an inefficient?) workaround normalised)))