|
| 1 | +package cz.cvut.spipes.util; |
| 2 | + |
| 3 | +import org.apache.jena.datatypes.xsd.XSDDatatype; |
| 4 | +import org.apache.jena.rdf.model.*; |
| 5 | +import org.apache.jena.vocabulary.RDF; |
| 6 | +import org.jetbrains.annotations.NotNull; |
| 7 | + |
| 8 | +import java.io.OutputStream; |
| 9 | +import java.io.OutputStreamWriter; |
| 10 | +import java.io.PrintWriter; |
| 11 | +import java.nio.charset.StandardCharsets; |
| 12 | +import java.util.*; |
| 13 | +import java.util.stream.Collectors; |
| 14 | + |
| 15 | +public class SPipesFormatter { |
| 16 | + |
| 17 | + private static final Comparator<Property> PREDICATE_ORDER = (p1, p2) -> { |
| 18 | + if (p1.equals(p2)) return 0; |
| 19 | + if (RDF.type.equals(p1)) return -1; |
| 20 | + if (RDF.type.equals(p2)) return 1; |
| 21 | + return p1.getURI().compareTo(p2.getURI()); |
| 22 | + }; |
| 23 | + |
| 24 | + private final Model model; |
| 25 | + private final Map<String, String> ns; |
| 26 | + private final Map<Resource, Map<Property, List<RDFNode>>> subjectMap = new LinkedHashMap<>(); |
| 27 | + |
| 28 | + private final Map<Resource, Integer> inDegree = new HashMap<>(); |
| 29 | + private final Map<Resource, String> bnodeLabels = new LinkedHashMap<>(); |
| 30 | + private int bCounter = 0; |
| 31 | + |
| 32 | + public SPipesFormatter(Model model) { |
| 33 | + this.model = model; |
| 34 | + this.ns = model.getNsPrefixMap(); |
| 35 | + buildSubjectMap(); |
| 36 | + } |
| 37 | + |
| 38 | + public void writeTo(OutputStream outputStream) { |
| 39 | + var writer = new PrintWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8), false); |
| 40 | + writePrefixes(writer); |
| 41 | + writeTriples(writer); |
| 42 | + writer.flush(); |
| 43 | + } |
| 44 | + |
| 45 | + private void buildSubjectMap() { |
| 46 | + StmtIterator stmtIter = model.listStatements(); |
| 47 | + while (stmtIter.hasNext()) { |
| 48 | + Statement stmt = stmtIter.nextStatement(); |
| 49 | + Resource subj = stmt.getSubject(); |
| 50 | + Property pred = stmt.getPredicate(); |
| 51 | + RDFNode obj = stmt.getObject(); |
| 52 | + |
| 53 | + subjectMap |
| 54 | + .computeIfAbsent(subj, k -> new LinkedHashMap<>()) |
| 55 | + .computeIfAbsent(pred, k -> new ArrayList<>()) |
| 56 | + .add(obj); |
| 57 | + |
| 58 | + if (obj.isAnon()) { |
| 59 | + Resource br = obj.asResource(); |
| 60 | + inDegree.merge(br, 1, Integer::sum); |
| 61 | + } |
| 62 | + } |
| 63 | + for (Resource subj : subjectMap.keySet()) { |
| 64 | + if (subj.isAnon() && inDegreeOf(subj) > 1) { |
| 65 | + allocLabel(subj); |
| 66 | + } |
| 67 | + } |
| 68 | + } |
| 69 | + |
| 70 | + private void writePrefixes(PrintWriter writer) { |
| 71 | + List<String> priority = List.of("owl", "rdf", "rdfs", "skos", "sm", "sml", "sp", "spin", "xsd"); |
| 72 | + |
| 73 | + Comparator<Map.Entry<String, String>> prefixComparator = (e1, e2) -> { |
| 74 | + if (e1.getKey().isEmpty() && !e2.getKey().isEmpty()) return -1; |
| 75 | + if (!e1.getKey().isEmpty() && e2.getKey().isEmpty()) return 1; |
| 76 | + int i1 = priority.indexOf(e1.getKey()); |
| 77 | + int i2 = priority.indexOf(e2.getKey()); |
| 78 | + if (i1 != -1 && i2 != -1) return Integer.compare(i1, i2); |
| 79 | + if (i1 != -1) return -1; |
| 80 | + if (i2 != -1) return 1; |
| 81 | + return e1.getKey().compareToIgnoreCase(e2.getKey()); |
| 82 | + }; |
| 83 | + |
| 84 | + ns.entrySet().stream() |
| 85 | + .sorted(prefixComparator) |
| 86 | + .forEach(e -> writer.printf("@prefix %s: <%s> .%n", e.getKey(), e.getValue())); |
| 87 | + |
| 88 | + writer.println(); |
| 89 | + } |
| 90 | + |
| 91 | + |
| 92 | + private void writeTriples(PrintWriter writer) { |
| 93 | + List<Resource> subjects = getSubjects(); |
| 94 | + |
| 95 | + for (Resource subject : subjects) { |
| 96 | + if (subject.isAnon() && !hasLabel(subject) && inDegreeOf(subject) >= 1) { |
| 97 | + continue; |
| 98 | + } |
| 99 | + if (subject.isAnon() && !hasLabel(subject) && inDegreeOf(subject) == 0) { |
| 100 | + writer.println(formatBNodeAsPropertyList(subject, new HashSet<>())); |
| 101 | + continue; |
| 102 | + } |
| 103 | + |
| 104 | + writer.println(formatNode(subject)); |
| 105 | + |
| 106 | + Map<Property, List<RDFNode>> predMap = new TreeMap<>(PREDICATE_ORDER); |
| 107 | + predMap.putAll(subjectMap.get(subject)); |
| 108 | + |
| 109 | + if (predMap.isEmpty()) { |
| 110 | + writer.println(" .\n"); |
| 111 | + continue; |
| 112 | + } |
| 113 | + |
| 114 | + List<Map.Entry<Property, List<RDFNode>>> predEntries = new ArrayList<>(predMap.entrySet()); |
| 115 | + for (Map.Entry<Property, List<RDFNode>> predEntry : predEntries) { |
| 116 | + String predStr = RDF.type.equals(predEntry.getKey()) ? "a" : formatNode(predEntry.getKey()); |
| 117 | + |
| 118 | + List<String> objStrs = predEntry.getValue().stream() |
| 119 | + .map(this::formatNode) |
| 120 | + .toList(); |
| 121 | + |
| 122 | + for (String objStr : objStrs) { |
| 123 | + writer.println(" " + predStr + " " + objStr + " ;"); |
| 124 | + } |
| 125 | + } |
| 126 | + |
| 127 | + writer.println(" ."); |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + @NotNull |
| 132 | + private List<Resource> getSubjects() { |
| 133 | + List<Resource> subjects = new ArrayList<>(subjectMap.keySet()); |
| 134 | + subjects.sort((a, b) -> { |
| 135 | + int ca = a.isURIResource() ? 0 : (hasLabel(a) ? 1 : 2); |
| 136 | + int cb = b.isURIResource() ? 0 : (hasLabel(b) ? 1 : 2); |
| 137 | + if (ca != cb) return Integer.compare(ca, cb); |
| 138 | + if (a.isURIResource() && b.isURIResource()) return a.getURI().compareTo(b.getURI()); |
| 139 | + if (hasLabel(a) && hasLabel(b)) return getLabel(a).compareTo(getLabel(b)); |
| 140 | + return 0; |
| 141 | + }); |
| 142 | + return subjects; |
| 143 | + } |
| 144 | + |
| 145 | + private String formatNode(RDFNode node) { |
| 146 | + if (node.isLiteral()) { |
| 147 | + return formatLiteral(node.asLiteral()); |
| 148 | + } else if (node.isAnon()) { |
| 149 | + Resource br = node.asResource(); |
| 150 | + if (hasLabel(br)) return getLabel(br); |
| 151 | + return formatBNodeAsPropertyList(br, new HashSet<>()); |
| 152 | + } else if (node.isURIResource()) { |
| 153 | + return formatURI(node.asResource()); |
| 154 | + } else { |
| 155 | + return node.toString(); |
| 156 | + } |
| 157 | + } |
| 158 | + |
| 159 | + private String formatURI(Resource res) { |
| 160 | + String uri = res.getURI(); |
| 161 | + for (var e : ns.entrySet()) { |
| 162 | + if (uri.startsWith(e.getValue())) { |
| 163 | + return e.getKey() + ":" + uri.substring(e.getValue().length()); |
| 164 | + } |
| 165 | + } |
| 166 | + return "<" + uri + ">"; |
| 167 | + } |
| 168 | + |
| 169 | + private String formatLiteral(Literal lit) { |
| 170 | + String value = lit.getString(); |
| 171 | + boolean multiline = value.contains("\n") || value.contains("\r"); |
| 172 | + String escaped = escapeString(value, multiline); |
| 173 | + String lex = multiline ? "\"\"\"" + escaped + "\"\"\"" : "\"" + escaped + "\""; |
| 174 | + |
| 175 | + String lang = lit.getLanguage(); |
| 176 | + if (lang != null && !lang.isEmpty()) { |
| 177 | + return lex + "@" + lang; |
| 178 | + } |
| 179 | + |
| 180 | + String dt = lit.getDatatypeURI(); |
| 181 | + if (dt != null && !dt.equals(XSDDatatype.XSDstring.getURI())) { |
| 182 | + return lex + "^^" + formatURI(ResourceFactory.createResource(dt)); |
| 183 | + } |
| 184 | + |
| 185 | + return lex; |
| 186 | + } |
| 187 | + |
| 188 | + private String escapeString(String s, boolean multiline) { |
| 189 | + StringBuilder b = new StringBuilder(); |
| 190 | + for (int i = 0; i < s.length(); i++) { |
| 191 | + char c = s.charAt(i); |
| 192 | + switch (c) { |
| 193 | + case '\\': b.append("\\\\"); break; |
| 194 | + case '"': |
| 195 | + if (!multiline) { |
| 196 | + b.append("\\\""); |
| 197 | + } else { |
| 198 | + if (i + 2 < s.length() && s.charAt(i+1) == '"' && s.charAt(i+2) == '"') { |
| 199 | + b.append("\\\"\\\"\\\""); |
| 200 | + i += 2; |
| 201 | + } else { |
| 202 | + b.append('"'); |
| 203 | + } |
| 204 | + } |
| 205 | + break; |
| 206 | + case '\n': b.append(multiline ? "\n" : "\\n"); break; |
| 207 | + case '\r': b.append(multiline ? "\r" : "\\r"); break; |
| 208 | + case '\t': b.append("\\t"); break; |
| 209 | + case '\b': b.append("\\b"); break; |
| 210 | + case '\f': b.append("\\f"); break; |
| 211 | + default: |
| 212 | + if (c < 0x20) b.append(String.format("\\u%04X", (int) c)); |
| 213 | + else b.append(c); |
| 214 | + } |
| 215 | + } |
| 216 | + return b.toString(); |
| 217 | + } |
| 218 | + |
| 219 | + private String formatBNodeAsPropertyList(Resource blank, Set<Resource> path) { |
| 220 | + if (hasLabel(blank)) return getLabel(blank); |
| 221 | + if (!path.add(blank)) return allocLabel(blank); |
| 222 | + |
| 223 | + List<Statement> props = model.listStatements(blank, null, (RDFNode) null).toList(); |
| 224 | + if (props.isEmpty()) return "[]"; |
| 225 | + |
| 226 | + props.sort(Comparator |
| 227 | + .comparing(Statement::getPredicate, PREDICATE_ORDER) |
| 228 | + .thenComparing(s -> formatNode(s.getObject()))); |
| 229 | + |
| 230 | + StringBuilder builder = new StringBuilder("[ "); |
| 231 | + for (Statement stmt : props) { |
| 232 | + String predStr = stmt.getPredicate().equals(RDF.type) ? "a" : formatNode(stmt.getPredicate()); |
| 233 | + String objStr = formatNodeWithPath(stmt.getObject(), path); |
| 234 | + builder.append(predStr).append(" ").append(objStr).append(" ; "); |
| 235 | + } |
| 236 | + builder.append("]"); |
| 237 | + path.remove(blank); |
| 238 | + return builder.toString(); |
| 239 | + } |
| 240 | + |
| 241 | + private String formatNodeWithPath(RDFNode node, Set<Resource> path) { |
| 242 | + if (node.isAnon()) { |
| 243 | + Resource br = node.asResource(); |
| 244 | + if (hasLabel(br)) return getLabel(br); |
| 245 | + if (inDegreeOf(br) <= 1) return formatBNodeAsPropertyList(br, path); |
| 246 | + return allocLabel(br); |
| 247 | + } |
| 248 | + return formatNode(node); |
| 249 | + } |
| 250 | + |
| 251 | + private int inDegreeOf(Resource r) { return inDegree.getOrDefault(r, 0); } |
| 252 | + private boolean hasLabel(Resource r) { return bnodeLabels.containsKey(r); } |
| 253 | + private String getLabel(Resource r) { return bnodeLabels.get(r); } |
| 254 | + private String allocLabel(Resource r) { |
| 255 | + return bnodeLabels.computeIfAbsent(r, k -> "_:b" + (bCounter++)); |
| 256 | + } |
| 257 | + |
| 258 | +} |
0 commit comments