-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsv2skos.py
More file actions
140 lines (115 loc) · 6.34 KB
/
csv2skos.py
File metadata and controls
140 lines (115 loc) · 6.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
import datetime
import math
import os
from rdflib import Graph, URIRef, BNode, Literal, Namespace, XSD
from rdflib.namespace import SKOS, RDF, DC, PROV, DCTERMS, RDFS
# global column names, properties and datatypes of scheme
schemePropertyDict = {"title": (DC.title, Literal, True),
"description": (SKOS.definition, Literal, True),
"creator": (DC.creator, Literal, False),
"publisher": (DCTERMS.publisher, Literal, False),
"license": (DCTERMS.license, Literal, False),
"rights": (DCTERMS.rights, Literal, False),
"contributor": (DCTERMS.contributor, Literal, False),
"subject": (DCTERMS.subject, Literal, True),
}
# global column names, properties and datatypes of concepts
conceptPropertyDict = {"notation": (SKOS.notation, Literal, False),
"prefLabel": (SKOS.prefLabel, Literal, True),
"altLabel": (SKOS.altLabel, Literal, True),
"definition": (SKOS.definition, Literal, True),
"broader": (SKOS.broader, URIRef, False),
"narrower": (SKOS.narrower, URIRef, False),
"related": (SKOS.related, URIRef, False),
"closeMatch": (SKOS.closeMatch, URIRef, False),
"relatedMatch": (SKOS.relatedMatch, URIRef, False),
"exactMatch": (SKOS.exactMatch, URIRef, False),
"source": (DC.source, Literal, False),
"creator": (DC.creator, Literal, False),
"seeAlso": (RDFS.seeAlso, Literal, False)
}
def buildTriples(g, df, subject, propertyDict, conceptScheme, conceptPrefix, seperator, baseLanguage):
# iterate over all rows in df
for index, row in df.iterrows():
rowDict = {key:value for key, value in row.items()}
if subject == "scheme":
subject = conceptScheme
else:
subject = URIRef(conceptPrefix + rowDict["notation"])
g.add((subject, RDF.type, SKOS.Concept))
g.add((subject, SKOS.inScheme, conceptScheme))
for key in propertyDict:
if key in rowDict:
value = rowDict[key]
if isinstance(value, float) and math.isnan(value):
if key == "broader":
g.add((conceptScheme, SKOS.hasTopConcept, subject))
g.add((subject, SKOS.topConceptOf, conceptScheme))
continue
values = value.split(seperator)
property, datatype, isLangString = propertyDict[key]
langDict = {}
for object in values:
if property in [SKOS.broader, SKOS.narrower, SKOS.related]:
object = conceptPrefix + object
if isLangString:
if len(object.split("@")) > 1:
object, language = object.split("@")
else:
object, language = object, baseLanguage
if language not in langDict:
langDict[language] = 0
else:
langDict[language] += 1
if langDict[language] > 0 and property == SKOS.prefLabel:
print(f"Multiple prefLabels for language @{language} at concept {subject}. Switching to altLabel.")
g.add((subject, SKOS.altLabel, Literal(object, lang=language)))
else:
g.add((subject, property, Literal(object, lang=language)))
else:
g.add((subject, property, datatype(object)))
if property == SKOS.broader and "narrower" not in rowDict:
g.add((URIRef(object), SKOS.narrower, subject))
if property == SKOS.narrower and "broader" not in rowDict:
g.add((URIRef(object), SKOS.broader, subject))
return g
def main(conceptCsvPath, schemeCsvPath, scriptRepositoryPath, seperator, baseLanguage, baseUri):
# initialization of graph and provenance entities
g = Graph()
thesaurusCreation = BNode()
g.add((thesaurusCreation, RDF.type, PROV.Activity))
g.add((thesaurusCreation, PROV.startedAtTime, Literal(datetime.datetime.now(), datatype=XSD.dateTime)))
pythonScript = URIRef(scriptRepositoryPath)
g.add((pythonScript, RDF.type, PROV.SoftwareAgent))
g.add((thesaurusCreation, PROV.wasAssociatedWith, pythonScript))
# generate dataframes from csv paths
conceptsDf = pd.read_csv(conceptCsvPath)
schemeDf = pd.read_csv(schemeCsvPath)
# create concept scheme and connect to provenance
conceptScheme = URIRef(baseUri)
g.add((conceptScheme, RDF.type, SKOS.ConceptScheme))
g.add((conceptScheme, RDF.type, PROV.Entity))
g.add((conceptScheme, PROV.wasGeneratedBy, thesaurusCreation))
g.add((conceptScheme, PROV.wasAttributedTo, pythonScript))
conceptPrefix = baseUri + "/"
# enrich concept scheme with metadata
g = buildTriples(g, schemeDf, "scheme", schemePropertyDict, conceptScheme, conceptPrefix, seperator, baseLanguage)
# create concepts and connect them to concept scheme
g = buildTriples(g, conceptsDf, "concept", conceptPropertyDict, conceptScheme, conceptPrefix, seperator, baseLanguage)
# add end time for thesaurus creation activity
g.add((thesaurusCreation, PROV.endedAtTime, Literal(datetime.datetime.now(), datatype=XSD.dateTime)))
# save graph to file
g.serialize(destination="thesaurus.ttl",format="turtle")
# paths to csv files
conceptCsvPath = "concepts.csv"
schemeCsvPath = "scheme.csv"
# repository url of this script
scriptRepositoryPath = "https://github.com/LasseMempel/csv2skos/blob/master/csv2skos.py"
# seperator character for multivalue cells in csv
seperator = "|"
# fallback language if no language is given in value
baseLanguage = "de"
# base uri of the thesaurus and the concepts
baseUri = "https://www.example.com/terminologies/sausagethesaurus"
main(conceptCsvPath, schemeCsvPath, scriptRepositoryPath, seperator, baseLanguage, baseUri)