Skip to content

Helper functions for the generation of CIDOC CRMish RDF

License

Notifications You must be signed in to change notification settings

acdh-oeaw/acdh-cidoc-pyutils

Folders and files

NameName
Last commit message
Last commit date

Latest commit

0c29c9f · Feb 24, 2025
Dec 13, 2024
Jan 18, 2023
Feb 24, 2025
Feb 24, 2025
Dec 13, 2024
Jan 31, 2025
Jan 18, 2023
Jan 31, 2025
Jan 31, 2025
Dec 13, 2024
Feb 24, 2025

Repository files navigation

flake8 Lint Test codecov PyPI version

acdh-cidoc-pyutils

Helper functions for the generation of CIDOC CRMish RDF (from XML/TEI data)

Installation

  • install via pip install acdh-cidoc-pyutils

Examples

extract cidoc:P14i_performed FRBROO:F51_ Pursuit triples from tei:person/tei:occupation nodes

import lxml.etree as ET
from rdflib import URIRef
rom acdh_cidoc_pyutils import make_occupations, NSMAP
sample = """
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <person xml:id="DWpers0091" sortKey="Gulbransson_Olaf_Leonhard">
        <persName type="pref">Gulbransson, Olaf</persName>
        <occupation notBefore="1900-12" notAfter="2000" key="#hansi" xml:lang="it">Bürgermeister</occupation>
        <occupation from="1233-02-03" key="#sumsi">Tischlermeister/Fleischhauer</occupation>
        <occupation key="franzi">Sängerin</occupation>
        <occupation>Bäckerin</occupation>
    </person>
</TEI>"""
g, uris = make_occupations(subj, x, "https://foo.bar", id_xpath="@key")
print(g.serialize())
# returns
@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://foo/bar/DWpers0091> ns1:P14i_performed <https://foo/bar/DWpers0091/occupation/3>,
        <https://foo/bar/DWpers0091/occupation/franzi>,
        <https://foo/bar/DWpers0091/occupation/hansi>,
        <https://foo/bar/DWpers0091/occupation/sumsi> .

<https://foo/bar/DWpers0091/occupation/3> a <http://iflastandards.info/ns/fr/frbr/frbroo#F51> ;
    rdfs:label "Bäckerin"@de .

<https://foo/bar/DWpers0091/occupation/franzi> a <http://iflastandards.info/ns/fr/frbr/frbroo#F51> ;
    rdfs:label "Sängerin"@de .

<https://foo/bar/DWpers0091/occupation/hansi> a <http://iflastandards.info/ns/fr/frbr/frbroo#F51> ;
    rdfs:label "Bürgermeister"@it ;
    ns1:P4_has_time-span <https://foo/bar/DWpers0091/occupation/hansi/time-span> .

<https://foo/bar/DWpers0091/occupation/hansi/time-span> a ns1:E52_Time-Span ;
    rdfs:label "1900-12 - 2000"^^xsd:string ;
    ns1:P82a_begin_of_the_begin "1900-12"^^xsd:gYearMonth ;
    ns1:P82b_end_of_the_end "2000"^^xsd:gYear .

<https://foo/bar/DWpers0091/occupation/sumsi> a <http://iflastandards.info/ns/fr/frbr/frbroo#F51> ;
    rdfs:label "Tischlermeister/Fleischhauer"@de ;
    ns1:P4_has_time-span <https://foo/bar/DWpers0091/occupation/sumsi/time-span> .

<https://foo/bar/DWpers0091/occupation/sumsi/time-span> a ns1:E52_Time-Span ;
    rdfs:label "1233-02-03 - 1233-02-03"^^xsd:string ;
    ns1:P82a_begin_of_the_begin "1233-02-03"^^xsd:date ;
    ns1:P82b_end_of_the_end "1233-02-03"^^xsd:date .

extract birth/death triples from tei:person

import lxml.etree as ET
from rdflib import URIRef
from acdh_cidoc_pyutils import make_birth_death_entities, NSMAP

sample = """
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <person xml:id="DWpers0091" sortKey="Gulbransson_Olaf_Leonhard">
        <persName type="pref">Gulbransson, Olaf</persName>
        <birth when="1873-05-26">
            26. 5. 1873<placeName key="#DWplace00139">Christiania (Oslo)</placeName>
        </birth>
        <death>
            <date notBefore-iso="1905-07-04" when="1955" to="2000">04.07.1905</date>
            <settlement key="pmb50">
                <placeName type="pref">Wien</placeName>
                <location><geo>48.2066 16.37341</geo></location>
            </settlement>
        </death>
    </person>
</TEI>"""

doc = ET.fromstring(sample)
x = doc.xpath(".//tei:person[1]", namespaces=NSMAP)[0]
xml_id = x.attrib["{http://www.w3.org/XML/1998/namespace}id"].lower()
item_id = f"https://foo/bar/{xml_id}"
subj = URIRef(item_id)
event_graph, birth_uri, birth_timestamp = make_birth_death_entities(
    subj, x, place_id_xpath="//tei:placeName[1]/@key
)
event_graph, birth_uri, birth_timestamp = make_birth_death_entities(
    subj, x, event_type="death", verbose=True, date_node_xpath="/tei:date[1]",
    place_id_xpath="//tei:settlement[1]/@key"
)
event_graph.serialize(format="turtle")
# returns
@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

# birth example

<https://foo/bar/dwpers0091/birth> a ns1:E67_Birth ;
    rdfs:label "Geburt von Gulbransson, Olaf Leonhard"@fr ;
    ns1:P4_has_time-span <https://foo/bar/dwpers0091/birth/time-span> ;
    ns1:P7_took_place_at <https://foo/bar/DWplace00139> ;
    ns1:P98_brought_into_life <https://foo/bar/dwpers0091> .

<https://foo/bar/dwpers0091/birth/time-span> a ns1:E52_Time-Span ;
    rdfs:label "1873-05-26 - 1873-05-26"^^xsd:string ;
    ns1:P82a_begin_of_the_begin "1873-05-26"^^xsd:date ;
    ns1:P82b_end_of_the_end "1873-05-26"^^xsd:date .

# death example

<https://foo/bar/dwpers0091/death> a ns1:E69_Death ;
    rdfs:label "Geburt von Gulbransson, Olaf Leonhard"@fr ;
    ns1:P100_was_death_of <https://foo/bar/dwpers0091> ;
    ns1:P7_took_place_at <https://foo/bar/pmb50>
    ns1:P4_has_time-span <https://foo/bar/dwpers0091/death/time-span> .

<https://foo/bar/dwpers0091/death/time-span> a ns1:E52_Time-Span ;
    rdfs:label "1905-07-04 - 2000"^^xsd:string ;
    ns1:P82a_begin_of_the_begin "1905-07-04"^^xsd:date ;
    ns1:P82b_end_of_the_end "2000"^^xsd:gYear .

create ns1:P168_place_is_defined_by "Point(456 123)"^^<geo:wktLiteral> . from tei:coords

import lxml.etree as ET
from rdflib import Graph, URIRef, RDF
from acdh_cidoc_pyutils import coordinates_to_p168, NSMAP, CIDOC
sample = """
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <place xml:id="DWplace00092">
        <placeName type="orig_name">Reval (Tallinn)</placeName>
        <location><geo>123 456</geo></location>
    </place>
</TEI>"""

doc = ET.fromstring(sample)
g = Graph()
for x in doc.xpath(".//tei:place", namespaces=NSMAP):
    xml_id = x.attrib["{http://www.w3.org/XML/1998/namespace}id"].lower()
    item_id = f"https://foo/bar/{xml_id}"
    subj = URIRef(item_id)
    g.add((subj, RDF.type, CIDOC["E53_Place"]))
    g += coordinates_to_p168(subj, x)
print(g.serialize())
# returns
...
    ns1:P168_place_is_defined_by "Point(456 123)"^^<geo:wktLiteral> .
...
  • Function parameter verbose prints information in case the given xpath does not return expected results which is a text node with two numbers separated by a given separator (default value is separator=" ")
  • Function parameter inverse (default: inverse=False) changes the order of the coordinates.

date-like-string to casted rdflib.Literal

from acdh_cidoc_pyutils import date_to_literal d
dates = [
    "1900",
    "1900-01",
    "1901-01-01",
    "foo",
]
for x in dates:
    date_literal = date_to_literal(x)
    print((date_literal.datatype))

# returns
# http://www.w3.org/2001/XMLSchema#gYear
# http://www.w3.org/2001/XMLSchema#gYearMonth
# http://www.w3.org/2001/XMLSchema#date
# http://www.w3.org/2001/XMLSchema#string

make some random URI

from acdh_cidoc_pyutils import make_uri

domain = "https://hansi4ever.com/"
version = "1"
prefix = "sumsi"
uri = make_uri(domain=domain, version=version, prefix=prefix)
print(uri)
# https://hansi4ever.com/1/sumsi/6ead32b8-9713-11ed-8065-65787314013c

uri = make_uri(domain=domain)
print(uri)
# https://hansi4ever.com/8b912e66-9713-11ed-8065-65787314013c

create an E52_Time-Span graph

from acdh_cidoc_pyutils import create_e52, make_uri
uri = make_uri()
e52 = create_e52(uri, begin_of_begin="1800-12-12", end_of_end="1900-01")
print(e52.serialize())
# returns
# @prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
# @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
# @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

# <https://hansi4ever.com/387fb457-971b-11ed-8065-65787314013c> a ns1:E52_Time-Span ;
#     rdfs:label "1800-12-12 - 1900-01"^^xsd:string ;
#     ns1:P82a_begin_of_the_begin "1800-12-12"^^xsd:date ;
#     ns1:P82b_end_of_the_end "1900-01"^^xsd:gYearMonth .

creates E42 from tei:org|place|person

takes a tei:person|place|org node, extracts their @xml:id and all tei:idno elements, derives idoc:E42_Identifier triples and relates them to a passed in subject via cidoc:P1_is_identified_by

import lxml.etree as ET
from rdflib import Graph, URIRef, RDF
from acdh_cidoc_pyutils import make_e42_identifiers, NSMAP, CIDOC
sample = """
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <place xml:id="DWplace00092">
        <placeName type="orig_name">Reval (Tallinn)</placeName>
        <placeName xml:lang="de" type="simple_name">Reval</placeName>
        <placeName xml:lang="und" type="alt_label">Tallinn</placeName>
        <idno type="pmb">https://pmb.acdh.oeaw.ac.at/entity/42085/</idno>
        <idno type="URI" subtype="geonames">https://www.geonames.org/588409</idno>
        <idno subtype="foobarid">12345</idno>
    </place>
</TEI>"""

doc = ET.fromstring(sample)
g = Graph()
for x in doc.xpath(".//tei:place|tei:org|tei:person|tei:bibl", namespaces=NSMAP):
    xml_id = x.attrib["{http://www.w3.org/XML/1998/namespace}id"].lower()
    item_id = f"https://foo/bar/{xml_id}"
    subj = URIRef(item_id)
    g.add((subj, RDF.type, CIDOC["E53_Place"]))
    g += make_e42_identifiers(
        subj, x, type_domain="http://hansi/4/ever", default_lang="it",
    )
print(g.serialize(format="turtle"))
# returns
@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<https://foo/bar/dwplace00092> a ns1:E53_Place ;
    ns1:P1_is_identified_by <https://foo/bar/dwplace00092/identifier/DWplace00092>,
        <https://foo/bar/dwplace00092/identifier/idno/0>,
        <https://foo/bar/dwplace00092/identifier/idno/1>,
        <https://foo/bar/dwplace00092/identifier/idno/2> ;
    owl:sameAs <https://pmb.acdh.oeaw.ac.at/entity/42085/>,
        <https://www.geonames.org/588409> .

<http://hansi/4/ever/idno/URI/geonames> a ns1:E55_Type .

<http://hansi/4/ever/idno/foobarid> a ns1:E55_Type .

<http://hansi/4/ever/idno/pmb> a ns1:E55_Type .

<http://hansi/4/ever/xml-id> a ns1:E55_Type .

<https://foo/bar/dwplace00092/identifier/DWplace00092> a ns1:E42_Identifier ;
    rdfs:label "Identifier: DWplace00092"@it ;
    rdf:value "DWplace00092";
    ns1:P2_has_type <http://hansi/4/ever/xml-id> .

<https://foo/bar/dwplace00092/identifier/idno/0> a ns1:E42_Identifier ;
    rdfs:label "Identifier: https://pmb.acdh.oeaw.ac.at/entity/42085/"@it ;
    rdf:value "https://pmb.acdh.oeaw.ac.at/entity/42085/";
    ns1:P2_has_type <http://hansi/4/ever/idno/pmb> .

<https://foo/bar/dwplace00092/identifier/idno/1> a ns1:E42_Identifier ;
    rdfs:label "Identifier: https://www.geonames.org/588409"@it ;
    rdf:value "https://www.geonames.org/588409" 
    ns1:P2_has_type <http://hansi/4/ever/idno/URI/geonames> .

<https://foo/bar/dwplace00092/identifier/idno/2> a ns1:E42_Identifier ;
    rdfs:label "Identifier: 12345"@it ;
    rdf:value "12345";
    ns1:P2_has_type <http://hansi/4/ever/idno/foobarid> .

creates appellations from tei:org|place|person

takes a tei:person|place|org node, extracts persName, placeName and orgName texts, @xml:lang and custom type values and returns cidoc:E33_41 and cidoc:E55 nodes linked via cidoc:P1_is_identified_by and cidoc:P2_has_type

import lxml.etree as ET
from rdflib import Graph, URIRef, RDF
from acdh_cidoc_pyutils import make_appellations, NSMAP, CIDOC

sample = """
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <place xml:id="DWplace00092">
        <placeName type="orig_name">Reval (Tallinn)</placeName>
        <placeName xml:lang="de" type="simple_name">Reval</placeName>
        <placeName xml:lang="und" type="alt_label">Tallinn</placeName>
        <idno type="pmb">https://pmb.acdh.oeaw.ac.at/entity/42085/</idno>
    </place>
</TEI>"""

doc = ET.fromstring(sample)
g = Graph()
for x in doc.xpath(".//tei:place|tei:org|tei:person|tei:bibl", namespaces=NSMAP):
    xml_id = x.attrib["{http://www.w3.org/XML/1998/namespace}id"].lower()
    item_id = f"https://foo/bar/{xml_id}"
    subj = URIRef(item_id)
    g.add((subj, RDF.type, CIDOC["E53_Place"]))
    g += make_appellations(
        subj, x, type_domain="http://hansi/4/ever", default_lang="it"
    )

g.serialize(format="ttl")
# returns
@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<https://foo/bar/dwplace00092> a ns1:E53_Place ;
    ns1:P1_is_identified_by <https://foo/bar/dwplace00092/appellation/0>,
        <https://foo/bar/dwplace00092/appellation/1>,
        <https://foo/bar/dwplace00092/appellation/2> .

<http://hansi/4/ever/alt-label> a ns1:E55_Type ;
    rdfs:label "alt_label" .

<http://hansi/4/ever/orig-name> a ns1:E55_Type ;
    rdfs:label "orig_name" .

<http://hansi/4/ever/simple-name> a ns1:E55_Type ;
    rdfs:label "simple_name" .

<https://foo/bar/dwplace00092/appellation/0> a ns1:E33_E41_Linguistic_Appellation ;
    rdfs:label "Reval (Tallinn)"@it ;
    ns1:P2_has_type <http://hansi/4/ever/orig-name> .

<https://foo/bar/dwplace00092/appellation/1> a ns1:E33_E41_Linguistic_Appellation ;
    rdfs:label "Reval"@de ;
    ns1:P2_has_type <http://hansi/4/ever/simple-name> .

<https://foo/bar/dwplace00092/appellation/2> a ns1:E33_E41_Linguistic_Appellation ;
    rdfs:label "Tallinn"@und ;
    ns1:P2_has_type <http://hansi/4/ever/alt-label> .

connects to places (E53_Place) with P89_falls_within

domain = "https://foo/bar/"
subj = URIRef(f"{domain}place__237979")
sample = """
<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <place xml:id="place__237979">
        <placeName>Lerchenfelder G&#252;rtel 48</placeName>
        <desc type="entity_type">Wohngeb&#228;ude (K.WHS)</desc>
        <desc type="entity_type_id">36</desc>
        <location type="coords">
            <geo>48,209035 16,339257</geo>
        </location>
        <location>
            <placeName ref="place__50">Wien</placeName>
            <geo>48,208333 16,373056</geo>
        </location>
    </place>
</TEI>"""
doc = ET.fromstring(sample)
node = doc.xpath(".//tei:place[1]", namespaces=NSMAP)[0]
g = p89_falls_within(
    subj, node, domain, location_id_xpath="./tei:location/tei:placeName/@ref"
)
result = g.serialize(format="ttl")

returns

@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .

<https://foo/bar/place__237979> ns1:P89_falls_within <https://foo/bar/place__50> .

creates E66_Formation and E68_Dissolution events

from acdh_cidoc_pyutils import p95i_was_formed_by
from rdflib import Graph, URIRef


g = Graph()
subj = URIRef("https://wienerschnitzler.org")
label = "Wiener Moderne Verein"
g += p95i_was_formed_by(
    subj, start_date="2023-10-14", end_date="2025-12-31", label=f"{label} wurde gegründet", label_lang="de"
)
result = g.serialize(format="ttl")

returns

@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://wienerschnitzler.org> ns1:P95i_was_formed_by <https://wienerschnitzler.org/formation-event> .

<https://wienerschnitzler.org/dissolution-event> a ns1:E68_Dissolution ;
    rdfs:label "Institution wurde aufgelöst"@de ;
    ns1:P4_has_time-span <https://wienerschnitzler.org/dissolution-event/dissolution-time-span> .

<https://wienerschnitzler.org/dissolution-event/dissolution-time-span> a ns1:E52_Time-Span ;
    rdfs:label "2025-12-31"^^xsd:string ;
    ns1:P82a_begin_of_the_begin "2025-12-31"^^xsd:date ;
    ns1:P82b_end_of_the_end "2025-12-31"^^xsd:date .

<https://wienerschnitzler.org/formation-event> a ns1:E66_Formation ;
    rdfs:label "Wiener Moderne Verein wurde gegründet"@de ;
    ns1:P4_has_time-span <https://wienerschnitzler.org/formation-event/formation-time-span> .

<https://wienerschnitzler.org/formation-event/formation-time-span> a ns1:E52_Time-Span ;
    rdfs:label "2023-10-14"^^xsd:string ;
    ns1:P82a_begin_of_the_begin "2023-10-14"^^xsd:date ;
    ns1:P82b_end_of_the_end "2023-10-14"^^xsd:date .

normalize_string

from acdh_cidoc_pyutils import normalize_string
string = """\n\nhallo
mein schatz ich liebe    dich
    du bist         die einzige für mich
        """
print(normalize_string(string))
# returns
# hallo mein schatz ich liebe dich du bist die einzige für mich

extract date attributes (begin, end)

expects typical TEI date attributes like @when, @when-iso, @notBefore, @notAfter, @from, @to, ... and returns a tuple containg start- and enddate values. If only @when or @when-iso or only @notBefore or @notAfter are provided, the returned values are the same, unless the default parameter fill_missing is set to False.

from lxml.etree import Element
from acdh_cidoc_pyutils import extract_begin_end

date_string = "1900-12-12"
date_object = Element("{http://www.tei-c.org/ns/1.0}tei")
date_object.attrib["when-iso"] = date_string
print(extract_begin_end(date_object))

# returns
# ('1900-12-12', '1900-12-12')

date_string = "1900-12-12"
date_object = Element("{http://www.tei-c.org/ns/1.0}tei")
date_object.attrib["when-iso"] = date_string
print(extract_begin_end(date_object, fill_missing=False))

# returns
# ('1900-12-12', None)

date_object = Element("{http://www.tei-c.org/ns/1.0}tei")
date_object.attrib["notAfter"] = "1900-12-12"
date_object.attrib["notBefore"] = "1800"
print(extract_begin_end(date_object))

# returns
# ('1800', '1900-12-12')

Convert a TEI document into an RDF graph representing a CIDOC CRM F24 Publication Expression.

from acdh_cidoc_pyutils import teidoc_as_f24_publication_expression

file_path = "L02643.xml"
domain = "https://schnitzler-briefe.acdh.oeaw.ac.at"

uri, g, mentions = teidoc_as_f24_publication_expression(
    file_path, domain, ".//tei:titleStmt/tei:title[@level='a']"
)
g.serialize(file_name.replace(".xml", ".ttl"))

returns

@prefix ns1: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<https://schnitzler-briefe.acdh.oeaw.ac.at/L02643.xml> a <http://iflastandards.info/ns/fr/frbr/frbroo/F24_Publication_Expression> ;
    rdfs:label "Paul Goldmann an Arthur Schnitzler, 6. 8. 1889"@de ;
    ns1:P1_is_identified_by <https://schnitzler-briefe.acdh.oeaw.ac.at/L02643.xml/appellation> ;
    ns1:P67_refers_to <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb11485>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb12698>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb169237>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb2121>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb213>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb29698>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb50>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb52510>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb53101>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb53104>,
        <https://schnitzler-briefe.acdh.oeaw.ac.at/#pmb88392> .

<https://pfp-schema.acdh.oeaw.ac.at/types/tei-document> a ns1:E55_Type ;
    rdfs:label "A TEI/XML encoded text"@en .

<https://schnitzler-briefe.acdh.oeaw.ac.at/L02643.xml/appellation> a ns1:E33_E41_Linguistic_Appellation ;
    rdfs:label "Paul Goldmann an Arthur Schnitzler, 6. 8. 1889"@de ;
    ns1:P2_has_type <https://pfp-schema.acdh.oeaw.ac.at/types/tei-document> .

development

  • pip install -r requirements_dev.txt
  • flake8 -> linting
  • coverage run -m pytest -> runs tests and creates coverage stats