Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions cds_migrator_kit/rdm/records/transform/models/courier.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,6 @@ class CourierIssueModel(CdsOverdo):
"690C_a",
"690__a", # only CERN value
"700__m",
"773__y",
"773__n",
"773__p",
"773__c",
"773__v",
"0248_q",
"8564_8",
"8564_s",
Expand All @@ -50,7 +45,6 @@ class CourierIssueModel(CdsOverdo):
}

_default_fields = {
"resource_type": {"id": "publication-other"},
"custom_fields": {"journal:journal": {"title": "CERN Courier"}},
"creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}],
}
Expand Down
6 changes: 5 additions & 1 deletion cds_migrator_kit/rdm/records/transform/models/it.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class ITModel(CdsOverdo):
"6531_9", # Keyword provenance
"700__m", # Author's email
"710__b", # Group name, TBD https://cds.cern.ch/record/2258345/export/hm?ln=en
"710__9", # Group name, TBD https://cds.cern.ch/record/2258345/export/hm?ln=en
"720__a", # Author's duplicate
"773__a", # Duplicate DOI
"773__o", # Duplicate meeting title
Expand All @@ -86,7 +87,10 @@ class ITModel(CdsOverdo):
"8564_8", # Files system field
"8564_s", # Files system field
"8564_x", # Files system field - Icon
"8564_z", # File comment, handled on files level, not MARC
"8564_q", # Files system field - Link
"856418", # Files system field
"85641q", # Files system field - Link
"8564_y", # Files / URLS label
"85641g",
"85641m",
Expand All @@ -110,7 +114,7 @@ class ITModel(CdsOverdo):
"961__h", # CDS modification tag
"961__l", # CDS modification tag
"961__x", # CDS modification tag
"964__a", # Item usualy 0001?
"964__a", # Item usually 0001?
"981__a", # duplicated record marker
"999C50", # https://cds.cern.ch/record/2284609/export/hm?ln=en CMS contributions
"999C52", # https://cds.cern.ch/record/2640188/export/hm?ln=en
Expand Down
3 changes: 2 additions & 1 deletion cds_migrator_kit/rdm/records/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ def _communities_ids(self, entry, record):
communities = record.get("communities", [])
communities = self.communities_ids + [slug for slug in communities]
if communities:
return {"ids": communities, "default": self.communities_ids}
return {"ids": communities, "default": self.communities_ids[0]}
return {}

def _parent(self, entry, record):
Expand Down Expand Up @@ -903,6 +903,7 @@ def compute_files(file_dump, versions_dict):
"description": file["description"],
"name": file["name"],
"status": file["status"],
"comment": file["comment"],
},
"mimetype": file["mime"],
"checksum": file["checksum"],
Expand Down
59 changes: 46 additions & 13 deletions cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
KEYWORD_SCHEMES_TO_DROP,
PID_SCHEMES_TO_STORE_IN_IDENTIFIERS,
RECOGNISED_KEYWORD_SCHEMES,
udc_pattern, IDENTIFIERS_SCHEMES_TO_DROP,
udc_pattern,
IDENTIFIERS_SCHEMES_TO_DROP,
)
from cds_migrator_kit.rdm.records.transform.models.base_record import (
rdm_base_record_model as model,
Expand Down Expand Up @@ -359,7 +360,6 @@ def identifiers(self, key, value):
if scheme.lower() == "arxiv":
id_value = id_value.replace("oai:arXiv.org:", "arXiv:")
if scheme.lower() == "cern annual report":

additional_descriptions = self.get("additional_descriptions", [])
new_desc = {
"description": f"{scheme} {id_value}",
Expand All @@ -384,6 +384,7 @@ def identifiers(self, key, value):
scheme = "cds"
if scheme.lower() == "inspire":
validate_inspire_identifier(id_value, key)

rel_id = {"scheme": scheme.lower(), "identifier": id_value}
if scheme.lower() == "admbul":
legacy_scheme = scheme
Expand Down Expand Up @@ -639,8 +640,7 @@ def copyrights(self, key, value):
return f"{year} © {holder}. {statement} {url}".strip()


@model.over("related_identifiers", "^8564_")
@model.over("related_identifiers", "^8564_")
@model.over("related_identifiers", "^8564[1_]")
@for_each_value
def urls(self, key, value, subfield="u"):
"""Translates urls field."""
Expand Down Expand Up @@ -770,28 +770,60 @@ def yellow_reports(self, key, value):
@for_each_value
def related_identifiers_787(self, key, value):
"""Translates related identifiers."""
description = value.get("i")
description = value.get("i", "").lower()
recid = value.get("w")
new_id = {}
rel_ids = self.get("related_identifiers", [])
if "https://cds.cern.ch/record/" in recid:

if recid and "https://cds.cern.ch/record/" in recid:
recid = recid.replace("https://cds.cern.ch/record/", "")
new_id = {
"identifier": recid,
"scheme": "cds",
"relation_type": {"id": "references"},

relation_map = {
"issue": {
"relation_type": {"id": "ispublishedin"},
"resource_type": {"id": "publication-periodicalissue"},
},
"slides": {
"relation_type": {"id": "references"},
"resource_type": {"id": "presentation"},
},
"conference paper": {
"relation_type": {"id": "references"},
"resource_type": {"id": "publication-conferencepaper"},
},
}

if recid:
if description:
new_id = {
"identifier": recid,
"scheme": "cds",
**relation_map[description],
}
elif not description or description not in relation_map.keys():
new_id = {
"identifier": recid,
"scheme": "cds",
"relation_type": {"id": "references"},
"resource_type": {"id": "other"},
}
else:
raise UnexpectedValue(
f"Unexpected relation description {description}", field=key, value=value
)

report_number = value.get("r")
if report_number:
if not recid and report_number:
report_id = {
"identifier": report_number,
"scheme": "cdsrn",
"relation_type": {"id": "references"},
"resource_type": {"id": "other"},
}
if report_id not in rel_ids:
rel_ids.append(report_id)
self["related_identifiers"] = rel_ids
if new_id not in rel_ids:
if new_id and new_id not in rel_ids:
return new_id

raise IgnoreKey("related_identifiers")
Expand All @@ -809,8 +841,9 @@ def related_identifiers(self, key, value):
"identifier": recid,
"scheme": "cds",
"relation_type": {"id": "references"},
"resource_type": {"id": "other"},
}
if new_id not in rel_ids:
if recid and new_id not in rel_ids:
return new_id
raise IgnoreKey("related_identifiers")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,34 +392,3 @@ def resource_type(self, key, value):
raise UnexpectedValue(
"Unknown resource type (BULLETIN)", field=key, value=value
)


@model.over("related_identifiers", "^787[0_]_", override=True)
@for_each_value
def related_identifiers(self, key, value):
"""Translates related identifiers."""
rel_ids = self.setdefault("related_identifiers", [])

description = value.get("i")
new_ids = []

if description == "issue":
recid = value.get("w")
if recid:
new_ids.append(
{
"identifier": recid,
"scheme": "cds",
"relation_type": {"id": "ispublishedin"},
"resource_type": {"id": "publication-periodicalissue"},
}
)
else:
new_ids.extend(base_related_identifiers(self, key, value))

for new_id in new_ids:
if new_id not in rel_ids:
rel_ids.append(new_id)

self["related_identifiers"] = rel_ids
raise IgnoreKey("bull_related_identifiers")
81 changes: 41 additions & 40 deletions cds_migrator_kit/rdm/records/transform/xml_processing/rules/it.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,14 @@
)
from cds_migrator_kit.transform.xml_processing.quality.parsers import StringValue

from ...config import IGNORED_THESIS_COLLECTIONS
from ...models.it import it_model as model
from .base import additional_titles as base_additional_titles
from .base import custom_fields_693 as base_custom_fields_693
from .base import normalize
from .base import note as base_internal_notes
from .base import subjects as base_subjects
from .base import urls
from .base import yellow_reports as base_yellow_reports
from .publications import imprint_info as base_publication_imprint_info
from .publications import issn as base_publications_issn
from .publications import journal as base_journal
from .publications import related_identifiers as base_publications_related_identifiers

Expand All @@ -48,18 +45,20 @@ def resource_type(self, key, value):

value_b = value.get("b", "")

# first has highest priority
priority = {
v: i
for i, v in enumerate(
[
"note",
"intnotetspubl",
"intnoteitpubl",
"preprint",
"article",
"slides",
"itcerntalk",
"bookchapter",
"conferencepaper",
"itcerntalk",
"article",
"preprint",
"intnoteitpubl",
"intnotetspubl",
"note",
]
)
}
Expand All @@ -83,11 +82,12 @@ def resource_type(self, key, value):
mapping = {
"preprint": {"id": "publication-preprint"},
"conferencepaper": {"id": "publication-conferencepaper"},
"article": {"id": "publication"},
"article": {"id": "publication-article"},
"note": {"id": "publication-technicalnote"},
"brochure": {"id": "publication-brochure"},
"itcerntalk": {"id": "presentation"},
"peri": {"id": "publication-periodicalissue"},
"slides": {"id": "presentation"},
"peri": {"id": "publication-periodical"},
"intnoteitpubl": {"id": "publication-technicalnote"},
"intnotetspubl": {"id": "publication-technicalnote"},
"bookchapter": {"id": "publication-section"},
Expand Down Expand Up @@ -208,6 +208,20 @@ def corporate_author(self, key, value):
@for_each_value
def meeting(self, key, value):
"""Translates additional description."""
published_in = value.get("e", "").strip().lower()

if published_in:
_related_identifiers = self.setdefault("related_identifiers", [])
_related_identifiers.append(
{
"identifier": published_in,
"scheme": "cds",
"relation_type": {"id": "ispublishedin"},
"resource_type": {"id": "publication-periodicalissue"},
}
)
self["related_identifiers"] = _related_identifiers

_custom_fields = self.setdefault("custom_fields", {})
meeting_fields = _custom_fields.get("meeting:meeting", {})
if value.get("t"):
Expand Down Expand Up @@ -235,27 +249,6 @@ def imprint(self, key, value):
raise IgnoreKey("imprint_info")


@model.over("notes", "^8564_", override=True)
@for_each_value
def notes(self, key, value):
"""Translate internal notes"""
url = value.get("u", "")
note = StringValue(value.get("z", "")).parse()
if url:
related_identifiers = self.get("related_identifiers", [])
url_entries = urls(self, key, value)
for entry in url_entries:
if entry not in related_identifiers:
related_identifiers.append(entry)
self["related_identifiers"] = related_identifiers

elif note:
_internal_notes = self.get("internal_notes", [])
_internal_notes.append(note)
self["internal_notes"] = _internal_notes
raise IgnoreKey("notes")


@model.over(
"subjects",
"(^6931_)|(^650[12_][7_])|(^653[12_]_)|(^695__)|(^694__)|(^69531_)",
Expand Down Expand Up @@ -422,10 +415,6 @@ def imprint_info(self, key, value):
if key.startswith("260"):
base_publication_imprint_info(self, key, value)
else:
from cds_migrator_kit.rdm.migration_config import CDS_RECORDS_TO_UNMERGE

if self["recid"] in CDS_RECORDS_TO_UNMERGE:
raise IgnoreKey("publication_date")
publication_date_str = value.get("a")
if publication_date_str:
try:
Expand Down Expand Up @@ -460,17 +449,25 @@ def related_works(self, key, value):
"identifier": recid,
"scheme": "cds",
"relation_type": {"id": relation_type},
"resource_type": {"id": "other"},
}
if new_id not in rel_ids:
return new_id

raise IgnoreKey("related_identifiers")


@model.over("additional_descriptions", "(^85641)")
@model.over("additional_descriptions_it", "^8564[1_]", override=True)
@for_each_value
def series(self, key, value):
"""Translates additional descriptinn and url."""
"""Translates additional descriptions and url."""
content_type = value.get("x", "")
if content_type == "icon":
# ignore icon urls (conditionally ignoring by accessing the value)
url_q = value.get("q", "")
url_u = value.get("u", "")
raise IgnoreKey("url_identifiers")

description = StringValue(value.get("3")).parse()
url = value.get("u", "")
if url:
Expand All @@ -481,8 +478,12 @@ def series(self, key, value):
related_identifiers.append(entry)
self["related_identifiers"] = related_identifiers
if description:
return {"description": description, "type": {"id": "series-information"}}
raise IgnoreKey("additional_descriptions")
_additional_descriptions = self.setdefault("additional_descriptions", [])
_additional_descriptions.append(
{"description": description, "type": {"id": "series-information"}}
)
self["additional_descriptions"] = _additional_descriptions
raise IgnoreKey("additional_descriptions_it")


@model.over("additional_titles", "^246_[3]")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def resource_type(self, key, value):
if value == "contributionsfromindico":
_additional_descriptions = self.get("additional_descriptions", [])
_additional_descriptions.append(
{"description": "Indico event contribution", "type": {"id": "technical-info"}})
{
"description": "Indico event contribution",
"type": {"id": "technical-info"},
}
)
self["additional_descriptions"] = _additional_descriptions

_subjects = self.get("subjects", [])
Expand Down
Loading
Loading