diff --git a/cap/config.py b/cap/config.py index b5768d10c6..232ce647a0 100644 --- a/cap/config.py +++ b/cap/config.py @@ -642,3 +642,6 @@ def _(x): # ================ REANA_ACCESS_TOKEN = os.environ.get( 'APP_REANA_ACCESS_TOKEN', None) + +SIPSTORE_DEFAULT_AGENT_JSONSCHEMA = 'sipstore/agent-v0.0.1.json' +SIPSTORE_DEFAULT_BAGIT_JSONSCHEMA = 'sipstore/bagit-v0.0.1.json' diff --git a/cap/jsonschemas/sipstore/agent-v0.0.1.json b/cap/jsonschemas/sipstore/agent-v0.0.1.json new file mode 100644 index 0000000000..7f4d2b42d6 --- /dev/null +++ b/cap/jsonschemas/sipstore/agent-v0.0.1.json @@ -0,0 +1,22 @@ +{ + "allow_all": true, + "experiment": null, + "fullname": null, + "is_deposit": false, + "jsonschema": { + "type": "object", + "title": "SIPStore Agent schema.", + "description": "User agent information making the SIP.", + "properties": { + "orcid": { + "type": "string" + }, + "email": { + "type": "string" + }, + "ip_address": { + "type": "string" + } + } + } +} diff --git a/cap/jsonschemas/sipstore/bagit-v0.0.1.json b/cap/jsonschemas/sipstore/bagit-v0.0.1.json new file mode 100644 index 0000000000..d9319a9c8a --- /dev/null +++ b/cap/jsonschemas/sipstore/bagit-v0.0.1.json @@ -0,0 +1,77 @@ +{ + "allow_all": true, + "experiment": null, + "fullname": null, + "is_deposit": false, + "jsonschema": { + "definitions": { + "file": { + "type": "object", + "title": "Archived file information.", + "description": "JSON describing a single file.", + "additionalProperties": false, + "properties": { + "filepath": { + "description": + "Filepath to the archived file, relative to the archived directory root.", + "type": "string" + }, + "fullpath": { + "description": + "Absolute filepath to the file in the archive file system.", + "type": "string" + }, + "size": { + "description": "Size of the file in bytes.", + "type": "number" + }, + "checksum": { + "description": + "MD5 checksum of the file. Always starts with 'md5:' prefix.", + "type": "string" + }, + "file_uuid": { + "description": + "UUID of the related FileInstance object. Used for Record's data files only.", + "type": "string" + }, + "metadata_id": { + "description": + "ID of the type (SIPMetadataType.id) of the related SIPMetadata object. Used for Record's metadata files only.", + "type": "number" + }, + "sipfilepath": { + "description": + "Original SIPFile.filepath value. Used for Record's data files only.", + "type": "string" + }, + "filename": { + "description": + "Filename of the SIPFile in the archive. Used for Record's data files only.", + "type": "string" + }, + "content": { + "description": + "Text-content of the file. Used for BagIt metadata files only.", + "type": "string" + }, + "fetched": { + "description": + "Marks whether given file is fetched from another bag (specified in 'fetch.txt'). If the key does not exist or is set to false, it is assumed that the file is written down in the bag, hence NOT fetched. Used for Record's data files only.", + "type": "boolean" + } + }, + "required": ["filepath", "fullpath", "size", "checksum"] + } + }, + "properties": { + "files": { + "description": "All files stored in this archive package.", + "type": "array", + "items": { + "$ref": "#/definitions/file" + } + } + } + } +} diff --git a/cap/jsonschemas/sipstore/file-v0.0.1.json b/cap/jsonschemas/sipstore/file-v0.0.1.json new file mode 100644 index 0000000000..47eb08e35e --- /dev/null +++ b/cap/jsonschemas/sipstore/file-v0.0.1.json @@ -0,0 +1,55 @@ +{ + "allow_all": true, + "experiment": null, + "fullname": null, + "is_deposit": false, + "jsonschema": { + "properties": { + "filepath": { + "description": + "Filepath to the archived file, relative to the archived directory root.", + "type": "string" + }, + "fullpath": { + "description": + "Absolute filepath to the file in the archive file system.", + "type": "string" + }, + "size": { + "description": "Size of the file in bytes.", + "type": "number" + }, + "checksum": { + "description": + "MD5 checksum of the file. Always starts with 'md5:' prefix.", + "type": "string" + }, + "file_uuid": { + "description": + "UUID of the related FileInstance object. Used for Record's data files only.", + "type": "string" + }, + "metadata_id": { + "description": + "ID of the type (SIPMetadataType.id) of the related SIPMetadata object. Used for Record's metadata files only.", + "type": "number" + }, + "sipfilepath": { + "description": + "Original SIPFile.filepath value. Used for Record's data files only.", + "type": "string" + }, + "filename": { + "description": + "Filename of the SIPFile in the archive. Used for Record's data files only.", + "type": "string" + }, + "content": { + "description": + "Text-content of the file. Used for BagIt metadata files only.", + "type": "string" + } + }, + "required": ["filepath", "fullpath", "size", "checksum"] + } +} diff --git a/cap/modules/deposit/api.py b/cap/modules/deposit/api.py index 2082a67c01..380cc4718c 100644 --- a/cap/modules/deposit/api.py +++ b/cap/modules/deposit/api.py @@ -27,8 +27,6 @@ from __future__ import absolute_import, print_function import copy -import shutil -import tempfile from copy import deepcopy from functools import wraps @@ -43,15 +41,21 @@ from invenio_files_rest.errors import MultipartMissingParts from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion from invenio_jsonschemas.errors import JSONSchemaNotFound +from invenio_pidstore.errors import PIDDoesNotExistError from invenio_records.models import RecordMetadata from invenio_records_files.models import RecordsBuckets from invenio_rest.errors import FieldError + +from invenio_sipstore.api import RecordSIP, SIP as SIPApi +from invenio_sipstore.archivers import BagItArchiver +from invenio_sipstore.models import SIP as SIPModel, \ + RecordSIP as RecordSIPModel + from jsonschema.validators import Draft4Validator, RefResolutionError from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import NoResultFound from werkzeug.local import LocalProxy -from cap.config import FILES_URL_MAX_SIZE from cap.modules.records.api import CAPRecord from cap.modules.repoimporter.repo_importer import RepoImporter from cap.modules.schemas.models import Schema @@ -59,13 +63,14 @@ from cap.modules.user.utils import (get_existing_or_register_role, get_existing_or_register_user) -from .errors import (DepositValidationError, FileUploadError, +from .errors import (ArchivingError, DepositValidationError, FileUploadError, UpdateDepositPermissionsError) from .fetchers import cap_deposit_fetcher from .minters import cap_deposit_minter from .permissions import (AdminDepositPermission, CloneDepositPermission, DepositAdminActionNeed, DepositReadActionNeed, DepositUpdateActionNeed, UpdateDepositPermission) +from .utils import compare_files, task_commit, ensure_content_length _datastore = LocalProxy(lambda: current_app.extensions['security'].datastore) @@ -197,7 +202,52 @@ def publish(self, *args, **kwargs): if file_.data['checksum'] is None: raise MultipartMissingParts() - return super(CAPDeposit, self).publish(*args, **kwargs) + try: + _, last_record = self.fetch_published() + is_first_publishing = False + fetched_files = last_record.files + create_sip_files = not compare_files(fetched_files, self.files) + except (PIDDoesNotExistError, KeyError): + is_first_publishing = True + create_sip_files = True if self.files else False + + deposit = super(CAPDeposit, self).publish(*args, **kwargs) + recid, record = deposit.fetch_published() + sip_patch_of = None + if not is_first_publishing: + sip_recid = recid + + sip_patch_of = ( + db.session.query(SIPModel) + .join(RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id) + .filter(RecordSIPModel.pid_id == sip_recid.id) + .order_by(SIPModel.created.desc()) + .first() + ) + + recordsip = RecordSIP.create( + recid, record, archivable=True, + create_sip_files=create_sip_files, + sip_metadata_type='json', + user_id=current_user.id, + agent=None) + + archiver = BagItArchiver( + recordsip.sip, include_all_previous=(not is_first_publishing), + patch_of=sip_patch_of) + + archiver.save_bagit_metadata() + + sip = ( + RecordSIPModel.query + .filter_by(pid_id=recid.id) + .order_by(RecordSIPModel.created.desc()) + .first().sip + ) + + archive_sip.delay(str(sip.id)) + + return deposit @mark_as_action def upload(self, pid=None, *args, **kwargs): @@ -601,32 +651,31 @@ def download_repo(pid, url, filename): task_commit(record, response.raw, filename, total) -def task_commit(record, response, filename, total): - """Commit file to the record.""" - record.files[filename].file.set_contents( - response, - default_location=record.files.bucket.location.uri, - size=total - ) - db.session.commit() - - -def ensure_content_length( - url, method='GET', - session=None, - max_size=FILES_URL_MAX_SIZE or 2**20, - *args, **kwargs): - """Add Content-Length when no present.""" - kwargs['stream'] = True - session = session or requests.Session() - r = session.request(method, url, *args, **kwargs) - if 'Content-Length' not in r.headers: - # stream content into a temporary file so we can get the real size - spool = tempfile.SpooledTemporaryFile(max_size) - shutil.copyfileobj(r.raw, spool) - r.headers['Content-Length'] = str(spool.tell()) - spool.seek(0) - # replace the original socket with our temporary file - r.raw._fp.close() - r.raw._fp = spool - return r +@shared_task(ignore_result=True, max_retries=6, + default_retry_delay=4 * 60 * 60) +def archive_sip(sip_uuid): + """Send the SIP for archiving. + + Retries every 4 hours, six times, which should work for up to 24 hours + archiving system downtime. + :param sip_uuid: UUID of the SIP for archiving. + :type sip_uuid: str + """ + try: + sip = SIPApi(SIPModel.query.get(sip_uuid)) + archiver = BagItArchiver(sip) + bagmeta = archiver.get_bagit_metadata(sip) + if bagmeta is None: + raise ArchivingError( + 'Bagit metadata does not exist for SIP: {0}.'.format(sip.id)) + if sip.archived: + raise ArchivingError( + 'SIP was already archived {0}.'.format(sip.id)) + archiver.write_all_files() + sip.archived = True + db.session.commit() + except Exception as exc: + # On ArchivingError (see above), do not retry, but re-raise + if not isinstance(exc, ArchivingError): + archive_sip.retry(exc=exc) + raise diff --git a/cap/modules/deposit/errors.py b/cap/modules/deposit/errors.py index 1ef4b167ca..e6271e29aa 100644 --- a/cap/modules/deposit/errors.py +++ b/cap/modules/deposit/errors.py @@ -28,6 +28,10 @@ from invenio_rest.errors import RESTException +class ArchivingError(Exception): + """Represents a SIP archiving error that can occur during task.""" + + class DepositDoesNotExist(Exception): """Deposit with given key does not exist exception.""" diff --git a/cap/modules/deposit/utils.py b/cap/modules/deposit/utils.py index 3795b5bc51..3e8b142b14 100644 --- a/cap/modules/deposit/utils.py +++ b/cap/modules/deposit/utils.py @@ -27,6 +27,13 @@ from __future__ import absolute_import, print_function +import requests +import shutil +import tempfile + +from invenio_db import db +from cap.config import FILES_URL_MAX_SIZE + def clean_empty_values(data): """Remove empty values from model.""" @@ -36,3 +43,49 @@ def clean_empty_values(data): return [v for v in (clean_empty_values(v) for v in data) if v] return {k: v for k, v in ( (k, clean_empty_values(v)) for k, v in data.items()) if v} + + +def task_commit(record, response, filename, total): + """Commit file to the record.""" + record.files[filename].file.set_contents( + response, + default_location=record.files.bucket.location.uri, + size=total + ) + db.session.commit() + + +def ensure_content_length( + url, method='GET', + session=None, + max_size=FILES_URL_MAX_SIZE or 2**20, + *args, **kwargs): + """Add Content-Length when no present.""" + kwargs['stream'] = True + session = session or requests.Session() + r = session.request(method, url, *args, **kwargs) + if 'Content-Length' not in r.headers: + # stream content into a temporary file so we can get the real size + spool = tempfile.SpooledTemporaryFile(max_size) + shutil.copyfileobj(r.raw, spool) + r.headers['Content-Length'] = str(spool.tell()) + spool.seek(0) + # replace the original socket with our temporary file + r.raw._fp.close() + r.raw._fp = spool + return r + + +def compare_files(files1, files2): + """Compare file lists.""" + if files1 is None or files2 is None: + return False + if len(files1) != len(files2): + return False + + checksums = (f['checksum'] for f in files2) + for f in files1: + if f['checksum'] not in checksums: + return False + + return True diff --git a/cap/modules/fixtures/cli.py b/cap/modules/fixtures/cli.py index 06319fb4ae..0dfe214c6b 100644 --- a/cap/modules/fixtures/cli.py +++ b/cap/modules/fixtures/cli.py @@ -30,6 +30,9 @@ import click from flask_cli import with_appcontext +from invenio_db import db +from invenio_sipstore.models import SIPMetadataType + from cap.modules.experiments.utils.cms import \ cache_das_datasets_in_es_from_file # noqa from cap.modules.experiments.utils.cms import synchronize_cadi_entries @@ -96,3 +99,32 @@ def schemas(dir): add_or_update_schema(fullpath=fullpath.replace(dir, ''), data=json_content) + + +@fixtures.command() +@with_appcontext +def sipmetadata(): + """Load sipmetadata types.""" + data = [ + { + "title": "CAP Alice Record JSON", + "name": "json", + "format": "json", + "schema": "" + }, + { + "title": "BagIt Archiver metadata", + "name": "bagit", + "format": "json", + "schema": "https://analysispreservation.cern.ch/schemas/" + + "sipstore/bagit-v0.0.1.json" + } + ] + + click.secho('Loading SIP metadata types...', fg='blue') + with click.progressbar(data) as types: + with db.session.begin_nested(): + for type in types: + db.session.add(SIPMetadataType(**type)) + db.session.commit() + click.secho('SIP metadata types loaded!', fg='green') diff --git a/requirements-local-forks.txt b/requirements-local-forks.txt index 677ba25f1d..391e915843 100644 --- a/requirements-local-forks.txt +++ b/requirements-local-forks.txt @@ -1,4 +1,5 @@ -e git+git://github.com/annatrz/invenio-deposit.git#egg=invenio-deposit +-e git+git://github.com/inveniosoftware/invenio-sipstore.git@master#egg=invenio-sipstore -e git+git://github.com/reanahub/reana-client.git@master#egg=reana-client -e git+git://github.com/reanahub/reana-commons.git@master#egg=reana-commons -e git+git://github.com/cernanalysispreservation/invenio-oauthclient.git@master#egg=invenio-oauthclient diff --git a/scripts/clean-and-init.sh b/scripts/clean-and-init.sh index 4b3ece171a..82c7d0d577 100644 --- a/scripts/clean-and-init.sh +++ b/scripts/clean-and-init.sh @@ -42,12 +42,14 @@ cap index init # Create default location for files if [[ -z "${DEBUG}" ]]; then cap files location local var/data --default + cap files location archive var/archive fi cap alembic upgrade heads # install schemas in db cap fixtures schemas +cap fixtures sipmetadata # install demo users cap users create info@inveniosoftware.org -a --password infoinfo diff --git a/setup.py b/setup.py index e40b8e023c..23962ed771 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ 'fs==0.5.4', 'invenio-accounts-rest>=1.0.0a4', 'invenio-oauthclient>=1.0.0', + # 'invenio-sipstore>=1.0.0a7', 'invenio-userprofiles>=1.0.0', 'invenio-query-parser>=0.3.0', 'invenio[{db},{es},base,auth,metadata]~={version}'.format(