diff --git a/invenio_vcs/alembic/1754318294_switch_to_generic_git_services.py b/invenio_vcs/alembic/1754318294_switch_to_generic_git_services.py new file mode 100644 index 00000000..9694b4a9 --- /dev/null +++ b/invenio_vcs/alembic/1754318294_switch_to_generic_git_services.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# This file is part of Invenio. +# Copyright (C) 2025 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Switch to a generic VCS module (not GitHub-specific).""" + + +import sqlalchemy as sa +from alembic import op +from sqlalchemy_utils import JSONType, UUIDType + +# revision identifiers, used by Alembic. +revision = "1754318294" +down_revision = "b0eaee37b545" +# You cannot rename an Alembic branch. So we will have to keep +# the branch label `invenio-github` despite changing the module +# to `invenio-vcs`. +branch_labels = () +depends_on = None + + +def upgrade(): + """Upgrade database.""" + op.create_table( + "vcs_repositories", + sa.Column("id", UUIDType()), + sa.Column("provider_id", sa.String(length=255), nullable=False), + sa.Column( + "provider", sa.String(length=255), nullable=False, server_default="github" + ), + sa.Column("name", sa.String(length=255), nullable=False), + sa.Column( + "default_branch", + sa.String(length=255), + nullable=False, + server_default="main", + ), + sa.Column("description", sa.String(length=10000)), + sa.Column("license_spdx", sa.String(length=255)), + sa.Column("hook", sa.String(length=255)), + sa.Column("enabled_by_user_id", sa.Integer), + sa.Column("created", sa.DateTime(), nullable=False), + sa.Column("updated", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("id", name=op.f("pk_vcs_repositories")), + sa.ForeignKeyConstraint( + ["enabled_by_user_id"], + ["accounts_user.id"], + name=op.f("fk_vcs_repository_enabled_by_user_id_accounts_user"), + ), + sa.UniqueConstraint( + "provider", + "provider_id", + name=op.f("uq_vcs_repositories_provider_provider_id"), + ), + ) + + op.create_table( + "vcs_repository_users", + sa.Column("repository_id", UUIDType()), + sa.Column("user_id", sa.Integer()), + sa.Column("created", sa.DateTime(), nullable=False), + sa.Column("updated", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint( + "repository_id", "user_id", name=op.f("pk_vcs_repository_users") + ), + sa.ForeignKeyConstraint( + ["repository_id"], + ["vcs_repositories.id"], + name=op.f("fk_vcs_repository_users_repository_id_vcs_repositories"), + ), + sa.ForeignKeyConstraint( + ["user_id"], + ["accounts_user.id"], + name=op.f("fk_vcs_repository_users_user_id_accounts_user"), + ), + ) + + op.create_table( + "vcs_releases", + sa.Column("id", UUIDType()), + sa.Column("provider_id", sa.String(length=255), nullable=False), + sa.Column( + "provider", sa.String(length=255), nullable=False, server_default="github" + ), + sa.Column("tag", sa.String(length=255), nullable=False), + sa.Column( + "errors", + sa.JSON() + .with_variant(sa.dialects.postgresql.JSONB(), "postgresql") + .with_variant(JSONType(), "sqlite") + .with_variant(JSONType(), "mysql"), + ), + sa.Column("repository_id", UUIDType(), nullable=False), + sa.Column("event_id", UUIDType(), nullable=True), + sa.Column("record_id", UUIDType()), + sa.Column("status", sa.CHAR(1)), + sa.Column("created", sa.DateTime(), nullable=False), + sa.Column("updated", sa.DateTime(), nullable=False), + sa.PrimaryKeyConstraint("id", name=op.f("pk_vcs_releases")), + sa.ForeignKeyConstraint( + ["event_id"], + ["webhooks_events.id"], + name=op.f("fk_vcs_releases_event_id_webhooks_events"), + ), + sa.ForeignKeyConstraint( + ["repository_id"], + ["vcs_repositories.id"], + name=op.f("fk_vcs_releases_repository_id_vcs_repositories"), + ), + sa.UniqueConstraint( + "provider", + "provider_id", + name=op.f("uq_vcs_releases_provider_id_provider"), + ), + ) + + op.create_index( + op.f("ix_vcs_releases_record_id"), + table_name="vcs_releases", + columns=["record_id"], + ) + + +def downgrade(): + """Downgrade database.""" + op.drop_table("vcs_releases") + op.drop_table("vcs_repository_users") + op.drop_table("vcs_repositories") diff --git a/invenio_vcs/models.py b/invenio_vcs/models.py new file mode 100644 index 00000000..bf1f65a9 --- /dev/null +++ b/invenio_vcs/models.py @@ -0,0 +1,283 @@ +# -*- coding: utf-8 -*- +# This file is part of Invenio. +# Copyright (C) 2025 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Models for the VCS integration.""" + +from __future__ import annotations + +import uuid +from datetime import datetime, timezone +from enum import Enum + +from invenio_accounts.models import User +from invenio_db import db +from invenio_i18n import lazy_gettext as _ +from invenio_webhooks.models import Event +from sqlalchemy import UniqueConstraint, delete, insert, select +from sqlalchemy.dialects import postgresql +from sqlalchemy.ext.mutable import MutableDict +from sqlalchemy_utils.models import Timestamp +from sqlalchemy_utils.types import ChoiceType, JSONType, UUIDType + + +class ReleaseStatus(Enum): + """Constants for possible status of a Release.""" + + __order__ = "RECEIVED PROCESSING PUBLISHED FAILED DELETED" + + RECEIVED = "R" + """Release has been received and is pending processing.""" + + PROCESSING = "P" + """Release is still being processed.""" + + PUBLISHED = "D" + """Release was successfully processed and published.""" + + FAILED = "F" + """Release processing has failed.""" + + DELETED = "E" + """Release has been deleted.""" + + def __init__(self, value): + """Hack.""" + + def __eq__(self, other): + """Equality test.""" + return self.value == other + + def __str__(self): + """Return its value.""" + return self.value + + +repository_user_association = db.Table( + "vcs_repository_users", + db.Model.metadata, + db.Column( + "repository_id", + UUIDType, + db.ForeignKey("vcs_repositories.id"), + primary_key=True, + ), + db.Column( + "user_id", db.Integer, db.ForeignKey("accounts_user.id"), primary_key=True + ), + db.Column("created", db.DateTime, nullable=False), + db.Column("updated", db.DateTime, nullable=False), +) + + +class Repository(db.Model, Timestamp): + """Information about a vcs repository.""" + + __tablename__ = "vcs_repositories" + + __table_args__ = ( + UniqueConstraint( + "provider", + "provider_id", + name="uq_vcs_repositories_provider_provider_id", + ), + # Index("ix_vcs_repositories_provider_provider_id", "provider", "provider_id"), + ) + + id = db.Column( + UUIDType, + primary_key=True, + default=uuid.uuid4, + ) + """Repository identifier.""" + + provider_id = db.Column( + db.String(255), + nullable=False, + ) + """Unique VCS provider identifier for a repository. + + .. note:: + + Past implementations of GitHub for Invenio, used the repository name + (eg. 'inveniosoftware/invenio-github') in order to track repositories. + This however leads to problems, since repository names can change and + thus render the stored repository name useless. In order to tackle this + issue, the `provider_id` should be used to track repositories, which is a + unique identifier that GitHub uses for each repository and doesn't + change on renames/transfers. + + In order to be able to keep deleted repositories with releases that + have been published, it is possible to keep an entry without a + `provider_id`, that only has a `name`. This only applies to the default + `github` provider on migrated pre-VCS instances. + """ + + provider = db.Column(db.String(255), nullable=False) + """Which VCS provider the repository is hosted by (and therefore the context in which to consider the provider_id)""" + + description = db.Column(db.String(10000), nullable=True) + license_spdx = db.Column(db.String(255), nullable=True) + default_branch = db.Column(db.String(255), nullable=False) + + full_name = db.Column("name", db.String(255), nullable=False) + """Fully qualified name of the repository including user/organization.""" + + hook = db.Column(db.String(255), nullable=True) + """Hook identifier.""" + + enabled_by_user_id = db.Column(db.Integer, db.ForeignKey(User.id), nullable=True) + + # + # Relationships + # + users = db.relationship(User, secondary=repository_user_association) + enabled_by_user = db.relationship(User, foreign_keys=[enabled_by_user_id]) + + @classmethod + def create( + cls, + provider, + provider_id, + default_branch, + full_name=None, + description=None, + license_spdx=None, + **kwargs, + ): + """Create the repository.""" + obj = cls( + provider=provider, + provider_id=provider_id, + full_name=full_name, + default_branch=default_branch, + description=description, + license_spdx=license_spdx, + **kwargs, + ) + db.session.add(obj) + return obj + + def add_user(self, user_id: int): + """Add permission for a user to access the repository.""" + now = datetime.now(tz=timezone.utc) + stmt = insert(repository_user_association).values( + repository_id=self.id, user_id=user_id, created=now, updated=now + ) + db.session.execute(stmt) + + def remove_user(self, user_id: int): + """Remove permission for a user to access the repository.""" + stmt = delete(repository_user_association).filter_by( + repository_id=self.id, user_id=user_id + ) + db.session.execute(stmt) + + def list_users(self): + """Return a list of users with access to the repository.""" + return db.session.execute( + select(repository_user_association).filter_by(repository_id=self.id) + ) + + @classmethod + def get(cls, provider: str, provider_id: str) -> Repository | None: + """Return a repository given its provider ID. + + :param str provider: Registered ID of the VCS provider. + :param str provider_id: VCS provider repository identifier. + :returns: The repository object or None if one with the given ID and provider doesn't exist. + """ + return cls.query.filter( + Repository.provider_id == provider_id, Repository.provider == provider + ).one_or_none() + + @property + def enabled(self): + """Return if the repository has webhooks enabled.""" + return bool(self.hook) + + def latest_release(self, status=None): + """Chronologically latest published release of the repository.""" + # Bail out fast if object (Repository) not in DB session. + if self not in db.session: + return None + + q = self.releases if status is None else self.releases.filter_by(status=status) + return q.order_by(db.desc(Release.created)).first() + + def __repr__(self): + """Get repository representation.""" + return "".format(self=self) + + +class Release(db.Model, Timestamp): + """Information about a VCS release.""" + + __tablename__ = "vcs_releases" + + __table_args__ = ( + UniqueConstraint( + "provider_id", + "provider", + name="uq_vcs_releases_provider_id_provider", + ), + ) + + id = db.Column( + UUIDType, + primary_key=True, + default=uuid.uuid4, + ) + """Release identifier.""" + + provider_id = db.Column(db.String(255), nullable=True) + """Unique VCS provider release identifier.""" + + provider = db.Column(db.String(255), nullable=False) + """Which VCS provider the release is hosted by (and therefore the context in which to consider the provider_id)""" + + tag = db.Column(db.String(255)) + """Release tag.""" + + errors = db.Column( + MutableDict.as_mutable( + db.JSON() + .with_variant(postgresql.JSONB(), "postgresql") + .with_variant(JSONType(), "sqlite") + .with_variant(JSONType(), "mysql") + ), + nullable=True, + ) + """Release processing errors.""" + + repository_id = db.Column(UUIDType, db.ForeignKey(Repository.id)) + """Repository identifier.""" + + event_id = db.Column(UUIDType, db.ForeignKey(Event.id), nullable=True) + """Incoming webhook event identifier.""" + + record_id = db.Column( + UUIDType, + index=True, + nullable=True, + ) + """Weak reference to a record identifier.""" + + status = db.Column( + ChoiceType(ReleaseStatus, impl=db.CHAR(1)), + nullable=False, + ) + """Status of the release, e.g. 'processing', 'published', 'failed', etc.""" + + repository = db.relationship( + Repository, backref=db.backref("releases", lazy="dynamic") + ) + + event = db.relationship(Event) + + def __repr__(self): + """Get release representation.""" + return f"" diff --git a/invenio_vcs/upgrade_scripts/__init__.py b/invenio_vcs/upgrade_scripts/__init__.py new file mode 100644 index 00000000..a7ce9edf --- /dev/null +++ b/invenio_vcs/upgrade_scripts/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# Invenio-VCS is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. +"""Upgrade scripts for InvenioVCS.""" diff --git a/invenio_vcs/upgrade_scripts/migrate_3_0_to_4_0.py b/invenio_vcs/upgrade_scripts/migrate_3_0_to_4_0.py new file mode 100644 index 00000000..b68f5dd2 --- /dev/null +++ b/invenio_vcs/upgrade_scripts/migrate_3_0_to_4_0.py @@ -0,0 +1,310 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# Invenio-VCS is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. +"""Migration script for v3 (old GitHub-only integration) to v4 (new generic VCS integration).""" + +import sys +import uuid +from datetime import datetime, timezone + +import sqlalchemy as sa +from alembic.runtime.migration import MigrationContext +from click import progressbar, secho +from invenio_db import db +from sqlalchemy.dialects import postgresql +from sqlalchemy.ext.mutable import MutableDict +from sqlalchemy_utils import JSONType, UUIDType + +# Lightweight models for all of the tables (incl old and new versions) +remote_account_table = sa.table( + "oauthclient_remoteaccount", + sa.Column("id", sa.Integer, primary_key=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")), + sa.Column("client_id", sa.String(255)), + # We may have changed this if we merge https://github.com/inveniosoftware/invenio-oauthclient/pull/360 + # but we're only reading this column so it shouldn't make a difference. + sa.Column("extra_data", MutableDict.as_mutable(JSONType)), +) +github_repositories_table = sa.table( + "github_repositories", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("github_id", sa.String(255), nullable=True), + sa.Column("name", sa.String(255), nullable=False), + sa.Column("hook", sa.Integer, nullable=True), + sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), +) +vcs_repositories_table = sa.table( + "vcs_repositories", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("provider_id", sa.String(255), nullable=False), + sa.Column("provider", sa.String(255), nullable=False), + sa.Column("description", sa.String(10000), nullable=True), + sa.Column("license_spdx", sa.String(255), nullable=True), + sa.Column("default_branch", sa.String(255), nullable=False), + sa.Column("name", sa.String(255), nullable=False), + sa.Column("hook", sa.String(255), nullable=True), + sa.Column( + "enabled_by_user_id", + sa.Integer, + sa.ForeignKey("account_user.id"), + nullable=True, + ), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), +) +github_releases_table = sa.table( + "github_releases", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("release_id", sa.Integer, primary_key=True), + sa.Column("tag", sa.String(255), nullable=True), + sa.Column("errors", MutableDict.as_mutable(JSONType), nullable=True), + sa.Column( + "repository_id", + UUIDType, + sa.ForeignKey("github_repositories.id"), + nullable=True, + ), + sa.Column("event_id", UUIDType, sa.ForeignKey("webhooks_events.id"), nullable=True), + sa.Column("record_id", UUIDType, nullable=True), + sa.Column("status", sa.CHAR(1), nullable=False), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), +) +vcs_releases_table = sa.table( + "vcs_releases", + sa.Column("id", UUIDType, primary_key=True), + sa.Column("provider_id", sa.String(255), nullable=False), + sa.Column("provider", sa.String(255), nullable=False), + sa.Column("tag", sa.String(255), nullable=False), + sa.Column( + "errors", + MutableDict.as_mutable( + sa.JSON() + .with_variant(postgresql.JSONB(), "postgresql") + .with_variant(JSONType(), "sqlite") + .with_variant(JSONType(), "mysql") + ), + nullable=True, + ), + sa.Column( + "repository_id", + UUIDType, + sa.ForeignKey("vcs_repositories.id"), + nullable=True, + ), + sa.Column( + "event_id", UUIDType, sa.ForeignKey("webhooks_events.id"), nullable=False + ), + sa.Column("record_id", UUIDType, nullable=True), + sa.Column("status", sa.CHAR(1), nullable=False), + sa.Column("created", sa.DateTime, nullable=False), + sa.Column("updated", sa.DateTime, nullable=False), +) + + +def run_upgrade_for_oauthclient_repositories(): + """Move the JSON repos from oauthclient_remoteaccount to the new vcs_repositories table.""" + + secho( + "Migrating JSON data from oauthclient_remoteaccount into vcs_repositories table...", + fg="green", + ) + + # We don't know the client ID as this is a config variable. + # So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence + # of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because + # (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the + # instance might be using MySQL/SQLite where we store it as `TEXT`. + + # We can make this a little bit faster if https://github.com/inveniosoftware/invenio-oauthclient/pull/328 + # were merged and released and all instances were using it, but this is unlikely to be the case + # by the time we release Invenio VCS v4. + + remote_accounts = db.session.execute(sa.select(remote_account_table)).mappings() + with progressbar(remote_accounts) as remote_accounts: + for remote_account in remote_accounts: + if "repos" not in remote_account["extra_data"]: + continue + + repos = remote_account["extra_data"]["repos"] + + for id, github_repo in repos.items(): + # `id` (the dict key) is a string because JSON keys must be strings + + # We might have already created it for another user + matching_db_repo_id = db.session.scalar( + sa.select(vcs_repositories_table).filter_by(provider_id=id) + ) + + if matching_db_repo_id is None: + # We are now storing _all_ repositories (even non-enabled ones) in the DB. + # The repo-user association will be created on the first sync after this migration, we need to download + # the list of users with access to the repo from the GitHub API. + db.session.execute( + vcs_repositories_table.insert().values( + id=uuid.uuid4(), + provider_id=id, + provider="github", + description=github_repo["description"], + name=github_repo["full_name"], + default_branch=github_repo["default_branch"], + # We have never stored this, it is queried at runtime right now. When the first + # sync happens after this migration, we will download all the license IDs from the VCS. + license_spdx=None, + # This repo wasn't enabled, since it is not already in the repositories table. + hook=None, + enabled_by_user_id=None, + created=datetime.now(tz=timezone.utc), + updated=datetime.now(tz=timezone.utc), + ) + ) + else: + db.session.execute( + vcs_repositories_table.update() + .filter_by(id=matching_db_repo_id) + .values( + description=github_repo["description"], + name=github_repo["full_name"], + default_branch=github_repo["default_branch"], + updated=datetime.now(tz=timezone.utc), + ) + ) + + # Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp + db.session.execute( + remote_account_table.update() + .filter_by(id=remote_account["id"]) + .values( + extra_data={"last_sync": remote_account["extra_data"]["last_sync"]} + ) + ) + + db.session.commit() + + +def run_upgrade_for_existing_db_repositories(): + """Move over any old rows from github_repositories that weren't attached to any user (for whatever reason). + + These are (almost) all repos that are enabled and have a hook. However repos that have been enabled and then + later disabled are also included. + """ + + secho( + "Migrating old repo table entries to new vcs_repositories table...", fg="green" + ) + + old_db_repos = db.session.execute(sa.select(github_repositories_table)).mappings() + with progressbar(old_db_repos) as old_db_repos: + for old_db_repo in old_db_repos: + matching_new_repo_id = db.session.scalar( + sa.select( + vcs_repositories_table.c.id, + ).filter_by(provider_id=str(old_db_repo["github_id"])) + ) + + if matching_new_repo_id is None: + # We only have very limited metadata available at this point. + # The first sync job after this migration will fill in the rest. + db.session.execute( + vcs_repositories_table.insert().values( + id=old_db_repo["id"], + provider_id=str(old_db_repo["github_id"]), + provider="github", + name=old_db_repo["name"], + default_branch="main", + license_spdx=None, + hook=old_db_repo["hook"], + enabled_by_user_id=old_db_repo["user_id"], + created=old_db_repo["created"], + updated=datetime.now(tz=timezone.utc), + ) + ) + else: + db.session.execute( + vcs_repositories_table.update() + .filter_by(id=matching_new_repo_id) + .values( + id=old_db_repo["id"], + hook=str(old_db_repo["hook"]), + enabled_by_user_id=old_db_repo["user_id"], + created=old_db_repo["created"], + ) + ) + + db.session.commit() + + +def run_upgrade_for_releases(): + """Copy releases from old table to new vcs_releases table.""" + + secho( + "Migrating old release table entries to new vcs_releases table...", fg="green" + ) + + # Finally, we copy over the releases + old_db_releases = db.session.execute(sa.select(github_releases_table)).mappings() + with progressbar(old_db_releases) as old_db_releases: + for old_db_release in old_db_releases: + # Since we've created all the repos, we know due to referential integrity that this release's repo ID corresponds + # to a valid and existent repo. + + db.session.execute( + vcs_releases_table.insert().values( + id=old_db_release["id"], + provider_id=str(old_db_release["release_id"]), + provider="github", + tag=old_db_release["tag"], + errors=old_db_release["errors"], + repository_id=old_db_release["repository_id"], + event_id=old_db_release["event_id"], + record_id=old_db_release["record_id"], + status=old_db_release["status"], + created=old_db_release["created"], + updated=datetime.now(tz=timezone.utc), + ) + ) + + db.session.commit() + + +def verify_alembic_version(expected_revision: str): + """Verify that the Alembic migration for this version has been executed. + + Attempting to run the other steps of this upgrade script on an old migration version + will have unexpected consequences. + """ + + secho("Verifying Alembic migration is up-to-date...", fg="green") + + with db.engine.connect() as connection: + alembic_ctx = MigrationContext.configure(connection) + # This returns a tuple of the versions of each branch (without the branch name). + current_revs = alembic_ctx.get_current_heads() + + # We just need to check that our expected version ID is included in the tuple + if expected_revision not in current_revs: + secho( + "The invenio-github Alembic branch is not at the latest revision. Please upgrade it before continuing.", + fg="red", + ) + sys.exit(1) + + +def execute_upgrade(): + """Execute all of the steps for the upgrade of InvenioVCS v3 to v4.""" + secho("Starting Invenio-VCS v3->v4 data migration...", fg="green") + + verify_alembic_version("1754318294") + + run_upgrade_for_oauthclient_repositories() + run_upgrade_for_existing_db_repositories() + run_upgrade_for_releases() + + +if __name__ == "__main__": + execute_upgrade()