-
Notifications
You must be signed in to change notification settings - Fork 30
feat(vcs): new data model #192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: vcs-staging
Are you sure you want to change the base?
Changes from 5 commits
449f41d
66c42c0
bf91a21
24cfce3
f8a3d84
1c52d2d
e24790e
98b86fe
65ef06c
8750be9
7ba04da
455a2c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,349 @@ | ||
| # -*- coding: utf-8 -*- | ||
| # This file is part of Invenio. | ||
| # Copyright (C) 2025 CERN. | ||
| # | ||
| # Invenio is free software; you can redistribute it and/or modify it | ||
| # under the terms of the MIT License; see LICENSE file for more details. | ||
|
|
||
| """Switch to a generic VCS module (not GitHub-specific).""" | ||
|
|
||
| import uuid | ||
| from datetime import datetime, timezone | ||
|
|
||
| import sqlalchemy as sa | ||
| from alembic import op | ||
| from sqlalchemy.ext.mutable import MutableDict | ||
| from sqlalchemy_utils import JSONType, UUIDType | ||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = "1754318294" | ||
| down_revision = "b0eaee37b545" | ||
| # You cannot rename an Alembic branch. So we will have to keep | ||
| # the branch label `invenio-github` despite changing the module | ||
| # to `invenio-vcs`. | ||
| branch_labels = () | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| """Upgrade database.""" | ||
| op.rename_table("github_repositories", "vcs_repositories") | ||
| op.alter_column( | ||
| "vcs_repositories", | ||
| "github_id", | ||
| new_column_name="provider_id", | ||
|
||
| type_=sa.String(length=255), | ||
| nullable=False, | ||
| existing_type=sa.Integer(), | ||
| existing_nullable=True, | ||
| ) | ||
| op.alter_column( | ||
|
||
| "vcs_repositories", | ||
| "hook", | ||
| type_=sa.String(length=255), | ||
| nullable=True, | ||
| existing_type=sa.Integer(), | ||
| existing_nullable=True, | ||
| ) | ||
| op.add_column( | ||
| "vcs_repositories", | ||
| # We use the provider name "github" by default as this is what we're already using across the codebase | ||
| sa.Column("provider", sa.String(255), nullable=False, server_default="github"), | ||
| ) | ||
| op.add_column( | ||
| "vcs_repositories", | ||
| sa.Column( | ||
| "default_branch", sa.String(255), nullable=False, server_default="master" | ||
|
||
| ), | ||
| ) | ||
| op.add_column( | ||
| "vcs_repositories", sa.Column("description", sa.String(10000), nullable=True) | ||
| ) | ||
| op.add_column( | ||
| # Nullable for now (see below) | ||
| "vcs_repositories", | ||
| sa.Column("html_url", sa.String(10000), nullable=True), | ||
palkerecsenyi marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ) | ||
| op.add_column( | ||
| "vcs_repositories", sa.Column("license_spdx", sa.String(255), nullable=True) | ||
| ) | ||
| op.alter_column("vcs_repositories", "user_id", new_column_name="enabled_by_user_id") | ||
| op.drop_index("ix_github_repositories_name") | ||
|
||
| op.drop_index("ix_github_repositories_github_id") | ||
|
|
||
| # Because they rely on the `provider` column, these are automatically | ||
| # deleted when downgrading so we don't need a separate drop command | ||
| # for them. | ||
| op.create_unique_constraint( | ||
| constraint_name=op.f("uq_vcs_repositories_provider_provider_id"), | ||
| table_name="vcs_repositories", | ||
| columns=["provider", "provider_id"], | ||
| ) | ||
| op.create_unique_constraint( | ||
| constraint_name=op.f("uq_vcs_repositories_provider_name"), | ||
| table_name="vcs_repositories", | ||
| columns=["provider", "name"], | ||
| ) | ||
|
|
||
| # Migrate data from the OAuth remote `extra_data` field to the repositories table | ||
| # where we will now store everything directly. | ||
| # | ||
| # We need to recreate the SQLAlchemy models for `RemoteAccount` and `Repository` here but | ||
| # in a much more lightweight way. We cannot simply import the models because (a) they depend | ||
| # on the full Invenio app being initialised and all extensions available and (b) we need | ||
|
||
| # to work with the models as they stand precisely at this point in the migration chain | ||
| # rather than the model file itself which may be at a later commit. | ||
| # | ||
| # We only include here the columns, constraints, and relations that we actually need to | ||
| # perform the migration, therefore keeping these models as lightweight as possible. | ||
| remote_account_table = sa.table( | ||
| "oauthclient_remoteaccount", | ||
| sa.Column("id", sa.Integer, primary_key=True), | ||
| sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")), | ||
| sa.Column("client_id", sa.String(255)), | ||
| sa.Column("extra_data", MutableDict.as_mutable(JSONType)), | ||
| ) | ||
| vcs_repositories_table = sa.table( | ||
| "vcs_repositories", | ||
| sa.Column("id", UUIDType, primary_key=True), | ||
| sa.Column("provider_id", sa.String(255), nullable=True), | ||
| sa.Column("provider", sa.String(255), nullable=True), | ||
| sa.Column("description", sa.String(10000), nullable=True), | ||
| sa.Column("html_url", sa.String(10000), nullable=False), | ||
| sa.Column("license_spdx", sa.String(255), nullable=True), | ||
| sa.Column("default_branch", sa.String(255), nullable=False), | ||
| sa.Column("name", sa.String(255), nullable=False), | ||
| sa.Column("hook", sa.String(255), nullable=True), | ||
| sa.Column( | ||
| "enabled_by_user_id", | ||
| sa.Integer, | ||
| sa.ForeignKey("account_user.id"), | ||
| nullable=True, | ||
| ), | ||
| sa.Column("created", sa.DateTime, nullable=False), | ||
| sa.Column("updated", sa.DateTime, nullable=False), | ||
| ) | ||
|
|
||
| # This is the recommended way to run SQLAlchemy operations in a migration, see https://alembic.sqlalchemy.org/en/latest/ops.html#alembic.operations.Operations.execute | ||
| session = op.get_bind() | ||
|
|
||
| # We don't know the client ID as this is a config variable. | ||
| # So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence | ||
| # of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because | ||
| # (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the | ||
| # instance might be using MySQL/SQLite where we store it as `TEXT`. | ||
|
|
||
| remote_accounts = session.execute(sa.select(remote_account_table)) | ||
| for remote_account in remote_accounts.mappings(): | ||
| if "repos" not in remote_account["extra_data"]: | ||
| continue | ||
|
|
||
| repos = remote_account["extra_data"]["repos"] | ||
|
|
||
| for id, github_repo in repos.items(): | ||
| # `id` (the dict key) is a string because JSON keys must be strings | ||
|
|
||
| matching_db_repo_id = session.scalar( | ||
| sa.select(vcs_repositories_table).filter_by(provider_id=id) | ||
| ) | ||
|
|
||
| if matching_db_repo_id is None: | ||
| # We are now storing _all_ repositories (even non-enabled ones) in the DB. | ||
| # The repo-user association will be created on the first sync after this migration, we need to download | ||
| # the list of users with access to the repo from the GitHub API. | ||
| session.execute( | ||
| vcs_repositories_table.insert().values( | ||
| id=uuid.uuid4(), | ||
| provider_id=id, | ||
| provider="github", | ||
| description=github_repo["description"], | ||
| name=github_repo["full_name"], | ||
| default_branch=github_repo["default_branch"], | ||
| # So far we have only supported github.com so we can safely assume the URL | ||
| html_url=f'https://github.com/{github_repo["full_name"]}', | ||
| # We have never stored this, it is queried at runtime right now. When the first | ||
| # sync happens after this migration, we will download all the license IDs from the VCS. | ||
| license_spdx=None, | ||
| # This repo wasn't enabled | ||
| hook=None, | ||
| enabled_by_user_id=None, | ||
| created=datetime.now(tz=timezone.utc), | ||
| updated=datetime.now(tz=timezone.utc), | ||
| ) | ||
| ) | ||
| else: | ||
| session.execute( | ||
| vcs_repositories_table.update() | ||
| .filter_by(id=matching_db_repo_id) | ||
| .values( | ||
| description=github_repo["description"], | ||
| name=github_repo["full_name"], | ||
| default_branch=github_repo["default_branch"], | ||
| html_url=f'https://github.com/{github_repo["full_name"]}', | ||
| updated=datetime.now(tz=timezone.utc), | ||
| ) | ||
| ) | ||
|
|
||
| # Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp | ||
| session.execute( | ||
| remote_account_table.update() | ||
| .filter_by(id=remote_account["id"]) | ||
| .values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]}) | ||
| ) | ||
|
|
||
| # Fill in HTML URL for orphaned repos to ensure we don't have any NULLs | ||
| vcs_repositories = session.execute(sa.select(vcs_repositories_table)) | ||
| for vcs_repository in vcs_repositories.mappings(): | ||
| if vcs_repository["html_url"] is None: | ||
| session.execute( | ||
| vcs_repositories_table.update() | ||
| .filter_by(id=vcs_repository["id"]) | ||
| .values( | ||
| html_url=f'https://github.com/{vcs_repository["name"]}', | ||
| updated=datetime.now(tz=timezone.utc), | ||
| ) | ||
| ) | ||
|
|
||
| session.commit() | ||
|
|
||
| # We initially set this to nullable=True so we can create the column without an error | ||
| # (it would be null for existing records) but after the SQLAlchemy operations above we | ||
| # have populated it so we can mark it non-nullable. | ||
| op.alter_column( | ||
| "vcs_repositories", "html_url", nullable=False, existing_nullable=True | ||
| ) | ||
|
|
||
| op.rename_table("github_releases", "vcs_releases") | ||
| op.alter_column( | ||
| "vcs_releases", | ||
| "release_id", | ||
| new_column_name="provider_id", | ||
| type_=sa.String(length=255), | ||
| nullable=False, | ||
| existing_type=sa.Integer(), | ||
| existing_nullable=True, | ||
| ) | ||
| op.add_column( | ||
| "vcs_releases", | ||
| sa.Column("provider", sa.String(255), nullable=False, server_default="github"), | ||
| ) | ||
| if op.get_context().dialect.name == "postgresql": | ||
| op.alter_column( | ||
| "vcs_releases", | ||
| "errors", | ||
| type_=sa.dialects.postgresql.JSONB, | ||
| postgresql_using="errors::text::jsonb", | ||
| ) | ||
|
|
||
| op.drop_constraint( | ||
| op.f("uq_github_releases_release_id"), table_name="vcs_releases", type_="unique" | ||
| ) | ||
| # A given provider cannot have duplicate repository IDs. | ||
| # These constraints are also inherently deleted when the `provider` column is dropped | ||
| op.create_unique_constraint( | ||
| constraint_name=op.f("uq_vcs_releases_provider_id_provider"), | ||
| table_name="vcs_releases", | ||
| columns=["provider_id", "provider"], | ||
| ) | ||
| # A specific repository from a given provider cannot have multiple releases of the same tag | ||
| op.create_unique_constraint( | ||
| constraint_name=op.f("uq_vcs_releases_provider_id_provider_tag"), | ||
| table_name="vcs_releases", | ||
| columns=["provider_id", "provider", "tag"], | ||
| ) | ||
|
|
||
| op.create_table( | ||
| "vcs_repository_users", | ||
| sa.Column("repository_id", UUIDType(), primary_key=True), | ||
| sa.Column("user_id", sa.Integer(), primary_key=True), | ||
|
||
| sa.ForeignKeyConstraint( | ||
| ["repository_id"], | ||
| ["vcs_repositories.id"], | ||
| name=op.f("fk_vcs_repository_users_repository_id_vcs_repositories"), | ||
| ), | ||
| sa.ForeignKeyConstraint( | ||
| ["user_id"], | ||
| ["accounts_user.id"], | ||
| name=op.f("fk_vcs_repository_users_user_id_accounts_user"), | ||
| ), | ||
| ) | ||
| # ### end Alembic commands ### | ||
|
|
||
|
|
||
| def downgrade(): | ||
| """Downgrade database. | ||
|
|
||
| Currently, the downgrade can only be peformed **without data**. The tables are transformed but | ||
| data will not be successfully migrated. The upgrade migration has a large amount of custom logic | ||
| for migrating the data into the new format, and this is not replicated/reversed for downgrading. | ||
| """ | ||
| op.alter_column( | ||
| "vcs_repositories", | ||
| "enabled_by_user_id", | ||
| new_column_name="user_id", | ||
| ) | ||
| op.drop_table("vcs_repository_users") | ||
|
|
||
| op.rename_table("vcs_repositories", "github_repositories") | ||
| op.alter_column( | ||
| "github_repositories", | ||
| "provider_id", | ||
| new_column_name="github_id", | ||
| type_=sa.Integer(), | ||
| nullable=True, | ||
| existing_type=sa.String(length=255), | ||
| existing_nullable=False, | ||
| postgresql_using="provider_id::integer", | ||
| ) | ||
| op.alter_column( | ||
| "github_repositories", | ||
| "hook", | ||
| type_=sa.Integer(), | ||
| nullable=True, | ||
| existing_type=sa.String(length=255), | ||
| existing_nullable=True, | ||
| postgresql_using="hook::integer", | ||
| ) | ||
| op.drop_column("github_repositories", "provider") | ||
| op.drop_column("github_repositories", "description") | ||
| op.drop_column("github_repositories", "html_url") | ||
| op.drop_column("github_repositories", "license_spdx") | ||
| op.drop_column("github_repositories", "default_branch") | ||
| op.create_index( | ||
| op.f("ix_github_repositories_github_id"), | ||
| "github_repositories", | ||
| ["github_id"], | ||
| unique=True, | ||
| ) | ||
| op.create_index( | ||
| op.f("ix_github_repositories_name"), | ||
| "github_repositories", | ||
| ["name"], | ||
| unique=True, | ||
| ) | ||
|
|
||
| op.rename_table("vcs_releases", "github_releases") | ||
| op.alter_column( | ||
| "github_releases", | ||
| "provider_id", | ||
| new_column_name="release_id", | ||
| type_=sa.Integer(), | ||
| nullable=True, | ||
| existing_type=sa.String(length=255), | ||
| existing_nullable=False, | ||
| postgresql_using="provider_id::integer", | ||
| ) | ||
| op.drop_column("github_releases", "provider") | ||
| if op.get_context().dialect.name == "postgresql": | ||
| op.alter_column( | ||
| "github_releases", | ||
| "errors", | ||
| type_=sa.dialects.postgresql.JSON, | ||
| postgresql_using="errors::text::json", | ||
| ) | ||
| op.create_unique_constraint( | ||
| op.f("uq_github_releases_release_id"), | ||
| table_name="github_releases", | ||
| columns=["release_id"], | ||
| ) | ||
| # ### end Alembic commands ### | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Question: given the complexity of this migration, would it be maybe easier to create a new table, copy over the data, and delete the old one instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have considered this and I have recommended creating a new table as the manual migration method for very large instances as it makes it easier to split up the migration into gradual batches without a full DB lock. However, the code is arguably even more complicated (see the new docs in my PR, especially under 'Example script'), and the risk of accidentally losing data due to bugs in the script is slightly higher.