Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
349 changes: 349 additions & 0 deletions invenio_vcs/alembic/1754318294_switch_to_generic_git_services.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# -*- coding: utf-8 -*-
# This file is part of Invenio.
# Copyright (C) 2025 CERN.
#
# Invenio is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.

"""Switch to a generic VCS module (not GitHub-specific)."""

import uuid
from datetime import datetime, timezone

import sqlalchemy as sa
from alembic import op
from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy_utils import JSONType, UUIDType

# revision identifiers, used by Alembic.
revision = "1754318294"
down_revision = "b0eaee37b545"
# You cannot rename an Alembic branch. So we will have to keep
# the branch label `invenio-github` despite changing the module
# to `invenio-vcs`.
branch_labels = ()
depends_on = None


def upgrade():
"""Upgrade database."""
op.rename_table("github_repositories", "vcs_repositories")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: given the complexity of this migration, would it be maybe easier to create a new table, copy over the data, and delete the old one instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have considered this and I have recommended creating a new table as the manual migration method for very large instances as it makes it easier to split up the migration into gradual batches without a full DB lock. However, the code is arguably even more complicated (see the new docs in my PR, especially under 'Example script'), and the risk of accidentally losing data due to bugs in the script is slightly higher.

op.alter_column(
"vcs_repositories",
"github_id",
new_column_name="provider_id",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is the id of the repository supplied by the specific provider?
if this is the case, my first thought was that provider_id means we assign an identifier to a provider (as in (github, 1), (gitlab, 2)... ) so the name of the column might not be descriptive enough to remove the ambiguity...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think it's been mentioned before the provider and provider_id have confusing names, so it's probably worth changing them. Maybe id_from_provider instead of provider_id or something similar?

type_=sa.String(length=255),
nullable=False,
existing_type=sa.Integer(),
existing_nullable=True,
)
op.alter_column(
Copy link
Contributor

@kpsherva kpsherva Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the purpose of this column and why we modify it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This stores the provider-specific ID of the webhook if the repository has been activated. On GitHub, this ID is always an integer so until now we have stored it as an integer. It also happens to be an integer on GitLab. But we don't know that this will be the case for all VCSes so we change it here to be stored as a string which is more flexible.

"vcs_repositories",
"hook",
type_=sa.String(length=255),
nullable=True,
existing_type=sa.Integer(),
existing_nullable=True,
)
op.add_column(
"vcs_repositories",
# We use the provider name "github" by default as this is what we're already using across the codebase
sa.Column("provider", sa.String(255), nullable=False, server_default="github"),
)
op.add_column(
"vcs_repositories",
sa.Column(
"default_branch", sa.String(255), nullable=False, server_default="master"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is this information used later on? why do we need to specify the default branch?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need it to be able to generate a new file link for creating the CITATION.cff file, which is shown in the UI as a matter of convenience. It's not absolutely essential though.

https://github.com/palkerecsenyi/invenio-vcs/blob/b9c8884f99435c900234c1ebeb5abcb59c24b238/invenio_vcs/views/vcs.py#L135-L137

),
)
op.add_column(
"vcs_repositories", sa.Column("description", sa.String(10000), nullable=True)
)
op.add_column(
# Nullable for now (see below)
"vcs_repositories",
sa.Column("html_url", sa.String(10000), nullable=True),
)
op.add_column(
"vcs_repositories", sa.Column("license_spdx", sa.String(255), nullable=True)
)
op.alter_column("vcs_repositories", "user_id", new_column_name="enabled_by_user_id")
op.drop_index("ix_github_repositories_name")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is it OK to drop these indices? especially the id

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We currently have two indexes:

  • ix_github_repositories_name on the repo name
  • ix_github_repositories_github_id on the repo's provider (GitHub) ID

We are replacing them with these two:

  • uq_vcs_repositories_provider_provider_id on the combination of provider and provider_id, since each repository must have a unique ID within the context of a provider.
  • uq_vcs_repositories_provider_name since each repository must have a unique full name (e.g. inveniosoftware/invenio-github) within the context of a provider.

op.drop_index("ix_github_repositories_github_id")

# Because they rely on the `provider` column, these are automatically
# deleted when downgrading so we don't need a separate drop command
# for them.
op.create_unique_constraint(
constraint_name=op.f("uq_vcs_repositories_provider_provider_id"),
table_name="vcs_repositories",
columns=["provider", "provider_id"],
)
op.create_unique_constraint(
constraint_name=op.f("uq_vcs_repositories_provider_name"),
table_name="vcs_repositories",
columns=["provider", "name"],
)

# Migrate data from the OAuth remote `extra_data` field to the repositories table
# where we will now store everything directly.
#
# We need to recreate the SQLAlchemy models for `RemoteAccount` and `Repository` here but
# in a much more lightweight way. We cannot simply import the models because (a) they depend
# on the full Invenio app being initialised and all extensions available and (b) we need
Copy link
Contributor

@kpsherva kpsherva Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't fully understand why we replicate oauth remote account, won't this recipe fail the moment we try to upgrade an existing instance? or this is not "really" creating the table?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just creating an SQLAlchemy model of the table so we can interact with it in a similar way to the rest of our codebase. It doesn't actually attempt to create or modify the table itself.

The alternative is using raw SQL to read/insert rows, which would be confusing for such a complex migration.

# to work with the models as they stand precisely at this point in the migration chain
# rather than the model file itself which may be at a later commit.
#
# We only include here the columns, constraints, and relations that we actually need to
# perform the migration, therefore keeping these models as lightweight as possible.
remote_account_table = sa.table(
"oauthclient_remoteaccount",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")),
sa.Column("client_id", sa.String(255)),
sa.Column("extra_data", MutableDict.as_mutable(JSONType)),
)
vcs_repositories_table = sa.table(
"vcs_repositories",
sa.Column("id", UUIDType, primary_key=True),
sa.Column("provider_id", sa.String(255), nullable=True),
sa.Column("provider", sa.String(255), nullable=True),
sa.Column("description", sa.String(10000), nullable=True),
sa.Column("html_url", sa.String(10000), nullable=False),
sa.Column("license_spdx", sa.String(255), nullable=True),
sa.Column("default_branch", sa.String(255), nullable=False),
sa.Column("name", sa.String(255), nullable=False),
sa.Column("hook", sa.String(255), nullable=True),
sa.Column(
"enabled_by_user_id",
sa.Integer,
sa.ForeignKey("account_user.id"),
nullable=True,
),
sa.Column("created", sa.DateTime, nullable=False),
sa.Column("updated", sa.DateTime, nullable=False),
)

# This is the recommended way to run SQLAlchemy operations in a migration, see https://alembic.sqlalchemy.org/en/latest/ops.html#alembic.operations.Operations.execute
session = op.get_bind()

# We don't know the client ID as this is a config variable.
# So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence
# of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because
# (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the
# instance might be using MySQL/SQLite where we store it as `TEXT`.

remote_accounts = session.execute(sa.select(remote_account_table))
for remote_account in remote_accounts.mappings():
if "repos" not in remote_account["extra_data"]:
continue

repos = remote_account["extra_data"]["repos"]

for id, github_repo in repos.items():
# `id` (the dict key) is a string because JSON keys must be strings

matching_db_repo_id = session.scalar(
sa.select(vcs_repositories_table).filter_by(provider_id=id)
)

if matching_db_repo_id is None:
# We are now storing _all_ repositories (even non-enabled ones) in the DB.
# The repo-user association will be created on the first sync after this migration, we need to download
# the list of users with access to the repo from the GitHub API.
session.execute(
vcs_repositories_table.insert().values(
id=uuid.uuid4(),
provider_id=id,
provider="github",
description=github_repo["description"],
name=github_repo["full_name"],
default_branch=github_repo["default_branch"],
# So far we have only supported github.com so we can safely assume the URL
html_url=f'https://github.com/{github_repo["full_name"]}',
# We have never stored this, it is queried at runtime right now. When the first
# sync happens after this migration, we will download all the license IDs from the VCS.
license_spdx=None,
# This repo wasn't enabled
hook=None,
enabled_by_user_id=None,
created=datetime.now(tz=timezone.utc),
updated=datetime.now(tz=timezone.utc),
)
)
else:
session.execute(
vcs_repositories_table.update()
.filter_by(id=matching_db_repo_id)
.values(
description=github_repo["description"],
name=github_repo["full_name"],
default_branch=github_repo["default_branch"],
html_url=f'https://github.com/{github_repo["full_name"]}',
updated=datetime.now(tz=timezone.utc),
)
)

# Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp
session.execute(
remote_account_table.update()
.filter_by(id=remote_account["id"])
.values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]})
)

# Fill in HTML URL for orphaned repos to ensure we don't have any NULLs
vcs_repositories = session.execute(sa.select(vcs_repositories_table))
for vcs_repository in vcs_repositories.mappings():
if vcs_repository["html_url"] is None:
session.execute(
vcs_repositories_table.update()
.filter_by(id=vcs_repository["id"])
.values(
html_url=f'https://github.com/{vcs_repository["name"]}',
updated=datetime.now(tz=timezone.utc),
)
)

session.commit()

# We initially set this to nullable=True so we can create the column without an error
# (it would be null for existing records) but after the SQLAlchemy operations above we
# have populated it so we can mark it non-nullable.
op.alter_column(
"vcs_repositories", "html_url", nullable=False, existing_nullable=True
)

op.rename_table("github_releases", "vcs_releases")
op.alter_column(
"vcs_releases",
"release_id",
new_column_name="provider_id",
type_=sa.String(length=255),
nullable=False,
existing_type=sa.Integer(),
existing_nullable=True,
)
op.add_column(
"vcs_releases",
sa.Column("provider", sa.String(255), nullable=False, server_default="github"),
)
if op.get_context().dialect.name == "postgresql":
op.alter_column(
"vcs_releases",
"errors",
type_=sa.dialects.postgresql.JSONB,
postgresql_using="errors::text::jsonb",
)

op.drop_constraint(
op.f("uq_github_releases_release_id"), table_name="vcs_releases", type_="unique"
)
# A given provider cannot have duplicate repository IDs.
# These constraints are also inherently deleted when the `provider` column is dropped
op.create_unique_constraint(
constraint_name=op.f("uq_vcs_releases_provider_id_provider"),
table_name="vcs_releases",
columns=["provider_id", "provider"],
)
# A specific repository from a given provider cannot have multiple releases of the same tag
op.create_unique_constraint(
constraint_name=op.f("uq_vcs_releases_provider_id_provider_tag"),
table_name="vcs_releases",
columns=["provider_id", "provider", "tag"],
)

op.create_table(
"vcs_repository_users",
sa.Column("repository_id", UUIDType(), primary_key=True),
sa.Column("user_id", sa.Integer(), primary_key=True),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it our user_id or user id from the VCS provider? I guess from vcs judging from the FK constraints, but can we be more explicit. Also from our previous experiences, having int type on ids can be very problematic, I would suggest another approach. What if some vcs provider has alphanumeric user ids?

Copy link
Member Author

@palkerecsenyi palkerecsenyi Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's our ID in this case, this table is storing which Invenio users have access to which repo. The foreign key maps it to accounts_user.id. Hence also why it's an int.

I agree the naming is confusing, maybe accounts_user_id or something similar would work better?

sa.ForeignKeyConstraint(
["repository_id"],
["vcs_repositories.id"],
name=op.f("fk_vcs_repository_users_repository_id_vcs_repositories"),
),
sa.ForeignKeyConstraint(
["user_id"],
["accounts_user.id"],
name=op.f("fk_vcs_repository_users_user_id_accounts_user"),
),
)
# ### end Alembic commands ###


def downgrade():
"""Downgrade database.

Currently, the downgrade can only be peformed **without data**. The tables are transformed but
data will not be successfully migrated. The upgrade migration has a large amount of custom logic
for migrating the data into the new format, and this is not replicated/reversed for downgrading.
"""
op.alter_column(
"vcs_repositories",
"enabled_by_user_id",
new_column_name="user_id",
)
op.drop_table("vcs_repository_users")

op.rename_table("vcs_repositories", "github_repositories")
op.alter_column(
"github_repositories",
"provider_id",
new_column_name="github_id",
type_=sa.Integer(),
nullable=True,
existing_type=sa.String(length=255),
existing_nullable=False,
postgresql_using="provider_id::integer",
)
op.alter_column(
"github_repositories",
"hook",
type_=sa.Integer(),
nullable=True,
existing_type=sa.String(length=255),
existing_nullable=True,
postgresql_using="hook::integer",
)
op.drop_column("github_repositories", "provider")
op.drop_column("github_repositories", "description")
op.drop_column("github_repositories", "html_url")
op.drop_column("github_repositories", "license_spdx")
op.drop_column("github_repositories", "default_branch")
op.create_index(
op.f("ix_github_repositories_github_id"),
"github_repositories",
["github_id"],
unique=True,
)
op.create_index(
op.f("ix_github_repositories_name"),
"github_repositories",
["name"],
unique=True,
)

op.rename_table("vcs_releases", "github_releases")
op.alter_column(
"github_releases",
"provider_id",
new_column_name="release_id",
type_=sa.Integer(),
nullable=True,
existing_type=sa.String(length=255),
existing_nullable=False,
postgresql_using="provider_id::integer",
)
op.drop_column("github_releases", "provider")
if op.get_context().dialect.name == "postgresql":
op.alter_column(
"github_releases",
"errors",
type_=sa.dialects.postgresql.JSON,
postgresql_using="errors::text::json",
)
op.create_unique_constraint(
op.f("uq_github_releases_release_id"),
table_name="github_releases",
columns=["release_id"],
)
# ### end Alembic commands ###
Loading