diff --git a/datalad_registry/overview.py b/datalad_registry/overview.py index 7aadcc39..c45f3108 100644 --- a/datalad_registry/overview.py +++ b/datalad_registry/overview.py @@ -14,19 +14,39 @@ lgr = logging.getLogger(__name__) bp = Blueprint("overview", __name__, url_prefix="/overview") -_SORT_ATTRS = { - "keys-asc": ("annex_key_count", "asc"), - "keys-desc": ("annex_key_count", "desc"), - "update-asc": ("last_update_dt", "asc"), - "update-desc": ("last_update_dt", "desc"), - "url-asc": ("url", "asc"), - "url-desc": ("url", "desc"), - "annexed_files_in_wt_count-asc": ("annexed_files_in_wt_count", "asc"), - "annexed_files_in_wt_count-desc": ("annexed_files_in_wt_count", "desc"), - "annexed_files_in_wt_size-asc": ("annexed_files_in_wt_size", "asc"), - "annexed_files_in_wt_size-desc": ("annexed_files_in_wt_size", "desc"), - "git_objects_kb-asc": ("git_objects_kb", "asc"), - "git_objects_kb-desc": ("git_objects_kb", "desc"), +# _SORT_ATTRS removed - now using _AVAILABLE_COLUMNS with db_field mapping + +# Available columns with their metadata +_AVAILABLE_COLUMNS = { + "url": {"label": "URL", "sortable": True, "db_field": "url"}, + "dataset": {"label": "Dataset", "sortable": False}, + "commit": {"label": "Commit", "sortable": False}, + "head_dt": {"label": "Last commit date", "sortable": True, "db_field": "head_dt"}, + "keys": { + "label": "Annex keys", + "sortable": True, + "tooltip": "Number of annex keys", + "db_field": "annex_key_count", + }, + "annexed_files_count": { + "label": "Nr of Annexed files", + "sortable": True, + "tooltip": "Number of annexed files in working tree", + "db_field": "annexed_files_in_wt_count", + }, + "annexed_files_size": { + "label": "Size of Annexed files", + "sortable": True, + "tooltip": "Size of annexed files in working tree", + "db_field": "annexed_files_in_wt_size", + }, + "update": {"label": "Last update", "sortable": True, "db_field": "last_update_dt"}, + "git_objects": { + "label": "Size of .git/objects", + "sortable": True, + "db_field": "git_objects_kb", + }, + "metadata": {"label": "Metadata", "sortable": False}, } @@ -36,7 +56,8 @@ @bp.get("/") def overview(): # No type hints due to mypy#7187. - default_sort_scheme = "update-desc" + default_sort_column = "update" + default_sort_direction = "desc" base_select_stmt = select(RepoUrl) @@ -54,12 +75,34 @@ def overview(): # No type hints due to mypy#7187. else: base_select_stmt = base_select_stmt.filter(criteria) - # Decipher sorting scheme - sort_by = request.args.get("sort", default_sort_scheme, type=str) - if sort_by not in _SORT_ATTRS: - lgr.debug("Ignoring unknown sort parameter: %s", sort_by) - sort_by = default_sort_scheme - col, sort_method = _SORT_ATTRS[sort_by] + # Handle configurable columns + columns_param = request.args.get("columns", None, type=str) + if columns_param: + # Parse comma-separated column names + requested_columns = [col.strip() for col in columns_param.split(",")] + # Filter out invalid column names + visible_columns = [ + col for col in requested_columns if col in _AVAILABLE_COLUMNS + ] + if not visible_columns: + visible_columns = list(_AVAILABLE_COLUMNS) + else: + visible_columns = list(_AVAILABLE_COLUMNS) + + # Handle sorting using new system + sort_by_param = request.args.get("sort_by", default_sort_column, type=str) + sort_direction = request.args.get("sort", default_sort_direction, type=str) + # Validate sort_by parameter + if sort_by_param in _AVAILABLE_COLUMNS and _AVAILABLE_COLUMNS[sort_by_param].get( + "sortable" + ): + col = _AVAILABLE_COLUMNS[sort_by_param]["db_field"] + sort_method = sort_direction if sort_direction in ["asc", "desc"] else "asc" + else: + lgr.debug("Ignoring unknown sort_by parameter: %s", sort_by_param) + sort_by_param = default_sort_column + col = _AVAILABLE_COLUMNS[default_sort_column]["db_field"] + sort_method = default_sort_direction # Apply sorting select_stmt = base_select_stmt.order_by( @@ -76,7 +119,11 @@ def overview(): # No type hints due to mypy#7187. "overview.html", pagination=pagination, stats=stats, - sort_by=sort_by, + sort_by_param=sort_by_param, + sort_direction=sort_method, # Use the validated sort_method search_query=query, search_error=search_error, + visible_columns=visible_columns, + available_columns=_AVAILABLE_COLUMNS, + columns_param=columns_param, ) diff --git a/datalad_registry/tasks/__init__.py b/datalad_registry/tasks/__init__.py index 56d52dae..c7db27d4 100644 --- a/datalad_registry/tasks/__init__.py +++ b/datalad_registry/tasks/__init__.py @@ -23,6 +23,7 @@ from datalad_registry.utils import StrEnum from datalad_registry.utils.datalad_tls import ( clone, + get_head_commit_date, get_head_describe, get_origin_annex_key_count, get_origin_annex_uuid, @@ -100,6 +101,7 @@ def _update_dataset_url_info(dataset_url: RepoUrl, ds: Dataset) -> None: dataset_url.head = ds.repo.get_hexsha("origin/HEAD") dataset_url.head_describe = get_head_describe(ds) + dataset_url.head_dt = get_head_commit_date(ds) dataset_url.branches = get_origin_branches(ds) diff --git a/datalad_registry/templates/overview.html b/datalad_registry/templates/overview.html index 84905c80..43a25bf7 100644 --- a/datalad_registry/templates/overview.html +++ b/datalad_registry/templates/overview.html @@ -1,10 +1,10 @@ -{%- macro sort_href(default, other) -%} - {%- if sort_by == default -%} - {%- set new_sort = other -%} +{%- macro sort_href(column_name, current_sort_by, current_direction) -%} + {%- if current_sort_by == column_name -%} + {%- set new_direction = 'asc' if current_direction == 'desc' else 'desc' -%} {%- else -%} - {%- set new_sort = default -%} + {%- set new_direction = 'desc' -%} {%- endif -%} - href="{{ url_for('.overview', query=search_query, sort='{}'.format(new_sort)) }}" + href="{{ url_for('.overview', query=search_query, sort_by=column_name, sort=new_direction, columns=columns_param) }}" {%- endmacro -%} {% macro render_pagination_widget(pagination, endpoint) %} @@ -16,7 +16,7 @@ {% for page in pagination.iter_pages() %} {% if page %} {% if page != pagination.page %} - {{ page }} + {{ page }} {% else %} {{ page }} {% endif %} @@ -60,7 +60,11 @@ {%- endif -%} placeholder='Search query' /> - + + + {% if columns_param %} + + {% endif %} {% if search_error -%}

ERROR: {{ search_error }}

@@ -69,7 +73,11 @@ {% if search_query -%}
- + + + {% if columns_param %} + + {% endif %}
{%- endif %} @@ -132,58 +140,53 @@

Search query syntax

- - - - - - - - - + {%- for column in visible_columns -%} + + {%- if available_columns[column]['sortable'] -%} + {{ available_columns[column]['label'] }} + {%- else -%} + {{ available_columns[column]['label'] }} + {%- endif -%} + {%- if available_columns[column].get('tooltip') -%} + {{ available_columns[column]['tooltip'] }} + {%- endif -%} + + {%- endfor -%} {%- for i in pagination -%} - - - - - - - - - + {%- for column in visible_columns -%} + + {%- if column == 'url' -%} + {{ i.url }} + {%- elif column == 'dataset' -%} + {% if i.ds_id is not none %} + {{ i.ds_id }} + {% endif %} + {%- elif column == 'commit' -%} + {{ i.head_describe if i.head_describe is not none }} + {%- elif column == 'head_dt' -%} + {{ i.head_dt.strftime("%Y-%m-%dT%H:%M:%S%z") if i.head_dt is not none }} + {%- elif column == 'keys' -%} + {{ i.annex_key_count|intcomma if i.annex_key_count is not none }} + {%- elif column == 'annexed_files_count' -%} + {{ i.annexed_files_in_wt_count|intcomma if i.annexed_files_in_wt_count is not none }} + {%- elif column == 'annexed_files_size' -%} + {{ i.annexed_files_in_wt_size|filesizeformat if i.annexed_files_in_wt_size is not none }} + {%- elif column == 'update' -%} + {{ i.last_update_dt.strftime("%Y-%m-%dT%H:%M:%S%z") if i.last_update_dt is not none }} + {%- elif column == 'git_objects' -%} + {{ (i.git_objects_kb * 1024)|filesizeformat if i.git_objects_kb is not none }} + {%- elif column == 'metadata' -%} + {%- for data in i.metadata_ -%} + + {{ data.extractor_name }} + + {{ ", " if not loop.last else "" }} + {%- endfor -%} + {%- endif -%} + + {%- endfor -%} {%- endfor -%}
URLDatasetCommitAnnex keys - - Nr of Annexed files - - Number of annexed files in working tree - - - Size of Annexed files - - Size of annexed files in working tree - - Last update - - - Size of .git/objects - - - Metadata -
{{ i.url }} - {% if i.ds_id is not none %} - {{ i.ds_id }} - {% endif %} - - {{ i.head_describe if i.head_describe is not none }} - {{ i.annex_key_count|intcomma if i.annex_key_count is not none }}{{ i.annexed_files_in_wt_count|intcomma if i.annexed_files_in_wt_count is not none }}{{ i.annexed_files_in_wt_size|filesizeformat if i.annexed_files_in_wt_size is not none }}{{ i.last_update_dt.strftime("%Y-%m-%dT%H:%M:%S%z") if i.last_update_dt is not none }}{{ (i.git_objects_kb * 1024)|filesizeformat if i.git_objects_kb is not none }} - {%- for data in i.metadata_ -%} - - {{ data.extractor_name }} - - {{ ", " if not loop.last else "" }} - {%- endfor -%} -
diff --git a/datalad_registry/tests/test_overview.py b/datalad_registry/tests/test_overview.py index 2763f051..ad9680de 100644 --- a/datalad_registry/tests/test_overview.py +++ b/datalad_registry/tests/test_overview.py @@ -127,6 +127,25 @@ class TestOverView: "https://www.dandiarchive.org", ], ), + # Test new head_dt sorting + ( + "head_dt-asc", + [ + "https://www.example.com", # null values come first + "http://www.datalad.org", # null values come first + "https://handbook.datalad.org", # null values come first + "https://www.dandiarchive.org", # null values come first + ], + ), + ( + "head_dt-desc", + [ + "https://www.example.com", # null values come first + "http://www.datalad.org", # null values come first + "https://handbook.datalad.org", # null values come first + "https://www.dandiarchive.org", # null values come first + ], + ), ], ) def test_sorting( @@ -135,7 +154,35 @@ def test_sorting( """ Test for the sorting of dataset URLs in the overview page """ - resp = flask_client.get("/overview/", query_string={"sort": sort_by}) + query_string = {} + if sort_by: + # Convert old format to new format + if sort_by.endswith("-asc"): + column = sort_by[:-4] + direction = "asc" + elif sort_by.endswith("-desc"): + column = sort_by[:-5] + direction = "desc" + else: + column, direction = sort_by, "asc" + + # Map old column names to new ones + column_mapping = { + "keys": "keys", + "update": "update", + "head_dt": "head_dt", + "url": "url", + "annexed_files_in_wt_count": "annexed_files_count", + "annexed_files_in_wt_size": "annexed_files_size", + "git_objects_kb": "git_objects", + } + + if column in column_mapping: + query_string = {"sort_by": column_mapping[column], "sort": direction} + else: + query_string = {"sort_by": column, "sort": direction} + + resp = flask_client.get("/overview/", query_string=query_string) soup = BeautifulSoup(resp.text, "html.parser") @@ -296,3 +343,151 @@ def test_metadata(self, flask_client): "https://handbook.datalad.org": {"metalad_core"}, "https://www.dandiarchive.org": set(), } + + @pytest.mark.usefixtures("populate_with_std_ds_urls") + @pytest.mark.parametrize( + "columns_param, expected_header_count", + [ + (None, 10), # All available columns + ("url,dataset,commit", 3), + ("url,head_dt,update", 3), + ("url,keys,metadata", 3), + ("invalid_column", 10), # Invalid columns fall back to all columns + ("url,invalid_column,dataset", 2), # Valid columns only + ], + ) + def test_configurable_columns( + self, columns_param: Optional[str], expected_header_count: int, flask_client + ): + """ + Test the configurable columns functionality + """ + query_params = {} + if columns_param: + query_params["columns"] = columns_param + + resp = flask_client.get("/overview/", query_string=query_params) + + soup = BeautifulSoup(resp.text, "html.parser") + + # Count header columns + header_row = soup.body.table.tr + header_count = len(header_row.find_all("th")) + + assert header_count == expected_header_count + + # Verify data rows have matching column count + data_rows = soup.body.table.find_all("tr")[1:] + if data_rows: # Only check if there are data rows + for row in data_rows: + assert len(row.find_all("td")) == expected_header_count + + @pytest.mark.usefixtures("populate_with_std_ds_urls") + def test_last_commit_date_column_present(self, flask_client): + """ + Test that the last commit date column is displayed when requested + """ + resp = flask_client.get("/overview/", query_string={"columns": "url,head_dt"}) + + soup = BeautifulSoup(resp.text, "html.parser") + + # Check that the header contains "Last commit date" + headers = [th.text.strip() for th in soup.body.table.tr.find_all("th")] + assert "Last commit date" in headers + + # Verify we have exactly 2 columns + assert len(headers) == 2 + + @pytest.mark.usefixtures("populate_with_std_ds_urls") + @pytest.mark.parametrize( + "sort_by_param, sort_direction, expected_order", + [ + # Test new sorting system + ( + "keys", + "asc", + [ + "https://www.example.com", + "http://www.datalad.org", + "https://handbook.datalad.org", + "https://www.dandiarchive.org", + ], + ), + ( + "keys", + "desc", + [ + "https://handbook.datalad.org", + "http://www.datalad.org", + "https://www.example.com", + "https://www.dandiarchive.org", + ], + ), + ( + "url", + "asc", + [ + "http://www.datalad.org", + "https://handbook.datalad.org", + "https://www.dandiarchive.org", + "https://www.example.com", + ], + ), + ( + "head_dt", + "asc", + [ + "https://www.example.com", # null values come first + "http://www.datalad.org", # null values come first + "https://handbook.datalad.org", # null values come first + "https://www.dandiarchive.org", # null values come first + ], + ), + ], + ) + def test_new_sorting_system( + self, + sort_by_param: str, + sort_direction: str, + expected_order: list[str], + flask_client, + ): + """ + Test the new sorting system with sort_by and sort parameters + """ + resp = flask_client.get( + "/overview/", + query_string={"sort_by": sort_by_param, "sort": sort_direction}, + ) + + soup = BeautifulSoup(resp.text, "html.parser") + + url_list = [row.td.a.string for row in soup.body.table.find_all("tr")[1:]] + + assert url_list == expected_order + + @pytest.mark.usefixtures("populate_with_std_ds_urls") + def test_new_sorting_with_columns(self, flask_client): + """ + Test that new sorting system works with column configuration + """ + resp = flask_client.get( + "/overview/", + query_string={"sort_by": "keys", "sort": "desc", "columns": "url,keys"}, + ) + + soup = BeautifulSoup(resp.text, "html.parser") + + # Check we have the right number of columns + headers = soup.body.table.tr.find_all("th") + assert len(headers) == 2 + + # Check ordering is still correct + url_list = [row.td.a.string for row in soup.body.table.find_all("tr")[1:]] + expected_order = [ + "https://handbook.datalad.org", + "http://www.datalad.org", + "https://www.example.com", + "https://www.dandiarchive.org", + ] + assert url_list == expected_order diff --git a/datalad_registry/tests/test_utils/test_datalad_tls.py b/datalad_registry/tests/test_utils/test_datalad_tls.py index a917ba63..412d1783 100644 --- a/datalad_registry/tests/test_utils/test_datalad_tls.py +++ b/datalad_registry/tests/test_utils/test_datalad_tls.py @@ -7,6 +7,7 @@ from datalad_registry.utils.datalad_tls import ( WtAnnexedFileInfo, clone, + get_head_commit_date, get_origin_annex_key_count, get_origin_annex_uuid, get_origin_branches, @@ -272,3 +273,36 @@ def test_normal_operation(self, ds_name, branch_name, request, tmp_path): l2_clone.repo.call_git(["push", "-u", "origin", branch_name]) assert get_origin_upstream_branch(l2_clone) == branch_name + + +@pytest.mark.parametrize( + "ds_name", + [ + "empty_ds_annex", + "two_files_ds_annex", + "empty_ds_non_annex", + "two_files_ds_non_annex", + ], +) +def test_get_head_commit_date(ds_name, request, tmp_path): + """ + Test the get_head_commit_date function + """ + from datetime import datetime + + ds: Dataset = request.getfixturevalue(ds_name) + ds_clone = clone(source=ds.path, path=tmp_path) + + # Get the commit date using our function + commit_date = get_head_commit_date(ds_clone) + + # Verify it's a datetime object + assert isinstance(commit_date, datetime) + + # Verify it matches what we get from git directly + expected_date_str = ds.repo.call_git( + ["log", "-1", "--format=%aI", "origin/HEAD"] + ).strip() + expected_date = datetime.fromisoformat(expected_date_str) + + assert commit_date == expected_date diff --git a/datalad_registry/utils/datalad_tls.py b/datalad_registry/utils/datalad_tls.py index 80d4ef49..311a9d4e 100644 --- a/datalad_registry/utils/datalad_tls.py +++ b/datalad_registry/utils/datalad_tls.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from datetime import datetime import re from typing import Optional from uuid import UUID @@ -104,6 +105,26 @@ def get_head_describe(ds: Dataset) -> str: return ds.repo.describe(tags=True, always=True) +def get_head_commit_date(ds: Dataset) -> datetime: + """ + Get the commit date of the HEAD commit of the origin remote of a dataset + + :param ds: The given dataset + :return: The commit date of the HEAD commit of the origin remote as a + datetime object + """ + # Get the commit date of origin/HEAD in ISO8601 format + commit_info = ds.repo.for_each_ref_( + pattern="refs/remotes/origin/HEAD", fields=["authordate:iso8601-strict"] + ) + + if not commit_info: + raise RuntimeError("Failed to get commit date for origin/HEAD") + + # Parse the ISO8601 date string to datetime object + return datetime.fromisoformat(commit_info[0]["authordate:iso8601-strict"]) + + def get_origin_branches(ds: Dataset) -> dict[str, dict[str, str]]: """ Get the branches of the origin remote of a given dataset