Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
d3ef817
Suppress known git progress logs
candleindark Oct 18, 2023
412923d
Define Git progress report matching patterns
candleindark Oct 19, 2023
638b044
Reimplement `filter()` in `SuppressKnownGitProgressLogs`
candleindark Oct 19, 2023
f5e47b8
Redefine `known_git_progress_log_types` as a set
candleindark Oct 19, 2023
5e07ecc
Eliminate unneeded group capturing in re expressions
candleindark Oct 19, 2023
5c5016e
Test `celery_app` in `make_celery.py` is instantiated correctly
candleindark Oct 19, 2023
e7b4e5a
Rename `SuppressKnownGitProgressLogs` to `SuppressKnownGitProgressRep…
candleindark Oct 19, 2023
1731edb
Rename `known_git_progress_log_types` to `known_git_progress_report_t…
candleindark Oct 19, 2023
fbea870
Modify comments
candleindark Oct 19, 2023
4461a37
RF:
candleindark Oct 19, 2023
45808f9
Merge with suggested change on return base on if op name is known
candleindark Oct 19, 2023
63736c6
Add additional comment
candleindark Oct 19, 2023
653f2b2
Provide tests for the log filter
candleindark Oct 20, 2023
ce5ca72
Merge branch 'master' into suppress-git-progress-logs
candleindark Oct 26, 2023
e60c28d
Enable datalad library mode in datalad-registry
candleindark Oct 26, 2023
d0d6270
Enable datalad library mode in datalad-registry-client
candleindark Oct 26, 2023
337bf31
Enable datalad library mode for datalad-registry in module level
candleindark Oct 26, 2023
74bf6ec
Revert "Enable datalad library mode in datalad-registry-client"
candleindark Oct 26, 2023
195ffd2
Remove custom log handler from the datalad package
candleindark Oct 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion datalad_registry/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from pathlib import Path
import sys

from celery import Celery, Task
import datalad
from flask import Flask, request
from flask_openapi3 import Info, OpenAPI
from kombu.serialization import register
Expand All @@ -19,12 +21,21 @@

__version__ = version("datalad-registry")

datalad.enable_librarymode()

# === A temporary solution for removing the custom log handlers set
# by the datalad package
# (until https://github.com/datalad/datalad/pull/7521 is resolved and released) ===
datalad_lgr = logging.getLogger("datalad")
for h in datalad_lgr.handlers:
datalad_lgr.removeHandler(h)
# === End of temporary solution ===


def create_app() -> Flask:
"""
Factory function for producing Flask app
"""

config = compile_config_from_env()

app = OpenAPI(
Expand Down
51 changes: 51 additions & 0 deletions datalad_registry/make_celery.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,60 @@
# This file provides Celery commands access to the Celery app created through
# the factory functions in datalad_registry/__init__.py
import logging
import re

from celery import Celery

from . import create_app


# === Code for suppressing known git progress reports ===
class SuppressKnownGitProgressReport(logging.Filter):
# Known git progress report types
# These types can be found in the definition of
# `datalad.support.gitrepo.GitProgress`
known_git_progress_report_types = {
"Counting objects",
"Compressing objects",
"Writing objects",
"Receiving objects",
"Resolving deltas",
"Finding sources",
"Checking out files",
"Enumerating objects",
}

re_op_absolute = re.compile(r"(?:remote: )?([\w\s]+):\s+\d+.*")
re_op_relative = re.compile(r"(?:remote: )?([\w\s]+):\s+\d+% \(\d+/\d+\).*")

def filter(self, record):
# The following logic is based on the logic in
# `datalad.support.gitrepo.GitProgress._parse_progress_line`

msg = record.getMessage()

match = self.re_op_relative.match(msg)
if match is None:
match = self.re_op_absolute.match(msg)

if match is None:
# === msg does not match the pattern of a git progress report ===
return True

op_name = match.group(1)

# Return False (filtering out the log message) only
# if the message matches the pattern of a git progress report and
# is of a known git progress report type
return op_name not in self.known_git_progress_report_types


# Retrieve a reference to the "datalad.gitrepo" logger
dl_gitrepo_lgr = logging.getLogger("datalad.gitrepo")

# Add a filter to the "datalad.gitrepo" logger to suppress known git progress reports
dl_gitrepo_lgr.addFilter(SuppressKnownGitProgressReport())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't need any longer right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, it is still needed. I added the solution in your previous comment. However, it only removed the duplications of the "start messages" such as "Start counting objects".

I also discovered that the log level setting,

"celery -A datalad_registry.make_celery:celery_app worker --loglevel INFO --pool prefork"
, of the logger seemingly has not affect on what messages are logged in execution.

This PR requires further digging. I will come back to this PR after #252 which is more important.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, so here, I guess, now we are having log_progress still causing logging of the progress... complimentary issue I guess. I guess we need to dig deeper, most likely adding a way to completely disable log_progress. It might be that some workaround like

def no_log_progress(*args, **kwargs):
     pass

from datalad.support import gitrepo
gitrepo.log_progress = no_log_progress

would do the drill.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This solution would take away the logs we want to take away and more. I think I still have to "start" and "finishing" logs, for example, to remain.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Additionally, I still want an explanation for why changing the log level of the Celery worker command doesn't change the behavior of the worker's logger.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is again datalad to blame: I added a note (see "edit 1") to datalad/datalad#7521

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Michael also pointed to the fact that we might want to use .propagate to avoid propagation of messages from our datalad logger: https://docs.python.org/3/library/logging.html#logging.Logger

# === End of code for suppressing known git progress reports ===

flask_app = create_app()
celery_app: Celery = flask_app.extensions["celery"]
63 changes: 63 additions & 0 deletions datalad_registry/tests/test_make_celery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import logging

from celery import Celery
import pytest


@pytest.mark.usefixtures("set_test_env")
def test_celery_app_instantiation():
"""
Test that the Celery app is instantiated correctly.
"""
from datalad_registry.make_celery import celery_app, flask_app

assert celery_app is flask_app.extensions["celery"]
assert isinstance(celery_app, Celery)


class TestSuppressKnownGitProgressReport:
# Set needed environment to instantiate a flask app in datalad_registry.make_celery
@pytest.mark.usefixtures("set_test_env")
@pytest.mark.parametrize(
"msg, expected_result",
[
("hello", True),
("Start receiving objects", True),
("Start counting objects", True),
(
"Failed to get 'sourcedata/templateflow/tpl-NKI.path', "
"skipping this submodule",
True,
),
(
"Failed to get 'sourcedata/templateflow/tpl-WHS.path', "
"skipping this submodule",
True,
),
("Finished enumerating objects", True),
("Finished compressing objects", True),
("Resolving deltas: 65% (19502/30003)", False),
("Receiving objects: 2% (8943/447132)", False),
("Receiving objects: 57% (194348/340960), 9.80 MiB | 9.79 MiB/s", False),
("remote: Compressing objects: 100% (111150/111150), done.", False),
("remote: Compressing objects: 89% (98924/111150)", False),
("Resolving deltas: 90% (27003/30003)", False),
],
)
def test_filter(self, msg, expected_result):
"""
Test the filter method of the SuppressKnownGitProgressReport class
"""
from datalad_registry.make_celery import SuppressKnownGitProgressReport

record = logging.LogRecord(
name="logger",
level=logging.DEBUG,
pathname="",
lineno=1,
msg=msg,
args=(),
exc_info=None,
)

assert SuppressKnownGitProgressReport().filter(record) is expected_result