diff --git a/nb2workflow/deploy.py b/nb2workflow/deploy.py index d0948f81..bf7681b1 100644 --- a/nb2workflow/deploy.py +++ b/nb2workflow/deploy.py @@ -19,6 +19,7 @@ import rdflib from oda_api.ontology_helper import Ontology from nb2workflow.nbadapter import NotebookAdapter +from dynaconf import Dynaconf logger = logging.getLogger(__name__) @@ -30,7 +31,9 @@ "filename_pattern": '.*', } -default_ontology_path = "https://odahub.io/ontology/ontology.ttl" +local_config = Dynaconf(settings_files=['settings.toml']) +config_ontology_path = local_config.get('default.service.ontology_path', 'http://odahub.io/ontology/ontology.ttl') + default_python_version = '3.10' @@ -69,8 +72,9 @@ def build_container(git_origin, engine="docker", cleanup=False, nb2wversion=version(), - ontology_path=default_ontology_path, + ontology_path=config_ontology_path, **kwargs): + if engine == "docker": return _build_with_docker(git_origin=git_origin, local=local, @@ -200,7 +204,10 @@ def _build_with_kaniko(git_origin, namespace="oda-staging", cleanup=True, nb2wversion=version(), - ontology_path=default_ontology_path): + ontology_path=None): + + if ontology_path is None: + ontology_path = config_ontology_path #secret should be created beforehand https://github.com/GoogleContainerTools/kaniko#pushing-to-docker-hub @@ -316,7 +323,8 @@ def _build_with_kaniko(git_origin, return container_metadata -def _extract_resource_requirements(local_repo_path, ontology_path=default_ontology_path): +def _extract_resource_requirements(local_repo_path, ontology_path=config_ontology_path): + ontology = Ontology(ontology_path) resources = {} @@ -345,7 +353,8 @@ def _build_with_docker(git_origin, source_from='localdir', cleanup=False, nb2wversion=version(), - ontology_path=default_ontology_path): + ontology_path=config_ontology_path): + if cleanup: logger.warning('Post-build cleanup is not implemented for docker builds') @@ -575,8 +584,8 @@ def deploy(git_origin, build_timestamp=False, cleanup=False, nb2wversion=version(), - ontology_path=default_ontology_path): - + ontology_path=config_ontology_path): + container = build_container(git_origin, local=local, run_tests=run_tests, @@ -617,10 +626,25 @@ def main(): parser.add_argument('--local', action="store_true", default=False) parser.add_argument('--build-engine', metavar="build_engine", default="docker") parser.add_argument('--nb2wversion', metavar="nb2wversion", default=version()) - parser.add_argument('--ontology-path', metavar="ontology_path", default=default_ontology_path) - + parser.add_argument('--ontology-path', metavar="ontology_path") + parser.add_argument('--settings-path', action="append", default=None) + parser.add_argument('-s', '--settings', nargs="*", default=[]) + args = parser.parse_args() + if args.settings_path is not None: + print("loading settings file from ", args.settings_path[0]) + local_config.load_file(path=args.settings_path[0]) + + if args.settings is not None: + for item in args.settings: + key, value = item.split('=') + local_config['deploy'][key] = value + + deploy_ontology_path = args.ontology_path + if deploy_ontology_path is None: + deploy_ontology_path = config_ontology_path + setup_logging() deploy(args.repository, @@ -629,7 +653,7 @@ def main(): local=args.local, build_engine=args.build_engine, nb2wversion=args.nb2wversion, - ontology_path=args.ontology_path) + ontology_path=deploy_ontology_path) if __name__ == "__main__": diff --git a/nb2workflow/galaxy.py b/nb2workflow/galaxy.py index ff616cfd..f682a4c9 100644 --- a/nb2workflow/galaxy.py +++ b/nb2workflow/galaxy.py @@ -15,6 +15,7 @@ import nbformat from nbconvert.exporters import ScriptExporter +from dynaconf import Dynaconf from ensureconda.api import ensureconda import subprocess as sp @@ -29,8 +30,8 @@ logger = logging.getLogger() - -default_ontology_path = 'http://odahub.io/ontology/ontology.ttl' +local_config = Dynaconf(settings_files=['settings.toml']) +config_ontology_path = local_config.get('default.service.ontology_path', 'http://odahub.io/ontology/ontology.ttl') global_req = [] @@ -539,7 +540,7 @@ def to_galaxy(input_path, citations_bibfile = None, help_file = None, available_channels = ['default', 'conda-forge'], - ontology_path = default_ontology_path, + ontology_path = config_ontology_path, test_data_baseurl = None ): @@ -690,7 +691,7 @@ def main(): tool_version = args.tool_version ontology_path = args.ontology_path if ontology_path is None: - ontology_path = default_ontology_path + ontology_path = config_ontology_path bibfile = args.citations_bibfile help_file = args.help_file test_data_baseurl = args.test_data_baseurl diff --git a/nb2workflow/nbadapter.py b/nb2workflow/nbadapter.py index 08f2256a..89edf86d 100644 --- a/nb2workflow/nbadapter.py +++ b/nb2workflow/nbadapter.py @@ -28,9 +28,10 @@ import nbformat from nbconvert import HTMLExporter from urllib.parse import urlencode, urlparse +from urllib import request +from dynaconf import Dynaconf from . import logstash - from nb2workflow.sentry import sentry from nb2workflow.health import current_health from nb2workflow import workflows @@ -249,14 +250,20 @@ class NotebookAdapter: limit_output_attachment_file = None - def __init__(self, notebook_fn, tempdir_cache=None, n_download_max_tries=10, download_retry_sleep=.5): + def __init__(self, notebook_fn, tempdir_cache=None, config=None): self.notebook_fn = os.path.abspath(notebook_fn) self.name = notebook_short_name(notebook_fn) self.tempdir_cache = tempdir_cache logger.debug("notebook adapter for %s", self.notebook_fn) logger.debug(self.extract_parameters()) - self.n_download_max_tries = n_download_max_tries - self.download_retry_sleep_s = download_retry_sleep + + if config is None: + config = dict() + + self.n_download_max_tries = config.get('SERVICE.N_DOWNLOAD_MAX_TRIES', 10) + self.download_retry_sleep_s = config.get('SERVICE.DOWNLOAD_RETRY_SLEEP', .5) + self.max_download_size = config.get('SERVICE.MAX_DOWNLOAD_SIZE', 1e6) + sentry.sentry_url = config.get('SERVICE.SENTRY_URL', None) @staticmethod def get_unique_filename_from_url(file_url): @@ -567,26 +574,51 @@ def extract_output(self): def download_file(self, file_url, tmpdir): n_download_tries_left = self.n_download_max_tries + size_ok = False + file_downloaded = False file_name = NotebookAdapter.get_unique_filename_from_url(file_url) - while True: + file_path = os.path.join(tmpdir, file_name) + for _ in range(n_download_tries_left): + step = 'getting the file size' + if not size_ok: + response = requests.head(file_url) + if response.status_code == 200: + file_size = int(response.headers.get('Content-Length', 0)) + if file_size > self.max_download_size: + msg = ("The file appears to be too large to download, " + f"and the download limit is set to {self.max_download_size} bytes.") + logger.warning(msg) + sentry.capture_message(msg) + raise Exception(msg) + else: + logger.warning( + (f"An issue occurred when attempting to {step} of the file at the url {file_url}. " + f"Sleeping {self.download_retry_sleep_s} seconds until retry") + ) + time.sleep(self.download_retry_sleep_s) + continue + size_ok = True + step = 'downloading file' response = requests.get(file_url) if response.status_code == 200: - with open(os.path.join(tmpdir, file_name), 'wb') as file: + with open(file_path, 'wb') as file: file.write(response.content) + file_downloaded = True break else: - n_download_tries_left -= 1 - if n_download_tries_left > 0: - logger.warning( - f"An issue occurred when attempting to download the file at the url {file_url}, " - f"sleeping {self.download_retry_sleep_s} seconds until retry") - time.sleep(self.download_retry_sleep_s) - else: - msg = (f"An issue occurred when attempting to download the url {file_url}, " - "this might be related to an invalid url, please check the input provided") - logger.warning(msg) - sentry.capture_message(msg) - raise Exception(msg) + logger.warning( + (f"An issue occurred when attempting to {step} the file at the url {file_url}. " + f"Sleeping {self.download_retry_sleep_s} seconds until retry") + ) + time.sleep(self.download_retry_sleep_s) + continue + + if not (file_downloaded and size_ok): + msg = (f"An issue occurred when attempting to {step} at the url {file_url}. " + "This might be related to an invalid url, please check the input provided") + logger.warning(msg) + sentry.capture_message(msg) + raise Exception(msg) return file_name @@ -712,13 +744,12 @@ def remove_tmpdir(self): def notebook_short_name(ipynb_fn): return os.path.basename(ipynb_fn).replace(".ipynb","") -def find_notebooks(source, tests=False, pattern = r'.*') -> Dict[str, NotebookAdapter]: +def find_notebooks(source, tests=False, pattern = r'.*', config=None) -> Dict[str, NotebookAdapter]: def base_filter(fn): good = "output" not in fn and "preproc" not in fn good = good and re.match(pattern, os.path.basename(fn)) return good - if tests: filt = lambda fn: base_filter(fn) and "/test_" in fn @@ -734,7 +765,10 @@ def base_filter(fn): raise Exception("no notebooks found in the directory:",source) notebook_adapters=dict([ - (notebook_short_name(notebook),NotebookAdapter(notebook)) for notebook in notebooks + ( + notebook_short_name(notebook), + NotebookAdapter(notebook, config=config) + ) for notebook in notebooks ]) logger.debug("notebook adapters: %s",notebook_adapters) @@ -742,15 +776,15 @@ def base_filter(fn): elif os.path.isfile(source): if pattern != r'.*': logger.warning('Filename pattern is set but source %s is a single file. Ignoring pattern.') - notebook_adapters={notebook_short_name(source): NotebookAdapter(source)} + notebook_adapters={notebook_short_name(source): NotebookAdapter(source, config=config)} else: raise Exception("requested notebook not found:",source) return notebook_adapters -def nbinspect(nb_source, out=True, machine_readable=False): - nbas = find_notebooks(nb_source) +def nbinspect(nb_source, out=True, machine_readable=False, config=None): + nbas = find_notebooks(nb_source, config=config) # class CustomEncoder(json.JSONEncoder): # def default(self, obj): @@ -885,9 +919,9 @@ def json(): -def nbrun(nb_source, inp, inplace=False, optional_dispather=True, machine_readable=False): +def nbrun(nb_source, inp, inplace=False, optional_dispather=True, machine_readable=False, config=None): - nbas = find_notebooks(nb_source) + nbas = find_notebooks(nb_source, config=config) if len(nbas) > 1: nba = nbas[inp.pop('notebook')] @@ -1011,7 +1045,9 @@ def main_inspect(): setup_logging(args.debug) - nbinspect(args.notebook, machine_readable=args.machine_readable) + config = Dynaconf(settings_files=['settings.toml']) + + nbinspect(args.notebook, machine_readable=args.machine_readable, config=config) def main(): @@ -1034,7 +1070,13 @@ def main(): setup_logging(args.debug) - nbrun(args.notebook, inputs, inplace=args.inplace, optional_dispather=not args.mmoda_validation, machine_readable=args.machine_readable) + config = Dynaconf(settings_files=['settings.toml']) + + nbrun(args.notebook, inputs, + inplace=args.inplace, + optional_dispather=not args.mmoda_validation, + machine_readable=args.machine_readable, + config=config) if __name__ == "__main__": diff --git a/nb2workflow/sentry.py b/nb2workflow/sentry.py index 98647ad5..a5e56ef1 100644 --- a/nb2workflow/sentry.py +++ b/nb2workflow/sentry.py @@ -20,6 +20,10 @@ def sentry_url(self): self._sentry_url = os.environ.get('SENTRY_URL', "https://63ae106793010d836c74830fa75b300c@o264756.ingest.sentry.io/4506186624335872") return self._sentry_url + @sentry_url.setter + def sentry_url(self, url): + self._sentry_url = url + @property def have_sentry(self): if self.sentry_url is None or self.sentry_url == '' or sentry_sdk is None: diff --git a/nb2workflow/service.py b/nb2workflow/service.py index 30eb98cc..9ec95e4c 100644 --- a/nb2workflow/service.py +++ b/nb2workflow/service.py @@ -4,6 +4,7 @@ from werkzeug.routing import RequestRedirect + try: from werkzeug.exceptions import MethodNotAllowed, NotFound except ImportError: @@ -36,6 +37,8 @@ from flask_caching import Cache from flask_cors import CORS +from dynaconf import FlaskDynaconf + from flasgger import LazyString, Swagger, swag_from from nb2workflow.workflows import serialize_workflow_exception @@ -92,6 +95,8 @@ def create_app(): "version": "0.0.1" } } + + FlaskDynaconf(app, settings_files=["settings.toml"], env_switcher="MERGE_ENABLED_FOR_DYNACONF") swagger = Swagger(app, template=template) app.wsgi_app = ReverseProxied(app.wsgi_app) app.json_encoder = CustomJSONEncoder @@ -173,7 +178,8 @@ def _run(self): return template_nba = app.notebook_adapters.get(self.target) - nba = NotebookAdapter(template_nba.notebook_fn, tempdir_cache=app.async_workflow_jobdirs) + + nba = NotebookAdapter(template_nba.notebook_fn, tempdir_cache=app.async_workflow_jobdirs, config=app.config) app.async_workflows[self.key] = 'started' self.perform_callback(action='progress') @@ -265,7 +271,8 @@ def workflow(target, background=False, async_request=False): logger.debug("raw parameters %s", request.args) template_nba = app.notebook_adapters.get(target) - nba = NotebookAdapter(template_nba.notebook_fn) + + nba = NotebookAdapter(template_nba.notebook_fn, config=app.config) if nba is None: interpreted_parameters = None @@ -678,9 +685,8 @@ def main(): parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('notebook', metavar='notebook', type=str) - parser.add_argument('--host', metavar='host', - type=str, default="127.0.0.1") - parser.add_argument('--port', metavar='port', type=int, default=9191) + parser.add_argument('--host', metavar='host', type=str) + parser.add_argument('--port', metavar='port', type=int) parser.add_argument('--async-workers', metavar='N', type=int, default=3) #parser.add_argument('--tmpdir', metavar='tmpdir', type=str, default=None) parser.add_argument('--publish', metavar='upstream-url', @@ -692,9 +698,30 @@ def main(): parser.add_argument('--debug', action="store_true") parser.add_argument('--one-shot', metavar='workflow', type=str) parser.add_argument('--pattern', type=str, default=r'.*') + parser.add_argument('--settings-path', action="append", default=None) + parser.add_argument('-s', '--settings', nargs="*", default=[]) args = parser.parse_args() + if args.settings_path is not None: + print("loading settings file from ", args.settings_path[0]) + app.config.load_file(path=args.settings_path[0]) + + if args.settings is not None: + for item in args.settings: + key, value = item.split('=') + app.config['global'][key] = value + + service_port = app.config.get('default.service.port', 9191) + if args.port is not None: + service_port = args.port + app.config['service.port'] = service_port + + service_host = app.config.get('default.service.host', "127.0.0.1") + if args.host is not None: + service_host = args.host + app.config['service.host'] = service_host + handler = logging.StreamHandler() handler.setLevel(logging.INFO) @@ -713,7 +740,7 @@ def main(): root.setLevel(logging.INFO) handler.setLevel(logging.INFO) - app.notebook_adapters = find_notebooks(args.notebook, pattern=args.pattern) + app.notebook_adapters = find_notebooks(args.notebook, pattern=args.pattern, config=app.config) setup_routes(app) app.service_semantic_signature = ontology.service_semantic_signature( app.notebook_adapters) @@ -725,13 +752,13 @@ def main(): s = args.publish_as.split(":") publish_host, publish_port = ":".join(s[:-1]), int(s[-1]) else: - publish_host, publish_port = args.host, args.port + publish_host, publish_port = service_host, service_port for nba_name, nba in app.notebook_adapters.items(): publish.publish(args.publish, nba_name, publish_host, publish_port) - # for rule in app.url_map.iter_rules(): - # logger.debug("==>> %s %s %s %s",rule,rule.endpoint,rule.__class__,rule.__dict__) + # for rule in app.url_map.iter_rules(): + # logger.debug("==>> %s %s %s %s",rule,rule.endpoint,rule.__class__,rule.__dict__) for worker_i in range(args.async_workers): async_worker = AsyncWorker('default-%i' % worker_i) diff --git a/nb2workflow/workflows.py b/nb2workflow/workflows.py index e32a8fd5..aa35348f 100644 --- a/nb2workflow/workflows.py +++ b/nb2workflow/workflows.py @@ -12,6 +12,8 @@ from nb2workflow import nbadapter +from dynaconf import Dynaconf + cache = Cache('.nb2workflow/cache') enable_cache = False @@ -58,6 +60,8 @@ def evaluate(router, *args, **kwargs): print("async_request is not used here, but is set to", async_request) + config = Dynaconf(settings_files=['settings.toml']) + sentry.sentry_url = config.default.service.sentry_url logstasher.set_context(dict(router=router, args=args, kwargs=kwargs)) logstasher.log(dict(event='starting')) @@ -80,7 +84,7 @@ def evaluate(router, *args, **kwargs): location = args[0] args = args[1:] - nba = nbadapter.NotebookAdapter(location+"/%s.ipynb"%args[0]) + nba = nbadapter.NotebookAdapter(location+"/%s.ipynb"%args[0], config=config) # unused args diff --git a/settings.toml b/settings.toml new file mode 100644 index 00000000..b01901fe --- /dev/null +++ b/settings.toml @@ -0,0 +1,10 @@ +[default.global] +max_download_size = 1000000000 +n_download_max_tries = 10 +download_retry_sleep = 0.5 +sentry_url = "https://63ae106793010d836c74830fa75b300c@o264756.ingest.sentry.io/4506186624335872" +ontology_path = "https://odahub.io/ontology/ontology.ttl" + +[default.service] +host = "127.0.0.1" +port = 9191 diff --git a/setup.py b/setup.py index 751a427c..6a84ca09 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,8 @@ 'validators', 'sentry_sdk', 'rdflib', - 'GitPython' + 'GitPython', + 'dynaconf' ], diff --git a/tests/conftest.py b/tests/conftest.py index 87debf7c..55f16681 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,7 @@ def test_notebook_lfs_repo(): return path + @pytest.fixture def app(test_notebook): app = nb2workflow.service.app @@ -55,6 +56,17 @@ def app(test_notebook): return app +@pytest.fixture +def app_low_download_limit(app): + testfiles_path = os.path.join(os.path.dirname(__file__), 'testfiles') + app = nb2workflow.service.app + app.config['SERVICE.MAX_DOWNLOAD_SIZE'] = 1 + app.notebook_adapters = nb2workflow.nbadapter.find_notebooks(testfiles_path, config=app.config) + nb2workflow.service.setup_routes(app) + print("creating app with low limit on the download of files") + return app + + def kill_child_processes(parent_pid, sig=signal.SIGTERM): try: parent = psutil.Process(parent_pid) diff --git a/tests/test_input_types.py b/tests/test_input_types.py index 34eada36..d8520669 100644 --- a/tests/test_input_types.py +++ b/tests/test_input_types.py @@ -20,10 +20,15 @@ def test_posix_download_file_with_arg(client): r = client.get('/api/v1.0/get/testposixpath', query_string={'fits_file_path': 'https://fits.gsfc.nasa.gov/samples/testkeys.fits'}) assert r.json['output']['output_file_download'] == 'file downloaded successfully' +def test_posix_download_file_with_arg_low_download_limit(client, app_low_download_limit): + print(app_low_download_limit.config) + r = client.get('/api/v1.0/get/testposixpath', query_string={'fits_file_path': 'https://fits.gsfc.nasa.gov/samples/testkeys.fits'}) + assert r.json['output']['output_file_download'] == 'file not downloaded' + def test_posix_download_file_with_arg_wrong_url(client): r = client.get('/api/v1.0/get/testposixpath', query_string={'fits_file_path': 'https://fits.gsfc.nasa.gov/samples/aaaaaa.fits'}) - assert r.json['exceptions'][0] == ("Exception('An issue occurred when attempting to download the url " - "https://fits.gsfc.nasa.gov/samples/aaaaaa.fits, this might be related " + assert r.json['exceptions'][0] == ("Exception('An issue occurred when attempting to getting the file size at the url " + "https://fits.gsfc.nasa.gov/samples/aaaaaa.fits. This might be related " "to an invalid url, please check the input provided')") def test_boolean_default(client): diff --git a/tests/test_service.py b/tests/test_service.py index d1805188..e2b21970 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -57,6 +57,7 @@ def test_service(client): for l in sorted(r.json, key=lambda x:x['ctime']): logger.info(l) + job = r.json[-1]['fn'].split("/")[-1] logger.info("job %s", job)