diff --git a/tools/saqc/.shed.yml b/tools/saqc/.shed.yml new file mode 100644 index 00000000..e9463701 --- /dev/null +++ b/tools/saqc/.shed.yml @@ -0,0 +1,9 @@ +categories: +- Statistics +description: System for automated Quality Control +long_description: Anomalies and errors are the rule not the exception when working with time series data. This is especially true, if such data originates from in-situ measurements of environmental properties. Almost all applications, however, implicily rely on data, that complies with some definition of 'correct'. In order to infer reliable data products and tools, there is no alternative to quality control. SaQC provides all the building blocks to comfortably bridge the gap between 'usually faulty' and 'expected to be corrected' in a accessible, consistent, objective and reproducible way. +name: saqc +owner: mbernt +remote_repository_url: https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ +homepage_url: https://git.ufz.de/rdm-software/saqc +type: unrestricted diff --git a/tools/saqc/TODO.md b/tools/saqc/TODO.md new file mode 100644 index 00000000..4248c188 --- /dev/null +++ b/tools/saqc/TODO.md @@ -0,0 +1,7 @@ +saqc: + +- module docstrings missing (except for breaks) +- flagPatternByDTW missing type annotation +- missing `--version` CLI argument +- allow tsv for `--config` +- list[x] should be List[X]? some parameters (eg field) are inconsistently annotated as Sequence/list \ No newline at end of file diff --git a/tools/saqc/example.py b/tools/saqc/example.py new file mode 100644 index 00000000..ef12f1fe --- /dev/null +++ b/tools/saqc/example.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python +import galaxyxml.tool as gxt +import galaxyxml.tool.parameters as gxtp + +tool = gxt.Tool( + "aragorn", + "se.lu.mbioekol.mbio-serv2.aragorn", + "1.2.36", + "Aragorn is a tRNA finder", + "aragorn.exe", + version_command="aragorn.exe --version", +) + +inputs = gxtp.Inputs() +outputs = gxtp.Outputs() + +# Add requirements +requirements = gxtp.Requirements() +requirements.append(gxtp.Requirement("package", "samtools", version="1.0.0")) +requirements.append(gxtp.Container("docker", "one_super_image")) +tool.requirements = requirements + +# A parameter +param = gxtp.BooleanParam("flag", label="Flag label", help="Flag help", num_dashes=1) +# Yes I know this is rubbish. Please make a PR!! +param.space_between_arg = " " +inputs.append(param) + + +# A float in a section +section = gxtp.Section("float_section", "Float section") +param = gxtp.FloatParam( + "float", value=0, label="Float label", help="Float help", num_dashes=1 +) +param.space_between_arg = " " +section.append(param) +param = gxtp.FloatParam( + None, argument="--float-fromarg", value=0, label="Float label", help="Float help" +) +section.append(param) +inputs.append(section) + +# A conditional +param = gxtp.Conditional("cond", label="Conditional") +param.append(gxtp.SelectParam("Select", options={"hi": "1", "bye": "2"})) +when_a = gxtp.When(value="hi") +when_b = gxtp.When(value="bye") +when_b.append( + gxtp.IntegerParam("some_int", value=0, num_dashes=1, label="Advanced value") +) +param.append(when_a) +param.append(when_b) +inputs.append(param) + +# Integer parameters +param_min = gxtp.IntegerParam( + "int_min", label="int_min label", help="int_min help", value=0, num_dashes=1 +) +param_max = gxtp.IntegerParam( + "int_max", label="int_max label", help="int_max help", value=0, num_dashes=1 +) + +posint = gxtp.IntegerParam( + "posint", + label="posint label", + positional=True, + help="posinthelp", + value=0, + num_dashes=2, +) + +param_min.command_line_override = "-i$int_min,$int_max" +param_max.command_line_override = "" +param_min.space_between_arg = " " +param_max.space_between_arg = " " +inputs.append(param_min) +inputs.append(param_max) +inputs.append(posint) + +# Add Select with options from_file with columns and filter +param = gxtp.SelectParam("select_local") +options = gxtp.Options(from_file="loc_file.loc") +column_a = gxtp.Column("name", 0) +options.append(column_a) +column_b = gxtp.Column("value", 1) +options.append(column_b) +filter_a = gxtp.Filter("sort_by", name="sorted", column="1") +options.append(filter_a) +param.append(options) +inputs.append(param) + +param = gxtp.Repeat("repeat", "repeat title") +data = gxtp.DataParam( + "data", + argument="--data", + optional=True, + format="fasta", + multiple=True, + label="data label", + help="data help", +) +param.append(data) +inputs.append(param) + +# Configfiles +configfiles = gxtp.Configfiles() +configfiles.append(gxtp.Configfile(name="testing", text="Hello <> World")) +configfiles.append(gxtp.ConfigfileDefaultInputs(name="inputs")) + +# Outputs +param = gxtp.OutputData("output", format="tabular", num_dashes=1) +param.space_between_arg = " " +outputs.append(param) +# Collection +collection = gxtp.OutputCollection("supercollection", label="a small label") +discover = gxtp.DiscoverDatasets(r"(?P<designation>.+)\.pdf.fasta", format="fasta") +collection.append(discover) +outputs.append(collection) + +tool.inputs = inputs +tool.outputs = outputs +tool.help = "HI" +tool.configfiles = configfiles + +# Add Tests sections +tool.tests = gxtp.Tests() +test_a = gxtp.Test() +param = gxtp.TestParam("float", value=5.4) +test_a.append(param) +test_out = gxtp.TestOutput(name="output", value="file.out") +test_a.append(test_out) +coll_out = gxtp.TestOutputCollection(name="pdf_out") +test_a.append(coll_out) +rep_out = gxtp.TestRepeat(name="testrepeat") +param = gxtp.TestParam("repeatchild", value="foo") +rep_out.append(param) +test_a.append(rep_out) +test_coll = gxtp.TestOutputCollection(name="pdf_out") +test_elem = gxtp.TestOCElement(name="apdf", file="apdf", ftype="pdf") +test_coll.append(test_elem) +test_a.append(test_coll) +rep_out = gxtp.TestRepeat(name="output_repeat") +param = gxtp.TestOutput(name="repeatout", value="repeatfile.out") +rep_out.append(param) +test_a.append(rep_out) +tool.tests.append(test_a) + + +# Add comment to the wrapper +tool.add_comment("This tool descriptor has been generated using galaxyxml.") + +print(tool.export()) diff --git a/tools/saqc/gen.py b/tools/saqc/gen.py new file mode 100644 index 00000000..92e0f67b --- /dev/null +++ b/tools/saqc/gen.py @@ -0,0 +1,679 @@ +""" +Create a wrapper for SaQC from the SaQC sources. + +Usage: call from an environment with saqc (and typing_inspect, galaxyxml) installed +""" + +import inspect +import re +import sys +from copy import deepcopy +from typing import ( + get_args, + get_origin, + Any, + Callable, + Dict, + ForwardRef, + Literal, + Optional, + Sequence, + Tuple, + Union, +) + +from galaxyxml.tool import Tool +from galaxyxml.tool.parameters import ( + BooleanParam, + Conditional, + Configfiles, + ConfigfileDefaultInputs, + DataParam, + DiscoverDatasets, + IntegerParam, + FloatParam, + HiddenParam, + Inputs, + OutputCollection, + OutputData, + Outputs, + Repeat, + SelectParam, + TextParam, + ValidatorParam, + When, +) +import matplotlib as mpl +import numpy as np +import pandas as pd +import saqc +from saqc.core import SaQC, DictOfSeries +from saqc.funcs.curvefit import FILL_METHODS +from saqc.funcs.drift import LinkageString +from saqc.funcs.generic import GenericFunction +from saqc.funcs.interpolation import INTERPOLATION_METHODS +from saqc.lib.types import CurveFitter +from typing_inspect import is_callable_type, is_union_type + + +def _get_doc(doc_str: Optional[str]) -> str: + if not doc_str: + return "" + doc_str = str(doc_str) + doc_str = [x for x in doc_str.split("\n") if x != ""] + doc_str = doc_str[0] + doc_str = doc_str.strip(" .,") + return doc_str + + +def parse_docstring(method: Callable) -> Dict[str, str]: + """ + parse sections from rst formatted doc string + + returns a mapping from section titles to section contents + 1st section title may be '' + """ + + docstring = method.__doc__ + if not docstring: + return "", "" + + sections = {} + + # Regular expressions for sections and paragraphs + section_pattern = r"^([^\S\n]*)(?P\S.*?)(\n\1([=-])+\n)" + + # Extract sections and paragraphs + section_matches = re.finditer(section_pattern, docstring, re.MULTILINE) + + end = 0 + title = "" + for i, match in enumerate(section_matches): + if i == 0 and match.start() > 0: + sections[""] = docstring[: match.start()] + else: + sections[title] = docstring[end: match.start()] + title = match.group("title") + end = match.end() + + return sections + + +def parse_parameter_docs(sections: Dict[str, str]) -> Dict[str, str]: + parameter_doc = {} + parameters = sections.get("Parameters", "") + parameter_pattern = r"^([\S\n]+)( : .*)?$" + for line in parameters.splitlines(): + match = re.match(parameter_pattern, line) + if match: + parameter = match.group(1) + parameter_doc[parameter] = [] + else: + parameter_doc[parameter].append(line) + for key in parameter_doc: + parameter_doc[key] = "\n".join(parameter_doc[key]) + return parameter_doc + + +def get_label_help(param_name, parameter_docs): + + parameter_doc = parameter_docs.get(param_name) + if not parameter_doc: + return param_name, "" + label_split = parameter_doc.split("\n", maxsplit=1) + label = label_split[0] + help = "" + if "." in label: + label = label.split(".", maxsplit=1)[0] + try: + help = label.split(".", maxsplit=1)[1] + except IndexError: + pass + if len(label_split) > 1: + help += "\n" + label_split[1] + return label.strip(), help.strip() + + +def get_modules() -> Tuple[str, "module"]: + return inspect.getmembers(saqc.funcs, inspect.ismodule) + + +def get_methods(module): + methods_with_saqc = [] + + classes = inspect.getmembers(module, inspect.isclass) + for name, cls in classes: + if inspect.ismodule(cls): + continue # Skip modules in case there are any + + methods = inspect.getmembers(cls, inspect.isfunction) + for method_name, method in methods: + parameters = inspect.signature(method).parameters + if "self" in parameters: + self_param = parameters["self"] + if self_param.annotation == "'SaQC'": + methods_with_saqc.append(method) + return methods_with_saqc + + +def get_method_params(method, module): + + sections = parse_docstring(method) + param_docs = parse_parameter_docs(sections) + + xml_params = [] + parameters = inspect.signature(method).parameters + for param_name, param in parameters.items(): + # sys.stderr.write(f"{module} {method} {param_name}\n") + # TODO check if *kwargs* really not needed + if param_name in ["self", "kwargs", "store_kwargs", "ax_kwargs"]: + continue + annotation = param.annotation + if annotation is inspect.Parameter.empty: + raise ValueError(f"missing type annotation for {param_name}") + annotation = eval(annotation) + origin = get_origin(annotation) + args = get_args(annotation) + + if param.default is inspect.Parameter.empty: + default = None + else: + default = param.default + + value = "" + if param.default is not inspect.Parameter.empty: + value = param.default + + label, help = get_label_help(param_name, param_docs) + kwargs = {"label": label, "help": help, "space_between_arg": "="} + + is_union = is_union_type(annotation) + + # a parameter is optional if None is a valid value + # this should be the case if None is in the Union and + # also if None is the default + if param.default is None: + optional = True + elif is_union and any([a is type(None) for a in args]): + optional = True + else: + optional = False + + # remove None (we just determined if the parameter is optional) + if is_union: + args_wo_none = [a for a in args if a is not type(None)] + if len(args_wo_none) == 1: + annotation = args_wo_none[0] + else: + annotation = Union[tuple(args_wo_none)] + origin = get_origin(annotation) + args = get_args(annotation) + + # print(annotation, type(annotation), origin, args) + if param_name in ["field", "target"]: + if annotation != str: + parent = Repeat( + name=f"{param_name}_repeat", title=f"{param_name}(s)", min=1 + ) + xml_params.append(parent) + else: + parent = xml_params + parent.append( + TextParam(argument=param_name, value=value, optional=optional, **kwargs) + ) + #TODO should have a validator/sanitizer + elif origin is None: + if annotation == bool: + xml_params.append( + BooleanParam( + argument=param_name, truevalue="", checked=default, **kwargs + ) + ) + elif annotation == str: + xml_params.append( + TextParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + ) + elif annotation == int: + xml_params.append( + IntegerParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + ) + elif annotation == float: + xml_params.append( + FloatParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + ) + elif ( + annotation == GenericFunction + or annotation == CurveFitter + or annotation == Any + or annotation == slice + or annotation == mpl.axes.Axes + ): + sys.stderr.write( + f"Ignoring {annotation} simple parameter {param_name} ({method.__name__})\n" + ) + pass + else: + exit( + f"Unknown simple parameter type {annotation}: {param_name} {method.__name__}" + ) + elif annotation == str | Tuple[str, str]: # window + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + # TODO make proper timedelta text + # txt.append( + # ValidatorParam( + # type="regex", + # text=r"[\dDW:]+(,[\dDW:]+)$", + # message="needs to be a single timedelta or two comma separated timedeltas", + # ) + # ) + xml_params.append(txt) + elif annotation == int | Tuple[int, int]: # periods + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + # TODO + # txt.append( + # ValidatorParam( + # type="regex", + # text=r"[\d]+(,[\d]+)$", + # message="needs to be a single number or two comma separated numbers", + # ) + # ) + xml_params.append(txt) + elif annotation == int | str and param_name in ["limit", "window"]: + cond = Conditional(name=f"{param_name}_cond") + options = {} + if optional: + options["none"] = "None" + options.update({"number": "number", "timedelta": "timedelta"}) + cond.append( + SelectParam( + argument=f"{param_name}_select", + label=f"{param_name} input mode", + options=options, + ) + ) + when = When(value="number") + kwargs_number = deepcopy(kwargs) + kwargs_number["help"] = "Number of values" + when.append( + IntegerParam( + argument=param_name, value=value, optional=optional, **kwargs_number + ) + ) + cond.append(when) + when = When(value="timedelta") + kwargs_delta = deepcopy(kwargs) + kwargs_delta["help"] = "Temporal extensions (offset string)" + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs_delta + ) + # TODO + # txt.append( + # ValidatorParam(type="regex", text="TODO$", message="TODO") + # ) + # TODO regex: see `pandas.rolling` for more information + when.append(txt) + cond.append(when) + if optional: + when = When(value="none") + when.append(HiddenParam(name=param_name, value="__none__")) + cond.append(when) + xml_params.append(cond) + elif annotation == float | str and param_name in ["cutoff", "freq"]: + cond = Conditional(name=f"{param_name}_cond") + options = {} + if optional: + options["none"] = "None" + if param_name == "cutoff": + options.update( + { + "number": "Give as multiple of sampling rate", + "offset": "specify as offset", + } + ) + cond.append( + SelectParam( + name=f"{param_name}_select", + label=f"{param_name} input mode", + options=options, + ) + ) + if optional: + cond.append(When(value="none")) + when = When(value="number") + kwargs_number = deepcopy(kwargs) + kwargs_number["help"] = "Multiple of sampling rate" + when.append( + FloatParam( + argument=param_name, + value=value, + optional=optional, + **kwargs_number, + ) + ) + cond.append(when) + elif param_name == "freq": + options.update( + {"number": "Give as period length", "offset": "specify as offset"} + ) + cond.append( + SelectParam( + name=f"{param_name}_select", + label=f"{param_name} input mode", + options=options, + ) + ) + if optional: + cond.append(When(value="none")) + when = When(value="number") + kwargs_number = deepcopy(kwargs) + kwargs_number["help"] = "Multiple of sampling rate" + when.append( + FloatParam( + argument=param_name, + value=value, + optional=optional, + **kwargs_number, + ) + ) + cond.append(when) + else: + exit(f"Unknown 'float | str' parameter {param_name}") + when = When(value="offset") + kwargs_delta = deepcopy(kwargs) + kwargs_delta["help"] = "offset frequency string" + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs_delta + ) + # TODO + # txt.append(ValidatorParam(type="regex", text="TODO$", message="TODO")) + # TODO regex: see `pandas.rolling` for more information + when.append(txt) + cond.append(when) + if optional: + when = When(value="none") + when.append(HiddenParam(name=param_name, value="__none__")) + cond.append(when) + xml_params.append(cond) + elif ( + annotation == Literal["auto"] | float + or annotation == Literal["auto"] | float | Callable + ): + cond = Conditional(name=f"{param_name}_cond") + options = {"auto": "automatic", "linear": "linear"} + if annotation == Literal["auto"] | float | Callable: + options["custom"] = "custom" + if optional: + options["none"] = "None" + cond.append( + SelectParam( + name=f"{param_name}_select", + label=f"{param_name} mode", + options=options, + ) + ) + auto_when = When(value="auto") + auto_when.append(HiddenParam(name=param_name, value="auto")) + cond.append(auto_when) + linear_when = When(value="linear") + try: + fvalue = float(value) + cvalue = "" + except ValueError: + fvalue = "" + cvalue = value + linear_when.append( + FloatParam(argument=param_name, value=fvalue, optional=optional, **kwargs) + ) + cond.append(linear_when) + if annotation == Literal["auto"] | float | Callable: + custom_when = When(value="custom") + txt = TextParam( + argument=param_name, value=cvalue, optional=optional, **kwargs + ) + # TODO + # txt.append(ValidatorParam(type="regex", text="TODO$", message="TODO")) + custom_when.append(txt) + cond.append(custom_when) + if optional: + when = When(value="none") + when.append(HiddenParam(name=param_name, value="__none__")) + cond.append(when) + xml_params.append(cond) + elif (annotation == Literal["valid", "complete"] | list[str]) or (annotation == Union[Literal['valid', 'complete'], list[str]]): + + cond = Conditional(name=f"{param_name}_cond") + options = { + "valid": "valid", + "complete": "complete", + "list": "list", + } + # TODO this is likely not correctly handled in json_to_saqc_config + # should be `None` instead of `"none"` + if optional: + options["none"] = "None" + + cond.append( + SelectParam( + name=f"{param_name}_select", + label=f"{param_name} mode", + options=options, + default=default, + ) + ) + for option in options: + when = When(value=option) + if option == "list": + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + # txt.append(ValidatorParam(type="regex", text="TODO$", message="TODO")) + else: + when.append(HiddenParam(name=param_name, value=option)) + cond.append(when) + if optional: + when = When(value="none") + when.append(HiddenParam(name=param_name, value="__none__")) + cond.append(when) + xml_params.append(cond) + # sys.stderr.write(f"TODO Ignoring {annotation} parameter {param_name} ({method.__name__})\n") + elif is_callable_type(annotation): + # set default to "" (otherwise potentially 'cryptic' default is shown) + # add default to help + kwargs["help"] += f"function {args} (default: {value})" + txt = TextParam(argument=param_name, value="", optional=optional, **kwargs) + # txt.append(ValidatorParam(type="regex", text="TODO$", message="TODO")) + xml_params.append(txt) + elif annotation == str | pd.Timedelta: + kwargs[ + "help" + ] += " see: https://pandas.pydata.org/docs/user_guide/timedeltas.html#parsing" + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + # TODO + # txt.append(ValidatorParam(type="regex", text="TODO$", message="TODO")) + xml_params.append(txt) + elif origin is Literal: + options = dict([(o, o) for o in args]) + xml_params.append( + SelectParam( + argument=param_name, + value=value, + optional=optional, + options=options, + **kwargs, + ) + ) + elif ( + annotation + == Sequence[ForwardRef("SaQC")] | dict["SaQC", str | Sequence[str]] + ): + sys.stderr.write( + f"TODO Ignoring {annotation} parameter {param_name} ({method.__name__})\n" + ) + elif ( + is_union + and is_callable_type(args[0]) + and args[1] == Literal["linear", "exponential"] + ): + cond = Conditional(name=f"{param_name}_cond") + options = { + "linear": "linear", + "exponential": "exponential", + "custom": "custom", + } + if optional: + options["none"] = "None" + cond.append( + SelectParam( + name=f"{param_name}_select", label="Model function", options=options + ) + ) + cond.append(When(value="linear")) + cond.append(When(value="exponential")) + custom_when = When(value="custom") + txt = TextParam( + argument=param_name, value=value, optional=optional, **kwargs + ) + # txt.append(ValidatorParam(type="regex", text="TODO$", message="TODO")) + custom_when.append(txt) + cond.append(custom_when) + if optional: + when = When(value="none") + when.append(HiddenParam(name=param_name, value="__none__")) + cond.append(when) + xml_params.append(cond) + elif annotation == pd.Series | pd.DataFrame | DictOfSeries | list | np.ndarray: + # for instance mdata in flagtools.flagManual, should refer to a field + xml_params.append(TextParam(argument=param_name, value=value, optional=optional, **kwargs)) + # TODO should have a validator/sanitizer + + else: + sys.stderr.write(f"Unknown parameter type {annotation}: {param_name} {method.__name__}\n") + return xml_params + + +def get_methods_conditional(methods, module): + method_conditional = Conditional(name="method_cond", label="Method") + method_select_options = [] + for method in methods: + method_name = method.__name__ + method_doc = _get_doc(method.__doc__) + if not method_doc: + method_doc = method_name + method_select_options.append((method_name, f"{method_name}: {method_doc}")) + method_select = SelectParam( + name="method_select", label="Method", options=dict(method_select_options) + ) + method_conditional.append(method_select) + for method in methods: + method_name = method.__name__ + method_doc = _get_doc(method.__doc__) + method_when = When(value=method_name) + try: + for p in get_method_params(method, module): + method_when.append(p) + except ValueError as e: + # TODO mark somehow + sys.stderr.write( + f"Skipping {method_name} in {module.__name__} due to {e}\n" + ) + method_conditional.append(method_when) + + return method_conditional + + +# overwrite command +command_override = """ +'$__tool_directory__'/json_to_saqc_config.py '$param_conf' > config.csv && +#for $i, $d in enumerate($data) + ## TODO maybe link to element_identifier + ln -s '$d' '${i}.csv' && +#end for +saqc --config config.csv +#for $i, $d in enumerate($data) + --data '${i}.csv' +#end for +--outfile output.csv +""" +# -c, --config PATH path to the configuration file [required] +# -d, --data PATH path to the data file [required] +# -o, --outfile PATH path to the output file +# --scheme [float|simple|dmp|positional] +# the flagging scheme to use +# --nodata FLOAT nodata value +# --log-level [DEBUG|INFO|WARNING] +# set output verbosity + +tool = Tool( + "SaQC", + "saqc", + version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@", + description="quality control pipelines for environmental sensor data", + executable="saqc", + macros=["macros.xml"], + command_override=command_override, + profile="22.01", + version_command="python -c 'import saqc; print(saqc.__version__)'", +) +tool.help = "TODO" + +tool.configfiles = Configfiles() +tool.configfiles.append(ConfigfileDefaultInputs(name="param_conf")) +inputs = tool.inputs = Inputs() +inputs.append(DataParam(argument="--data", format="csv", multiple=True, label="Input table")) + +outputs = tool.outputs = Outputs() + +outputs.append(OutputData(name="output", format="csv", from_work_dir="output.csv")) +plot_outputs = OutputCollection( + name="plots", type="list", label="${tool.name} on ${on_string}: Plots" +) +plot_outputs.append(DiscoverDatasets(pattern=r"(?P<name>.*)\.png", ext="png")) +# plot_outputs.append(OutputFilter(text="TODO")) +outputs.append(OutputData(name="config", format="txt", from_work_dir="config.csv")) +# TODO filter +outputs.append(plot_outputs) + +modules = get_modules() + +module_repeat = Repeat(name="methods_repeat", title="Methods") +inputs.append(module_repeat) +module_conditional = Conditional(name="module_cond", label="Module") +module_select_options = [] +for module_name, module in modules: + module_doc = _get_doc(module.__doc__) + if not module_doc: + module_doc = module_name + module_select_options.append((module_name, f"{module_name}: {module_doc}")) +module_select = SelectParam( + name="module_select", label="saqc module", options=dict(module_select_options) +) +module_conditional.append(module_select) +for module_name, module in modules: + module_when = When(value=module_name) + methods = get_methods(module) + methods_conditional = get_methods_conditional(methods, module) + module_when.append(methods_conditional) + module_conditional.append(module_when) +module_repeat.append(module_conditional) + +print(tool.export()) + +# for module_name, module in get_modules(): +# saqc_methods = get_methods_with_saqc_argument(module) +# for method in saqc_methods: +# print(method.__name__) +# parameters = inspect.signature(method).parameters +# for param in parameters: +# print("\t", param, parameters[param].annotation) diff --git a/tools/saqc/json_to_saqc_config.py b/tools/saqc/json_to_saqc_config.py new file mode 100755 index 00000000..df21369f --- /dev/null +++ b/tools/saqc/json_to_saqc_config.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +import json +import sys + +infile = sys.argv[1] + +with open(infile) as fh: + params = json.load(fh) + +# print header (important: SaQC ignores the 1st line) +print("varname; function") + +for r in params["methods_repeat"]: + r = r["module_cond"]["method_cond"] + method = r["method_select"] + del r["method_select"] + field = r["field"] + del r["field"] + + items = [] + # flatten included dictionaries + # which correspond to conditionals added by the tool + for k, v in list(r.items()): + if isinstance(v, dict) and k.endswith("_cond"): + k_prefix = k[:-5] + for s in v: + if s == f"{k_prefix}_select": + continue + if v[s] == "__none__": + v[s] = None + items.append((s, v[s])) + r[s] = v[s] + del r[k] + else: + items.append((k, v)) + + # quote string parameters + for i, item in enumerate(items): + if isinstance(item[1], str): + items[i] = (item[0], f'"{item[1]}"') + + if isinstance(field, list): + print(f"{','.join(field)}; ", end="") + else: + print(f"{field}; ", end="") + print(f"{method}(", end="") + print(', '.join([f"{p[0]}={p[1]}" for p in items]), end="") + print(")", ) diff --git a/tools/saqc/macros.xml b/tools/saqc/macros.xml new file mode 100644 index 00000000..757eaa05 --- /dev/null +++ b/tools/saqc/macros.xml @@ -0,0 +1,209 @@ +<macros> + +<xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">saqc</requirement> + </requirements> +</xml> + +<token name="@TOOL_VERSION@">2.4.1</token> +<token name="@VERSION_SUFFIX@">0</token> + +<xml name="citations"/> + +<xml name="saqc_tests"> +<tests> + <!-- https://rdm-software.pages.ufz.de/saqc/gettingstarted/TutorialCLI.html#get-toy-data-and-configuration + varname;test + SM2 ;flagRange(min=10, max=60) + SM2 ;flagMAD(window="30d", z=3.5) + SM2 ;plot() --> + <test> + <param name="data" value="test1/data.csv" ftype="csv"/> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="outliers"/> + <conditional name="method_cond"> + <param name="method_select" value="flagRange"/> + <param name="field" value="SM2"/> + <param name="min" value="10"/> + <param name="max" value="60"/> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="outliers"/> + <conditional name="method_cond"> + <param name="method_select" value="flagMAD"/> + <param name="field" value="SM2"/> + <param name="center" value="false"/> + <conditional name="window_cond"> + <param name="window_select" value="timedelta"/> + <param name="window" value="30d"/> + </conditional> + <param name="z" value="3.5"/> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="tools"/> + <conditional name="method_cond"> + <!-- plot names should be determined automatically --> + <param name="path" value="test"/> + <param name="method_select" value="plot"/> + <param name="field" value="SM2"/> + </conditional> + </conditional> + </repeat> + <output name="output" value="test1/out.csv" ftype="csv"/> + <output name="config" ftype="txt"> + <assert_contents> + <has_n_lines n="4"/> + <has_n_columns n="2" sep=";"/> + <has_line_matching expression="SM2; flagRange\(min=10.*, max=60.*\)"/> + <has_line_matching expression='SM2; flagMAD\(window=\"30d\", z=3\.5.*\)'/> + </assert_contents> + </output> + <output_collection name="plots" type="list"> + <element name="test" ftype="png"> + <assert_contents> + <has_text text="PNG"/> + <has_size value="150k" delta="10k"/> + </assert_contents> + </element> + </output_collection> + </test> + + <!-- data.csv sind die eigentlichen Observationsdaten, maint.csv händisch geführte Einträge über Wartungsarbeiten an den Senoren. Aufgerufen wird das dann mit--> + <test> + <!-- + SAK254 ; flagManual(mdata="maint", method="closed") + SAK254 ; flagRange(min=1, max=200) + SAK254 ; flagConstants(thresh=0, window="4h") + SAK254 ; flagByStatLowPass(func="var", window="8h", sub_window="2h", thresh=0.19) + SAK254 ; flagUniLOF(thresh=2.2) + SAK254 ; flagIsolated(gap_window="30min", group_window="1h") --> + + <param name="data" value="test2/data.csv,test2/maint.csv" ftype="csv"/> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="flagtools"/> + <conditional name="method_cond"> + <param name="method_select" value="flagManual"/> + <param name="field" value="SAK254"/> + <param name="mdata" value="maint"/> + <param name="method" value="closed"/> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="outliers"/> + <conditional name="method_cond"> + <param name="method_select" value="flagRange"/> + <param name="field" value="SAK254"/> + <param name="min" value="1"/> + <param name="max" value="200"/> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="constants"/> + <conditional name="method_cond"> + <param name="method_select" value="flagConstants"/> + <param name="field" value="SAK254"/> + <param name="thresh" value="0"/> + <conditional name="window_cond"> + <param name="window_select" value="timedelta"/> + <param name="window" value="4h"/> + </conditional> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="noise"/> + <conditional name="method_cond"> + <param name="method_select" value="flagByStatLowPass"/> + <param name="field" value="SAK254"/> + <param name="func" value="var"/> + <param name="window" value="8h"/> + <param name="sub_window" value="2h"/> + <param name="thresh" value="0.19"/> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="outliers"/> + <conditional name="method_cond"> + <param name="method_select" value="flagUniLOF"/> + <param name="field" value="SAK254"/> + <conditional name="thresh_cond"> + <param name="thresh_select" value="linear"/> + <param name="thresh" value="2.2"/> + </conditional> + </conditional> + </conditional> + </repeat> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="breaks"/> + <conditional name="method_cond"> + <param name="method_select" value="flagIsolated"/> + <param name="field" value="SAK254"/> + <param name="gap_window" value="30min"/> + <param name="group_window" value="1h"/> + </conditional> + </conditional> + </repeat> + <output name="config" ftype="txt"> + <assert_contents> + <!-- <has_n_lines n="4"/> --> + <has_n_columns n="2" sep=";"/> + <has_line_matching expression='SAK254; flagManual\(mdata=\"maint\", method=\"closed\".*\)'/> + <has_line_matching expression='SAK254; flagRange\(min=1.*, max=200.*\)'/> + <has_line_matching expression='SAK254; flagConstants\(thresh=0.*, window=\"4h\".*\)'/> + <has_line_matching expression='SAK254; flagByStatLowPass\(func="var", window=\"8h\", thresh=0.19, sub_window=\"2h\".*\)'/> + <has_line_matching expression='SAK254; flagUniLOF\(.*thresh=2.2.*\)'/> + <has_line_matching expression='SAK254; flagIsolated\(gap_window=\"30min\", group_window=\"1h\".*\)'/> + </assert_contents> + </output> + <output name="output" value="test2/out.csv" ftype="csv"> + <!-- <assert_contents> + <has_n_lines n="14696"/> + <has_n_columns n="7" sep=","/> + <has_text text=",UNFLAGGED,"/> + </assert_contents> --> + </output> + </test> + <!-- docs/resources/data/config_ci.csv + +# SM2;align(freq="15Min", method="nshift") +# '.*';flagRange(min=10, max=60) +SM2;flagMissing() +# SM2;flagRange(min=10, max=60) +# SM2;flagMAD(window="30d", z=3.5) --> + + <!-- <test> + <repeat name="methods_repeat"> + <conditional name="module_cond"> + <param name="module_select" value="interpolation"/> + <conditional name="method_cond"> + <param name="method_select" value="align"/> + <param name="field" value="SM2"/> + <param name="freq" value="15Min"/> + <param name="method" value="polynomial"/> + </conditional> + </conditional> + </repeat> + <assert_command> + <has_text text="SM2"/> + </assert_command> + </test> --> +</tests> +</xml> +</macros> \ No newline at end of file diff --git a/tools/saqc/saqc.xml b/tools/saqc/saqc.xml new file mode 100644 index 00000000..dadc0cd3 --- /dev/null +++ b/tools/saqc/saqc.xml @@ -0,0 +1,1212 @@ +<tool name="SaQC" id="saqc" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.01"> + <description>quality control pipelines for environmental sensor data</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <stdio> + <exit_code range="1:" level="fatal"/> + </stdio> + <version_command><![CDATA[python -c 'import saqc; print(saqc.__version__)']]></version_command> + <command><![CDATA['$__tool_directory__'/json_to_saqc_config.py '$param_conf' > config.csv && +#for $i, $d in enumerate($data) + ## TODO maybe link to element_identifier + ln -s '$d' '${i}.csv' && +#end for +saqc -c config.csv +#for $i, $d in enumerate($data) + -d '${i}.csv' +#end for +-o output.csv]]></command> + <configfiles> + <inputs name="param_conf"/> + </configfiles> + <inputs> + <param argument="--data" type="data" label="Input table" format="csv" multiple="true"/> + <repeat name="methods_repeat" title="Methods"> + <conditional name="module_cond" label="Module"> + <param name="module_select" type="select" label="saqc module"> + <option value="breaks">breaks: Detecting breaks in data</option> + <option value="changepoints">changepoints: changepoints</option> + <option value="constants">constants: constants</option> + <option value="curvefit">curvefit: curvefit</option> + <option value="drift">drift: drift</option> + <option value="flagtools">flagtools: flagtools</option> + <option value="generic">generic: generic</option> + <option value="interpolation">interpolation: interpolation</option> + <option value="noise">noise: noise</option> + <option value="outliers">outliers: outliers</option> + <option value="pattern">pattern: pattern</option> + <option value="resampling">resampling: resampling</option> + <option value="residuals">residuals: residuals</option> + <option value="rolling">rolling: rolling</option> + <option value="scores">scores: scores</option> + <option value="tools">tools: tools</option> + <option value="transformation">transformation: transformation</option> + </param> + <when value="breaks"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="flagIsolated">flagIsolated: Find and flag temporal isolated groups of data</option> + <option value="flagJumps">flagJumps: Flag jumps and drops in data</option> + <option value="flagMissing">flagMissing: Flag NaNs in data</option> + </param> + <when value="flagIsolated"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="gap_window" type="text" value="" optional="false" label="Minimum gap size required before and after a data group to consider it" help="isolated. See condition (2) and (3)"/> + <param argument="group_window" type="text" value="" optional="false" label="Maximum size of a data chunk to consider it a candidate for an isolated group" help="Data chunks that are bigger than the ``group_window`` are ignored. This does not include the possible gaps surrounding it. See condition (1)."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagJumps"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="thresh" type="float" value="" optional="false" label="Threshold value by which the mean of data has to jump, to trigger flagging" help=""/> + <param argument="window" type="text" value="" optional="false" label="Size of the two moving windows" help="for calculating the mean in every window. The window size should be big enough to yield enough samples for a reliable mean calculation, but it should also not be arbitrarily big, since it also limits the density of jumps that can be detected. More precisely: Jumps that are not distanced to each other by more than three fourth (3/4) of the selected window size, will not be detected reliably."/> + <param argument="min_periods" type="integer" value="1" optional="false" label="The minimum number of observations in window required to calculate a valid" help="mean value."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + <param argument="dfilter" type="float" value="-inf" optional="false" label="Defines which observations will be masked based on the already existing flags" help=""/> + </when> + <when value="flagMissing"> + <param argument="field" type="text" value="" optional="false" label="field" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="flag" help=""/> + <param argument="dfilter" type="float" value="-inf" optional="false" label="dfilter" help=""/> + </when> + </conditional> + </when> + <when value="changepoints"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="assignChangePointCluster">assignChangePointCluster: Label data where it changes significantly</option> + <option value="flagChangePoints">flagChangePoints: Flag values that represent a system state transition</option> + </param> + <when value="assignChangePointCluster"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="stat_func" type="text" value="" optional="false" label="A function that assigns a value to every twin window" help="be passed to first variable, right window content will be passed to the second.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'float'>) (default: )"/> + <param argument="thresh_func" type="text" value="" optional="false" label="A function that determines the value level, exceeding wich qualifies a" help="timestamps func value as denoting a changepoint.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'float'>) (default: )"/> + <param argument="window" type="text" value="" optional="false" label="Size of the rolling windows the calculation is performed in" help="frequency offset, it applies for the backward- and the forward-facing window. If two offsets (as a tuple) is passed the first defines the size of the backward facing window, the second the size of the forward facing window."/> + <param argument="min_periods" type="text" value="" optional="false" label="Minimum number of observations in a window required to perform the changepoint" help="test. If it is a tuple of two int, the first refer to the backward-, the second to the forward-facing window."/> + <param argument="reduce_window" type="text" optional="true" label="The sliding window search method is not an exact CP search method and usually" help="there won't be detected a single changepoint, but a "region" of change around a changepoint. If `reduce_window` is given, for every window of size `reduce_window`, there will be selected the value with index `reduce_func(x, y)` and the others will be dropped. If `reduce_window` is None, the reduction window size equals the twin window size, the changepoints have been detected with."/> + <param argument="reduce_func" type="text" value="" optional="false" label="A function that must return an index value upon input of two arrays x and y" help="First input parameter will hold the result from the stat_func evaluation for every reduction window. Second input parameter holds the result from the thresh_func evaluation. The default reduction function just selects the value that maximizes the stat_func.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'float'>) (default: <function ChangepointsMixin.<lambda> at 0x7fb9ff491e40>)"/> + <param argument="model_by_resids" type="boolean" label="If True, the results of `stat_funcs` are written, otherwise the regime labels" help="" checked="false" truevalue="" falsevalue=""/> + </when> + <when value="flagChangePoints"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="stat_func" type="text" value="" optional="false" label="A function that assigns a value to every twin window" help="window content will be passed as the first array, the forward-facing window content as the second.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'float'>) (default: )"/> + <param argument="thresh_func" type="text" value="" optional="false" label="A function that determines the value level, exceeding wich qualifies a" help="timestamps func value as denoting a change-point.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'float'>) (default: )"/> + <param argument="window" type="text" value="" optional="false" label="Size of the moving windows" help="calculating the statistic. If it is a single frequency offset, it applies for the backward- and the forward-facing window. If two offsets (as a tuple) is passed the first defines the size of the backward facing window, the second the size of the forward facing window."/> + <param argument="min_periods" type="text" value="" optional="false" label="Minimum number of observations in a window required to perform the changepoint" help="test. If it is a tuple of two int, the first refer to the backward-, the second to the forward-facing window."/> + <param argument="reduce_window" type="text" optional="true" label="The sliding window search method is not an exact CP search method and usually" help="there wont be detected a single changepoint, but a "region" of change around a changepoint. If `reduce_window` is given, for every window of size `reduce_window`, there will be selected the value with index `reduce_func(x, y)` and the others will be dropped. If `reduce_window` is None, the reduction window size equals the twin window size, the changepoints have been detected with."/> + <param argument="reduce_func" type="text" value="" optional="false" label="A function that must return an index value upon input of two arrays x and y" help="First input parameter will hold the result from the stat_func evaluation for every reduction window. Second input parameter holds the result from the `thresh_func` evaluation. The default reduction function just selects the value that maximizes the `stat_func`.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'int'>) (default: <function ChangepointsMixin.<lambda> at 0x7fb9ff491a80>)"/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + </conditional> + </when> + <when value="constants"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="flagByVariance">flagByVariance: Flag low-variance data</option> + <option value="flagConstants">flagConstants: Flag constant data values</option> + </param> + <when value="flagByVariance"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="window" type="text" value="" optional="false" label="Size of the moving window" help="for calculating the statistic. Each window will be a fixed size. If its an offset then this will be the time period of each window. Each window will be sized, based on the number of observations included in the time-period."/> + <param argument="thresh" type="float" value="" optional="false" label="Maximum total variance allowed per window" help=""/> + <param argument="maxna" type="integer" optional="true" label="Maximum number of NaNs allowed in window" help="If more NaNs are present, the window is not flagged."/> + <param argument="maxna_group" type="integer" optional="true" label="Same as `maxna` but for consecutive NaNs" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagConstants"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="thresh" type="float" value="" optional="false" label="Maximum total change allowed per window" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="Size of the moving window" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="Size of the moving window" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="min_periods" type="integer" value="2" optional="false" label="min_periods" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + </conditional> + </when> + <when value="curvefit"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="fitLowpassFilter">fitLowpassFilter: Fits the data using the butterworth filter</option> + <option value="fitPolynomial">fitPolynomial: Fits a polynomial model to the data</option> + </param> + <when value="fitLowpassFilter"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="cutoff_cond"> + <param name="cutoff_select" type="select" label="cutoff input mode"> + <option value="number">Give as multiple of sampling rate</option> + <option value="offset">specify as offset</option> + </param> + <when value="number"> + <param argument="cutoff" type="float" value="" optional="false" label="The cutoff-frequency, either an offset freq string, or expressed in multiples of the sampling rate" help="Multiple of sampling rate"/> + </when> + <when value="offset"> + <param argument="cutoff" type="text" value="" optional="false" label="The cutoff-frequency, either an offset freq string, or expressed in multiples of the sampling rate" help="offset frequency string"/> + </when> + </conditional> + <param argument="nyq" type="float" value="0.5" optional="false" label="The niquist-frequency" help=""/> + <param argument="filter_order" type="integer" value="2" optional="false" label="filter_order" help=""/> + <param argument="fill_method" type="select" value="linear" optional="false" label="Fill method to be applied on the data before filtering (butterfilter cant" help="handle ''np.nan''). See documentation of pandas.Series.interpolate method for details on the methods associated with the different keywords."> + <option value="linear">linear</option> + <option value="nearest">nearest</option> + <option value="zero">zero</option> + <option value="slinear">slinear</option> + <option value="quadratic">quadratic</option> + <option value="cubic">cubic</option> + <option value="spline">spline</option> + <option value="barycentric">barycentric</option> + <option value="polynomial">polynomial</option> + </param> + </when> + <when value="fitPolynomial"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="Size of the window you want to use for fitting" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="Size of the window you want to use for fitting" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="order" type="integer" value="" optional="false" label="Degree of the polynomial used for fitting" help=""/> + <param argument="min_periods" type="integer" value="0" optional="false" label="Minimum number of observations in a window required to perform the fit," help="otherwise NaNs will be assigned. If ``None``, `min_periods` defaults to 1 for integer windows and to the size of the window for offset based windows. Passing 0, disables the feature and will result in over-fitting for too sparse windows."/> + </when> + </conditional> + </when> + <when value="drift"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="assignRegimeAnomaly">assignRegimeAnomaly: A function to detect values belonging to an anomalous regime regarding modelling</option> + <option value="correctDrift">correctDrift: The function corrects drifting behavior</option> + <option value="correctOffset">correctOffset: Parameters</option> + <option value="correctRegimeAnomaly">correctRegimeAnomaly: Function fits the passed model to the different regimes in data[field] and tries to correct</option> + <option value="flagDriftFromNorm">flagDriftFromNorm: Flags data that deviates from an avarage data course</option> + <option value="flagDriftFromReference">flagDriftFromReference: Flags data that deviates from a reference course. Deviation is measured by a</option> + <option value="flagRegimeAnomaly">flagRegimeAnomaly: Flags anomalous regimes regarding to modelling regimes of ``field``</option> + </param> + <when value="assignRegimeAnomaly"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="cluster_field" type="text" value="" optional="false" label="Column in data, holding the cluster labels for the samples in field" help="(has to be indexed equal to field)"/> + <param argument="spread" type="float" value="" optional="false" label="A threshold denoting the value level, up to wich clusters a agglomerated" help=""/> + <param argument="method" type="select" value="single" optional="false" label="The linkage method for hierarchical (agglomerative) clustering of the variables" help=""> + <option value="single">single</option> + <option value="complete">complete</option> + <option value="average">average</option> + <option value="weighted">weighted</option> + <option value="centroid">centroid</option> + <option value="median">median</option> + <option value="ward">ward</option> + </param> + <param argument="metric" type="text" value="" optional="false" label="A metric function for calculating the dissimilarity between 2 regimes" help="Defaults to the difference in mean.function ([<class 'numpy.ndarray'>, <class 'numpy.ndarray'>], <class 'float'>) (default: <function DriftMixin.<lambda> at 0x7fb9fd16f2e0>)"/> + <param argument="frac" type="float" value="0.5" optional="false" label="Has to be in [0,1]" help="the "normal" group has to comprise to be the normal group actually."/> + </when> + <when value="correctDrift"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="maintenance_field" type="text" value="" optional="false" label="Column holding the support-points information" help="The data is expected to have the following form: The index of the series represents the beginning of a maintenance event, wheras the values represent its endings."/> + <conditional name="model_cond"> + <param name="model_select" type="select" label="Model function"> + <option value="linear">linear</option> + <option value="exponential">exponential</option> + <option value="custom">custom</option> + </param> + <when value="linear"/> + <when value="exponential"/> + <when value="custom"> + <param argument="model" type="text" value="" optional="false" label="A model function describing the drift behavior, that is to be corrected" help="Either use built-in exponential or linear drift model by passing a string, or pass a custom callable. The model function must always contain the keyword parameters 'origin' and 'target'. The starting parameter must always be the parameter, by wich the data is passed to the model. After the data parameter, there can occure an arbitrary number of model calibration arguments in the signature. See the Notes section for an extensive description."/> + </when> + </conditional> + <param argument="cal_range" type="integer" value="5" optional="false" label="Number of values to calculate the mean of, for obtaining the value level directly" help="after and directly before a maintenance event. Needed for shift calibration."/> + </when> + <when value="correctOffset"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="max_jump" type="float" value="" optional="false" label="when searching for changepoints in mean - this is the threshold a mean difference in the" help="sliding window search must exceed to trigger changepoint detection."/> + <param argument="spread" type="float" value="" optional="false" label="threshold denoting the maximum, regimes are allowed to abolutely differ in their means" help="to form the "normal group" of values."/> + <param argument="window" type="text" value="" optional="false" label="Size of the adjacent windows that are used to search for the mean changepoints" help=""/> + <param argument="min_periods" type="integer" value="" optional="false" label="Minimum number of periods a search window has to contain, for the result of the changepoint" help="detection to be considered valid."/> + <param argument="tolerance" type="text" optional="true" label="If an offset string is passed, a data chunk of length `offset` right from the" help="start and right before the end of any regime is ignored when calculating a regimes mean for data correcture. This is to account for the unrelyability of data near the changepoints of regimes."/> + </when> + <when value="correctRegimeAnomaly"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="cluster_field" type="text" value="" optional="false" label="A string denoting the field in data, holding the cluster label for the data you want" help="to correct."/> + <param argument="tolerance" type="text" optional="true" label="If an offset string is passed, a data chunk of length `offset` right at the" help="start and right at the end is ignored when fitting the model. This is to account for the unreliability of data near the changepoints of regimes."/> + <param argument="epoch" type="boolean" label="If True, use "seconds from epoch" as x input to the model func, instead of" help=""seconds from regime start"." checked="false" truevalue="" falsevalue=""/> + </when> + <when value="flagDriftFromNorm"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <param argument="window" type="text" value="" optional="false" label="Frequency, that split the data in chunks" help=""/> + <param argument="spread" type="float" value="" optional="false" label="Maximum spread allowed in the group of *normal* data" help=""/> + <param argument="frac" type="float" value="0.5" optional="false" label="Fraction defining the normal group" help="The higher the value, the more stable the algorithm will be. For values below 0.5 the results are undefined."/> + <param argument="metric" type="text" value="" optional="false" label="Distance function that takes two arrays as input and returns a scalar float" help="This value is interpreted as the distance of the two input arrays. Defaults to the `averaged manhattan metric` (see Notes).function ([numpy.ndarray | pandas.core.series.Series, numpy.ndarray | pandas.core.series.Series], <class 'numpy.ndarray'>) (default: <function cityblock at 0x7fb9fd156480>)"/> + <param argument="method" type="select" value="single" optional="false" label="Linkage method used for hierarchical (agglomerative) clustering of the data" help="`method` is directly passed to ``scipy.hierarchy.linkage``. See its documentation [1] for more details. For a general introduction on hierarchical clustering see [2]."> + <option value="single">single</option> + <option value="complete">complete</option> + <option value="average">average</option> + <option value="weighted">weighted</option> + <option value="centroid">centroid</option> + <option value="median">median</option> + <option value="ward">ward</option> + </param> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagDriftFromReference"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <param argument="reference" type="text" value="" optional="false" label="Reference variable, the deviation is calculated from" help=""/> + <param argument="freq" type="text" value="" optional="false" label="Frequency, that split the data in chunks" help=""/> + <param argument="thresh" type="float" value="" optional="false" label="Maximum deviation from reference" help=""/> + <param argument="metric" type="text" value="" optional="false" label="Distance function" help="This value is interpreted as the mutual distance of the two input arrays. Defaults to the `averaged manhattan metric` (see Notes).function ([numpy.ndarray | pandas.core.series.Series, numpy.ndarray | pandas.core.series.Series], <class 'numpy.ndarray'>) (default: <function cityblock at 0x7fb9fd156480>)"/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagRegimeAnomaly"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="cluster_field" type="text" value="" optional="false" label="Column in data, holding the cluster labels for the samples in field" help="(has to be indexed equal to field)"/> + <param argument="spread" type="float" value="" optional="false" label="A threshold denoting the value level, up to wich clusters a agglomerated" help=""/> + <param argument="method" type="select" value="single" optional="false" label="The linkage method for hierarchical (agglomerative) clustering of the variables" help=""> + <option value="single">single</option> + <option value="complete">complete</option> + <option value="average">average</option> + <option value="weighted">weighted</option> + <option value="centroid">centroid</option> + <option value="median">median</option> + <option value="ward">ward</option> + </param> + <param argument="metric" type="text" value="" optional="false" label="A metric function for calculating the dissimilarity between 2 regimes" help="Defaults to the difference in mean.function ([numpy.ndarray | pandas.core.series.Series, numpy.ndarray | pandas.core.series.Series], <class 'float'>) (default: <function DriftMixin.<lambda> at 0x7fb9fd16ef20>)"/> + <param argument="frac" type="float" value="0.5" optional="false" label="Has to be in [0,1]" help="the "normal" group has to comprise to be the normal group actually."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + </conditional> + </when> + <when value="flagtools"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="andGroup">andGroup: Flag all values, if all of the given ``field`` values are already flagged</option> + <option value="clearFlags">clearFlags: Set whole column to UNFLAGGED</option> + <option value="flagDummy">flagDummy: Function does nothing but returning data and flags</option> + <option value="flagManual">flagManual: Flag data by given, "manually generated" data</option> + <option value="flagUnflagged">flagUnflagged: Function sets a flag at all unflagged positions</option> + <option value="forceFlags">forceFlags: Set whole column to a flag value</option> + <option value="orGroup">orGroup: Flag all values, if at least one of the given ``field`` values is already flagged</option> + <option value="propagateFlags">propagateFlags: Flag values before or after flags set by the last test</option> + <option value="transferFlags">transferFlags: Transfer Flags of one variable to another</option> + </param> + <when value="andGroup"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <param argument="target" type="text" optional="true" label="Variable name to which the results are written" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="clearFlags"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </when> + <when value="flagDummy"> + <param argument="field" type="text" value="" optional="false" label="field" help=""/> + </when> + <when value="flagManual"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="mdata" type="text" value="" optional="false" label="The Data determining, wich intervals are to be flagged, or a string, denoting under which field the data is" help="accessable."/> + <param argument="method" type="select" value="left-open" optional="false" label="Defines how mdata is projected on data" help="index. * 'plain': mdata must have the same length as data and is projected one-to-one on data. * 'ontime': works only with indexed mdata. mdata entries are matched with data entries that have the same index. * 'right-open': mdata defines intervals, values are to be projected on. The intervals are defined, (1) Either, by any two consecutive timestamps t_1 and 1_2 where t_1 is valued with mflag, or by a series, (2) Or, a Series, where the index contains in the t1 timestamps nd the values the respective t2 stamps. The value at t_1 gets projected onto all data timestamps t with t_1 <= t < t_2. * 'left-open': like 'right-open', but the projected interval now covers all t with t_1 < t <= t_2. * 'closed': like 'right-open', but the projected interval now covers all t with t_1 <= t <= t_2."> + <option value="left-open">left-open</option> + <option value="right-open">right-open</option> + <option value="closed">closed</option> + <option value="plain">plain</option> + <option value="ontime">ontime</option> + </param> + <param argument="mformat" type="select" value="start-end" optional="false" label="* "start-end": mdata is a Series, where every entry indicates an interval to-flag" help="bound, the value defines the right bound. * "mflag": mdata is an array like, with entries containing 'mflag',where flags shall be set. See documentation for examples."> + <option value="start-end">start-end</option> + <option value="mflag">mflag</option> + </param> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagUnflagged"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="forceFlags"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="orGroup"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <param argument="target" type="text" optional="true" label="Variable name to which the results are written" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="propagateFlags"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="Size of the repetition window" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="Size of the repetition window" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="method" type="select" value="ffill" optional="false" label="Direction of repetetion" help="repeat, with "bfill" the previous values."> + <option value="ffill">ffill</option> + <option value="bfill">bfill</option> + </param> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + <param argument="dfilter" type="float" value="-inf" optional="false" label="Defines which observations will be masked based on the already existing flags" help=""/> + </when> + <when value="transferFlags"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="target" type="text" value="" optional="false" label="Variable name to which the results are written" help=""/> + </when> + </conditional> + </when> + <when value="generic"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="flagGeneric">flagGeneric: Flag data based on a given function</option> + <option value="processGeneric">processGeneric: Generate/process data with user defined functions</option> + </param> + <when value="flagGeneric"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <repeat name="target_repeat" title="target(s)" min="1"> + <param argument="target" type="text" optional="true" label="Variable name to which the results are written" help=""/> + </repeat> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="processGeneric"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <repeat name="target_repeat" title="target(s)" min="1"> + <param argument="target" type="text" optional="true" label="Variable name to which the results are written" help=""/> + </repeat> + <param argument="dfilter" type="float" value="-inf" optional="false" label="Defines which observations will be masked based on the already existing flags" help=""/> + </when> + </conditional> + </when> + <when value="interpolation"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="align">align: Convert time series to specified frequency. Values affected by frequency</option> + <option value="interpolate">interpolate: Fill NaN and flagged values using an interpolation method</option> + <option value="interpolateByRolling">interpolateByRolling: Interpolates nan-values in the data by assigning them the aggregation result of the window surrounding them</option> + <option value="interpolateIndex">interpolateIndex: Function to interpolate the data at regular (äquidistant) timestamps (or Grid points)</option> + <option value="interpolateInvalid">interpolateInvalid: deprecated:: 2.4.0</option> + </param> + <when value="align"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="freq" type="text" value="" optional="false" label="Target frequency" help=""/> + <param argument="method" type="select" value="time" optional="false" label="Interpolation technique to use" help="* ``'nshift'``: shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` * ``'bshift'``: shift grid points to the first succeeding time stamp (if any) * ``'fshift'``: shift grid points to the last preceeding time stamp (if any) * ``'linear'``: Ignore the index and treat the values as equally spaced. * ``'time'``, ``'index'``, 'values': Use the actual numerical values of the index. * ``'pad'``: Fill in NaNs using existing values. * ``'nearest'``, ``'zero'``, ``'slinear'``, ``'quadratic'``, ``'cubic'``, ``'spline'``, ``'barycentric'``, ``'polynomial'``: Passed to ``scipy.interpolate.interp1d``. These methods use the numerical values of the index. Both ``'polynomial'`` and ``'spline'`` require that you also specify an ``order``, e.g. ``qc.interpolate(method='polynomial', order=5)``. * ``'krogh'``, ``'spline'``, ``'pchip'``, ``'akima'``, ``'cubicspline'``: Wrappers around the SciPy interpolation methods of similar names. * ``'from_derivatives'``: Refers to ``scipy.interpolate.BPoly.from_derivatives``"> + <option value="linear">linear</option> + <option value="time">time</option> + <option value="nearest">nearest</option> + <option value="zero">zero</option> + <option value="slinear">slinear</option> + <option value="quadratic">quadratic</option> + <option value="cubic">cubic</option> + <option value="spline">spline</option> + <option value="barycentric">barycentric</option> + <option value="polynomial">polynomial</option> + <option value="krogh">krogh</option> + <option value="piecewise_polynomial">piecewise_polynomial</option> + <option value="pchip">pchip</option> + <option value="akima">akima</option> + </param> + <param argument="order" type="integer" value="2" optional="false" label="Order of the interpolation method, ignored if not supported by the chosen ``method``" help=""/> + <param argument="extrapolate" type="select" optional="true" label="Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of" help="NaN values in data series. * ``None`` (default) - perform interpolation * ``'forward'``/``'backward'`` - perform forward/backward extrapolation * ``'both'`` - perform forward and backward extrapolation"> + <option value="forward">forward</option> + <option value="backward">backward</option> + <option value="both">both</option> + </param> + <param argument="overwrite" type="boolean" label="If set to True, existing flags will be cleared" help="" checked="false" truevalue="" falsevalue=""/> + </when> + <when value="interpolate"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="method" type="select" value="time" optional="false" label="Interpolation technique to use" help="* ‘linear’: Ignore the index and treat the values as equally spaced. * ‘time’: Works on daily and higher resolution data to interpolate given length of interval. * ‘index’, ‘values’: Use the actual numerical values of the index. * ‘pad’: Fill in NaNs using existing values. * ‘nearest’, ‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’: Passed to scipy.interpolate.interp1d. These methods use the numerical values of the index. Both ‘polynomial’ and ‘spline’ require that you also specify an order (int), e.g. ``qc.interpolate(method='polynomial', order=5)``. * ‘krogh’, ‘spline’, ‘pchip’, ‘akima’, ‘cubicspline’: Wrappers around the SciPy interpolation methods of similar names. * ‘from_derivatives’: Refers to scipy.interpolate.BPoly.from_derivatives"> + <option value="linear">linear</option> + <option value="time">time</option> + <option value="nearest">nearest</option> + <option value="zero">zero</option> + <option value="slinear">slinear</option> + <option value="quadratic">quadratic</option> + <option value="cubic">cubic</option> + <option value="spline">spline</option> + <option value="barycentric">barycentric</option> + <option value="polynomial">polynomial</option> + <option value="krogh">krogh</option> + <option value="piecewise_polynomial">piecewise_polynomial</option> + <option value="pchip">pchip</option> + <option value="akima">akima</option> + </param> + <param argument="order" type="integer" value="2" optional="false" label="Order of the interpolation method, ignored if not supported by the chosen ``method``" help=""/> + <conditional name="limit_cond"> + <param argument="limit_select" type="select" label="limit input mode"> + <option value="none">None</option> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="limit" type="integer" optional="true" label="Maximum number of missing values to interpolate" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="limit" type="text" optional="true" label="Maximum number of missing values to interpolate" help="Temporal extensions (offset string)"/> + </when> + <when value="none"> + <param name="limit" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="extrapolate" type="select" optional="true" label="Use parameter to perform extrapolation instead of interpolation onto the trailing and/or leading chunks of" help="NaN values in data series. * 'None' (default) - perform interpolation * 'forward'/'backward' - perform forward/backward extrapolation * 'both' - perform forward and backward extrapolation"> + <option value="forward">forward</option> + <option value="backward">backward</option> + <option value="both">both</option> + </param> + <param argument="flag" type="float" value="-inf" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="interpolateByRolling"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="The size of the window, the aggregation is computed from" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="The size of the window, the aggregation is computed from" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="func" type="text" value="" optional="false" label="The function used for aggregation" help="function ([<class 'pandas.core.series.Series'>], <class 'float'>) (default: <function median at 0x7fba139720c0>)"/> + <param argument="center" type="boolean" label="Center the window around the value" help="" checked="true" truevalue="" falsevalue=""/> + <param argument="min_periods" type="integer" value="0" optional="false" label="Minimum number of valid (not np" help="computed."/> + <param argument="flag" type="float" value="-inf" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="interpolateIndex"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="freq" type="text" value="" optional="false" label="An Offset String, interpreted as the frequency of" help="the grid you want to interpolate your data to."/> + <param argument="method" type="select" value="" optional="false" label="The interpolation method you want to apply" help=""> + <option value="linear">linear</option> + <option value="time">time</option> + <option value="nearest">nearest</option> + <option value="zero">zero</option> + <option value="slinear">slinear</option> + <option value="quadratic">quadratic</option> + <option value="cubic">cubic</option> + <option value="spline">spline</option> + <option value="barycentric">barycentric</option> + <option value="polynomial">polynomial</option> + <option value="krogh">krogh</option> + <option value="piecewise_polynomial">piecewise_polynomial</option> + <option value="pchip">pchip</option> + <option value="akima">akima</option> + </param> + <param argument="order" type="integer" value="2" optional="false" label="If your selected interpolation method can be performed at different 'orders' - here you pass the desired" help="order."/> + <param argument="limit" type="integer" value="2" optional="true" label="Upper limit of missing index values (with respect to ``freq``) to fill" help="as the number of consecutive missing values (integer) or temporal extension of the gaps to be filled (Offset String). If ``None`` is passed, no limit is set."/> + <param argument="extrapolate" type="select" optional="true" label="extrapolate" help=""> + <option value="forward">forward</option> + <option value="backward">backward</option> + <option value="both">both</option> + </param> + </when> + <when value="interpolateInvalid"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="method" type="select" value="" optional="false" label="method" help=""> + <option value="linear">linear</option> + <option value="time">time</option> + <option value="nearest">nearest</option> + <option value="zero">zero</option> + <option value="slinear">slinear</option> + <option value="quadratic">quadratic</option> + <option value="cubic">cubic</option> + <option value="spline">spline</option> + <option value="barycentric">barycentric</option> + <option value="polynomial">polynomial</option> + <option value="krogh">krogh</option> + <option value="piecewise_polynomial">piecewise_polynomial</option> + <option value="pchip">pchip</option> + <option value="akima">akima</option> + </param> + <param argument="order" type="integer" value="2" optional="false" label="order" help=""/> + <param argument="limit" type="integer" optional="true" label="limit" help=""/> + <param argument="extrapolate" type="select" optional="true" label="extrapolate" help=""> + <option value="forward">forward</option> + <option value="backward">backward</option> + <option value="both">both</option> + </param> + <param argument="flag" type="float" value="-inf" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + </conditional> + </when> + <when value="noise"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="flagByStatLowPass">flagByStatLowPass: Flag data chunks of length ``window``, if:</option> + </param> + <when value="flagByStatLowPass"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="func" type="text" value="" optional="false" label="Aggregation function applied on every chunk" help="function ([<class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>], <class 'float'>) (default: )"/> + <param argument="window" type="text" value="" optional="false" label="Window (i" help=" see: https://pandas.pydata.org/docs/user_guide/timedeltas.html#parsing"/> + <param argument="thresh" type="float" value="" optional="false" label="Threshold" help=""/> + <param argument="sub_window" type="text" optional="true" label="Window size of sub chunks, that are additionally tested for exceeding ``sub_thresh``" help="with respect to ``func``. see: https://pandas.pydata.org/docs/user_guide/timedeltas.html#parsing"/> + <param argument="sub_thresh" type="float" optional="true" label="Threshold" help=""/> + <param argument="min_periods" type="integer" optional="true" label="Minimum number of values needed in a chunk to perfom the test" help="Ignored if ``window`` is an integer."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + </conditional> + </when> + <when value="outliers"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="flagByGrubbs">flagByGrubbs: Flag outliers using the Grubbs algorithm</option> + <option value="flagByStray">flagByStray: Flag outliers in 1-dimensional (score) data using the STRAY Algorithm</option> + <option value="flagCrossStatistics">flagCrossStatistics: Function checks for outliers relatively to the "horizontal" input data axis</option> + <option value="flagLOF">flagLOF: Flag values where the Local Outlier Factor (LOF) exceeds cutoff</option> + <option value="flagMAD">flagMAD: Flag outiers using the modified Z-score outlier detection method</option> + <option value="flagMVScores">flagMVScores: The algorithm implements a 3-step outlier detection procedure for simultaneously</option> + <option value="flagOffset">flagOffset: A basic outlier test that works on regularly and irregularly sampled data</option> + <option value="flagRaise">flagRaise: The function flags raises and drops in value courses, that exceed a certain threshold</option> + <option value="flagRange">flagRange: Function flags values exceeding the closed interval [:py:attr:`min`, :py:attr:`max`]</option> + <option value="flagUniLOF">flagUniLOF: Flag "univariate" Local Outlier Factor (LOF) exceeding cutoff</option> + <option value="flagZScore">flagZScore: Flag data where its (rolling) Zscore exceeds a threshold</option> + </param> + <when value="flagByGrubbs"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="Size of the testing window" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="Size of the testing window" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="alpha" type="float" value="0.05" optional="false" label="Level of significance, the grubbs test is to be performed at" help=""/> + <param argument="min_periods" type="integer" value="8" optional="false" label="Minimum number of values needed in a :py:attr:`window` in order to perform the grubs test" help="Ignored if :py:attr:`window` is an integer."/> + <param argument="pedantic" type="boolean" label="If ``True``, every value gets checked twice" help="and second in a rolling window that is lagging by :py:attr:`window` / 2. Recommended to avoid false positives at the window edges. Ignored if :py:attr:`window` is an offset string." checked="false" truevalue="" falsevalue=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagByStray"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="none">None</option> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" optional="true" label="Determines the segmentation of the data into partitions, the kNN algorithm is" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" optional="true" label="Determines the segmentation of the data into partitions, the kNN algorithm is" help="Temporal extensions (offset string)"/> + </when> + <when value="none"> + <param name="window" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="min_periods" type="integer" value="11" optional="false" label="Minimum number of periods per partition that have to be present for a valid" help="outlier detection to be made in this partition (only of effect, if :py:attr:`freq` is an integer)."/> + <param argument="iter_start" type="float" value="0.5" optional="false" label="Float in ``[0, 1]`` that determines which percentage of data is considered" help=""normal". ``0.5`` results in the stray algorithm to search only the upper 50% of the scores for the cut off point. (See reference section for more information)"/> + <param argument="alpha" type="float" value="0.05" optional="false" label="Level of significance by which it is tested, if a score might be drawn from" help="another distribution than the majority of the data."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagCrossStatistics"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="List of variables names to process" help=""/> + </repeat> + <param argument="thresh" type="float" value="" optional="false" label="Threshold which the outlier score of an value must exceed, for being flagged an outlier" help=""/> + <param argument="method" type="select" value="modZscore" optional="false" label="Method used for calculating the outlier scores" help="* ``'modZscore'``: Median based "sigma"-ish approach. See References [1]. * ``'Zscore'``: Score values by how many times the standard deviation they differ from the median. See References [1]."> + <option value="modZscore">modZscore</option> + <option value="Zscore">Zscore</option> + </param> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagLOF"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + </repeat> + <param argument="n" type="integer" value="20" optional="false" label="Number of neighbors to be included into the LOF calculation" help="value found to be suitable in the literature. * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors) and sets the upper limit to the number of values in outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than :py:attr:`n`/2 may not be detected reliably. * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers points. Higher values greatly increase numerical costs."/> + <conditional name="thresh_cond"> + <param name="thresh_select" type="select" label="thresh mode"> + <option value="auto">automatic</option> + <option value="linear">linear</option> + </param> + <when value="auto"> + <param name="thresh" type="hidden" value="auto" label=""/> + </when> + <when value="linear"> + <param argument="thresh" type="float" value="1.5" optional="false" label="The threshold for flagging the calculated LOF" help="most likely corresponds to inlier points. * The "automatic" threshing introduced with the publication of the algorithm defaults to ``1.5``. * In this implementation, :py:attr:`thresh` defaults (``'auto'``) to flagging the scores with a modified 3-sigma rule, resulting in a :py:attr:`thresh` `` > 1.5`` which usually mitigates overflagging compared to the literature recommendation."/> + </when> + </conditional> + <param argument="algorithm" type="select" value="ball_tree" optional="false" label="Algorithm used for calculating the :py:attr:`n`-nearest neighbors" help=""> + <option value="ball_tree">ball_tree</option> + <option value="kd_tree">kd_tree</option> + <option value="brute">brute</option> + <option value="auto">auto</option> + </param> + <param argument="p" type="integer" value="1" optional="false" label="Degree of the metric ("Minkowski"), according to which the distance to neighbors is determined" help="Most important values are: * ``1`` - Manhatten Metric * ``2`` - Euclidian Metric"/> + <conditional name="density_cond"> + <param name="density_select" type="select" label="density mode"> + <option value="auto">automatic</option> + <option value="linear">linear</option> + <option value="custom">custom</option> + </param> + <when value="auto"> + <param name="density" type="hidden" value="auto" label=""/> + </when> + <when value="linear"> + <param argument="density" type="float" value="" optional="false" label="density" help=""/> + </when> + <when value="custom"> + <param argument="density" type="text" value="auto" optional="false" label="density" help=""/> + </when> + </conditional> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagMAD"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="none">None</option> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" optional="true" label="Size of the window" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" optional="true" label="Size of the window" help="Temporal extensions (offset string)"/> + </when> + <when value="none"> + <param name="window" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="z" type="float" value="3.5" optional="false" label="The value the Z-score is tested against" help=""/> + <param argument="min_residuals" type="integer" optional="true" label="min_residuals" help=""/> + <param argument="min_periods" type="integer" optional="true" label="Minimum number of valid meassurements in a scoring window, to consider the resulting score valid" help=""/> + <param argument="center" type="boolean" label="Weather or not to center the target value in the scoring window" help="target value is the last value in the window." checked="false" truevalue="" falsevalue=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagMVScores"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="List of variables names to process" help=""/> + </repeat> + <param argument="trafo" type="text" value="" optional="false" label="Transformation to be applied onto every column before scoring" help="control, the data could also be transformed before :py:meth:`~saqc.SaQC.flagMVScores` is called.function ([<class 'pandas.core.series.Series'>], <class 'pandas.core.series.Series'>) (default: <function OutliersMixin.<lambda> at 0x7fb9fce605e0>)"/> + <param argument="alpha" type="float" value="0.05" optional="false" label="Level of significance by which it is tested, if an observations score might" help="be drawn from another distribution than the majority of the data."/> + <param argument="n" type="integer" value="10" optional="false" label="Number of neighbors included in the scoring process for every datapoint" help=""/> + <param argument="func" type="text" value="" optional="false" label="Function that aggregates a value's k-smallest distances, returning a scalar score" help="function ([<class 'pandas.core.series.Series'>], <class 'float'>) (default: <function sum at 0x7fba13a1da80>)"/> + <param argument="iter_start" type="float" value="0.5" optional="false" label="Value in ``[0,1]`` that determines which percentage of data is considered" help=""normal". 0.5 results in the threshing algorithm to search only the upper 50% of the scores for the cut off point. (See reference section for more information)"/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="none">None</option> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" optional="true" label="Only effective if :py:attr:`threshing` is set to ``'stray'``" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" optional="true" label="Only effective if :py:attr:`threshing` is set to ``'stray'``" help="Temporal extensions (offset string)"/> + </when> + <when value="none"> + <param name="window" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="min_periods" type="integer" value="11" optional="false" label="Only effective if :py:attr:`threshing` is set to ``'stray'`` and :py:attr:`partition` is an integer" help="Minimum number of periods per :py:attr:`partition` that have to be present for a valid outlier detection to be made in this partition."/> + <param argument="stray_range" type="text" optional="true" label="If not ``None``, it is tried to reduce the stray result onto single outlier components" help="of the input :py:attr:`field`. The offset string denotes the range of the temporal surrounding to include into the MAD testing while trying to reduce flags."/> + <param argument="drop_flagged" type="boolean" label="Only effective when :py:attr:`stray_range` is not ``None``" help="values from the temporal surroundings." checked="false" truevalue="" falsevalue=""/> + <param argument="thresh" type="float" value="3.5" optional="false" label="Only effective when :py:attr:`stray_range` is not ``None``" help="controlling wheather the MAD score is considered referring to an outlier or not. Higher values result in less rigid flagging. The default value is widely considered apropriate in the literature."/> + <param argument="min_periods_r" type="integer" value="1" optional="false" label="Only effective when :py:attr:`stray_range` is not ``None``" help="necessary in an interval to actually perform the reduction step."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagOffset"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="tolerance" type="float" value="" optional="false" label="Maximum difference allowed between the value, directly preceding and the value directly" help="succeeding an offset to trigger flagging of the offsetting values. See condition (4)."/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="Maximum length allowed for offset value courses, to trigger flagging of the offsetting values" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="Maximum length allowed for offset value courses, to trigger flagging of the offsetting values" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="thresh" type="float" optional="true" label="Minimum difference between a value and its successors, to consider the successors an anomalous" help="offset group. See condition (1). If ``None``, condition (1) is not tested."/> + <param argument="thresh_relative" type="float" optional="true" label="Minimum relative change between a value and its successors, to consider the successors an anomalous" help="offset group. See condition (2). If ``None``, condition (2) is not tested."/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagRaise"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="thresh" type="float" value="" optional="false" label="The threshold, for the total rise (:py:attr:`thresh` ``> 0``), or total drop" help="(:py:attr:`thresh` ``< 0``), value courses must not exceed within a timespan of length :py:attr:`raise_window`."/> + <param argument="raise_window" type="text" value="" optional="false" label="An offset string, determining the timespan, the rise/drop thresholding refers" help="to. Window is inclusively defined."/> + <param argument="freq" type="text" value="" optional="false" label="An offset string, determining the frequency, the timeseries to flag is supposed" help="to be sampled at. The window is inclusively defined."/> + <param argument="average_window" type="text" optional="true" label="See condition (2) of the description given in the Notes" help="inclusively defined, defaults to 1.5 times the size of :py:attr:`raise_window`."/> + <param argument="raise_factor" type="float" value="2.0" optional="false" label="See condition (2)" help=""/> + <param argument="slope" type="float" optional="true" label="See condition (3)" help=""/> + <param argument="weight" type="float" value="0.8" optional="false" label="See condition (3)" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagRange"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="min" type="float" value="-inf" optional="false" label="Lower bound for valid data" help=""/> + <param argument="max" type="float" value="inf" optional="false" label="Upper bound for valid data" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagUniLOF"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="n" type="integer" value="20" optional="false" label="Number of periods to be included into the LOF calculation" help="value found to be suitable in the literature. * :py:attr:`n` determines the "locality" of an observation (its :py:attr:`n` nearest neighbors) and sets the upper limit to the number of values in an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than :py:attr:`n`/2 may not be detected reliably. * The larger :py:attr:`n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outlier points. Higher values greatly increase numerical costs."/> + <conditional name="thresh_cond"> + <param name="thresh_select" type="select" label="thresh mode"> + <option value="auto">automatic</option> + <option value="linear">linear</option> + </param> + <when value="auto"> + <param name="thresh" type="hidden" value="auto" label=""/> + </when> + <when value="linear"> + <param argument="thresh" type="float" value="1.5" optional="false" label="The threshold for flagging the calculated LOF" help="most likely corresponds to inlier points. This parameter is considered the main calibration parameter of the algorithm. * The threshing defaults to ``1.5``, wich is the default value found to be suitable in the literature. * ``'auto'`` enables flagging the scores with a modified 3-sigma rule, resulting in a thresh around ``4``, which usually greatly mitigates overflagging compared to the literature recommendation, but often is too high. * sensitive range for the parameter may be ``[1,15]``, assuming default settings for the other parameters."/> + </when> + </conditional> + <param argument="algorithm" type="select" value="ball_tree" optional="false" label="Algorithm used for calculating the :py:attr:`n`-nearest neighbors needed for LOF calculation" help=""> + <option value="ball_tree">ball_tree</option> + <option value="kd_tree">kd_tree</option> + <option value="brute">brute</option> + <option value="auto">auto</option> + </param> + <param argument="p" type="integer" value="1" optional="false" label="Degree of the metric ("Minkowski"), according to which distance to neighbors is determined" help="Most important values are: * ``1`` - Manhatten Metric * ``2`` - Euclidian Metric"/> + <conditional name="density_cond"> + <param name="density_select" type="select" label="density mode"> + <option value="auto">automatic</option> + <option value="linear">linear</option> + <option value="custom">custom</option> + </param> + <when value="auto"> + <param name="density" type="hidden" value="auto" label=""/> + </when> + <when value="linear"> + <param argument="density" type="float" value="" optional="false" label="How to calculate the temporal distance/density for the variable to flag" help="* ``'auto'`` - introduces linear density with an increment equal to the median of the absolute diff of the variable to flag. * ``float`` - introduces linear density with an increment equal to :py:attr:`density` * Callable - calculates the density by applying the function passed onto the variable to flag (passed as Series)."/> + </when> + <when value="custom"> + <param argument="density" type="text" value="auto" optional="false" label="How to calculate the temporal distance/density for the variable to flag" help="* ``'auto'`` - introduces linear density with an increment equal to the median of the absolute diff of the variable to flag. * ``float`` - introduces linear density with an increment equal to :py:attr:`density` * Callable - calculates the density by applying the function passed onto the variable to flag (passed as Series)."/> + </when> + </conditional> + <param argument="fill_na" type="text" value="linear" optional="false" label="Weather or not to fill NaN values in the data with a linear interpolation" help=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + <when value="flagZScore"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="none">None</option> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" optional="true" label="Size of the window" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" optional="true" label="Size of the window" help="Temporal extensions (offset string)"/> + </when> + <when value="none"> + <param name="window" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="thresh" type="float" value="3" optional="false" label="Cutoff level for the Zscores, above which associated points are marked as outliers" help=""/> + <param argument="min_residuals" type="integer" optional="true" label="Minimum residual value points must have to be considered outliers" help=""/> + <param argument="min_periods" type="integer" optional="true" label="Minimum number of valid meassurements in a scoring window, to consider the resulting score valid" help=""/> + <param argument="model_func" type="text" value="" optional="false" label="Function to calculate the center moment in every window" help="function ([numpy.ndarray | pandas.core.series.Series], <class 'float'>) (default: <function nanmean at 0x7fba1372a0c0>)"/> + <param argument="norm_func" type="text" value="" optional="false" label="Function to calculate the scaling for every window" help="function ([numpy.ndarray | pandas.core.series.Series], <class 'float'>) (default: <function nanstd at 0x7fba1372ade0>)"/> + <param argument="center" type="boolean" label="Weather or not to center the target value in the scoring window" help="target value is the last value in the window." checked="true" truevalue="" falsevalue=""/> + <param argument="flag" type="float" value="255.0" optional="false" label="The flag value the function uses to mark observations" help=""/> + </when> + </conditional> + </when> + <when value="pattern"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="flagPatternByDTW">flagPatternByDTW: Pattern Recognition via Dynamic Time Warping</option> + </param> + <when value="flagPatternByDTW"/> + </conditional> + </when> + <when value="resampling"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="concatFlags">concatFlags: Project flags/history of :py:attr:`field` to :py:attr:`target` and adjust to the frequeny grid</option> + <option value="linear">linear: A method to "regularize" data by interpolating linearly the data at regular timestamp</option> + <option value="resample">resample: Resample data points and flags to a regular frequency</option> + <option value="shift">shift: Shift data points and flags to a regular frequency grid</option> + </param> + <when value="concatFlags"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="target" type="text" optional="true" label="Variable name to which the results are written" help=""/> + <param argument="method" type="select" value="match" optional="false" label="Method to project the flags of :py:attr:`field` the flags to :py:attr:`target`:" help="* ``'auto'``: inverse the last alignment/resampling operations * ``'inverse_nagg'``: project a flag of :py:attr:`field` to all timestamps of :py:attr:`target` within the range +/- :py:attr:`freq`/2. * ``'inverse_bagg'``: project a flag of :py:attr:`field` to all preceeding timestamps of :py:attr:`target` within the range :py:attr:`freq` * ``'inverse_fagg'``: project a flag of :py:attr:`field` to all succeeding timestamps of :py:attr:`target` within the range :py:attr:`freq` * ``'inverse_interpolation'`` - project a flag of :py:attr:`field` to all timestamps of :py:attr:`target` within the range +/- :py:attr:`freq` * ``'inverse_nshift'`` - project a flag of :py:attr:`field` to the neaerest timestamps in :py:attr:`target` within the range +/- :py:attr:`freq`/2 * ``'inverse_bshift'`` - project a flag of :py:attr:`field` to nearest preceeding timestamps in :py:attr:`target` * ``'inverse_nshift'`` - project a flag of :py:attr:`field` to nearest succeeding timestamps in :py:attr:`target` * ``'match'`` - project a flag of :py:attr:`field` to all identical timestamps :py:attr:`target`"> + <option value="inverse_fagg">inverse_fagg</option> + <option value="inverse_bagg">inverse_bagg</option> + <option value="inverse_nagg">inverse_nagg</option> + <option value="inverse_fshift">inverse_fshift</option> + <option value="inverse_bshift">inverse_bshift</option> + <option value="inverse_nshift">inverse_nshift</option> + <option value="inverse_interpolation">inverse_interpolation</option> + <option value="match">match</option> + <option value="auto">auto</option> + </param> + <param argument="freq" type="text" optional="true" label="Projection range" help=""/> + <param argument="drop" type="boolean" label="Remove :py:attr:`field` if ``True``" help="" checked="false" truevalue="" falsevalue=""/> + <param argument="squeeze" type="boolean" label="Squueze the history into a single column if ``True``" help="" checked="false" truevalue="" falsevalue=""/> + <param argument="overwrite" type="boolean" label="Overwrite existing flags if ``True``" help="" checked="false" truevalue="" falsevalue=""/> + </when> + <when value="linear"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="freq" type="text" value="" optional="false" label="An offset string" help=""/> + </when> + <when value="resample"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="freq" type="text" value="" optional="false" label="Offset string" help=""/> + <param argument="func" type="text" value="" optional="false" label="Aggregation function" help="function ([<class 'pandas.core.series.Series'>], <class 'pandas.core.series.Series'>) (default: <function mean at 0x7fba13a1f100>)"/> + <param argument="method" type="select" value="bagg" optional="false" label="Specifies which intervals to be aggregated for a certain timestamp" help="succeeding or "surrounding" interval). See description above for more details."> + <option value="fagg">fagg</option> + <option value="bagg">bagg</option> + <option value="nagg">nagg</option> + </param> + <param argument="maxna" type="integer" optional="true" label="Maximum number of allowed ``NaN``s in a resampling interval" help="entire interval is filled with ``NaN``."/> + <param argument="maxna_group" type="integer" optional="true" label="Same as `maxna` but for consecutive NaNs" help=""/> + </when> + <when value="shift"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="freq" type="text" value="" optional="false" label="Offset string" help=""/> + <param argument="method" type="select" value="nshift" optional="false" label="Method to propagate values:" help="* 'nshift' : shift grid points to the nearest time stamp in the range = +/- 0.5 * ``freq`` * 'bshift' : shift grid points to the first succeeding time stamp (if any) * 'fshift' : shift grid points to the last preceeding time stamp (if any)"> + <option value="fshift">fshift</option> + <option value="bshift">bshift</option> + <option value="nshift">nshift</option> + </param> + </when> + </conditional> + </when> + <when value="residuals"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="calculatePolynomialResiduals">calculatePolynomialResiduals: Fits a polynomial model to the data and calculate the residuals</option> + <option value="calculateRollingResiduals">calculateRollingResiduals: Calculate the diff of a rolling-window function and the data</option> + </param> + <when value="calculatePolynomialResiduals"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="The size of the window you want to use for fitting" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="The size of the window you want to use for fitting" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="order" type="integer" value="" optional="false" label="The degree of the polynomial used for fitting" help=""/> + <param argument="min_periods" type="integer" value="0" optional="false" label="The minimum number of periods, that has to be available in every values" help="fitting surrounding for the polynomial fit to be performed. If there are not enough values, np.nan gets assigned. Default (0) results in fitting regardless of the number of values present (results in overfitting for too sparse intervals). To automatically set the minimum number of periods to the number of values in an offset defined window size, pass np.nan."/> + </when> + <when value="calculateRollingResiduals"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="The size of the window you want to roll with" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="The size of the window you want to roll with" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="func" type="text" value="" optional="false" label="Function to roll with" help="function ([<class 'pandas.core.series.Series'>], <class 'numpy.ndarray'>) (default: <function mean at 0x7fba13a1f100>)"/> + <param argument="min_periods" type="integer" value="0" optional="false" label="The minimum number of periods to get a valid value" help=""/> + <param argument="center" type="boolean" label="If True, center the rolling window" help="" checked="true" truevalue="" falsevalue=""/> + </when> + </conditional> + </when> + <when value="rolling"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="roll">roll: Calculate a rolling-window function on the data</option> + <option value="rolling">rolling: Calculate a rolling-window function on the data</option> + </param> + <when value="roll"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="The size of the window you want to roll with" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="The size of the window you want to roll with" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="func" type="text" value="" optional="false" label="Function to roll with" help="function ([<class 'pandas.core.series.Series'>], <class 'numpy.ndarray'>) (default: <function mean at 0x7fba13a1f100>)"/> + <param argument="min_periods" type="integer" value="0" optional="false" label="The minimum number of periods to get a valid value" help=""/> + <param argument="center" type="boolean" label="If True, center the rolling window" help="" checked="true" truevalue="" falsevalue=""/> + </when> + <when value="rolling"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <conditional name="window_cond"> + <param argument="window_select" type="select" label="window input mode"> + <option value="number">number</option> + <option value="timedelta">timedelta</option> + </param> + <when value="number"> + <param argument="window" type="integer" value="" optional="false" label="The size of the window you want to roll with" help="Number of values"/> + </when> + <when value="timedelta"> + <param argument="window" type="text" value="" optional="false" label="The size of the window you want to roll with" help="Temporal extensions (offset string)"/> + </when> + </conditional> + <param argument="func" type="text" value="" optional="false" label="Function to roll with" help="function ([<class 'pandas.core.series.Series'>], <class 'numpy.ndarray'>) (default: <function mean at 0x7fba13a1f100>)"/> + <param argument="min_periods" type="integer" value="0" optional="false" label="The minimum number of periods to get a valid value" help=""/> + <param argument="center" type="boolean" label="If True, center the rolling window" help="" checked="true" truevalue="" falsevalue=""/> + </when> + </conditional> + </when> + <when value="scores"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="assignKNNScore">assignKNNScore: Score datapoints by an aggregation of the dictances to their k nearest neighbors</option> + <option value="assignLOF">assignLOF: Assign Local Outlier Factor (LOF)</option> + <option value="assignUniLOF">assignUniLOF: Assign "univariate" Local Outlier Factor (LOF)</option> + <option value="assignZScore">assignZScore: Calculate (rolling) Zscores</option> + </param> + <when value="assignKNNScore"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="List of variables names to process" help=""/> + </repeat> + <param argument="target" type="text" value="" optional="false" label="Variable name to which the results are written" help=""/> + <param argument="n" type="integer" value="10" optional="false" label="The number of nearest neighbors to which the distance is comprised in every datapoints scoring calculation" help=""/> + <param argument="func" type="text" value="" optional="false" label="A function that assigns a score to every one dimensional array, containing the distances" help="to every datapoints `n` nearest neighbors.function ([<class 'pandas.core.series.Series'>], <class 'float'>) (default: <function sum at 0x7fba13a1da80>)"/> + <conditional name="freq_cond"> + <param name="freq_select" type="select" label="freq input mode"> + <option value="none">None</option> + <option value="number">Give as period length</option> + <option value="offset">specify as offset</option> + </param> + <when value="none"/> + <when value="number"> + <param argument="freq" type="float" value="inf" optional="true" label="Determines the segmentation of the data into partitions, the kNN algorithm is" help="Multiple of sampling rate"/> + </when> + <when value="offset"> + <param argument="freq" type="text" value="inf" optional="true" label="Determines the segmentation of the data into partitions, the kNN algorithm is" help="offset frequency string"/> + </when> + <when value="none"> + <param name="freq" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="min_periods" type="integer" value="2" optional="false" label="The minimum number of periods that have to be present in a window for the kNN scoring" help="to be applied. If the number of periods present is below `min_periods`, the score for the datapoints in that window will be np.nan."/> + <param argument="algorithm" type="select" value="ball_tree" optional="false" label="The search algorithm to find each datapoints k nearest neighbors" help="The keyword just gets passed on to the underlying sklearn method. See reference [1] for more information on the algorithm."> + <option value="ball_tree">ball_tree</option> + <option value="kd_tree">kd_tree</option> + <option value="brute">brute</option> + <option value="auto">auto</option> + </param> + <param argument="metric" type="text" value="minkowski" optional="false" label="The metric the distances to any datapoints neighbors is computed with" help="together with the default of `p` result in the euclidian to be applied. The keyword just gets passed on to the underlying sklearn method. See reference [1] for more information on the algorithm."/> + <param argument="p" type="integer" value="2" optional="false" label="The grade of the metrice specified by parameter `metric`" help="The keyword just gets passed on to the underlying sklearn method. See reference [1] for more information on the algorithm."/> + </when> + <when value="assignLOF"> + <repeat name="field_repeat" title="field(s)" min="1"> + <param argument="field" type="text" value="" optional="false" label="List of variables names to process" help=""/> + </repeat> + <param argument="target" type="text" value="" optional="false" label="Variable name to which the results are written" help=""/> + <param argument="n" type="integer" value="20" optional="false" label="Number of periods to be included into the LOF calculation" help="suitable in the literature. * `n` determines the "locality" of an observation (its `n` nearest neighbors) and sets the upper limit of values of an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than `n/2` may not be detected reliably. * The larger `n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers points. Higher values greatly increase numerical costs."/> + <conditional name="freq_cond"> + <param name="freq_select" type="select" label="freq input mode"> + <option value="none">None</option> + <option value="number">Give as period length</option> + <option value="offset">specify as offset</option> + </param> + <when value="none"/> + <when value="number"> + <param argument="freq" type="float" value="inf" optional="true" label="Determines the segmentation of the data into partitions, the kNN algorithm is" help="Multiple of sampling rate"/> + </when> + <when value="offset"> + <param argument="freq" type="text" value="inf" optional="true" label="Determines the segmentation of the data into partitions, the kNN algorithm is" help="offset frequency string"/> + </when> + <when value="none"> + <param name="freq" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="min_periods" type="integer" value="2" optional="false" label="min_periods" help=""/> + <param argument="algorithm" type="select" value="ball_tree" optional="false" label="Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation" help=""> + <option value="ball_tree">ball_tree</option> + <option value="kd_tree">kd_tree</option> + <option value="brute">brute</option> + <option value="auto">auto</option> + </param> + <param argument="p" type="integer" value="2" optional="false" label="Degree of the metric ("Minkowski"), according to wich distance to neighbors is determined" help="Most important values are: * `1` - Manhatten Metric * `2` - Euclidian Metric"/> + </when> + <when value="assignUniLOF"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="n" type="integer" value="20" optional="false" label="Number of periods to be included into the LOF calculation" help="suitable in the literature. * `n` determines the "locality" of an observation (its `n` nearest neighbors) and sets the upper limit of values of an outlier clusters (i.e. consecutive outliers). Outlier clusters of size greater than `n/2` may not be detected reliably. * The larger `n`, the lesser the algorithm's sensitivity to local outliers and small or singleton outliers points. Higher values greatly increase numerical costs."/> + <param argument="algorithm" type="select" value="ball_tree" optional="false" label="Algorithm used for calculating the `n`-nearest neighbors needed for LOF calculation" help=""> + <option value="ball_tree">ball_tree</option> + <option value="kd_tree">kd_tree</option> + <option value="brute">brute</option> + <option value="auto">auto</option> + </param> + <param argument="p" type="integer" value="1" optional="false" label="Degree of the metric ("Minkowski"), according to wich distance to neighbors is determined" help="Most important values are: * `1` - Manhatten Metric * `2` - Euclidian Metric"/> + <conditional name="density_cond"> + <param name="density_select" type="select" label="density mode"> + <option value="auto">automatic</option> + <option value="linear">linear</option> + <option value="custom">custom</option> + </param> + <when value="auto"> + <param name="density" type="hidden" value="auto" label=""/> + </when> + <when value="linear"> + <param argument="density" type="float" value="" optional="false" label="How to calculate the temporal distance/density for the variable-to-be-flagged" help="* `auto` - introduces linear density with an increment equal to the median of the absolute diff of the variable to be flagged * float - introduces linear density with an increment equal to `density` * Callable - calculates the density by applying the function passed onto the variable to be flagged (passed as Series)."/> + </when> + <when value="custom"> + <param argument="density" type="text" value="auto" optional="false" label="How to calculate the temporal distance/density for the variable-to-be-flagged" help="* `auto` - introduces linear density with an increment equal to the median of the absolute diff of the variable to be flagged * float - introduces linear density with an increment equal to `density` * Callable - calculates the density by applying the function passed onto the variable to be flagged (passed as Series)."/> + </when> + </conditional> + <param argument="fill_na" type="text" value="linear" optional="false" label="Weather or not to fill NaN values in the data with a linear interpolation" help=""/> + </when> + <when value="assignZScore"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="window" type="text" optional="true" label="Size of the window" help="by an integer, denoting the windows number of periods. `NaN` measurements also count as periods. If `None` is passed, All data points share the same scoring window, which than equals the whole data."/> + <param argument="norm_func" type="text" value="" optional="false" label="Function to calculate the scaling for every window" help="function () (default: <function nanstd at 0x7fba1372ade0>)"/> + <param argument="model_func" type="text" value="" optional="false" label="Function to calculate the center moment in every window" help="function () (default: <function nanmean at 0x7fba1372a0c0>)"/> + <param argument="center" type="boolean" label="Weather or not to center the target value in the scoring window" help="target value is the last value in the window." checked="true" truevalue="" falsevalue=""/> + <param argument="min_periods" type="integer" optional="true" label="Minimum number of valid meassurements in a scoring window, to consider the resulting score valid" help=""/> + </when> + </conditional> + </when> + <when value="tools"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="copyField">copyField: Copy data and flags to a new name (preserve flags history)</option> + <option value="dropField">dropField: Drops field from the data and flags</option> + <option value="plot">plot: Plot data and flags or store plot to file</option> + <option value="renameField">renameField: Rename field in data and flags</option> + <option value="selectTime">selectTime: Realizes masking within saqc</option> + </param> + <when value="copyField"> + <param argument="field" type="text" value="" optional="false" label="field" help=""/> + <param argument="target" type="text" value="" optional="false" label="target" help=""/> + <param argument="overwrite" type="boolean" label="overwrite" help="" checked="false" truevalue="" falsevalue=""/> + </when> + <when value="dropField"> + <param argument="field" type="text" value="" optional="false" label="field" help=""/> + </when> + <when value="plot"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="path" type="text" optional="true" label="If ``None`` is passed, interactive mode is entered; plots are shown immediatly" help="and a user need to close them manually before execution continues. If a filepath is passed instead, store-mode is entered and the plot is stored unter the passed location."/> + <param argument="max_gap" type="text" optional="true" label="If ``None``, all data points will be connected, resulting in long linear" help="lines, in case of large data gaps. ``NaN`` values will be removed before plotting. If an offset string is passed, only points that have a distance below ``max_gap`` are connected via the plotting line."/> + <conditional name="history_cond"> + <param name="history_select" type="select" label="history mode"> + <option selected="true" value="valid">valid</option> + <option value="complete">complete</option> + <option value="list">list</option> + <option value="none">None</option> + </param> + <when value="valid"> + <param name="history" type="hidden" value="valid" label=""/> + </when> + <when value="complete"> + <param name="history" type="hidden" value="complete" label=""/> + </when> + <when value="list"/> + <when value="none"> + <param name="history" type="hidden" value="none" label=""/> + </when> + <when value="none"> + <param name="history" type="hidden" value="__none__" label=""/> + </when> + </conditional> + <param argument="phaseplot" type="text" optional="true" label="If a string is passed, plot ``field`` in the phase space it forms together with the" help="variable ``phaseplot``."/> + <param argument="dfilter" type="float" value="inf" optional="false" label="Defines which observations will be masked based on the already existing flags" help=""/> + </when> + <when value="renameField"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="new_name" type="text" value="" optional="false" label="String, field is to be replaced with" help=""/> + </when> + <when value="selectTime"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="mode" type="select" value="" optional="false" label="The masking mode" help="- "periodic": parameters "period_start", "end" are evaluated to generate a periodical mask - "mask_var": data[mask_var] is expected to be a boolean valued timeseries and is used as mask."> + <option value="periodic">periodic</option> + <option value="selection_field">selection_field</option> + </param> + <param argument="selection_field" type="text" optional="true" label="Only effective if mode == "mask_var"" help="Fieldname of the column, holding the data that is to be used as mask. (must be boolean series) Neither the series` length nor its labels have to match data[field]`s index and length. An inner join of the indices will be calculated and values get masked where the values of the inner join are ``True``."/> + <param argument="start" type="text" optional="true" label="Only effective if mode == "seasonal"" help="String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". Has to be of same length as `end` parameter. See examples section below for some examples."/> + <param argument="end" type="text" optional="true" label="Only effective if mode == "periodic"" help="String denoting starting point of every period. Formally, it has to be a truncated instance of "mm-ddTHH:MM:SS". Has to be of same length as `end` parameter. See examples section below for some examples."/> + <param argument="closed" type="boolean" label="Wheather or not to include the mask defining bounds to the mask" help="" checked="true" truevalue="" falsevalue=""/> + </when> + </conditional> + </when> + <when value="transformation"> + <conditional name="method_cond" label="Method"> + <param name="method_select" type="select" label="Method"> + <option value="transform">transform: Transform data by applying a custom function on data chunks of variable size. Existing flags are preserved</option> + </param> + <when value="transform"> + <param argument="field" type="text" value="" optional="false" label="Variable to process" help=""/> + <param argument="func" type="text" value="" optional="false" label="Transformation function" help="function ([pandas.core.series.Series | numpy.ndarray], <class 'pandas.core.series.Series'>) (default: )"/> + <conditional name="freq_cond"> + <param name="freq_select" type="select" label="freq input mode"> + <option value="none">None</option> + <option value="number">Give as period length</option> + <option value="offset">specify as offset</option> + </param> + <when value="none"/> + <when value="number"> + <param argument="freq" type="float" optional="true" label="Size of the data window" help="Multiple of sampling rate"/> + </when> + <when value="offset"> + <param argument="freq" type="text" optional="true" label="Size of the data window" help="offset frequency string"/> + </when> + <when value="none"> + <param name="freq" type="hidden" value="__none__" label=""/> + </when> + </conditional> + </when> + </conditional> + </when> + </conditional> + </repeat> + </inputs> + <outputs> + <data name="output" format="csv" from_work_dir="output.csv" hidden="false"/> + <data name="config" format="txt" from_work_dir="config.csv" hidden="false"/> + <collection name="plots" type="list" label="${tool.name} on ${on_string}: Plots"> + <discover_datasets pattern="(?P<name>.*)\.png" ext="png"/> + </collection> + </outputs> + <expand macro="saqc_tests"/> + <help><![CDATA[TODO]]></help> + <expand macro="citations"/> +</tool> +