Merge: Fix dtypes related to floating point precision (#254)

Scienfitz · web-flow · commit 95f7f2571f03 · 2024-06-04T14:38:04.000+02:00
Since floating point precision can be controlled via env vars (#226) various problems have surfaced letting tests fail in single precision. This PR fixes those. They were mostly related to the way `values` and `comp_df` were created for parameters, `selection` was treated in `SubSelectionCondition` and a `lookup` in a different float precision being used in a simulation. The only remaining issues with test in single precision are numerical instabilities (out of scope)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 - Non-GP surrogates not working with `deepcopy` and the simulation module due to slotted
   base class
+- Datatype inconsistencies for various parameters' `values` and `comp_df` and 
+  `SubSelectionCondition`'s `selection` related to floating point precision
 
 ## [0.9.0] - 2024-05-21
 ### Added
diff --git a/baybe/constraints/base.py b/baybe/constraints/base.py
@@ -6,18 +6,19 @@
 from collections.abc import Sequence
 from typing import TYPE_CHECKING, Any, ClassVar
 
+import numpy as np
 import pandas as pd
 from attr import define, field
 from attr.validators import min_len
 
-from baybe.constraints.conditions import Condition
 from baybe.parameters import NumericalContinuousParameter
 from baybe.serialization import (
     SerialMixin,
     converter,
     get_base_structure_hook,
     unstructure_base,
 )
+from baybe.utils.numerical import DTypeFloatNumpy
 
 if TYPE_CHECKING:
     from torch import Tensor
@@ -173,16 +174,13 @@ def to_botorch(
             if p in param_names
         ]
 
-        # TODO: Cast rhs to correct precision once BoTorch also supports single point.
         return (
             torch.tensor(param_indices),
             torch.tensor(self.coefficients, dtype=DTypeFloatTorch),
-            self.rhs,
+            np.asarray(self.rhs, dtype=DTypeFloatNumpy).item(),
         )
 
 
 # Register (un-)structure hooks
-converter.register_unstructure_hook(Condition, unstructure_base)
-converter.register_structure_hook(Condition, get_base_structure_hook(Condition))
 converter.register_unstructure_hook(Constraint, unstructure_base)
 converter.register_structure_hook(Constraint, get_base_structure_hook(Constraint))
diff --git a/baybe/constraints/conditions.py b/baybe/constraints/conditions.py
@@ -2,16 +2,26 @@
 
 import operator as ops
 from abc import ABC, abstractmethod
+from functools import partial
 from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import pandas as pd
 from attr import define, field
 from attr.validators import in_
+from attrs.validators import min_len
+from cattrs.gen import override
 from funcy import rpartial
 from numpy.typing import ArrayLike
 
-from baybe.serialization import SerialMixin
+from baybe.parameters.validation import validate_unique_values
+from baybe.serialization import (
+    SerialMixin,
+    converter,
+    get_base_structure_hook,
+    unstructure_base,
+)
+from baybe.utils.numerical import DTypeFloatNumpy
 
 
 def _is_not_close(x: ArrayLike, y: ArrayLike, rtol: float, atol: float) -> np.ndarray:
@@ -135,9 +145,38 @@ class SubSelectionCondition(Condition):
     """Class for defining valid parameter entries."""
 
     # object variables
-    selection: list[Any] = field()
-    """The list of items which are considered valid."""
+    _selection: tuple = field(
+        converter=tuple,
+        # FIXME[typing]: https://github.com/python-attrs/attrs/issues/1197
+        validator=[
+            min_len(1),
+            validate_unique_values,  # type: ignore
+        ],
+    )
+    """The internal list of items which are considered valid."""
+
+    @property
+    def selection(self) -> tuple:  # noqa: D102
+        """The list of items which are considered valid."""
+        return tuple(
+            DTypeFloatNumpy(itm) if isinstance(itm, (float, int, bool)) else itm
+            for itm in self._selection
+        )
 
     def evaluate(self, data: pd.Series) -> pd.Series:  # noqa: D102
         # See base class.
         return data.isin(self.selection)
+
+
+# Register (un-)structure hooks
+_overrides = {
+    "_selection": override(rename="selection"),
+}
+# FIXME[typing]: https://github.com/python/mypy/issues/4717
+converter.register_structure_hook(
+    Condition,
+    get_base_structure_hook(Condition, overrides=_overrides),  # type: ignore
+)
+converter.register_unstructure_hook(
+    Condition, partial(unstructure_base, overrides=_overrides)
+)
diff --git a/baybe/objectives/desirability.py b/baybe/objectives/desirability.py
@@ -17,6 +17,7 @@
 from baybe.targets.numerical import NumericalTarget
 from baybe.utils.basic import to_tuple
 from baybe.utils.numerical import geom_mean
+from baybe.utils.validation import finite_float
 
 
 def _is_all_numerical_targets(
@@ -73,7 +74,7 @@ class DesirabilityObjective(Objective):
 
     weights: tuple[float, ...] = field(
         converter=lambda w: cattrs.structure(w, tuple[float, ...]),
-        validator=deep_iterable(member_validator=gt(0.0)),
+        validator=deep_iterable(member_validator=[finite_float, gt(0.0)]),
     )
     """The weights to balance the different targets.
     By default, all targets are considered equally important."""
diff --git a/baybe/parameters/base.py b/baybe/parameters/base.py
@@ -130,15 +130,15 @@ class ContinuousParameter(Parameter):
 
 
 # Register (un-)structure hooks
-overrides = {
+_overrides = {
     "_values": override(rename="values"),
     "decorrelate": override(struct_hook=lambda x, _: x),
 }
 # FIXME[typing]: https://github.com/python/mypy/issues/4717
 converter.register_structure_hook(
     Parameter,
-    get_base_structure_hook(Parameter, overrides=overrides),  # type: ignore
+    get_base_structure_hook(Parameter, overrides=_overrides),  # type: ignore
 )
 converter.register_unstructure_hook(
-    Parameter, partial(unstructure_base, overrides=overrides)
+    Parameter, partial(unstructure_base, overrides=_overrides)
 )
diff --git a/baybe/parameters/categorical.py b/baybe/parameters/categorical.py
@@ -11,6 +11,7 @@
 from baybe.parameters.base import DiscreteParameter
 from baybe.parameters.enum import CategoricalEncoding
 from baybe.parameters.validation import validate_unique_values
+from baybe.utils.numerical import DTypeFloatNumpy
 
 
 @define(frozen=True, slots=False)
@@ -47,9 +48,13 @@ def comp_df(self) -> pd.DataFrame:  # noqa: D102
         # See base class.
         if self.encoding is CategoricalEncoding.OHE:
             cols = [f"{self.name}_{val}" for val in self.values]
-            comp_df = pd.DataFrame(np.eye(len(self.values), dtype=int), columns=cols)
+            comp_df = pd.DataFrame(
+                np.eye(len(self.values), dtype=DTypeFloatNumpy), columns=cols
+            )
         elif self.encoding is CategoricalEncoding.INT:
-            comp_df = pd.DataFrame(range(len(self.values)), columns=[self.name])
+            comp_df = pd.DataFrame(
+                range(len(self.values)), dtype=DTypeFloatNumpy, columns=[self.name]
+            )
         comp_df.index = pd.Index(self.values)
 
         return comp_df
diff --git a/baybe/parameters/custom.py b/baybe/parameters/custom.py
@@ -13,6 +13,7 @@
 from baybe.parameters.validation import validate_decorrelation
 from baybe.utils.boolean import eq_dataframe
 from baybe.utils.dataframe import df_uncorrelated_features
+from baybe.utils.numerical import DTypeFloatNumpy
 
 
 @define(frozen=True, slots=False)
@@ -100,7 +101,9 @@ def comp_df(self) -> pd.DataFrame:  # noqa: D102
         # The encoding is directly provided by the user
         # We prepend the parameter name to the columns names to avoid potential
         # conflicts with other parameters
-        comp_df = self.data.rename(columns=lambda x: f"{self.name}_{x}")
+        comp_df = self.data.rename(columns=lambda x: f"{self.name}_{x}").astype(
+            DTypeFloatNumpy
+        )
 
         # Get a decorrelated subset of the provided features
         if self.decorrelate:
diff --git a/baybe/parameters/numerical.py b/baybe/parameters/numerical.py
@@ -62,8 +62,8 @@ def _validate_tolerance(  # noqa: DOC101, DOC103
         if tolerance == 0.0:
             return
 
-        min_dist = np.diff(self.values).min()
-        if min_dist == (eps := np.nextafter(0, 1, dtype=DTypeFloatNumpy)):
+        min_dist = np.diff(self._values).min()
+        if min_dist == (eps := np.nextafter(0, 1)):
             raise NumericalUnderflowError(
                 f"The distance between any two parameter values must be at least "
                 f"twice the size of the used floating point resolution of {eps}."
@@ -79,12 +79,14 @@ def _validate_tolerance(  # noqa: DOC101, DOC103
     @property
     def values(self) -> tuple:  # noqa: D102
         # See base class.
-        return self._values
+        return tuple(DTypeFloatNumpy(itm) for itm in self._values)
 
     @cached_property
     def comp_df(self) -> pd.DataFrame:  # noqa: D102
         # See base class.
-        comp_df = pd.DataFrame({self.name: self.values}, index=self.values)
+        comp_df = pd.DataFrame(
+            {self.name: self.values}, index=self.values, dtype=DTypeFloatNumpy
+        )
         return comp_df
 
     def is_in_range(self, item: float) -> bool:  # noqa: D102
diff --git a/baybe/parameters/substance.py b/baybe/parameters/substance.py
@@ -108,8 +108,6 @@ def _validate_substance_data(  # noqa: DOC101, DOC103
     @property
     def values(self) -> tuple:
         """Returns the labels of the given set of molecules."""
-        # Since the order of dictionary keys is important here, this will only work
-        # for Python 3.7 or higher
         return tuple(self.data.keys())
 
     @cached_property
diff --git a/baybe/simulation/core.py b/baybe/simulation/core.py
@@ -16,7 +16,7 @@
 from baybe.simulation.lookup import _look_up_target_values
 from baybe.targets.enum import TargetMode
 from baybe.utils.dataframe import add_parameter_noise
-from baybe.utils.numerical import closer_element, closest_element
+from baybe.utils.numerical import DTypeFloatNumpy, closer_element, closest_element
 from baybe.utils.random import temporary_seed
 
 
@@ -112,6 +112,12 @@ def simulate_experiment(
                 "Impute mode 'ignore' is only available for dataframe lookups."
             )
 
+        # Enforce correct float precision in lookup dataframes
+        if isinstance(lookup, pd.DataFrame):
+            lookup = lookup.copy()
+            float_cols = lookup.select_dtypes(include=["float"]).columns
+            lookup[float_cols] = lookup[float_cols].astype(DTypeFloatNumpy)
+
         # Clone the campaign to avoid mutating the original object
         # TODO: Reconsider if deepcopies are required once [16605] is resolved
         campaign = deepcopy(campaign)
diff --git a/baybe/simulation/lookup.py b/baybe/simulation/lookup.py
@@ -59,7 +59,7 @@ def _look_up_target_values(
         #   column ordering, which is not robust. Instead, the callable should return
         #   a dataframe with properly labeled columns.
 
-        # Since the return of a lookup function is a a tuple, the following code stores
+        # Since the return of a lookup function is a tuple, the following code stores
         # tuples of floats in a single column with label 0:
         measured_targets = queries.apply(lambda x: lookup(*x.values), axis=1).to_frame()
         # We transform this column to a DataFrame in which there is an individual
@@ -79,7 +79,7 @@ def _look_up_target_values(
             queries[target.name] = measured_targets.iloc[:, k_target]
 
     # Get results via dataframe lookup (works only for exact matches)
-    # IMPROVE: Although its not too important for a simulation, this
+    # IMPROVE: Although it's not too important for a simulation, this
     #  could also be implemented for approximate matches
     elif isinstance(lookup, pd.DataFrame):
         all_match_vals = []
diff --git a/baybe/utils/chemistry.py b/baybe/utils/chemistry.py
@@ -88,13 +88,10 @@ def _smiles_to_mordred_features(smiles: str) -> np.ndarray:
     """
     try:
         return np.asarray(
-            _mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing(),
-            dtype=DTypeFloatNumpy,
+            _mordred_calculator(Chem.MolFromSmiles(smiles)).fill_missing()
         )
     except Exception:
-        return np.full(
-            len(_mordred_calculator.descriptors), np.NaN, dtype=DTypeFloatNumpy
-        )
+        return np.full(len(_mordred_calculator.descriptors), np.NaN)
 
 
 def smiles_to_mordred_features(
@@ -117,7 +114,7 @@ def smiles_to_mordred_features(
     features = [_smiles_to_mordred_features(smiles) for smiles in smiles_list]
     descriptor_names = list(_mordred_calculator.descriptors)
     columns = [prefix + "MORDRED_" + str(name) for name in descriptor_names]
-    dataframe = pd.DataFrame(data=features, columns=columns)
+    dataframe = pd.DataFrame(data=features, columns=columns, dtype=DTypeFloatNumpy)
 
     if dropna:
         dataframe = dataframe.dropna(axis=1)
@@ -169,7 +166,7 @@ def smiles_to_rdkit_features(
     res = []
     for mol in mols:
         desc = {
-            prefix + "RDKIT_" + dname: func(mol)
+            prefix + "RDKIT_" + dname: DTypeFloatNumpy(func(mol))
             for dname, func in Chem.Descriptors.descList
         }
         res.append(desc)
diff --git a/baybe/utils/memory.py b/baybe/utils/memory.py
@@ -14,4 +14,4 @@ def bytes_to_human_readable(num: float, /) -> tuple[float, str]:
         if abs(num) < 1024.0:
             return num, unit
         num /= 1024.0
-    return num, "YB"
+    return round(num, 2), "YB"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -231,7 +231,7 @@ def fixture_parameters(
         CategoricalParameter(
             name="Categorical_2",
             values=("bad", "OK", "good"),
-            encoding="OHE",
+            encoding="INT",
         ),
         CategoricalParameter(
             name="Switch_1",
diff --git a/tests/hypothesis_strategies/acquisition.py b/tests/hypothesis_strategies/acquisition.py
@@ -17,20 +17,18 @@
     qUpperConfidenceBound,
 )
 
+from ..hypothesis_strategies.basic import finite_floats
+
 # These acqfs are ordered roughly according to increasing complexity
 acquisition_functions = st.one_of(
     st.builds(ExpectedImprovement),
     st.builds(ProbabilityOfImprovement),
-    st.builds(
-        UpperConfidenceBound, beta=st.floats(min_value=0.0, allow_infinity=False)
-    ),
+    st.builds(UpperConfidenceBound, beta=finite_floats(min_value=0.0)),
     st.builds(PosteriorMean),
     st.builds(LogExpectedImprovement),
     st.builds(qExpectedImprovement),
     st.builds(qProbabilityOfImprovement),
-    st.builds(
-        qUpperConfidenceBound, beta=st.floats(min_value=0.0, allow_infinity=False)
-    ),
+    st.builds(qUpperConfidenceBound, beta=finite_floats(min_value=0.0)),
     st.builds(qSimpleRegret),
     st.builds(qLogExpectedImprovement),
     st.builds(qNoisyExpectedImprovement),
diff --git a/tests/hypothesis_strategies/basic.py b/tests/hypothesis_strategies/basic.py
@@ -3,6 +3,14 @@
 from functools import partial
 
 import hypothesis.strategies as st
+import numpy as np
 
-finite_floats = partial(st.floats, allow_infinity=False, allow_nan=False)
+from baybe.utils.numerical import DTypeFloatNumpy
+
+finite_floats = partial(
+    st.floats,
+    allow_infinity=False,
+    allow_nan=False,
+    width=32 if DTypeFloatNumpy == np.float32 else 64,
+)
 """A strategy producing finite (i.e., non-nan and non-infinite) floats."""
diff --git a/tests/hypothesis_strategies/constraints.py b/tests/hypothesis_strategies/constraints.py
diff --git a/tests/hypothesis_strategies/dataframes.py b/tests/hypothesis_strategies/dataframes.py
diff --git a/tests/hypothesis_strategies/objectives.py b/tests/hypothesis_strategies/objectives.py
diff --git a/tests/hypothesis_strategies/parameters.py b/tests/hypothesis_strategies/parameters.py
diff --git a/tests/test_searchspace.py b/tests/test_searchspace.py
diff --git a/tests/validation/test_objective_validation.py b/tests/validation/test_objective_validation.py
diff --git a/tests/validation/test_parameter_validation.py b/tests/validation/test_parameter_validation.py
diff --git a/tox.ini b/tox.ini

Original file line number	Diff line number	Diff line change
`@@ -130,15 +130,15 @@ class ContinuousParameter(Parameter):`
`130`	`130`
`131`	`131`
`132`	`132`	`# Register (un-)structure hooks`
`133`		`-overrides = {`
	`133`	`+_overrides = {`
`134`	`134`	`"_values": override(rename="values"),`
`135`	`135`	`"decorrelate": override(struct_hook=lambda x, _: x),`
`136`	`136`	`}`
`137`	`137`	`# FIXME[typing]: https://github.com/python/mypy/issues/4717`
`138`	`138`	`converter.register_structure_hook(`
`139`	`139`	`Parameter,`
`140`		`- get_base_structure_hook(Parameter, overrides=overrides), # type: ignore`
	`140`	`+ get_base_structure_hook(Parameter, overrides=_overrides), # type: ignore`
`141`	`141`	`)`
`142`	`142`	`converter.register_unstructure_hook(`
`143`		`- Parameter, partial(unstructure_base, overrides=overrides)`
	`143`	`+ Parameter, partial(unstructure_base, overrides=_overrides)`
`144`	`144`	`)`