CCSI-Toolset · franflame · Mar 1, 2024 · Mar 9, 2024 · Mar 9, 2024 · Mar 25, 2024
diff --git a/foqus_lib/framework/surrogate/keras_nn.py b/foqus_lib/framework/surrogate/keras_nn.py
@@ -41,6 +41,7 @@
 from pathlib import Path
 from tokenize import String
 
+from typing import Tuple
 import numpy as np
 import pandas as pd
 import tensorflow as tf  # pylint: disable=import-error
@@ -52,6 +53,20 @@
 from foqus_lib.framework.surrogate.surrogate import surrogate
 from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
 
+from foqus_lib.framework.surrogate.scaling import (
+    BaseScaler,
+    LinearScaler,
+    LogScaler,
+    LogScaler2,
+    PowerScaler,
+    PowerScaler2,
+    map_name_to_scaler,
+    scale_dataframe,
+)
+
+# mapping between the human-readable name for the scaling variant
+# and an instance of the corresponding scaler class
+
 
 # custom class to define Keras NN layers
 @tf.keras.utils.register_keras_serializable()
@@ -293,6 +308,14 @@ def __init__(self, dat=None):
             desc="Name of output file for model, should have file extension: .keras",
             hint="Enter a custom file name if desired",
         )
+        # add option for normalization_form, make dropdown option
+        self.options.add(
+            name="scaling_function",
+            default="Linear",
+            dtype=str,
+            desc="Scaling/normalization function for input data",
+            validValues=list(map_name_to_scaler.keys()),
+        )
 
     def run(self):
         """
@@ -316,6 +339,9 @@ def run(self):
         self.msgQueue.put(f"input data columns: {input_data.columns}")
         self.msgQueue.put(f"output data columns: {output_data.columns}")
 
+        # extract scaling function option, apply it to the input data
+        # get scaler object
+
         # np.random.seed(46)
         # rn.seed(1342)
         # tf.random.set_seed(62)
@@ -341,22 +367,13 @@ def run(self):
         xdata = input_data
         zdata = output_data
 
-        xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
-        zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
-
-        # normalize data using Linear form
-        # users can normalize with any allowed form # manually, and then pass the
-        # appropriate flag to FOQUS from the allowed list:
-        # ["Linear", "Log", "Power", "Log 2", "Power 2"] - see the documentation for
-        # details on the scaling formulations
-        xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
-        zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
-        xdata, zdata = np.array(xdata), np.array(zdata)
-        for i in range(len(xdata)):
-            for j in range(len(xlabels)):
-                xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
-            for j in range(len(zlabels)):
-                zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
+        scaling_func_option = self.options["scaling_function"].value
+
+        scaler_instance = map_name_to_scaler[scaling_func_option]
+        xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
+        zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)
+
+        print(f"using scaling function: {scaling_func_option}")
 
         # method to create model
         def create_model():
@@ -370,7 +387,7 @@ def create_model():
                 input_bounds=xdata_bounds,
                 output_bounds=zdata_bounds,
                 normalized=True,
-                normalization_form="Linear",
+                normalization_form=scaling_func_option,
             )
 
             outputs = layers(inputs)  # use network as function outputs = f(inputs)

diff --git a/foqus_lib/framework/surrogate/pytorch_nn.py b/foqus_lib/framework/surrogate/pytorch_nn.py
@@ -50,6 +50,16 @@
 # from foqus_lib.framework.graph.graph import Graph
 from foqus_lib.framework.surrogate.surrogate import surrogate
 from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
+from foqus_lib.framework.surrogate.scaling import (
+    BaseScaler,
+    LinearScaler,
+    LogScaler,
+    LogScaler2,
+    PowerScaler,
+    PowerScaler2,
+    map_name_to_scaler,
+    scale_dataframe,
+)
 
 # custom class to define Keras NN layers
 np.random.seed(46)
@@ -284,6 +294,13 @@ def __init__(self, dat=None):
             desc="Name of output file for model, should have file extension: .pt",
             hint="Enter a custom file name if desired",
         )
+        self.options.add(
+            name="scaling_function",
+            default="Linear",
+            dtype=str,
+            desc="Scaling/normalization function for input data",
+            validValues=["Linear", "Log", "Log2", "Power", "Power2"],
+        )
 
     def run(self):
         """
@@ -326,22 +343,16 @@ def run(self):
         zlabels = list(output_data.columns)
         xdata = input_data
         zdata = output_data
-        xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
-        zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
-
-        # normalize data using Linear form, pass as custom string and parse with SymPy
-        # users can normalize with any allowed form # manually, and then pass the
-        # appropriate flag to FOQUS from the allowed list:
-        # ["Linear", "Log", "Power", "Log 2", "Power 2", "Custom] - see the
-        # documentation for details on the scaling formulations
-        xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
-        zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
-        xdata, zdata = np.array(xdata), np.array(zdata)
-        for i in range(len(xdata)):
-            for j in range(len(xlabels)):
-                xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
-            for j in range(len(zlabels)):
-                zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
+        # xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata}  # x bounds
+        # zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata}  # z bounds
+
+        scaling_func_option = self.options["scaling_function"].value
+
+        scaler_instance = map_name_to_scaler[scaling_func_option]
+        xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
+        zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)
+
+        print(f"using scaling function: {scaling_func_option}")
 
         model_data = np.concatenate(
             (xdata, zdata), axis=1
@@ -353,8 +364,11 @@ def run(self):
 
         # raise exception here after BPC position
         # create model
-        x_train = torch.from_numpy(xdata).float().to(device)
-        z_train = torch.from_numpy(zdata).float().to(device)
+
+        # need to convert xdata to a numpy array for the below to work
+        # otherwise causes TypeError: expected np.ndarray (got DataFrame)
+        x_train = torch.from_numpy(xdata.to_numpy()).float().to(device)
+        z_train = torch.from_numpy(zdata.to_numpy()).float().to(device)
 
         # print type at this point
         # can also print inside create_model

diff --git a/foqus_lib/framework/surrogate/scaling.py b/foqus_lib/framework/surrogate/scaling.py
@@ -0,0 +1,202 @@
+import copy
+import json
+import logging
+import math
+from collections import OrderedDict
+
+import numpy as np
+import pandas as pd
+from typing import Tuple
+
+
+def validate_for_scaling(array_in, lo, hi) -> None:
+    if not np.all(np.isfinite(array_in)):
+        raise ValueError("Input data cannot contain NaN or inf values")
+    if array_in.ndim != 1:
+        raise ValueError("Only 1D arrays supported")
+    if array_in.size < 2:
+        raise ValueError("Array must have at least 2 values")
+    if np.allclose(lo, hi):
+        raise ValueError("Array must contain non-identical values")
+    if not check_under_or_overflow(array_in):
+        raise ValueError("Array contains under/overflow values for dtype")
+
+
+def check_under_or_overflow(arr):
+    if np.issubdtype(arr.dtype, np.integer):
+        info = np.iinfo(arr.dtype)
+    elif np.issubdtype(arr.dtype, np.floating):
+        info = np.finfo(arr.dtype)
+    else:
+        raise ValueError("Unsupported data type")
+    max_value = info.max
+    min_value = info.min
+    return np.all(arr < max_value) & np.all(arr > min_value)
+
+
+def scale_linear(array_in, lo=None, hi=None):
+    if lo is None:
+        lo = np.min(array_in)
+    if hi is None:
+        hi = np.max(array_in)
+    validate_for_scaling(array_in, lo, hi)
+    if (hi - lo) == 0:
+        result = 0
+    else:
+        result = (array_in - lo) / (hi - lo)
+    return result
+
+
+def scale_log(array_in, lo=None, hi=None):
+    # need to account for log domain
+    epsilon = 1e-8
+    if np.any(array_in < epsilon):
+        raise ValueError(f"All values must be greater than {epsilon}")
+    if lo is None:
+        lo = np.min(array_in)
+    if hi is None:
+        hi = np.max(array_in)
+    validate_for_scaling(array_in, lo, hi)
+    result = (np.log10(array_in) - np.log10(lo)) / (np.log10(hi) - np.log10(lo))
+    return result
+
+
+def scale_log2(array_in, lo=None, hi=None):
+    if lo is None:
+        lo = np.min(array_in)
+    if hi is None:
+        hi = np.max(array_in)
+    validate_for_scaling(array_in, lo, hi)
+    result = np.log10(9 * (array_in - lo) / (hi - lo) + 1)
+    return result
+
+
+def scale_power(array_in, lo=None, hi=None):
+    if lo is None:
+        lo = np.min(array_in)
+    if hi is None:
+        hi = np.max(array_in)
+    validate_for_scaling(array_in, lo, hi)
+    result = (np.power(10, array_in) - np.power(10, lo)) / (
+        np.power(10, hi) - np.power(10, lo)
+    )
+    return result
+
+
+def scale_power2(array_in, lo=None, hi=None):
+    if lo is None:
+        lo = np.min(array_in)
+    if hi is None:
+        hi = np.max(array_in)
+    validate_for_scaling(array_in, lo, hi)
+    result = 1 / 9 * (np.power(10, (array_in - lo) / (hi - lo)) - 1)
+    return result
+
+
+def unscale_linear(array_in, lo, hi):
+    result = array_in * (hi - lo) / 1.0 + lo
+    return result
+
+
+def unscale_log(array_in, lo, hi):
+    result = lo * np.power(hi / lo, array_in)
+    return result
+
+
+def unscale_log2(array_in, lo=None, hi=None):
+    result = (np.power(10, array_in / 1.0) - 1) * (hi - lo) / 9.0 + lo
+    return result
+
+
+def unscale_power(array_in, lo, hi):
+    result = np.log10(
+        (array_in / 1.0) * (np.power(10, hi) - np.power(10, lo)) + np.power(10, lo)
+    )
+    return result
+
+
+def unscale_power2(array_in, lo, hi):
+    result = np.log10(9.0 * array_in / 1.0 + 1) * (hi - lo) + lo
+    return result
+
+
+class BaseScaler:
+    """BaseScaler is the base class for the scaler classes defined
+    below. It exposes the transformer interface from scikit-learn,
+    and is not supposed to be instantiated directly."""
+
+    def fit(self, X: np.ndarray):
+        self.lo_ = np.min(X)
+        self.hi_ = np.max(X)
+        return self
+
+    def fit_transform(self, X: np.ndarray) -> np.ndarray:
+        return self.fit(X).transform(X)
+
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+
+class LinearScaler(BaseScaler):
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        return scale_linear(X, self.lo_, self.hi_)
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        return unscale_linear(X, self.lo_, self.hi_)
+
+
+class LogScaler(BaseScaler):
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        return scale_log(X, self.lo_, self.hi_)
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        return unscale_log(X, self.lo_, self.hi_)
+
+
+class LogScaler2(BaseScaler):
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        return scale_log2(X, self.lo_, self.hi_)
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        return unscale_log2(X, self.lo_, self.hi_)
+
+
+class PowerScaler(BaseScaler):
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        return scale_power(X, self.lo_, self.hi_)
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        return unscale_power(X, self.lo_, self.hi_)
+
+
+class PowerScaler2(BaseScaler):
+    def transform(self, X: np.ndarray) -> np.ndarray:
+        return scale_power2(X, self.lo_, self.hi_)
+
+    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
+        return unscale_power2(X, self.lo_, self.hi_)
+
+
+map_name_to_scaler = {
+    "Linear": LinearScaler(),
+    "Log": LogScaler(),
+    "Log2": LogScaler2(),
+    "Power": PowerScaler(),
+    "Power2": PowerScaler2(),
+}
+
+
+def scale_dataframe(df: pd.DataFrame, scaler: BaseScaler) -> Tuple[pd.DataFrame, dict]:
+    scaled_df = pd.DataFrame(np.nan, columns=df.columns, index=df.index)
+    bounds = {}
+
+    for col_name in df:
+        unscaled_col_data = df[col_name]
+        scaled_col_data = scaler.fit_transform(unscaled_col_data)
+        bounds[col_name] = scaler.lo_, scaler.hi_
+        scaled_df.loc[:, col_name] = scaled_col_data
+
+    return scaled_df, bounds