Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add first implementation of scaling functions #1197

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 34 additions & 17 deletions foqus_lib/framework/surrogate/keras_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from pathlib import Path
from tokenize import String

from typing import Tuple
import numpy as np
import pandas as pd
import tensorflow as tf # pylint: disable=import-error
Expand All @@ -52,6 +53,20 @@
from foqus_lib.framework.surrogate.surrogate import surrogate
from foqus_lib.framework.uq.SurrogateParser import SurrogateParser

from foqus_lib.framework.surrogate.scaling import (
BaseScaler,
LinearScaler,
LogScaler,
LogScaler2,
PowerScaler,
PowerScaler2,
map_name_to_scaler,
scale_dataframe,
)

# mapping between the human-readable name for the scaling variant
# and an instance of the corresponding scaler class


# custom class to define Keras NN layers
@tf.keras.utils.register_keras_serializable()
Expand Down Expand Up @@ -293,6 +308,14 @@ def __init__(self, dat=None):
desc="Name of output file for model, should have file extension: .keras",
hint="Enter a custom file name if desired",
)
# add option for normalization_form, make dropdown option
self.options.add(
name="scaling_function",
default="Linear",
dtype=str,
desc="Scaling/normalization function for input data",
validValues=list(map_name_to_scaler.keys()),
)

def run(self):
"""
Expand All @@ -316,6 +339,9 @@ def run(self):
self.msgQueue.put(f"input data columns: {input_data.columns}")
self.msgQueue.put(f"output data columns: {output_data.columns}")

# extract scaling function option, apply it to the input data
# get scaler object

# np.random.seed(46)
# rn.seed(1342)
# tf.random.set_seed(62)
Expand All @@ -341,22 +367,13 @@ def run(self):
xdata = input_data
zdata = output_data

xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata} # x bounds
zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata} # z bounds

# normalize data using Linear form
# users can normalize with any allowed form # manually, and then pass the
# appropriate flag to FOQUS from the allowed list:
# ["Linear", "Log", "Power", "Log 2", "Power 2"] - see the documentation for
# details on the scaling formulations
xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
xdata, zdata = np.array(xdata), np.array(zdata)
for i in range(len(xdata)):
for j in range(len(xlabels)):
xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
for j in range(len(zlabels)):
zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
scaling_func_option = self.options["scaling_function"].value

scaler_instance = map_name_to_scaler[scaling_func_option]
xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)

print(f"using scaling function: {scaling_func_option}")

# method to create model
def create_model():
Expand All @@ -370,7 +387,7 @@ def create_model():
input_bounds=xdata_bounds,
output_bounds=zdata_bounds,
normalized=True,
normalization_form="Linear",
normalization_form=scaling_func_option,
)

outputs = layers(inputs) # use network as function outputs = f(inputs)
Expand Down
50 changes: 32 additions & 18 deletions foqus_lib/framework/surrogate/pytorch_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@
# from foqus_lib.framework.graph.graph import Graph
from foqus_lib.framework.surrogate.surrogate import surrogate
from foqus_lib.framework.uq.SurrogateParser import SurrogateParser
from foqus_lib.framework.surrogate.scaling import (
BaseScaler,
LinearScaler,
LogScaler,
LogScaler2,
PowerScaler,
PowerScaler2,
map_name_to_scaler,
scale_dataframe,
)

# custom class to define Keras NN layers
np.random.seed(46)
Expand Down Expand Up @@ -284,6 +294,13 @@ def __init__(self, dat=None):
desc="Name of output file for model, should have file extension: .pt",
hint="Enter a custom file name if desired",
)
self.options.add(
name="scaling_function",
default="Linear",
dtype=str,
desc="Scaling/normalization function for input data",
validValues=["Linear", "Log", "Log2", "Power", "Power2"],
)

def run(self):
"""
Expand Down Expand Up @@ -326,22 +343,16 @@ def run(self):
zlabels = list(output_data.columns)
xdata = input_data
zdata = output_data
xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata} # x bounds
zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata} # z bounds

# normalize data using Linear form, pass as custom string and parse with SymPy
# users can normalize with any allowed form # manually, and then pass the
# appropriate flag to FOQUS from the allowed list:
# ["Linear", "Log", "Power", "Log 2", "Power 2", "Custom] - see the
# documentation for details on the scaling formulations
xmax, xmin = xdata.max(axis=0), xdata.min(axis=0)
zmax, zmin = zdata.max(axis=0), zdata.min(axis=0)
xdata, zdata = np.array(xdata), np.array(zdata)
for i in range(len(xdata)):
for j in range(len(xlabels)):
xdata[i, j] = (xdata[i, j] - xmin[j]) / (xmax[j] - xmin[j])
for j in range(len(zlabels)):
zdata[i, j] = (zdata[i, j] - zmin[j]) / (zmax[j] - zmin[j])
# xdata_bounds = {i: (xdata[i].min(), xdata[i].max()) for i in xdata} # x bounds
# zdata_bounds = {j: (zdata[j].min(), zdata[j].max()) for j in zdata} # z bounds

scaling_func_option = self.options["scaling_function"].value

scaler_instance = map_name_to_scaler[scaling_func_option]
xdata, xdata_bounds = scale_dataframe(xdata, scaler_instance)
zdata, zdata_bounds = scale_dataframe(zdata, scaler_instance)

print(f"using scaling function: {scaling_func_option}")

model_data = np.concatenate(
(xdata, zdata), axis=1
Expand All @@ -353,8 +364,11 @@ def run(self):

# raise exception here after BPC position
# create model
x_train = torch.from_numpy(xdata).float().to(device)
z_train = torch.from_numpy(zdata).float().to(device)

# need to convert xdata to a numpy array for the below to work
# otherwise causes TypeError: expected np.ndarray (got DataFrame)
x_train = torch.from_numpy(xdata.to_numpy()).float().to(device)
z_train = torch.from_numpy(zdata.to_numpy()).float().to(device)

# print type at this point
# can also print inside create_model
Expand Down
202 changes: 202 additions & 0 deletions foqus_lib/framework/surrogate/scaling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import copy
import json
import logging
import math
from collections import OrderedDict

import numpy as np
import pandas as pd
from typing import Tuple


def validate_for_scaling(array_in, lo, hi) -> None:
if not np.all(np.isfinite(array_in)):
raise ValueError("Input data cannot contain NaN or inf values")

Check warning on line 14 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L14

Added line #L14 was not covered by tests
if array_in.ndim != 1:
raise ValueError("Only 1D arrays supported")
if array_in.size < 2:
raise ValueError("Array must have at least 2 values")
if np.allclose(lo, hi):
raise ValueError("Array must contain non-identical values")
if not check_under_or_overflow(array_in):
raise ValueError("Array contains under/overflow values for dtype")

Check warning on line 22 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L22

Added line #L22 was not covered by tests


def check_under_or_overflow(arr):
if np.issubdtype(arr.dtype, np.integer):
info = np.iinfo(arr.dtype)
elif np.issubdtype(arr.dtype, np.floating):
info = np.finfo(arr.dtype)
else:
raise ValueError("Unsupported data type")

Check warning on line 31 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L31

Added line #L31 was not covered by tests
max_value = info.max
min_value = info.min
return np.all(arr < max_value) & np.all(arr > min_value)


def scale_linear(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
if (hi - lo) == 0:
result = 0

Check warning on line 44 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L44

Added line #L44 was not covered by tests
else:
result = (array_in - lo) / (hi - lo)
return result


def scale_log(array_in, lo=None, hi=None):
# need to account for log domain
epsilon = 1e-8
if np.any(array_in < epsilon):
raise ValueError(f"All values must be greater than {epsilon}")
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = (np.log10(array_in) - np.log10(lo)) / (np.log10(hi) - np.log10(lo))
return result


def scale_log2(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = np.log10(9 * (array_in - lo) / (hi - lo) + 1)
return result


def scale_power(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = (np.power(10, array_in) - np.power(10, lo)) / (
np.power(10, hi) - np.power(10, lo)
)
return result


def scale_power2(array_in, lo=None, hi=None):
if lo is None:
lo = np.min(array_in)
if hi is None:
hi = np.max(array_in)
validate_for_scaling(array_in, lo, hi)
result = 1 / 9 * (np.power(10, (array_in - lo) / (hi - lo)) - 1)
return result


def unscale_linear(array_in, lo, hi):
result = array_in * (hi - lo) / 1.0 + lo
return result


def unscale_log(array_in, lo, hi):
result = lo * np.power(hi / lo, array_in)
return result


def unscale_log2(array_in, lo=None, hi=None):
result = (np.power(10, array_in / 1.0) - 1) * (hi - lo) / 9.0 + lo
return result


def unscale_power(array_in, lo, hi):
result = np.log10(
(array_in / 1.0) * (np.power(10, hi) - np.power(10, lo)) + np.power(10, lo)
)
return result


def unscale_power2(array_in, lo, hi):
result = np.log10(9.0 * array_in / 1.0 + 1) * (hi - lo) + lo
return result


class BaseScaler:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I may be misunderstanding the usage of the annotations here, but what is the outcome of this class? It seems that arrays that are transformed will raise exceptions for any input.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add a comment to explain the purpose of the BaseScaler class; transform() and inverse_transform() should be implemented by the derived classes, so it raises an error if called from the base class.

"""BaseScaler is the base class for the scaler classes defined
below. It exposes the transformer interface from scikit-learn,
and is not supposed to be instantiated directly."""

def fit(self, X: np.ndarray):
self.lo_ = np.min(X)
self.hi_ = np.max(X)
return self

def fit_transform(self, X: np.ndarray) -> np.ndarray:
return self.fit(X).transform(X)

def transform(self, X: np.ndarray) -> np.ndarray:
raise NotImplementedError

Check warning on line 137 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L137

Added line #L137 was not covered by tests

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
raise NotImplementedError

Check warning on line 140 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L140

Added line #L140 was not covered by tests


class LinearScaler(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_linear(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_linear(X, self.lo_, self.hi_)

Check warning on line 148 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L148

Added line #L148 was not covered by tests


class LogScaler(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_log(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_log(X, self.lo_, self.hi_)

Check warning on line 156 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L156

Added line #L156 was not covered by tests


class LogScaler2(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_log2(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_log2(X, self.lo_, self.hi_)

Check warning on line 164 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L164

Added line #L164 was not covered by tests


class PowerScaler(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_power(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_power(X, self.lo_, self.hi_)

Check warning on line 172 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L172

Added line #L172 was not covered by tests


class PowerScaler2(BaseScaler):
def transform(self, X: np.ndarray) -> np.ndarray:
return scale_power2(X, self.lo_, self.hi_)

def inverse_transform(self, X: np.ndarray) -> np.ndarray:
return unscale_power2(X, self.lo_, self.hi_)

Check warning on line 180 in foqus_lib/framework/surrogate/scaling.py

View check run for this annotation

Codecov / codecov/patch

foqus_lib/framework/surrogate/scaling.py#L180

Added line #L180 was not covered by tests


map_name_to_scaler = {
"Linear": LinearScaler(),
"Log": LogScaler(),
"Log2": LogScaler2(),
"Power": PowerScaler(),
"Power2": PowerScaler2(),
}


def scale_dataframe(df: pd.DataFrame, scaler: BaseScaler) -> Tuple[pd.DataFrame, dict]:
scaled_df = pd.DataFrame(np.nan, columns=df.columns, index=df.index)
bounds = {}

for col_name in df:
unscaled_col_data = df[col_name]
scaled_col_data = scaler.fit_transform(unscaled_col_data)
bounds[col_name] = scaler.lo_, scaler.hi_
scaled_df.loc[:, col_name] = scaled_col_data

return scaled_df, bounds
Loading
Loading