Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion howso/utilities/feature_attributes/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,9 +718,11 @@ def _infer_floating_point_attributes(self, feature_name: str) -> dict:

# Determine number of decimal places using
# np.format_float_positional to handle scientific notation.
# Convert to numpy array for faster iteration
col_array = col.to_numpy()
decimals = max([
len((str(np.format_float_positional(r))).split('.')[1])
for r in col
for r in col_array
])

# specify decimal place. Proceed with training but issue a warning.
Expand Down
23 changes: 14 additions & 9 deletions howso/utilities/feature_attributes/relational.py
Original file line number Diff line number Diff line change
Expand Up @@ -878,24 +878,29 @@ def _infer_feature_bounds( # noqa: C901
# This loop grabs all the distinct values, then converts
# them according to the `format_dt` to a proper datetime
# instance, then compares them to find min and max values.
min_date_obj = datetime.datetime.max
max_date_obj = datetime.datetime.min

try:
unique_values = self._get_unique_values(feature_name)
# Collect all date objects first, then find min/max once
# (more efficient than comparing in loop, especially for many values)
date_objects = []
# The comma in this loop is necessary since
# unique_values is a list of sqlalchemy Row values
for dt_str, in unique_values:
# Parse using the `format_dt` into a datetime
if dt_str: # skip any empty values
date_obj = datetime.datetime.strptime(dt_str, format_dt)
min_date_obj = min(min_date_obj, date_obj)
max_date_obj = max(max_date_obj, date_obj)
date_objects.append(date_obj)
else:
warnings.warn(
f'Cannot guess the bounds for feature '
f'"{feature_name}" without samples.')
return None
# If no valid dates were found, warn and return None
if not date_objects:
warnings.warn(
f'Cannot guess the bounds for feature '
f'"{feature_name}" without samples.')
return None

# Compute min/max from collected date objects
min_date_obj = min(date_objects)
max_date_obj = max(date_objects)
except Exception: # noqa: Intentionally broad
warnings.warn(
f'Feature "{feature_name}" does not match the '
Expand Down
58 changes: 41 additions & 17 deletions howso/utilities/feature_attributes/time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@

def _apply_date_to_epoch(df: pd.DataFrame, feature_name: str, dt_format: str):
"""Internal function to aid multiprocessing of series feature attributes."""
return df[feature_name].apply(lambda x: date_to_epoch(x, dt_format))
series = df[feature_name]
# Optimize for datetime64 dtypes: use vectorized conversion to epoch
if pd.api.types.is_datetime64_any_dtype(series.dtype):
# Convert datetime64 to epoch seconds (vectorized, much faster than apply)
epoch_zero = pd.Timestamp('1970-01-01')
return (series - epoch_zero).dt.total_seconds()
else:
# Fall back to apply for string dates or other types
return series.apply(lambda x: date_to_epoch(x, dt_format))


class InferFeatureAttributesTimeSeries:
Expand Down Expand Up @@ -140,6 +148,14 @@ def _infer_delta_min_max_from_chunk( # noqa: C901
derived_orders = dict()

time_feature_deltas = None
# Cache groupby object for id_feature_name to avoid recreating it for each feature
cached_groupby = None
if id_feature_name:
if isinstance(id_feature_name, list):
cached_groupby = chunk.groupby(id_feature_name)
elif isinstance(id_feature_name, str):
cached_groupby = chunk.groupby([id_feature_name])

for f_name in feature_names:
if features[f_name].get('data_type') in {"json", "yaml", "amalgam", "string_mixable"}:
# continuous semi-structured data should not infer these derived values
Expand Down Expand Up @@ -226,7 +242,9 @@ def _infer_delta_min_max_from_chunk( # noqa: C901
future.cancel()
continue

df_c[f_name] = pd.concat(temp_results)
# Concatenate all results at once (more efficient than multiple concats)
if temp_results:
df_c[f_name] = pd.concat(temp_results, ignore_index=True)
else:
try:
df_c[f_name] = _apply_date_to_epoch(df_c, f_name, dt_format)
Expand Down Expand Up @@ -255,10 +273,9 @@ def _infer_delta_min_max_from_chunk( # noqa: C901

else:
# Use pandas' diff() to pull all the deltas for this feature
if isinstance(id_feature_name, list):
deltas = chunk.groupby(id_feature_name)[f_name].diff(1)
elif isinstance(id_feature_name, str):
deltas = chunk.groupby([id_feature_name])[f_name].diff(1)
# Use cached groupby if available (more efficient than recreating for each feature)
if cached_groupby is not None:
deltas = cached_groupby[f_name].diff(1)
else:
deltas = chunk[f_name].diff(1)

Expand Down Expand Up @@ -286,22 +303,29 @@ def _infer_delta_min_max_from_chunk( # noqa: C901

# compute each 1st order rate as: delta x / delta time
# higher order rates as: delta previous rate / delta time
rates = [
dx / (dt if dt != 0 else SMALLEST_TIME_DELTA)
for dx, dt in zip(rates, time_feature_deltas)
]

# remove NaNs
no_nan_rates = [x for x in rates if pd.isna(x) is False]
# Vectorized computation using numpy for better performance
rates_array = np.asarray(rates)
time_deltas_array = np.asarray(time_feature_deltas)
# Avoid division by zero
time_deltas_safe = np.where(
time_deltas_array != 0,
time_deltas_array,
SMALLEST_TIME_DELTA
)
rates = rates_array / time_deltas_safe

# remove NaNs using numpy boolean indexing (faster than list comprehension)
no_nan_mask = ~np.isnan(rates)
no_nan_rates = rates[no_nan_mask]
if len(no_nan_rates) == 0:
continue

# TODO: 15550: support user-specified min/max values
rate_max = max(no_nan_rates)
rate_max = float(np.max(no_nan_rates))
rate_max = rate_max * e if rate_max > 0 else rate_max / e
features[f_name]['time_series']['rate_max'].append(rate_max)

rate_min = min(no_nan_rates)
rate_min = float(np.min(no_nan_rates))
rate_min = rate_min / e if rate_min > 0 else rate_min * e
features[f_name]['time_series']['rate_min'].append(rate_min)
else: # 'type' == "delta"
Expand All @@ -319,11 +343,11 @@ def _infer_delta_min_max_from_chunk( # noqa: C901
no_nan_deltas: pd.Series = deltas.dropna()
if len(no_nan_deltas) == 0:
continue
delta_max = max(no_nan_deltas)
delta_max = float(no_nan_deltas.max())
delta_max = delta_max * e if delta_max > 0 else delta_max / e
features[f_name]['time_series']['delta_max'].append(delta_max)

delta_min = min(no_nan_deltas)
delta_min = float(no_nan_deltas.min())
# don't allow the time series time feature to go back in time
# TODO: 15550: support user-specified min/max values
if f_name == self.time_feature_name:
Expand Down
Loading