Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b0edca9
support for logging in `Moon` & `Planet`
sgathrid Oct 15, 2025
65454a6
suppress sklearn depreciation warnings thrown by kmapper + hdbscan
sgathrid Oct 15, 2025
6e28e0f
implement filter functions + fix filter handling in `Galaxy` class
sgathrid Oct 15, 2025
9b80463
add logging to galaxy, remove old print statements
sgathrid Oct 15, 2025
bdc7152
update `Planet.writeParams_toYaml() to work if a yaml file does NOT a…
sgathrid Oct 15, 2025
ab77b53
fix params yaml creation in `Planet` class
sgathrid Oct 15, 2025
beb39f0
OOPS! Add min-coverage filter for Retire use
sgathrid Oct 15, 2025
07ab478
support for filtering + running curvature on a list of filepaths, in …
sgathrid Oct 15, 2025
a6f87d7
fix bug in `Galaxy.collapse()` preventing a `filter=None` from runnin…
sgathrid Oct 15, 2025
af54e48
update `tqdm` handling in notebooks
sgathrid Oct 15, 2025
2dfa6b3
preference refac: no need for 2 util files in `thema` module. Main en…
jeremy-wayland Oct 16, 2025
eb0b67f
add collapse to `Thema.galaxy_genesis`
jeremy-wayland Oct 16, 2025
5c0a672
add detailed DEBUG logging for multiverse of objects
sgathrid Oct 20, 2025
4cab13d
add filter support vial YAML file
sgathrid Oct 20, 2025
1f3a36c
support graph filtering params in YAML
sgathrid Oct 20, 2025
ee359b2
move filter config into `config.py` for best practices
sgathrid Oct 20, 2025
7a63b91
chore(format): black-format starFilters utilities
jeremy-wayland Oct 21, 2025
9514eb6
fix(planet): robust outDir handling and richer progress logging
jeremy-wayland Oct 21, 2025
e7e0630
refactor(oort): detailed logging, timing, and output delta reporting
jeremy-wayland Oct 21, 2025
f5df11e
feat(galaxy): robust filter setup, safer path handling, and richer co…
jeremy-wayland Oct 21, 2025
e85ebde
feat(geodesics): set default curvature to ollivier_ricci_curvature an…
jeremy-wayland Oct 21, 2025
e366a99
fix(thema): initialize and populate selected_model_files; tidy doc ex…
jeremy-wayland Oct 21, 2025
13e4bf4
refactor(galaxy): prefer callable filter_fn branch and log provided f…
jeremy-wayland Oct 21, 2025
40c9d95
[copilot]: catch `ImportError` on `tqdm` import
sgathrid Oct 21, 2025
7e5f862
[copilot]: remove duplicate import
sgathrid Oct 21, 2025
e7e5ca9
[copilot]: Update yaml logging in thema/multiverse/system/inner/plane…
sgathrid Oct 21, 2025
183bcca
docs(geodesics): expand curvature parameter docstring with explanatio…
sgathrid Oct 21, 2025
2e3f46a
remove redundant comments
sgathrid Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions thema/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,91 @@
from .multiverse.universe.utils.starGraph import starGraph
from .thema import Thema

import logging
import warnings

# Suppress sklearn deprecation warnings about force_all_finite -> ensure_all_finite
warnings.filterwarnings(
"ignore",
message=".*'force_all_finite' was renamed to 'ensure_all_finite'.*",
category=FutureWarning,
module="sklearn.*",
)


def enable_logging(level="INFO"):
"""
Enable thema logging for interactive use (e.g., notebooks).

Parameters
----------
level : str, optional
Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL')
Default is 'INFO' for moderate verbosity.
Use 'DEBUG' for detailed operational info.
Use 'WARNING' for warnings and errors only.
Use 'ERROR' for errors only.

Examples
--------
>>> import thema
>>> thema.enable_logging('DEBUG') # Detailed logging
>>> thema.enable_logging('INFO') # Moderate logging
>>> thema.enable_logging('WARNING') # Warnings/errors only
"""
thema_logger = logging.getLogger("thema")

for handler in thema_logger.handlers[:]:
if not isinstance(handler, logging.NullHandler):
thema_logger.removeHandler(handler)

handler = logging.StreamHandler()
formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
thema_logger.addHandler(handler)

# Set level
level_map = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
log_level = level_map.get(level.upper(), logging.INFO)
thema_logger.setLevel(log_level)

for name in list(logging.Logger.manager.loggerDict.keys()):
if name.startswith("thema."):
child_logger = logging.getLogger(name)
# Reset level to NOTSET so parent logger controls the level
child_logger.setLevel(logging.NOTSET)

thema_logger.propagate = False

print(f"Thema logging enabled at {level.upper()} level")


def disable_logging():
"""
Disable thema logging (return to quiet mode).

Examples
--------
>>> import thema
>>> thema.disable_logging()
"""
thema_logger = logging.getLogger("thema")

for handler in thema_logger.handlers[:]:
if not isinstance(handler, logging.NullHandler):
thema_logger.removeHandler(handler)

thema_logger.setLevel(logging.ERROR)

print("Thema logging disabled (errors only)")


# Package metadata
__version__ = "0.1.3"
__author__ = "Krv-Analytics"
Expand All @@ -41,4 +126,6 @@
"Star",
"Galaxy",
"starGraph",
"enable_logging",
"disable_logging",
]
24 changes: 24 additions & 0 deletions thema/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,27 @@ class jmapObservatoryConfig:
star_to_observatory = {
"jmapStar": "jmapObservatoryConfig",
}

# Map from filter YAML tags to filter functions and their parameter names
filter_configs = {
"component_count": {
"function": "component_count_filter",
"params": {"target_components": 1}
},
"component_count_range": {
"function": "component_count_range_filter",
"params": {"min_components": 1, "max_components": 10}
},
"minimum_nodes": {
"function": "minimum_nodes_filter",
"params": {"min_nodes": 3}
},
"minimum_edges": {
"function": "minimum_edges_filter",
"params": {"min_edges": 2}
},
"minimum_unique_items": {
"function": "minimum_unique_items_filter",
"params": {"min_unique_items": 10}
}
}
129 changes: 67 additions & 62 deletions thema/multiverse/system/inner/moon.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# File: multiverse/system/inner/moon.py
# Last Update: 05/15/24
# Updated By: JW
# Last Update: 10/15/25
# Updated By: SG

import pickle
import warnings
import logging

import category_encoders as ce
import pandas as pd
Expand All @@ -12,6 +12,9 @@
from ....core import Core
from . import inner_utils

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


class Moon(Core):
"""
Expand Down Expand Up @@ -122,81 +125,79 @@ def __init__(
self.seed = seed
self.imputeData = None

def fit(self):
"""
Performs the cleaning procedure according to the constructor arguments.
Initializes the imputeData member as a DataFrame, which is a scaled,
numeric, and complete representation of the original raw data set.
# Log initial state
logger.debug(f"Moon initialized with data shape: {self.data.shape}")
logger.debug(f"Drop columns: {self.dropColumns}")
logger.debug(f"Impute columns: {self.imputeColumns}")
logger.debug(f"Impute methods: {self.imputeMethods}")
logger.debug(
f"Encoding: {self.encoding}, Scaler: {self.scaler}, Seed: {self.seed}"
)

Examples
----------
>>> moon = Moon()
>>> moon.fit()
"""
def fit(self):
# Add imputed flags
self.imputeData = inner_utils.add_imputed_flags(self.data, self.imputeColumns)
logger.debug("Added imputed flags to columns")
logger.debug(f"Data shape after adding flags: {self.imputeData.shape}")

self.imputeData = inner_utils.add_imputed_flags(
self.data, self.imputeColumns
)
# Apply imputation
for index, column in enumerate(self.imputeColumns):
impute_function = getattr(inner_utils, self.imputeMethods[index])
self.imputeData[column] = impute_function(
self.data[column], self.seed
self.imputeData[column] = impute_function(self.data[column], self.seed)
logger.debug(
f"Column '{column}' imputed using '{self.imputeMethods[index]}'. "
f"NaNs remaining: {self.imputeData[column].isna().sum()}"
)

self.dropColumns = [
col for col in self.dropColumns if col in self.data.columns
]
# Drop Columns
if not self.dropColumns == []:
self.imputeData = self.data.drop(columns=self.dropColumns)
# Drop specified columns
self.dropColumns = [col for col in self.dropColumns if col in self.data.columns]
if self.dropColumns:
before_drop = self.imputeData.shape
self.imputeData = self.imputeData.drop(columns=self.dropColumns)
logger.debug(
f"Dropped columns: {self.dropColumns}. Shape before: {before_drop}, after: {self.imputeData.shape}"
)

# Drop Rows with Nans
# Drop rows with NaNs
nan_cols = self.imputeData.columns[self.imputeData.isna().any()]
logger.debug(f"Columns with NaN values before dropping rows: {list(nan_cols)}")
self.imputeData.dropna(axis=0, inplace=True)
logger.debug(f"Shape after dropping rows with NaNs: {self.imputeData.shape}")

if type(self.encoding) == str:
# Ensure encoding is a list
if isinstance(self.encoding, str):
self.encoding = [
self.encoding
for _ in range(
len(
self.imputeData.select_dtypes(
include=["object"]
).columns
)
len(self.imputeData.select_dtypes(include=["object"]).columns)
)
]

# Encoding
assert len(self.encoding) == len(
self.imputeData.select_dtypes(include=["object"]).columns
), f"length of encoding: {len(self.encoding)}, length of cat variables: {len(self.imputeData.select_dtypes(include=['object']).columns)}"

for i, column in enumerate(
self.imputeData.select_dtypes(include=["object"]).columns
):
encoding = self.encoding[i]

if encoding == "one_hot":
if self.imputeData[column].dtype == object:
self.imputeData = pd.get_dummies(
self.imputeData, prefix=f"OH_{column}", columns=[column]
)

elif encoding == "integer":
if self.imputeData[column].dtype == object:
vals = self.imputeData[column].values
self.imputeData[column] = inner_utils.integer_encoder(vals)

elif encoding == "hash":
if self.imputeData[column].dtype == object:
hashing_encoder = ce.HashingEncoder(
cols=[column], n_components=10
)
self.imputeData = hashing_encoder.fit_transform(
self.imputeData
)

else:
pass
cat_cols = self.imputeData.select_dtypes(include=["object"]).columns
assert len(self.encoding) == len(cat_cols), (
f"length of encoding: {len(self.encoding)}, "
f"length of categorical variables: {len(cat_cols)}"
)
for i, column in enumerate(cat_cols):
encoding_method = self.encoding[i]
if encoding_method == "one_hot" and self.imputeData[column].dtype == object:
self.imputeData = pd.get_dummies(
self.imputeData, prefix=f"OH_{column}", columns=[column]
)
logger.debug(f"Column '{column}' one-hot encoded")

elif (
encoding_method == "integer" and self.imputeData[column].dtype == object
):
vals = self.imputeData[column].values
self.imputeData[column] = inner_utils.integer_encoder(vals)
logger.debug(f"Column '{column}' integer encoded")

elif encoding_method == "hash" and self.imputeData[column].dtype == object:
hashing_encoder = ce.HashingEncoder(cols=[column], n_components=10)
self.imputeData = hashing_encoder.fit_transform(self.imputeData)
logger.debug(f"Column '{column}' hash encoded")

# Scaling
assert self.scaler in ["standard"], "Invalid Scaler"
Expand All @@ -206,6 +207,9 @@ def fit(self):
scaler.fit_transform(self.imputeData),
columns=list(self.imputeData.columns),
)
logger.debug(
f"Data scaled using StandardScaler. Final shape: {self.imputeData.shape}"
)

def save(self, file_path):
"""
Expand All @@ -224,3 +228,4 @@ def save(self, file_path):
"""
with open(file_path, "wb") as f:
pickle.dump(self, f)
logger.debug(f"Moon object saved to {file_path}")
Loading