MSDLLCpapers · fima5 · Aug 13, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 26, 2025
diff --git a/demo/APO Sample Existing Demo.ipynb b/demo/APO Sample Existing Demo.ipynb
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75f106cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '../')\n",
+    "\n",
+    "print(sys.path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "117a382f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import obsidian\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "print(f'obsidian version: ' + obsidian.__version__)\n",
+    "\n",
+    "from obsidian.experiment import AdvExpDesigner\n",
+    "from obsidian.experiment.sampling import sample_with_bias, best_sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cd0a6b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#generate random data for this demo\n",
+    "np.random.seed(42)\n",
+    "\n",
+    "n = 1000\n",
+    "demo_data = pd.DataFrame({\n",
+    "    'reagent_conc': np.round(np.random.uniform(0.1, 1.0, n), 2),\n",
+    "    'ionic_strength': np.round(np.random.uniform(10, 100, n), 2),\n",
+    "    'surfactant_conc': np.round(np.random.uniform(0.01, 0.2, n), 3),\n",
+    "    'compound_A': np.round(np.random.uniform(0, 50, n), 2),\n",
+    "    'compound_B': np.round(np.random.uniform(0, 50, n), 2),\n",
+    "    'sugar': np.random.choice(['glucose', 'fructose', 'sucrose'], n),\n",
+    "    'surfactant': np.random.choice(['SDS', 'Tween20', 'TritonX'], n),\n",
+    "    'buffer': np.random.choice(['PBS', 'Tris', 'HEPES'], n),\n",
+    "    'pH': np.round(np.random.uniform(5.5, 8.5, n), 2)\n",
+    "})\n",
+    "\n",
+    "demo_data.index.name = 'FormulationID'\n",
+    "demo_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57ed0226",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Initialize existing experimental data as an AdvExpDesigner object\n",
+    "designer = AdvExpDesigner(design_df=demo_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9feae24b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "You can sample an existing dataset with or without bias: \n",
+    "Bias dictionary format : {\"column\": [lower_bound, upper_bound, relative_weight]}\n",
+    "\n",
+    "- Weight >1 increases sampling probability for in-range rows.\n",
+    "- Weight <1 decreases it.\n",
+    "- Weight = 0 excludes those rows entirely.\n",
+    "\"\"\"\n",
+    "\n",
+    "bias = {\n",
+    "    \"ionic_strength\": [50, 60, 3.0], \n",
+    "}\n",
+    "\n",
+    "seed = np.random.randint(0,1000)\n",
+    "print(f\"Random seed for reproducibility: {seed}\")\n",
+    "\n",
+    "#We can easily create a random sample of n samples with weights using built in Pandas functions\n",
+    "#enforce = True allows you to force the boundary to be true ; resultant sample may not be space-filling.\n",
+    "sample = sample_with_bias(designer.design, n=1000, replace=False, seed=seed, bias=bias, plot_weights=True, enforce=False)\n",
+    "\n",
+    "sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab457ed8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#One-hot encode your categorical columns for easy handling in determining Euclidean distance\n",
+    "df_encoded = pd.get_dummies(designer.design, columns=[\"sugar\", \"surfactant\", \"buffer\"], dtype=int) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a46bcd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "perform random sampling n_trial times, select the best one via criteria metric:\n",
+    "metric:\n",
+    "    - \"maximin\":   maximize the minimum pairwise Euclidean distance\n",
+    "    - \"mean_nn\":   maximize the mean nearest-neighbor Euclidean distance\n",
+    "    - \"hybrid\":    0.6*maximin + 0.4*mean_nn \n",
+    "\"\"\"\n",
+    "seed = np.random.randint(0,1000)\n",
+    "print(f\"Random seed for reproducibility: {seed}\")\n",
+    "\n",
+    "optimal_sample, info = best_sample(\n",
+    "    df_encoded, 10, feature_cols=df_encoded.columns, n_trials=1000,\n",
+    "    bias=bias, plot_weights=True, enforce=False, random_state=seed, metric=\"hybrid\"\n",
+    ")\n",
+    "\n",
+    "print(info)\n",
+    "optimal_sample\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ea1bcd8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#decode from one-hot encoding\n",
+    "normal_cols = list(optimal_sample.columns)[0:6]\n",
+    "encoded_cols = list(optimal_sample.columns)[6:]\n",
+    "decoded = pd.from_dummies(optimal_sample[encoded_cols],sep=\"_\")\n",
+    "optimal_design_decoded = pd.concat([optimal_sample[normal_cols], decoded], axis=1)\n",
+    "optimal_design_decoded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2137e315",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(designer.plot_histograms(optimal_design_decoded))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv (3.13.5)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/obsidian/experiment/advanced_design.py b/obsidian/experiment/advanced_design.py
@@ -22,25 +22,32 @@ class AdvExpDesigner:
     """
 
     def __init__(
-        self, continuous_params, conditional_subparameters, subparam_mapping=None
+        self, continuous_params=None, conditional_subparameters=None, subparam_mapping=None, design_df=None
     ):
         """
         Initializes the AdvExpDesigner with experimental parameters and optional subparameter mappings.
 
         :param continuous_params: A dictionary containing the continuous parameters for the design.
         :param conditional_subparameters: A dictionary containing the conditional subparameters for the design.
         :param subparam_mapping: A dictionary for mapping, will be inferred if not provided.
+        :param design_df: A Pandas DataFrame of an existing experimental design, default None
         """
-        self.continuous_params = continuous_params
-        self.conditional_subparameters = conditional_subparameters
-        self.subparam_mapping = subparam_mapping or infer_subparam_mapping(
-            self.conditional_subparameters
-        )
-        self.continuous_keys = list(self.continuous_params.keys())
-        self.categorical_keys = list(self.conditional_subparameters.keys())
-        self.subparam_key = (
-            list(self.subparam_mapping.values())[0] if self.subparam_mapping else None
-        )
+        self.continuous_params = continuous_params if continuous_params else {}
+        self.conditional_subparameters = conditional_subparameters if conditional_subparameters else {}
+
+        if design_df is not None and not design_df.empty:
+            self.design = design_df
+            self.categorical_keys = design_df.select_dtypes(exclude=['number']).columns.tolist()
+            if continuous_params:
+                self.continuous_keys = list(self.continuous_params.keys())
+            else:
+                self.continuous_keys = design_df.select_dtypes(include=['number']).columns.tolist()
+        else:
+            self.continuous_keys = list(self.continuous_params.keys()) if continuous_params else []
+            self.categorical_keys = list(self.conditional_subparameters.keys()) if conditional_subparameters else []
+
+        self.subparam_mapping = subparam_mapping or infer_subparam_mapping(self.conditional_subparameters)
+        self.subparam_key = (list(self.subparam_mapping.values())[0] if self.subparam_mapping else None)
 
     def generate_design(self, seed, n_samples, optimize_categories=True):
         """
@@ -426,13 +433,16 @@ def assign_conditional_subparameter(
 
 def infer_subparam_mapping(conditional_subparameters):
     mapping = {}
-    for cat_param, levels in conditional_subparameters.items():
-        subparam_candidates = set()
-        for level_info in levels.values():
-            subparams = [k for k in level_info if k != "freq"]
-            subparam_candidates.update(subparams)
-        if len(subparam_candidates) == 1:
-            mapping[cat_param] = subparam_candidates.pop()
+    if len(conditional_subparameters) == 0:
+        return mapping
+    else:
+        for cat_param, levels in conditional_subparameters.items():
+            subparam_candidates = set()
+            for level_info in levels.values():
+                subparams = [k for k in level_info if k != "freq"]
+                subparam_candidates.update(subparams)
+            if len(subparam_candidates) == 1:
+                mapping[cat_param] = subparam_candidates.pop()
     return mapping
 
 
@@ -915,14 +925,7 @@ def plot_design_quality_evolution(metrics_df):
     metrics_df = metrics_df.sort_values("seed")
 
     fig, axes = plt.subplots(2, 3, figsize=(15, 10))
-    metrics = [
-        "D-optimality",
-        "A-optimality",
-        "Pairwise Distance CV",
-        "Max Continuous Corr",
-        "Max Categorical Corr",
-        "score",
-    ]
+    metrics = metrics_df.columns
 
     for i, metric in enumerate(metrics):
         ax = axes[i // 3, i % 3]