Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
924 changes: 924 additions & 0 deletions Experiment_MultiTaskForestClassifier.ipynb

Large diffs are not rendered by default.

720 changes: 720 additions & 0 deletions MNIST(Sporf).ipynb

Large diffs are not rendered by default.

406 changes: 406 additions & 0 deletions MultiTaskClf.ipynb

Large diffs are not rendered by default.

598 changes: 598 additions & 0 deletions Sporf_XOR_Spiral.ipynb

Large diffs are not rendered by default.

11,760 changes: 11,760 additions & 0 deletions Sporf_XOR_Spiral_hyperparamter tuning.ipynb

Large diffs are not rendered by default.

109 changes: 109 additions & 0 deletions multi_task_forest1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
"""MultiTaskClf.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1d7ipMRMx9b10KeaPRe-yLQq5tAo6Wnvd
"""

import numpy as np
from sklearn.metrics import accuracy_score
from treeple import ObliqueRandomForestClassifier

class MultiTaskForestClassifier:
def __init__(self, clf_type="SPORF", **kwargs):
if clf_type == "SPORF":
self.model_cls = ObliqueRandomForestClassifier
self.default_params = {
"n_estimators": 200,
"feature_combinations": 2.0,
"max_depth": 20,
"min_samples_split": 5,
"min_samples_leaf": 1,
"max_features": 0.5,
"bootstrap": True
}
elif clf_type == "MORF":
self.model_cls = MORFClassifier # Liora
self.default_params = { ... }
elif clf_type == "HonestForest":
self.model_cls = HonestForestClassifier # Riya
self.default_params = { ... }
else:
raise ValueError(f"Unsupported tree: {clf_type}")

self.params = {**self.default_params, **kwargs}
self.model = None
self.task_data = {}

def add_task(self, task_id, X, y):
self.task_data[task_id] = (X, y)

def fit(self, task_ids):
X_all, y_all, task_labels = [], [], []
for task_id in task_ids:
X, y = self.task_data[task_id]
X_all.append(X)
y_all.append(y)
task_labels.append(np.full(len(y), task_id))

X_all = np.vstack(X_all)
y_all = np.concatenate(y_all)
task_labels = np.concatenate(task_labels)
X_all = np.column_stack((X_all, task_labels))

self.model = self.model_cls(**self.params, random_state=42)
self.model.fit(X_all, y_all)

def predict(self, X, task_id):
X_task = np.column_stack((X, np.full(len(X), task_id)))
return self.model.predict(X_task)

def score(self, X, y, task_id):
return accuracy_score(y, self.predict(X, task_id))

def evaluate_transfer_general(self, forward_train_ids, forward_test_id, backward_train_ids, backward_test_ids, do_reverse=False):
"""
- Forward: train on `forward_train_ids`, test on `forward_test_id`
- Backward: train on `backward_train_ids`, test individually on each in `backward_test_ids`
- Reverse (not mandatory. default as False but can change to true): train on `forward_test_id`, test on `forward_train_ids`
"""
results = {}

# Forward
self.fit(forward_train_ids)
X_test, y_test = self.task_data[forward_test_id]
forward_acc = self.score(X_test, y_test, task_id=forward_test_id)
results["forward_transfer"] = {
"train_on": forward_train_ids,
"test_on": forward_test_id,
"accuracy": forward_acc
}

# Backward
self.fit(backward_train_ids)
backward_accuracies = {}
for tid in backward_test_ids:
X, y = self.task_data[tid]
backward_accuracies[f"task{tid}"] = {
"train_on": backward_train_ids,
"test_on": tid,
"accuracy": self.score(X, y, task_id=tid)
}
results["backward_transfer"] = backward_accuracies

# Reverse
if do_reverse:
self.fit([forward_test_id])
reverse_accuracies = {}
for tid in forward_train_ids:
X, y = self.task_data[tid]
reverse_accuracies[f"task{tid}"] = {
"train_on": [forward_test_id],
"test_on": tid,
"accuracy": self.score(X, y, task_id=tid)
}
results["reverse_transfer"] = reverse_accuracies

return results
78 changes: 78 additions & 0 deletions multitaskclf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# -*- coding: utf-8 -*-
"""MultiTaskClf.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1d7ipMRMx9b10KeaPRe-yLQq5tAo6Wnvd
"""

import numpy as np
from sklearn.metrics import accuracy_score
from treeple import ObliqueRandomForestClassifier

class MultiTaskForestClassifier:
def __init__(self, clf_type="SPORF", **kwargs):
if clf_type == "SPORF":
self.model_cls = ObliqueRandomForestClassifier
self.default_params = {
"n_estimators": 200,
"feature_combinations": 2.0,
"max_depth": 20,
"min_samples_split": 5,
"min_samples_leaf": 1,
"max_features": 0.5,
"bootstrap": True
}
elif clf_type == "MORF":
self.model_cls = MORFClassifier # Liora
self.default_params = { ... }
elif clf_type == "HonestForest":
self.model_cls = HonestForestClassifier # Riya
self.default_params = { ... }
else:
raise ValueError(f"Unsupported tree: {clf_type}")

self.params = {**self.default_params, **kwargs}
self.model = None
self.task_data = {}

def add_task(self, task_id, X_train, y_train, X_test, y_test):
"""Add both training and testing data for each task."""
self.task_data[task_id] = {
"train": (X_train, y_train),
"test": (X_test, y_test)
}

def get_task_ids(self):
return list(self.task_data.keys())

def fit(self):
"""Train on all tasks jointly (multi-task learning)."""
X_all, y_all, task_labels = [], [], []
for task_id, data in self.task_data.items():
X, y = data["train"]
X_all.append(X)
y_all.append(y)
task_labels.append(np.full(len(y), task_id))

X_all = np.vstack(X_all)
y_all = np.concatenate(y_all)
task_labels = np.concatenate(task_labels)

# Add task ID as a feature
X_all = np.column_stack((X_all, task_labels))

self.model = self.model_cls(**self.params, random_state=42)
self.model.fit(X_all, y_all)

def predict(self, X, task_id):
"""Make predictions for a specific task."""
X_task = np.column_stack((X, np.full(len(X), task_id)))
return self.model.predict(X_task)


def score(self, task_id):
"""Return accuracy on the held-out test set for a specific task."""
X_test, y_test = self.task_data[task_id]["test"]
return accuracy_score(y_test, self.predict(X_test, task_id))
96 changes: 96 additions & 0 deletions multitaskclf_ratiosplit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
"""MultiTaskClf_ratiosplit.ipynb

Automatically generated by Colab.

Original file is located at
https://colab.research.google.com/drive/1d7ipMRMx9b10KeaPRe-yLQq5tAo6Wnvd
"""

import numpy as np
from sklearn.metrics import accuracy_score
from treeple import ObliqueRandomForestClassifier

"""
MultiTaskForestClassifier:
A unified multi-task learning wrapper for SPORF, MORF, and HonestForest.
Trains on all tasks jointly and evaluates per-task performance.
"""

class MultiTaskForestClassifier:
def __init__(self, clf_type="SPORF", task_ratios=None, random_state=42, **kwargs):
if clf_type == "SPORF":
self.model_cls = ObliqueRandomForestClassifier
self.default_params = {
"n_estimators": 200,
"feature_combinations": 2.0,
"max_depth": 20,
"min_samples_split": 5,
"min_samples_leaf": 1,
"max_features": 0.5,
"bootstrap": True
}
elif clf_type == "MORF":
self.model_cls = MORFClassifier
self.default_params = { ... }
elif clf_type == "HonestForest":
self.model_cls = HonestForestClassifier
self.default_params = { ... }
else:
raise ValueError(f"Unsupported tree: {clf_type}")

self.params = {**self.default_params, **kwargs}
self.model = None
self.task_data = {}

if task_ratios is None:
self.task_ratios = {0: 0.1, 1: 0.9}
else:
self.task_ratios = task_ratios

self.random_state = random_state


def add_task(self, task_id, X, y, test_size=0.2):
from sklearn.model_selection import train_test_split
if task_id in self.task_ratios:
ratio = self.task_ratios[task_id]
X, _, y, _ = train_test_split(X, y, train_size=ratio, stratify=y, random_state=self.random_state)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=self.random_state)
self.task_data[task_id] = {
"train": (X_train, y_train),
"test": (X_test, y_test)
}

def get_task_ids(self):
return list(self.task_data.keys())

def fit(self):
"""Train on all tasks jointly (multi-task learning)."""
X_all, y_all, task_labels = [], [], []
for task_id, data in self.task_data.items():
X, y = data["train"]
X_all.append(X)
y_all.append(y)
task_labels.append(np.full(len(y), task_id))

X_all = np.vstack(X_all)
y_all = np.concatenate(y_all)
task_labels = np.concatenate(task_labels)
X_all = np.column_stack((X_all, task_labels))

self.model = self.model_cls(**self.params, random_state=42)
self.model.fit(X_all, y_all)

def predict(self, X, task_id):
"""Make predictions for a specific task."""
X_task = np.column_stack((X, np.full(len(X), task_id)))
return self.model.predict(X_task)


def score(self, task_id):
"""Return accuracy on the held-out test set for a specific task."""
X_test, y_test = self.task_data[task_id]["test"]
return accuracy_score(y_test, self.predict(X_test, task_id))

Loading