Merge pull request #85 from sdpython/dev

sdpython · web-flow · commit 029928b62e54 · 2021-05-03T00:29:59.000+02:00
Fixes #70, implements DecisionTreeLogisticRegression
diff --git a/_unittests/ut_mlmodel/test_decision_tree_logistic_regression.py b/_unittests/ut_mlmodel/test_decision_tree_logistic_regression.py
@@ -146,6 +146,13 @@ def test_decision_path(self):
         leaves = predict_leaves(dtlr, X_test)
         self.assertEqual(leaves.shape[0], X_test.shape[0])
 
+    def test_classifier_strat(self):
+        X = numpy.array([[0.1, 0.2], [0.2, 0.3], [-0.2, -0.3], [0.4, 0.3]])
+        Y = numpy.array([0, 1, 0, 1])
+        dtlr = DecisionTreeLogisticRegression(
+            fit_improve_algo=None, strategy='')
+        self.assertRaise(lambda: dtlr.fit(X, Y), ValueError)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/mlinsights/mlmodel/decision_tree_logreg.py b/mlinsights/mlmodel/decision_tree_logreg.py
@@ -315,6 +315,8 @@ class DecisionTreeLogisticRegression(BaseEstimator, ClassifierMixin):
         where *p* is the proportion of samples falling in the first
         fold.
     :param verbose: prints out information about the training
+    :param strategy: `'parallel'` or `'perpendicular'`,
+        see below
 
     Fitted attributes:
 
@@ -323,6 +325,14 @@ class DecisionTreeLogisticRegression(BaseEstimator, ClassifierMixin):
         or a list of arrays of class labels (multi-output problem).
     * `tree_`: Tree
         The underlying Tree object.
+
+    The class implements two strategies to build the tree.
+    The first one `'parallel'` splits the feature space using
+    the hyperplan defined by a logistic regression, the second
+    strategy `'perpendicular'` splis the feature space based on
+    a hyperplan perpendicular to a logistic regression. By doing
+    this, two logistic regression fit on both sub parts must
+    necessary decreases the training error.
     """
 
     _fit_improve_algo_values = (
@@ -332,7 +342,7 @@ def __init__(self, estimator=None,
                  max_depth=20, min_samples_split=2,
                  min_samples_leaf=2, min_weight_fraction_leaf=0.0,
                  fit_improve_algo='auto', p1p2=0.09,
-                 gamma=1., verbose=0):
+                 gamma=1., verbose=0, strategy='parallel'):
         "constructor"
         ClassifierMixin.__init__(self)
         BaseEstimator.__init__(self)
@@ -354,6 +364,7 @@ def __init__(self, estimator=None,
         self.p1p2 = p1p2
         self.gamma = gamma
         self.verbose = verbose
+        self.strategy = strategy
 
         if self.fit_improve_algo not in DecisionTreeLogisticRegression._fit_improve_algo_values:
             raise ValueError(
@@ -392,13 +403,27 @@ def fit(self, X, y, sample_weight=None):
             raise RuntimeError(
                 "The model only supports binary classification but labels are "
                 "{}.".format(self.classes_))
+
+        if self.strategy == 'parallel':
+            return self._fit_parallel(X, y, sample_weight)
+        if self.strategy == 'perpendicular':
+            return self._fit_perpendicular(X, y, sample_weight)
+        raise ValueError(
+            "Unknown strategy '{}'.".format(self.strategy))
+
+    def _fit_parallel(self, X, y, sample_weight):
+        "Implements the parallel strategy."
         cls = (y == self.classes_[1]).astype(numpy.int32)
         estimator = clone(self.estimator)
         self.tree_ = _DecisionTreeLogisticRegressionNode(estimator, 0.5)
         self.n_nodes_ = self.tree_.fit(
             X, cls, sample_weight, self, X.shape[0]) + 1
         return self
 
+    def _fit_perpendicular(self, X, y, sample_weight):
+        "Implements the perpendicular strategy."
+        raise NotImplementedError()
+
     def predict(self, X):
         """
         Runs the predictions.
diff --git a/mlinsights/mlmodel/kmeans_l1.py b/mlinsights/mlmodel/kmeans_l1.py
@@ -18,7 +18,7 @@
 from sklearn.utils.extmath import stable_cumsum
 try:
     from sklearn.cluster._kmeans import _check_sample_weight
-except ImportError:
+except ImportError:  # pragma: no cover
     from sklearn.cluster._kmeans import (
         _check_normalize_sample_weight as _check_sample_weight)
 from ._kmeans_022 import (
@@ -144,7 +144,7 @@ def _init_centroids(norm, X, k, init, random_state=None,
         X = X[init_indices]
         n_samples = X.shape[0]
     elif n_samples < k:
-        raise ValueError(
+        raise ValueError(  # pragma: no cover
             "n_samples=%d should be larger than k=%d" % (n_samples, k))
 
     if isinstance(init, str) and init == 'k-means++':
@@ -160,21 +160,22 @@ def _init_centroids(norm, X, k, init, random_state=None,
         centers = init(norm, X, k, random_state=random_state)
         centers = numpy.asarray(centers, dtype=X.dtype)
     else:
-        raise ValueError("the init parameter for the k-means should "
-                         "be 'k-means++' or 'random' or an ndarray, "
-                         "'%s' (type '%s') was passed." % (init, type(init)))
+        raise ValueError(  # pragma: no cover
+            "init parameter for the k-means should "
+            "be 'k-means++' or 'random' or an ndarray, "
+            "'%s' (type '%s') was passed." % (init, type(init)))
 
     if issparse(centers):
         centers = centers.toarray()
 
     def _validate_center_shape(X, k, centers):
         """Check if centers is compatible with X and n_clusters"""
         if centers.shape[0] != k:
-            raise ValueError(
+            raise ValueError(  # pragma: no cover
                 f"The shape of the initial centers {centers.shape} does not "
                 f"match the number of clusters {k}.")
         if centers.shape[1] != X.shape[1]:
-            raise ValueError(
+            raise ValueError(  # pragma: no cover
                 f"The shape of the initial centers {centers.shape} does not "
                 f"match the number of features of the data {X.shape[1]}.")
 
@@ -598,7 +599,7 @@ def _fit_l1(self, X, y=None, sample_weight=None):
                     X, init)
 
             if n_init != 1:
-                warnings.warn(
+                warnings.warn(  # pragma: no cover
                     'Explicit initial center position passed: '
                     'performing only one init in k-means instead of n_init=%d'
                     % n_init, RuntimeWarning, stacklevel=2)
diff --git a/mlinsights/mlmodel/sklearn_testing.py b/mlinsights/mlmodel/sklearn_testing.py
@@ -93,7 +93,7 @@ def _get_test_instance():
     try:
         from pyquickhelper.pycode import ExtTestCase  # pylint: disable=C0415
         cls = ExtTestCase
-    except ImportError:
+    except ImportError:  # pragma: no cover
 
         class _ExtTestCase(TestCase):
             "simple test classe with a more methods"
@@ -185,7 +185,7 @@ def _assert_dict_equal(a, b, ext):
         if key not in b:
             rows.append("** Removed key '{0}' in a".format(key))
     if len(rows) > 0:
-        raise AssertionError(
+        raise AssertionError(  # pragma: no cover
             "Dictionaries are different\n{0}".format('\n'.join(rows)))
 
 
@@ -290,7 +290,7 @@ def adjust(obj1, obj2):
                 if hasattr(obj2, k):
                     v1 = getattr(obj1, k)
                     if callable(v1):
-                        raise RuntimeError(
+                        raise RuntimeError(  # pragma: no cover
                             "Cannot migrate trained parameters for {}.".format(obj1))
                     elif isinstance(v1, BaseEstimator):
                         v1 = getattr(obj1, k)
@@ -302,7 +302,7 @@ def adjust(obj1, obj2):
                     v1 = getattr(obj1, k)
                     setattr(obj2, k, clone_with_fitted_parameters(v1))
                 else:
-                    raise RuntimeError(
+                    raise RuntimeError(  # pragma: no cover
                         "Cloned object is missing '{0}' in {1}.".format(k, obj2))
 
     if isinstance(est, BaseEstimator):