diff --git a/afsklearn/__init__.py b/afsklearn/__init__.py
index ada3caa..a0774b7 100644
--- a/afsklearn/__init__.py
+++ b/afsklearn/__init__.py
@@ -4,6 +4,7 @@
 import yaml
 
 app_dir = Path(__file__).resolve().parent
+__version__ = '0.1.0'
 
 
 def load_yaml_file(name: str, directory: Path = app_dir) -> Any:
@@ -13,3 +14,13 @@ def load_yaml_file(name: str, directory: Path = app_dir) -> Any:
 
 
 patches_info = load_yaml_file("patched_modules.yml")
+
+from .patcher import Patcher
+
+def patch_sklearn():
+    Patcher.patch_all()
+
+def unpatch_sklearn():
+    Patcher.rollback_all()
+
+__all__ = ['Patcher', 'patch_sklearn', 'unpatch_sklearn']
diff --git a/afsklearn/_nn_utils.py b/afsklearn/_nn_utils.py
index 4ff98a5..ed5560f 100644
--- a/afsklearn/_nn_utils.py
+++ b/afsklearn/_nn_utils.py
@@ -4,10 +4,9 @@
 # License: BSD 3 clause
 
 import arrayfire as af
-#import numpy as np
 import numpy as np
 import numpy
-from ._type_utils import typemap
+from ..utils._type_utils import typemap
 
 
 def logistic_sigmoid(x):
diff --git a/afsklearn/afClassifierMixin.py b/afsklearn/afClassifierMixin.py
new file mode 100644
index 0000000..db81060
--- /dev/null
+++ b/afsklearn/afClassifierMixin.py
@@ -0,0 +1,154 @@
+import arrayfire as af
+
+def _weighted_sum(sample_score, sample_weight, normalize=False):
+    if normalize:
+        return np.average(sample_score, weights=sample_weight)
+    elif sample_weight is not None:
+        return np.dot(sample_score, sample_weight)
+    else:
+        return sample_score.sum()
+
+def _check_targets(y_true, y_pred):
+    """Check that y_true and y_pred belong to the same classification task
+    This converts multiclass or binary types to a common shape, and raises a
+    ValueError for a mix of multilabel and multiclass targets, a mix of
+    multilabel formats, for the presence of continuous-valued or multioutput
+    targets, or for targets of different lengths.
+    Column vectors are squeezed to 1d, while multilabel formats are returned
+    as CSR sparse label indicators.
+    Parameters
+    ----------
+    y_true : array-like
+    y_pred : array-like
+    Returns
+    -------
+    type_true : one of {'multilabel-indicator', 'multiclass', 'binary'}
+        The type of the true target data, as output by
+        ``utils.multiclass.type_of_target``
+    y_true : array or indicator matrix
+    y_pred : array or indicator matrix
+    """
+    check_consistent_length(y_true, y_pred)
+    type_true = type_of_target(y_true)
+    type_pred = type_of_target(y_pred)
+
+    y_type = {type_true, type_pred}
+    if y_type == {"binary", "multiclass"}:
+        y_type = {"multiclass"}
+
+    if len(y_type) > 1:
+        raise ValueError("Classification metrics can't handle a mix of {0} "
+                         "and {1} targets".format(type_true, type_pred))
+
+    # We can't have more than one value on y_type => The set is no more needed
+    y_type = y_type.pop()
+
+    # No metrics support "multiclass-multioutput" format
+    if (y_type not in ["binary", "multiclass", "multilabel-indicator"]):
+        raise ValueError("{0} is not supported".format(y_type))
+
+    if y_type in ["binary", "multiclass"]:
+        y_true = column_or_1d(y_true)
+        y_pred = column_or_1d(y_pred)
+        if y_type == "binary":
+            unique_values = np.union1d(y_true, y_pred)
+            if len(unique_values) > 2:
+                y_type = "multiclass"
+
+    if y_type.startswith('multilabel'):
+        y_true = csr_matrix(y_true)
+        y_pred = csr_matrix(y_pred)
+        y_type = 'multilabel-indicator'
+
+    return y_type, y_true, y_pred
+
+
+
+
+def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
+    """Accuracy classification score.
+    In multilabel classification, this function computes subset accuracy:
+    the set of labels predicted for a sample must *exactly* match the
+    corresponding set of labels in y_true.
+    Read more in the :ref:`User Guide <accuracy_score>`.
+    Parameters
+    ----------
+    y_true : 1d array-like, or label indicator array / sparse matrix
+        Ground truth (correct) labels.
+    y_pred : 1d array-like, or label indicator array / sparse matrix
+        Predicted labels, as returned by a classifier.
+    normalize : bool, optional (default=True)
+        If ``False``, return the number of correctly classified samples.
+        Otherwise, return the fraction of correctly classified samples.
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+    Returns
+    -------
+    score : float
+        If ``normalize == True``, return the fraction of correctly
+        classified samples (float), else returns the number of correctly
+        classified samples (int).
+        The best performance is 1 with ``normalize == True`` and the number
+        of samples with ``normalize == False``.
+    See also
+    --------
+    jaccard_score, hamming_loss, zero_one_loss
+    Notes
+    -----
+    In binary and multiclass classification, this function is equal
+    to the ``jaccard_score`` function.
+    Examples
+    --------
+    >>> from sklearn.metrics import accuracy_score
+    >>> y_pred = [0, 2, 1, 3]
+    >>> y_true = [0, 1, 2, 3]
+    >>> accuracy_score(y_true, y_pred)
+    0.5
+    >>> accuracy_score(y_true, y_pred, normalize=False)
+    2
+    In the multilabel case with binary label indicators:
+    >>> import numpy as np
+    >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
+    0.5
+    """
+
+    # Compute accuracy for each possible representation
+    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    check_consistent_length(y_true, y_pred, sample_weight)
+    if y_type.startswith('multilabel'):
+        differing_labels = count_nonzero(y_true - y_pred, axis=1)
+        score = differing_labels == 0
+    else:
+        score = y_true == y_pred
+
+    return _weighted_sum(score, sample_weight, normalize)
+
+class afClassifierMixin:
+    """ArrayFire enabled Mixin class for all classifiers in scikit-learn."""
+
+    _estimator_type = "classifier"
+
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples.
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for X.
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+        Returns
+        -------
+        score : float
+            Mean accuracy of self.predict(X) wrt. y.
+        """
+        #return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
+        return #TMP
+
+    def _more_tags(self):
+        return {'requires_y': True}
diff --git a/afsklearn/afLabelBinarizer.py b/afsklearn/afLabelBinarizer.py
new file mode 100644
index 0000000..1ed5ead
--- /dev/null
+++ b/afsklearn/afLabelBinarizer.py
@@ -0,0 +1,640 @@
+import arrayfire as af
+#import cupy as np
+import numpy as np
+import numpy
+import scipy.sparse as sp
+import warnings
+import numbers
+from collections.abc import Sequence
+from scipy.sparse.base import spmatrix
+from itertools import chain
+
+from sklearn.utils.validation import _deprecate_positional_args
+from sklearn.preprocessing import LabelBinarizer
+from af_validation import _num_samples
+from af_validation import check_is_fitted
+from af_validation import check_array
+from af_validation import column_or_1d
+
+def _unique_multiclass(y):
+    if hasattr(y, '__array__'):
+        return np.unique(np.asarray(y))
+    else:
+        return set(list(y))
+
+
+def _unique_indicator(y):
+    return np.arange(
+        check_array(y, accept_sparse=['csr', 'csc', 'coo']).shape[1]
+    )
+
+
+_FN_UNIQUE_LABELS = {
+    'binary': _unique_multiclass,
+    'multiclass': _unique_multiclass,
+    'multilabel-indicator': _unique_indicator,
+}
+
+def unique_labels(*ys):
+    """Extract an ordered array of unique labels
+
+    We don't allow:
+        - mix of multilabel and multiclass (single label) targets
+        - mix of label indicator matrix and anything else,
+          because there are no explicit labels)
+        - mix of label indicator matrices of different sizes
+        - mix of string and integer labels
+
+    At the moment, we also don't allow "multiclass-multioutput" input type.
+
+    Parameters
+    ----------
+    *ys : array-likes
+
+    Returns
+    -------
+    out : numpy array of shape [n_unique_labels]
+        An ordered array of unique labels.
+
+    Examples
+    --------
+    >>> from sklearn.utils.multiclass import unique_labels
+    >>> unique_labels([3, 5, 5, 5, 7, 7])
+    array([3, 5, 7])
+    >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
+    array([1, 2, 3, 4])
+    >>> unique_labels([1, 2, 10], [5, 11])
+    array([ 1,  2,  5, 10, 11])
+    """
+    if not ys:
+        raise ValueError('No argument has been passed.')
+    # Check that we don't mix label format
+
+    ys_types = set(type_of_target(x) for x in ys)
+    if ys_types == {"binary", "multiclass"}:
+        ys_types = {"multiclass"}
+
+    if len(ys_types) > 1:
+        raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
+
+    label_type = ys_types.pop()
+
+    # Check consistency for the indicator format
+    if (label_type == "multilabel-indicator" and
+            len(set(check_array(y,
+                                accept_sparse=['csr', 'csc', 'coo']).shape[1]
+                    for y in ys)) > 1):
+        raise ValueError("Multi-label binary indicator input with "
+                         "different numbers of labels")
+
+    # Get the unique set of labels
+    _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
+    if not _unique_labels:
+        raise ValueError("Unknown label type: %s" % repr(ys))
+
+    #ys_labels = set(chain.from_iterable(_unique_labels(y.tolist()) for y in ys))
+    ys_labels = set(chain.from_iterable(_unique_labels(y.to_list()) for y in ys))
+
+    # Check that we don't mix string type with number type
+    if (len(set(isinstance(label, str) for label in ys_labels)) > 1):
+        raise ValueError("Mix of label input types (string and number)")
+
+    return np.array(sorted(ys_labels))
+
+def is_multilabel(y):
+    """ Check if ``y`` is in a multilabel format.
+
+    Parameters
+    ----------
+    y : numpy array of shape [n_samples]
+        Target values.
+
+    Returns
+    -------
+    out : bool,
+        Return ``True``, if ``y`` is in a multilabel format, else ```False``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.utils.multiclass import is_multilabel
+    >>> is_multilabel([0, 1, 0, 1])
+    False
+    >>> is_multilabel([[1], [0, 2], []])
+    False
+    >>> is_multilabel(np.array([[1, 0], [0, 0]]))
+    True
+    >>> is_multilabel(np.array([[1], [0], [0]]))
+    False
+    >>> is_multilabel(np.array([[1, 0, 0]]))
+    True
+    """
+    if hasattr(y, '__array__') or isinstance(y, Sequence):
+        y = np.asarray(y)
+    if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
+        return False
+
+    if issparse(y):
+        if isinstance(y, (dok_matrix, lil_matrix)):
+            y = y.tocsr()
+        return (len(y.data) == 0 or np.unique(y.data).size == 1 and
+                (y.dtype.kind in 'biu' or  # bool, int, uint
+                 _is_integral_float(np.unique(y.data))))
+    else:
+        labels = np.unique(y)
+
+        return len(labels) < 3 and (y.dtype.kind in 'biu' or  # bool, int, uint
+                                    _is_integral_float(labels))
+
+def type_of_target(y):
+    """Determine the type of data indicated by the target.
+
+    Note that this type is the most specific type that can be inferred.
+    For example:
+
+        * ``binary`` is more specific but compatible with ``multiclass``.
+        * ``multiclass`` of integers is more specific but compatible with
+          ``continuous``.
+        * ``multilabel-indicator`` is more specific but compatible with
+          ``multiclass-multioutput``.
+
+    Parameters
+    ----------
+    y : array-like
+
+    Returns
+    -------
+    target_type : string
+        One of:
+
+        * 'continuous': `y` is an array-like of floats that are not all
+          integers, and is 1d or a column vector.
+        * 'continuous-multioutput': `y` is a 2d array of floats that are
+          not all integers, and both dimensions are of size > 1.
+        * 'binary': `y` contains <= 2 discrete values and is 1d or a column
+          vector.
+        * 'multiclass': `y` contains more than two discrete values, is not a
+          sequence of sequences, and is 1d or a column vector.
+        * 'multiclass-multioutput': `y` is a 2d array that contains more
+          than two discrete values, is not a sequence of sequences, and both
+          dimensions are of size > 1.
+        * 'multilabel-indicator': `y` is a label indicator matrix, an array
+          of two dimensions with at least two columns, and at most 2 unique
+          values.
+        * 'unknown': `y` is array-like but none of the above, such as a 3d
+          array, sequence of sequences, or an array of non-sequence objects.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> type_of_target([0.1, 0.6])
+    'continuous'
+    >>> type_of_target([1, -1, -1, 1])
+    'binary'
+    >>> type_of_target(['a', 'b', 'a'])
+    'binary'
+    >>> type_of_target([1.0, 2.0])
+    'binary'
+    >>> type_of_target([1, 0, 2])
+    'multiclass'
+    >>> type_of_target([1.0, 0.0, 3.0])
+    'multiclass'
+    >>> type_of_target(['a', 'b', 'c'])
+    'multiclass'
+    >>> type_of_target(np.array([[1, 2], [3, 1]]))
+    'multiclass-multioutput'
+    >>> type_of_target([[1, 2]])
+    'multilabel-indicator'
+    >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
+    'continuous-multioutput'
+    >>> type_of_target(np.array([[0, 1], [1, 1]]))
+    'multilabel-indicator'
+    """
+    valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__'))
+             and not isinstance(y, str))
+
+    if not valid:
+        raise ValueError('Expected array-like (array or non-string sequence), '
+                         'got %r' % y)
+
+    sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray'])
+    if sparse_pandas:
+        raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
+
+    if is_multilabel(y):
+        return 'multilabel-indicator'
+
+    try:
+        y = np.asarray(y)
+    except ValueError:
+        # Known to fail in numpy 1.3 for array of arrays
+        return 'unknown'
+
+    # The old sequence of sequences format
+    try:
+        if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence)
+                and not isinstance(y[0], str)):
+            raise ValueError('You appear to be using a legacy multi-label data'
+                             ' representation. Sequence of sequences are no'
+                             ' longer supported; use a binary array or sparse'
+                             ' matrix instead - the MultiLabelBinarizer'
+                             ' transformer can convert to this format.')
+    except IndexError:
+        pass
+
+    # Invalid inputs
+    if y.ndim > 2 or (y.dtype == object and len(y) and
+                      not isinstance(y.flat[0], str)):
+        return 'unknown'  # [[[1, 2]]] or [obj_1] and not ["label_1"]
+
+    if y.ndim == 2 and y.shape[1] == 0:
+        return 'unknown'  # [[]]
+
+    if y.ndim == 2 and y.shape[1] > 1:
+        suffix = "-multioutput"  # [[1, 2], [1, 2]]
+    else:
+        suffix = ""  # [1, 2, 3] or [[1], [2], [3]]
+
+    # check float and contains non-integer float values
+    if y.dtype.kind == 'f' and np.any(y != y.astype(int)):
+        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+        _assert_all_finite(y)
+        return 'continuous' + suffix
+
+    if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
+        return 'multiclass' + suffix  # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
+    else:
+        return 'binary'  # [1, 2] or [["a"], ["b"]]
+
+def _inverse_binarize_multiclass(y, classes):
+    """Inverse label binarization transformation for multiclass.
+
+    Multiclass uses the maximal score instead of a threshold.
+    """
+    classes = np.asarray(classes)
+
+    if sp.issparse(y):
+        # Find the argmax for each row in y where y is a CSR matrix
+
+        y = y.tocsr()
+        n_samples, n_outputs = y.shape
+        outputs = np.arange(n_outputs)
+        row_max = min_max_axis(y, 1)[1]
+        row_nnz = np.diff(y.indptr)
+
+        y_data_repeated_max = np.repeat(row_max, row_nnz)
+        # picks out all indices obtaining the maximum per row
+        y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
+
+        # For corner case where last row has a max of 0
+        if row_max[-1] == 0:
+            y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
+
+        # Gets the index of the first argmax in each row from y_i_all_argmax
+        index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
+        # first argmax of each row
+        y_ind_ext = np.append(y.indices, [0])
+        y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
+        # Handle rows of all 0
+        y_i_argmax[np.where(row_nnz == 0)[0]] = 0
+
+        # Handles rows with max of 0 that contain negative numbers
+        samples = np.arange(n_samples)[(row_nnz > 0) &
+                                       (row_max.ravel() == 0)]
+        for i in samples:
+            ind = y.indices[y.indptr[i]:y.indptr[i + 1]]
+            y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
+
+        return classes[y_i_argmax]
+    else:
+        return classes.take(y.argmax(axis=1), mode="clip")
+
+
+def _inverse_binarize_thresholding(y, output_type, classes, threshold):
+    """Inverse label binarization transformation using thresholding."""
+
+    if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
+        raise ValueError("output_type='binary', but y.shape = {0}".
+                         format(y.shape))
+
+    if output_type != "binary" and y.shape[1] != len(classes):
+        raise ValueError("The number of class is not equal to the number of "
+                         "dimension of y.")
+
+    classes = np.asarray(classes)
+
+    # Perform thresholding
+    if sp.issparse(y):
+        if threshold > 0:
+            if y.format not in ('csr', 'csc'):
+                y = y.tocsr()
+            y.data = np.array(y.data > threshold, dtype=np.int)
+            y.eliminate_zeros()
+        else:
+            y = np.array(y.toarray() > threshold, dtype=np.int)
+    else:
+        y = np.array(y > threshold, dtype=np.int)
+
+    # Inverse transform data
+    if output_type == "binary":
+        if sp.issparse(y):
+            y = y.toarray()
+        if y.ndim == 2 and y.shape[1] == 2:
+            return classes[y[:, 1]]
+        else:
+            if len(classes) == 1:
+                return np.repeat(classes[0], len(y))
+            else:
+                return classes[y.ravel()]
+
+    elif output_type == "multilabel-indicator":
+        return y
+
+    else:
+        raise ValueError("{0} format is not supported".format(output_type))
+
+
+def af_in1d(arr0, arr1):
+    #temporarily perform computation in numy, potentially change to arrayfire
+    a0 = arr0.to_ndarray()
+    #a1 = arr1.to_ndarray()
+    isin = np.in1d(a0,  arr1)
+    return isin
+
+@_deprecate_positional_args
+def label_binarize(y, *, classes, neg_label=0, pos_label=1,
+                   sparse_output=False):
+    """Binarize labels in a one-vs-all fashion
+
+    Several regression and binary classification algorithms are
+    available in scikit-learn. A simple way to extend these algorithms
+    to the multi-class classification case is to use the so-called
+    one-vs-all scheme.
+
+    This function makes it possible to compute this transformation for a
+    fixed set of class labels known ahead of time.
+
+    Parameters
+    ----------
+    y : array-like
+        Sequence of integer labels or multilabel data to encode.
+
+    classes : array-like of shape [n_classes]
+        Uniquely holds the label for each class.
+
+    neg_label : int (default: 0)
+        Value with which negative labels must be encoded.
+
+    pos_label : int (default: 1)
+        Value with which positive labels must be encoded.
+
+    sparse_output : boolean (default: False),
+        Set to true if output binary array is desired in CSR sparse format
+
+    Returns
+    -------
+    Y : numpy array or CSR matrix of shape [n_samples, n_classes]
+        Shape will be [n_samples, 1] for binary problems.
+
+    Examples
+    --------
+    >>> from sklearn.preprocessing import label_binarize
+    >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
+    array([[1, 0, 0, 0],
+           [0, 0, 0, 1]])
+
+    The class ordering is preserved:
+
+    >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
+    array([[1, 0, 0, 0],
+           [0, 1, 0, 0]])
+
+    Binary targets transform to a column vector
+
+    >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
+    array([[1],
+           [0],
+           [0],
+           [1]])
+
+    See also
+    --------
+    LabelBinarizer : class used to wrap the functionality of label_binarize and
+        allow for fitting to classes independently of the transform operation
+    """
+    if not isinstance(y, list):
+        # XXX Workaround that will be removed when list of list format is
+        # dropped
+        y = check_array(y, accept_sparse='csr', ensure_2d=False, dtype=None)
+    else:
+        if _num_samples(y) == 0:
+            raise ValueError('y has 0 samples: %r' % y)
+    if neg_label >= pos_label:
+        raise ValueError("neg_label={0} must be strictly less than "
+                         "pos_label={1}.".format(neg_label, pos_label))
+
+    if (sparse_output and (pos_label == 0 or neg_label != 0)):
+        raise ValueError("Sparse binarization is only supported with non "
+                         "zero pos_label and zero neg_label, got "
+                         "pos_label={0} and neg_label={1}"
+                         "".format(pos_label, neg_label))
+
+    # To account for pos_label == 0 in the dense case
+    pos_switch = pos_label == 0
+    if pos_switch:
+        pos_label = -neg_label
+
+    y_type = type_of_target(y)
+    if 'multioutput' in y_type:
+        raise ValueError("Multioutput target data is not supported with label "
+                         "binarization")
+    if y_type == 'unknown':
+        raise ValueError("The type of target data is not known")
+
+    n_samples = y.shape[0] if sp.issparse(y) else len(y)
+    n_classes = len(classes)
+    classes = np.asarray(classes)
+
+    if y_type == "binary":
+        if n_classes == 1:
+            if sparse_output:
+                return sp.csr_matrix((n_samples, 1), dtype=int)
+            else:
+                Y = np.zeros((len(y), 1), dtype=np.int)
+                Y += neg_label
+                return Y
+        elif len(classes) >= 3:
+            y_type = "multiclass"
+
+    sorted_class = np.sort(classes)
+    if y_type == "multilabel-indicator":
+        y_n_classes = y.shape[1] if hasattr(y, 'shape') else len(y[0])
+        if classes.size != y_n_classes:
+            raise ValueError("classes {0} mismatch with the labels {1}"
+                             " found in the data"
+                             .format(classes, unique_labels(y)))
+
+    if y_type in ("binary", "multiclass"):
+        y = column_or_1d(y)
+
+        # pick out the known labels from y
+        y_in_classes = af_in1d(y, classes)
+        y_in_classes = af.interop.from_ndarray(y_in_classes, copy=True)
+        y[y_in_classes]
+        y_seen = y[y_in_classes]
+        y_seen = y_seen.to_ndarray()
+        indices = np.searchsorted(sorted_class, y_seen)
+        indptr = np.hstack((0, np.cumsum(y_in_classes)))
+
+        data = np.empty_like(indices)
+        data.fill(pos_label)
+        Y = data
+
+        #Y = sp.csr_matrix((data, indices, indptr),
+                          #shape=(n_samples, n_classes))
+    elif y_type == "multilabel-indicator":
+        Y = sp.csr_matrix(y)
+        if pos_label != 1:
+            data = np.empty_like(Y.data)
+            data.fill(pos_label)
+            Y.data = data
+    else:
+        raise ValueError("%s target data is not supported with label "
+                         "binarization" % y_type)
+
+    if not sparse_output:
+        #Y = Y.toarray() #TODO: test if ndarray, then cast if not
+        Y = Y.astype(int, copy=False)
+
+        if neg_label != 0:
+            Y[Y == 0] = neg_label
+
+        if pos_switch:
+            Y[Y == pos_label] = 0
+    else:
+        Y.data = Y.data.astype(int, copy=False)
+
+    # preserve label ordering
+    if np.any(classes != sorted_class):
+        indices = np.searchsorted(sorted_class, classes)
+        Y = Y[:, indices]
+
+    if y_type == "binary":
+        if sparse_output:
+            Y = Y.getcol(-1)
+        else:
+            Y = Y[:, -1].reshape((-1, 1))
+
+
+    #return Y
+    return Y
+
+
+class afLabelBinarizer(LabelBinarizer):
+    def fit(self, y):
+        """Fit label binarizer
+
+        Parameters
+        ----------
+        y : array of shape [n_samples,] or [n_samples, n_classes]
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        self.y_type_ = type_of_target(y)
+        if 'multioutput' in self.y_type_:
+            raise ValueError("Multioutput target data is not supported with "
+                             "label binarization")
+        if _num_samples(y) == 0:
+            raise ValueError('y has 0 samples: %r' % y)
+
+        self.sparse_input_ = sp.issparse(y)
+        self.classes_ = unique_labels(y)
+        return self
+
+    def transform(self, y):
+        """Transform multi-class labels to binary labels
+
+        The output of transform is sometimes referred to by some authors as
+        the 1-of-K coding scheme.
+
+        Parameters
+        ----------
+        y : array or sparse matrix of shape [n_samples,] or \
+            [n_samples, n_classes]
+            Target values. The 2-d matrix should only contain 0 and 1,
+            represents multilabel classification. Sparse matrix can be
+            CSR, CSC, COO, DOK, or LIL.
+
+        Returns
+        -------
+        Y : numpy array or CSR matrix of shape [n_samples, n_classes]
+            Shape will be [n_samples, 1] for binary problems.
+        """
+        check_is_fitted(self)
+
+        y_is_multilabel = type_of_target(y).startswith('multilabel')
+        if y_is_multilabel and not self.y_type_.startswith('multilabel'):
+            raise ValueError("The object was not fitted with multilabel"
+                             " input.")
+
+        return label_binarize(y, classes=self.classes_,
+                              pos_label=self.pos_label,
+                              neg_label=self.neg_label,
+                              sparse_output=self.sparse_output)
+
+    def inverse_transform(self, Y, threshold=None):
+        """Transform binary labels back to multi-class labels
+
+        Parameters
+        ----------
+        Y : numpy array or sparse matrix with shape [n_samples, n_classes]
+            Target values. All sparse matrices are converted to CSR before
+            inverse transformation.
+
+        threshold : float or None
+            Threshold used in the binary and multi-label cases.
+
+            Use 0 when ``Y`` contains the output of decision_function
+            (classifier).
+            Use 0.5 when ``Y`` contains the output of predict_proba.
+
+            If None, the threshold is assumed to be half way between
+            neg_label and pos_label.
+
+        Returns
+        -------
+        y : numpy array or CSR matrix of shape [n_samples] Target values.
+
+        Notes
+        -----
+        In the case when the binary labels are fractional
+        (probabilistic), inverse_transform chooses the class with the
+        greatest value. Typically, this allows to use the output of a
+        linear model's decision_function method directly as the input
+        of inverse_transform.
+        """
+        check_is_fitted(self)
+
+        if threshold is None:
+            threshold = (self.pos_label + self.neg_label) / 2.
+
+        Y = Y.to_ndarray()
+        if Y.ndim == 1:
+            Y = Y[:, np.newaxis]
+        if self.y_type_ == "multiclass":
+            y_inv = _inverse_binarize_multiclass(Y, self.classes_)
+        else:
+            y_inv = _inverse_binarize_thresholding(Y, self.y_type_,
+                                                   self.classes_, threshold)
+
+        if self.sparse_input_:
+            y_inv = sp.csr_matrix(y_inv)
+        elif sp.issparse(y_inv):
+            y_inv = y_inv.toarray()
+
+        af_yinv = af.from_ndarray(y_inv)
+        return af_yinv
diff --git a/afsklearn/afRegressorMixin.py b/afsklearn/afRegressorMixin.py
new file mode 100644
index 0000000..7ff54b0
--- /dev/null
+++ b/afsklearn/afRegressorMixin.py
@@ -0,0 +1,246 @@
+import arrayfire as af
+
+from sklearn.utils.validation import _deprecate_positional_args
+from af_validation import check_consistent_length
+from af_validation import check_array
+from af_validation import _num_samples
+
+def _check_reg_targets(y_true, y_pred, multioutput, dtype="numeric"):
+    """Check that y_true and y_pred belong to the same regression task
+    Parameters
+    ----------
+    y_true : array-like
+    y_pred : array-like
+    multioutput : array-like or string in ['raw_values', uniform_average',
+        'variance_weighted'] or None
+        None is accepted due to backward compatibility of r2_score().
+    Returns
+    -------
+    type_true : one of {'continuous', continuous-multioutput'}
+        The type of the true target data, as output by
+        'utils.multiclass.type_of_target'
+    y_true : array-like of shape (n_samples, n_outputs)
+        Ground truth (correct) target values.
+    y_pred : array-like of shape (n_samples, n_outputs)
+        Estimated target values.
+    multioutput : array-like of shape (n_outputs) or string in ['raw_values',
+        uniform_average', 'variance_weighted'] or None
+        Custom output weights if ``multioutput`` is array-like or
+        just the corresponding argument if ``multioutput`` is a
+        correct keyword.
+    dtype: str or list, default="numeric"
+        the dtype argument passed to check_array
+    """
+    check_consistent_length(y_true, y_pred)
+    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
+    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
+
+    # irrelevant in af, dim[1] always valid
+    #if y_true.numdims() == 1:
+        #y_true = y_true.reshape((-1, 1))
+
+    #if y_pred.numdims() == 1:
+        #y_pred = y_pred.reshape((-1, 1))
+
+    print(type(y_true))
+    print(type(y_pred))
+    if y_true.numdims() != 1 and y_pred.numdims() !=1:
+        if y_true.shape[1] != y_pred.shape[1]:
+            raise ValueError("y_true and y_pred have different number of output "
+                             "({0}!={1})".format(y_true.shape[1], y_pred.shape[1]))
+
+    n_outputs = 1 if y_true.numdims() == 1 else  y_true.shape[1]
+    allowed_multioutput_str = ('raw_values', 'uniform_average',
+                               'variance_weighted')
+    if isinstance(multioutput, str):
+        if multioutput not in allowed_multioutput_str:
+            raise ValueError("Allowed 'multioutput' string values are {}. "
+                             "You provided multioutput={!r}".format(
+                                 allowed_multioutput_str,
+                                 multioutput))
+    elif multioutput is not None:
+        multioutput = check_array(multioutput, ensure_2d=False)
+        if n_outputs == 1:
+            raise ValueError("Custom weights are useful only in "
+                             "multi-output cases.")
+        elif n_outputs != len(multioutput):
+            raise ValueError(("There must be equally many custom weights "
+                              "(%d) as outputs (%d).") %
+                             (len(multioutput), n_outputs))
+    y_type = 'continuous' if n_outputs == 1 else 'continuous-multioutput'
+
+    return y_type, y_true, y_pred, multioutput
+
+
+@_deprecate_positional_args
+def r2_score(y_true, y_pred, *, sample_weight=None,
+             multioutput="uniform_average"):
+    """R^2 (coefficient of determination) regression score function.
+    Best possible score is 1.0 and it can be negative (because the
+    model can be arbitrarily worse). A constant model that always
+    predicts the expected value of y, disregarding the input features,
+    would get a R^2 score of 0.0.
+    Read more in the :ref:`User Guide <r2_score>`.
+    Parameters
+    ----------
+    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+    sample_weight : array-like of shape (n_samples,), optional
+        Sample weights.
+    multioutput : string in ['raw_values', 'uniform_average', \
+'variance_weighted'] or None or array-like of shape (n_outputs)
+        Defines aggregating of multiple output scores.
+        Array-like value defines weights used to average scores.
+        Default is "uniform_average".
+        'raw_values' :
+            Returns a full set of scores in case of multioutput input.
+        'uniform_average' :
+            Scores of all outputs are averaged with uniform weight.
+        'variance_weighted' :
+            Scores of all outputs are averaged, weighted by the variances
+            of each individual output.
+        .. versionchanged:: 0.19
+            Default value of multioutput is 'uniform_average'.
+    Returns
+    -------
+    z : float or ndarray of floats
+        The R^2 score or ndarray of scores if 'multioutput' is
+        'raw_values'.
+    Notes
+    -----
+    This is not a symmetric function.
+    Unlike most other scores, R^2 score may be negative (it need not actually
+    be the square of a quantity R).
+    This metric is not well-defined for single samples and will return a NaN
+    value if n_samples is less than two.
+    References
+    ----------
+    .. [1] `Wikipedia entry on the Coefficient of determination
+            <https://en.wikipedia.org/wiki/Coefficient_of_determination>`_
+    Examples
+    --------
+    >>> from sklearn.metrics import r2_score
+    >>> y_true = [3, -0.5, 2, 7]
+    >>> y_pred = [2.5, 0.0, 2, 8]
+    >>> r2_score(y_true, y_pred)
+    0.948...
+    >>> y_true = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> y_pred = [[0, 2], [-1, 2], [8, -5]]
+    >>> r2_score(y_true, y_pred,
+    ...          multioutput='variance_weighted')
+    0.938...
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [1, 2, 3]
+    >>> r2_score(y_true, y_pred)
+    1.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [2, 2, 2]
+    >>> r2_score(y_true, y_pred)
+    0.0
+    >>> y_true = [1, 2, 3]
+    >>> y_pred = [3, 2, 1]
+    >>> r2_score(y_true, y_pred)
+    -3.0
+    """
+    y_type, y_true, y_pred, multioutput = _check_reg_targets(
+        y_true, y_pred, multioutput)
+    check_consistent_length(y_true, y_pred, sample_weight)
+
+    if _num_samples(y_pred) < 2:
+        msg = "R^2 score is not well-defined with less than two samples."
+        warnings.warn(msg, UndefinedMetricWarning)
+        return float('nan')
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+        weight = sample_weight[:, np.newaxis]
+    else:
+        weight = 1.
+
+    numerator = af.sum((weight * (y_true - y_pred) ** 2), dim=0)
+    #denominator = (weight * (y_true - np.average(
+        #y_true, axis=0, weights=sample_weight)) ** 2).sum(axis=0,
+                                                          #dtype=np.float64)
+    denominator = af.sum((weight * (y_true - af.tile(af.mean(y_true, weights=sample_weight, dim=0), y_true.shape[0])) ** 2), dim=0)
+
+    nonzero_denominator = denominator != 0
+    nonzero_numerator = numerator != 0
+    valid_score = nonzero_denominator & nonzero_numerator
+    y_sz_1 = 1 if y_true.numdims() == 1 else y_true.shape[1]
+    output_scores = af.constant(0, y_sz_1)
+    if(af.any_true(valid_score)):
+        output_scores[valid_score] = (1.0 - (numerator[valid_score] /
+                                            denominator[valid_score])).as_type(output_scores.dtype())
+    # arbitrary set to zero to avoid -inf scores, having a constant
+    # y_true is not interesting for scoring a regression anyway
+    output_scores[nonzero_numerator & ~nonzero_denominator] = 0.
+    if isinstance(multioutput, str):
+        if multioutput == 'raw_values':
+            # return scores individually
+            return output_scores
+        elif multioutput == 'uniform_average':
+            # passing None as weights results is uniform mean
+            avg_weights = None
+        elif multioutput == 'variance_weighted':
+            avg_weights = denominator
+            # avoid fail on constant y or one-element arrays
+            if not af.any_true(nonzero_denominator):
+                if not af.any_true(nonzero_numerator):
+                    return 1.0
+                else:
+                    return 0.0
+    else:
+        avg_weights = multioutput
+
+    #return np.average(output_scores, weights=avg_weights)
+    return af.mean(output_scores, weights=avg_weights)
+
+
+class afRegressorMixin:
+    """Mixin class for all regression estimators in scikit-learn."""
+    _estimator_type = "regressor"
+
+    def score(self, X, y, sample_weight=None):
+        """Return the coefficient of determination R^2 of the prediction.
+        The coefficient R^2 is defined as (1 - u/v), where u is the residual
+        sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
+        sum of squares ((y_true - y_true.mean()) ** 2).sum().
+        The best possible score is 1.0 and it can be negative (because the
+        model can be arbitrarily worse). A constant model that always
+        predicts the expected value of y, disregarding the input features,
+        would get a R^2 score of 0.0.
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Test samples. For some estimators this may be a
+            precomputed kernel matrix or a list of generic objects instead,
+            shape = (n_samples, n_samples_fitted),
+            where n_samples_fitted is the number of
+            samples used in the fitting for the estimator.
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True values for X.
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+        Returns
+        -------
+        score : float
+            R^2 of self.predict(X) wrt. y.
+        Notes
+        -----
+        The R2 score used when calling ``score`` on a regressor uses
+        ``multioutput='uniform_average'`` from version 0.23 to keep consistent
+        with default value of :func:`~sklearn.metrics.r2_score`.
+        This influences the ``score`` method of all the multioutput
+        regressors (except for
+        :class:`~sklearn.multioutput.MultiOutputRegressor`).
+        """
+
+        y_pred = self.predict(X)
+        return r2_score(y, y_pred, sample_weight=sample_weight)
+
+    def _more_tags(self):
+        return {'requires_y': True}
+
+
diff --git a/afsklearn/afSKImputer.py b/afsklearn/afSKImputer.py
new file mode 100644
index 0000000..96b7beb
--- /dev/null
+++ b/afsklearn/afSKImputer.py
@@ -0,0 +1,389 @@
+from typing import Any, List, Dict, Optional, Sequence
+from scipy import sparse
+import os
+import sklearn
+
+import numpy  # NOTE numpy is used only to set types and nan values. FIXME with correct types from AF
+# import arrayfire as af  # FIXME uncommnet when line 300 is resolved
+from numpy import ndarray  # FIXME to arrayfire ndarray typing
+import pandas
+
+# Custom import commands if any
+from .afSimpleImputer import afSimpleImputer as SimpleImputer
+# FIXME uncommnet when line 300 is resolved
+# from .SKImputer_base_af import _get_mask
+
+from d3m.container import DataFrame as d3m_dataframe
+from d3m.metadata import hyperparams, params, base as metadata_base
+from d3m import utils
+from d3m.base import utils as base_utils
+from d3m.exceptions import PrimitiveNotFittedError
+from d3m.primitive_interfaces.base import CallResult, DockerContainer
+from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase
+
+
+Inputs = d3m_dataframe
+Outputs = d3m_dataframe
+
+
+class Params(params.Params):
+    statistics_: Optional[ndarray]
+    indicator_: Optional[sklearn.base.BaseEstimator]
+    input_column_names: Optional[pandas.core.indexes.base.Index]
+    target_names_: Optional[Sequence[Any]]
+    training_indices_: Optional[Sequence[int]]
+    target_column_indices_: Optional[Sequence[int]]
+    target_columns_metadata_: Optional[List[Dict]]
+
+
+class Hyperparams(hyperparams.Hyperparams):
+    missing_values = hyperparams.Union(
+        configuration={
+            'int': hyperparams.Hyperparameter[int](
+                default=0,
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            ),
+            'float': hyperparams.Hyperparameter[float](
+                default=numpy.nan,
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            )
+        },
+        default='float',
+        description='The placeholder for the missing values. All occurrences of `missing_values` will be imputed.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+    strategy = hyperparams.Enumeration[str](
+        default='mean',
+        values=['median', 'most_frequent', 'mean', 'constant'],
+        description='The imputation strategy.  - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data.  .. versionadded:: 0.20 strategy="constant" for fixed value imputation.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    add_indicator = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+    fill_value = hyperparams.Union(
+        configuration={
+            'int': hyperparams.Hyperparameter[int](
+                default=0,
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            ),
+            'none': hyperparams.Constant(
+                default=None,
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            )
+        },
+        default='none',
+        description='When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and "missing_value" for strings or object data types.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+
+    use_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
+    )
+    exclude_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
+    )
+    return_result = hyperparams.Enumeration(
+        values=['append', 'replace', 'new'],
+        default='new',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
+    )
+    use_semantic_types = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
+    )
+    add_index_columns = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
+    )
+    error_on_no_input = hyperparams.UniformBool(
+        default=True,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
+    )
+
+    return_semantic_type = hyperparams.Enumeration[str](
+        values=['https://metadata.datadrivendiscovery.org/types/Attribute',
+                'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
+        default='https://metadata.datadrivendiscovery.org/types/Attribute',
+        description='Decides what semantic type to attach to generated attributes',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+
+
+class SKImputer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
+    """
+    Primitive for ArrayFire accelerated variant of sklearn SimpleImputer
+    `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_
+
+    """
+
+    __author__ = "ArrayFire"
+    metadata = metadata_base.PrimitiveMetadata({
+        "algorithm_types": [metadata_base.PrimitiveAlgorithmType.IMPUTATION, ],
+        "name": "ArrayFire.impute.SimpleImputer",
+        "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING,
+        "python_path": "d3m.primitives.data_cleaning.imputer.ArrayFire",
+        "source": {'name': 'ArrayFire', 'contact': 'mailto:support@arrayfire.com', 'uris': ['https://github.com/arrayfire/d3m-arrayfire-primitives.git']},
+        "version": "0.1.0",
+        "id": "21709973-f877-4700-8675-92ac10a208d3",
+        "hyperparams_to_tune": ['strategy'],
+        'installation': [
+            {'type': metadata_base.PrimitiveInstallationType.PIP,
+             'package_uri': 'git+https://github.com/arrayfire/d3m-arrayfire-primitives.git@{git_commit}#egg=af_primitives'.format(
+                             git_commit=utils.current_git_commit(os.path.dirname(__file__)),
+              ),
+             }]
+    })
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams,
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None,
+                 _verbose: int = 0) -> None:
+
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        # False
+        self._clf = SimpleImputer(
+            missing_values=self.hyperparams['missing_values'],
+            strategy=self.hyperparams['strategy'],
+            add_indicator=self.hyperparams['add_indicator'],
+            fill_value=self.hyperparams['fill_value'],
+            verbose=_verbose
+        )
+
+        self._inputs = None
+        self._outputs = None
+        self._training_inputs = None
+        self._training_outputs = None
+        self._target_names = None
+        self._training_indices = None
+        self._target_column_indices = None
+        self._target_columns_metadata: List[Dict] = None
+        self._input_column_names = None
+        self._fitted = False
+
+    def set_training_data(self, *, inputs: Inputs) -> None:
+        self._inputs = inputs
+        self._fitted = False
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        if self._fitted:
+            return CallResult(None)
+
+        self._training_inputs, self._training_indices, _ = self._get_columns_to_fit(self._inputs, self.hyperparams)
+        self._input_column_names = self._training_inputs.columns.astype(str)
+
+        if self._training_inputs is None:
+            return CallResult(None)
+
+        if len(self._training_indices) > 0:
+            self._clf.fit(self._training_inputs)
+            self._fitted = True
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        return CallResult(None)
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams)
+        output = []
+        if len(sk_inputs.columns):
+            try:
+                sk_output = self._clf.transform(sk_inputs)
+            except sklearn.exceptions.NotFittedError as error:
+                raise PrimitiveNotFittedError("Primitive not fitted.") from error
+            if sparse.issparse(sk_output):
+                sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output)
+            target_columns_metadata = self._copy_columns_metadata(
+                inputs.metadata, self._training_indices, self.hyperparams)
+            output = self._wrap_predictions(inputs, sk_output, target_columns_metadata)
+
+            output.columns = [
+                inputs.columns[idx] for idx in range(len(inputs.columns)) if idx in self._training_indices]
+            output = [output]
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams)
+        outputs = base_utils.combine_columns(
+            return_result=self.hyperparams['return_result'], add_index_columns=self.hyperparams['add_index_columns'],
+            inputs=inputs, column_indices=self._training_indices + dropped_cols, columns_list=output)
+        return CallResult(outputs)
+
+    def get_params(self) -> Params:
+        if not self._fitted:
+            return Params(
+                statistics_=None,
+                indicator_=None,
+                input_column_names=self._input_column_names,
+                training_indices_=self._training_indices,
+                target_names_=self._target_names,
+                target_column_indices_=self._target_column_indices,
+                target_columns_metadata_=self._target_columns_metadata
+            )
+
+        return Params(
+            statistics_=getattr(self._clf, 'statistics_', None),
+            indicator_=getattr(self._clf, 'indicator_', None),
+            input_column_names=self._input_column_names,
+            training_indices_=self._training_indices,
+            target_names_=self._target_names,
+            target_column_indices_=self._target_column_indices,
+            target_columns_metadata_=self._target_columns_metadata
+        )
+
+    def set_params(self, *, params: Params) -> None:
+        self._clf.statistics_ = params['statistics_']
+        self._clf.indicator_ = params['indicator_']
+        self._input_column_names = params['input_column_names']
+        self._training_indices = params['training_indices_']
+        self._target_names = params['target_names_']
+        self._target_column_indices = params['target_column_indices_']
+        self._target_columns_metadata = params['target_columns_metadata_']
+
+        if params['statistics_'] is not None:
+            self._fitted = True
+        if params['indicator_'] is not None:
+            self._fitted = True
+
+    @classmethod
+    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            columns_to_produce = list(range(len(inputs.columns)))
+
+        else:
+            inputs_metadata = inputs.metadata
+
+            def can_produce_column(column_index: int) -> bool:
+                return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
+
+            columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
+                inputs_metadata, use_columns=hyperparams['use_columns'],
+                exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column)
+
+        columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce, hyperparams)
+        for col in columns_to_drop:
+            columns_to_produce.remove(col)
+
+        return inputs.iloc[:, columns_to_produce], columns_to_produce, columns_to_drop
+
+    @classmethod
+    def _get_columns_to_drop(cls, inputs: Inputs, column_indices: List[int], hyperparams: Hyperparams):
+        """
+        Check for columns that contain missing_values that need to be imputed
+        If strategy is constant and missin_values is nan, then all nan columns will not be dropped
+        :param inputs:
+        :param column_indices:
+        :return:
+        """
+        columns_to_remove = []
+        if hyperparams['strategy'] != "constant":
+            for _, col in enumerate(column_indices):
+                # BUG
+                # FIXME with uncomment below when resolved
+                # inp = inputs.iloc[:, [col]].values
+                # mask = _get_mask(inp, hyperparams['missing_values'])
+                # if af.all_true(mask):
+                #     columns_to_remove.append(col)
+                # FIXME remove pass when bug is fixed
+                pass
+        return columns_to_remove
+
+    @classmethod
+    def _can_produce_column(
+            cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
+
+        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+
+        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
+        accepted_semantic_types = set()
+        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
+        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
+            return False
+
+        semantic_types = set(column_metadata.get('semantic_types', []))
+
+        if len(semantic_types) == 0:
+            cls.logger.warning("No semantic types found in column metadata")
+            return False
+        # Making sure all accepted_semantic_types are available in semantic_types
+        if len(accepted_semantic_types - semantic_types) == 0:
+            return True
+
+        return False
+
+    @classmethod
+    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[Dict]:
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[Dict] = []
+        for column_index in range(outputs_length):
+            column_metadata = dict(outputs_metadata.query_column(column_index))
+
+            # Update semantic types and prepare it for predicted targets.
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set([])
+            add_semantic_types = []
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+    @classmethod
+    def _update_predictions_metadata(
+            cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
+            target_columns_metadata: List[Dict]) -> metadata_base.DataMetadata:
+        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
+
+        for column_index, column_metadata in enumerate(target_columns_metadata):
+            column_metadata.pop("structural_type", None)
+            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
+
+        return outputs_metadata
+
+    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs:
+        outputs = d3m_dataframe(predictions, generate_metadata=False)
+        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
+        return outputs
+
+    @classmethod
+    def _copy_columns_metadata(
+            cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[Dict]:
+
+        target_columns_metadata: List[Dict] = []
+        for column_index in column_indices:
+            column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
+            column_metadata = dict(inputs_metadata.query_column(column_index))
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set([])
+            add_semantic_types = set()
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            column_metadata["name"] = str(column_name)
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+
+afSKImputer.__doc__ = SimpleImputer.__doc__
diff --git a/afsklearn/afSKMLPClassifier.py b/afsklearn/afSKMLPClassifier.py
new file mode 100644
index 0000000..43cd26e
--- /dev/null
+++ b/afsklearn/afSKMLPClassifier.py
@@ -0,0 +1,740 @@
+from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
+from numpy import ndarray
+from collections import OrderedDict
+from scipy import sparse
+import os
+import sklearn
+import numpy
+import typing
+import pandas
+
+# Custom import commands if any
+from af_multilayer_perceptron import afMLPClassifier
+
+
+from d3m.container.numpy import ndarray as d3m_ndarray
+from d3m.container import DataFrame as d3m_dataframe
+from d3m.metadata import hyperparams, params, base as metadata_base
+from d3m import utils
+from d3m.base import utils as base_utils
+from d3m.exceptions import PrimitiveNotFittedError
+from d3m.primitive_interfaces.base import CallResult, DockerContainer
+
+from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
+from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
+from d3m import exceptions
+
+
+
+
+Inputs = d3m_dataframe
+Outputs = d3m_dataframe
+
+
+class Params(params.Params):
+    classes_: Optional[ndarray]
+    loss_: Optional[float]
+    coefs_: Optional[Sequence[Any]]
+    intercepts_: Optional[Sequence[Any]]
+    n_iter_: Optional[int]
+    n_layers_: Optional[int]
+    n_outputs_: Optional[int]
+    out_activation_: Optional[str]
+    _best_coefs: Optional[Sequence[Any]]
+    _best_intercepts: Optional[Sequence[Any]]
+    _label_binarizer: Optional[sklearn.preprocessing.LabelBinarizer]
+    _no_improvement_count: Optional[int]
+    _random_state: Optional[numpy.random.mtrand.RandomState]
+    best_validation_score_: Optional[numpy.float64]
+    loss_curve_: Optional[Sequence[Any]]
+    t_: Optional[int]
+    _optimizer: Optional[sklearn.neural_network._stochastic_optimizers.AdamOptimizer]
+    validation_scores_: Optional[Sequence[Any]]
+    input_column_names: Optional[pandas.core.indexes.base.Index]
+    target_names_: Optional[Sequence[Any]]
+    training_indices_: Optional[Sequence[int]]
+    target_column_indices_: Optional[Sequence[int]]
+    target_columns_metadata_: Optional[List[OrderedDict]]
+
+
+
+class Hyperparams(hyperparams.Hyperparams):
+    hidden_layer_sizes = hyperparams.List(
+        elements=hyperparams.Bounded(1, None, 100),
+        default=(100, ),
+        min_size=1,
+        max_size=None,
+        description='The ith element represents the number of neurons in the ith hidden layer.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    activation = hyperparams.Enumeration[str](
+        values=['identity', 'logistic', 'tanh', 'relu'],
+        default='relu',
+        description='Activation function for the hidden layer.  - \'identity\', no-op activation, useful to implement linear bottleneck, returns f(x) = x  - \'logistic\', the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).  - \'tanh\', the hyperbolic tan function, returns f(x) = tanh(x).  - \'relu\', the rectified linear unit function, returns f(x) = max(0, x)',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    solver = hyperparams.Choice(
+        choices={
+            'lbfgs': hyperparams.Hyperparams.define(
+                configuration=OrderedDict({
+                    'max_fun': hyperparams.Bounded[int](
+                        default=15000,
+                        lower=1,
+                        upper=None,
+                        description='Maximum number of loss function calls',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    )
+                })
+            ),
+            'sgd': hyperparams.Hyperparams.define(
+                configuration=OrderedDict({
+                    'learning_rate': hyperparams.Enumeration[str](
+                        values=['constant', 'invscaling', 'adaptive'],
+                        default='constant',
+                        description='Learning rate schedule for weight updates. Only used when solver=’sgd’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'learning_rate_init': hyperparams.Bounded[float](
+                        lower=0,
+                        upper=None,
+                        default=0.001,
+                        description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'power_t': hyperparams.Bounded[float](
+                        lower=0,
+                        upper=None,
+                        default=0.5,
+                        description='The exponent for inverse scaling learning rate. Only used when solver=’sgd’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'shuffle': hyperparams.UniformBool(
+                        default=True,
+                        description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'momentum': hyperparams.Bounded[float](
+                        default=0.9,
+                        lower=0,
+                        upper=1,
+                        description='Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'nesterovs_momentum': hyperparams.UniformBool(
+                        default=True,
+                        description='Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'early_stopping': hyperparams.UniformBool(
+                        default=False,
+                        description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'n_iter_no_change': hyperparams.Bounded[int](
+                        default=10,
+                        lower=1,
+                        upper=None,
+                        description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    )
+                })
+            ),
+            'adam': hyperparams.Hyperparams.define(
+                configuration=OrderedDict({
+                    'learning_rate_init': hyperparams.Bounded[float](
+                        lower=0,
+                        upper=None,
+                        default=0.001,
+                        description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'shuffle': hyperparams.UniformBool(
+                        default=True,
+                        description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'early_stopping': hyperparams.UniformBool(
+                        default=False,
+                        description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'beta_1': hyperparams.Bounded[float](
+                        default=0.9,
+                        lower=0,
+                        upper=1,
+                        description='Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1).',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'beta_2': hyperparams.Bounded[float](
+                        default=0.999,
+                        lower=0,
+                        upper=1,
+                        description='Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1).',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'epsilon': hyperparams.Bounded[float](
+                        default=1e-08,
+                        lower=0,
+                        upper=None,
+                        description='Value for numerical stability in adam. Only used when solver=’adam’',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'n_iter_no_change': hyperparams.Bounded[int](
+                        default=10,
+                        lower=1,
+                        upper=None,
+                        description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    )
+                })
+            )
+        },
+        default='adam',
+        description='The solver for weight optimization.  - \'lbfgs\' is an optimizer in the family of quasi-Newton methods.  - \'sgd\' refers to stochastic gradient descent.  - \'adam\' refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba  Note: The default solver \'adam\' works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, \'lbfgs\' can converge faster and perform better.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    alpha = hyperparams.Bounded[float](
+        lower=0,
+        upper=None,
+        default=0.0001,
+        description='L2 penalty (regularization term) parameter.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    batch_size = hyperparams.Union(
+        configuration=OrderedDict({
+            'int': hyperparams.Bounded[int](
+                lower=0,
+                upper=None,
+                default=16,
+                description='Size of minibatches for stochastic optimizers. If the solver is ‘lbfgs’, the classifier will not use minibatch',
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            ),
+            'auto': hyperparams.Constant(
+                default='auto',
+                description='When set to “auto”, batch_size=min(200, n_samples)',
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            )
+        }),
+        default='auto',
+        description='Size of minibatches for stochastic optimizers. If the solver is \'lbfgs\', the classifier will not use minibatch. When set to "auto", `batch_size=min(200, n_samples)`',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    max_iter = hyperparams.Bounded[int](
+        lower=0,
+        upper=None,
+        default=200,
+        description='Maximum number of iterations. The solver iterates until convergence (determined by \'tol\') or this number of iterations. For stochastic solvers (\'sgd\', \'adam\'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    tol = hyperparams.Bounded[float](
+        default=0.0001,
+        lower=0,
+        upper=None,
+        description='Tolerance for the optimization. When the loss or score is not improving by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, unless ``learning_rate`` is set to \'adaptive\', convergence is considered to be reached and training stops.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    validation_fraction = hyperparams.Bounded[float](
+        default=0.1,
+        lower=0,
+        upper=None,
+        description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    warm_start = hyperparams.UniformBool(
+        default=False,
+        description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary <warm_start>`.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    
+    use_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
+    )
+    use_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
+    )
+    exclude_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
+    )
+    exclude_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
+    )
+    return_result = hyperparams.Enumeration(
+        values=['append', 'replace', 'new'],
+        default='new',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
+    )
+    use_semantic_types = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
+    )
+    add_index_columns = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
+    )
+    error_on_no_input = hyperparams.UniformBool(
+        default=True,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
+    )
+
+    return_semantic_type = hyperparams.Enumeration[str](
+        values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
+        default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
+        description='Decides what semantic type to attach to generated output',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+
+class afSKMLPClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
+                          ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
+    """
+    Primitive for ArrayFire accelerated variant of sklearn MLPClassifier
+    `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html>`_
+
+    """
+
+    __author__ = "ArrayFire"
+    metadata = metadata_base.PrimitiveMetadata({ 
+         "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MULTILAYER_PERCEPTRON, ],
+         "name": "ArrayFire.afMLPClassifier",
+         "primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
+         "python_path": "d3m.primitives.classification.mlp.ArrayFire",
+         "source": {'name': 'ArrayFire', 'contact': 'mailto:support@arrayfire.com', 'uris': ['https://github.com/arrayfire/d3m-arrayfire-primitives.git']},
+         "version": "0.1.0",
+         "id": "dcc94f2b-8c70-4095-ba21-ff923ab1bae2",
+         "hyperparams_to_tune": ['hidden_layer_sizes', 'activation', 'solver', 'alpha'],
+         'installation': [
+                        {'type': metadata_base.PrimitiveInstallationType.PIP,
+                           'package_uri': 'git+https://github.com/arrayfire/d3m-arrayfire-primitives.git@{git_commit}#egg=af_primitives'.format(
+                               git_commit=utils.current_git_commit(os.path.dirname(__file__)),
+                            ),
+                           }]
+    })
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams,
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None,
+                 _verbose: bool = False) -> None:
+
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        # False
+        self._clf = afMLPClassifier(
+              hidden_layer_sizes=self.hyperparams['hidden_layer_sizes'],
+              activation=self.hyperparams['activation'],
+              solver=self.hyperparams['solver']['choice'],
+              learning_rate=self.hyperparams['solver'].get('learning_rate', 'constant'),
+              learning_rate_init=self.hyperparams['solver'].get('learning_rate_init', 0.001),
+              power_t=self.hyperparams['solver'].get('power_t', 0.5),
+              shuffle=self.hyperparams['solver'].get('shuffle', True),
+              momentum=self.hyperparams['solver'].get('momentum', 0.9),
+              nesterovs_momentum=self.hyperparams['solver'].get('nesterovs_momentum', True),
+              early_stopping=self.hyperparams['solver'].get('early_stopping', False),
+              beta_1=self.hyperparams['solver'].get('beta_1', 0.9),
+              beta_2=self.hyperparams['solver'].get('beta_2', 0.999),
+              epsilon=self.hyperparams['solver'].get('epsilon', 1e-08),
+              n_iter_no_change=self.hyperparams['solver'].get('n_iter_no_change', 10),
+              max_fun=self.hyperparams['solver'].get('max_fun', 15000),
+              alpha=self.hyperparams['alpha'],
+              batch_size=self.hyperparams['batch_size'],
+              max_iter=self.hyperparams['max_iter'],
+              tol=self.hyperparams['tol'],
+              validation_fraction=self.hyperparams['validation_fraction'],
+              warm_start=self.hyperparams['warm_start'],
+              verbose=_verbose,
+              random_state=self.random_seed,
+        )
+
+        self._inputs = None
+        self._outputs = None
+        self._training_inputs = None
+        self._training_outputs = None
+        self._target_names = None
+        self._training_indices = None
+        self._target_column_indices = None
+        self._target_columns_metadata: List[OrderedDict] = None
+        self._input_column_names = None
+        self._fitted = False
+        self._new_training_data = False
+
+    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
+        self._inputs = inputs
+        self._outputs = outputs
+        self._fitted = False
+        self._new_training_data = True
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        if self._inputs is None or self._outputs is None:
+            raise ValueError("Missing training data.")
+
+        if not self._new_training_data:
+            return CallResult(None)
+        self._new_training_data = False
+
+        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
+        self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
+        self._input_column_names = self._training_inputs.columns.astype(str)
+
+        if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
+            self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
+            sk_training_output = self._training_outputs.values
+
+            shape = sk_training_output.shape
+            if len(shape) == 2 and shape[1] == 1:
+                sk_training_output = numpy.ravel(sk_training_output)
+
+            self._clf.fit(self._training_inputs, sk_training_output)
+            self._fitted = True
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+
+        return CallResult(None)
+
+
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
+        output = []
+        if len(sk_inputs.columns):
+            try:
+                sk_output = self._clf.predict(sk_inputs)
+            except sklearn.exceptions.NotFittedError as error:
+                raise PrimitiveNotFittedError("Primitive not fitted.") from error
+            # For primitives that allow predicting without fitting like GaussianProcessRegressor
+            if not self._fitted:
+                raise PrimitiveNotFittedError("Primitive not fitted.")
+            if sparse.issparse(sk_output):
+                sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output)
+            output = self._wrap_predictions(inputs, sk_output)
+            output.columns = self._target_names
+            output = [output]
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
+                                               add_index_columns=self.hyperparams['add_index_columns'],
+                                               inputs=inputs, column_indices=self._target_column_indices,
+                                               columns_list=output)
+
+        return CallResult(outputs)
+
+
+    def get_params(self) -> Params:
+        if not self._fitted:
+            return Params(
+                classes_=None,
+                loss_=None,
+                coefs_=None,
+                intercepts_=None,
+                n_iter_=None,
+                n_layers_=None,
+                n_outputs_=None,
+                out_activation_=None,
+                _best_coefs=None,
+                _best_intercepts=None,
+                _label_binarizer=None,
+                _no_improvement_count=None,
+                _random_state=None,
+                best_validation_score_=None,
+                loss_curve_=None,
+                t_=None,
+                _optimizer=None,
+                validation_scores_=None,
+                input_column_names=self._input_column_names,
+                training_indices_=self._training_indices,
+                target_names_=self._target_names,
+                target_column_indices_=self._target_column_indices,
+                target_columns_metadata_=self._target_columns_metadata
+            )
+
+        return Params(
+            classes_=getattr(self._clf, 'classes_', None),
+            loss_=getattr(self._clf, 'loss_', None),
+            coefs_=getattr(self._clf, 'coefs_', None),
+            intercepts_=getattr(self._clf, 'intercepts_', None),
+            n_iter_=getattr(self._clf, 'n_iter_', None),
+            n_layers_=getattr(self._clf, 'n_layers_', None),
+            n_outputs_=getattr(self._clf, 'n_outputs_', None),
+            out_activation_=getattr(self._clf, 'out_activation_', None),
+            _best_coefs=getattr(self._clf, '_best_coefs', None),
+            _best_intercepts=getattr(self._clf, '_best_intercepts', None),
+            _label_binarizer=getattr(self._clf, '_label_binarizer', None),
+            _no_improvement_count=getattr(self._clf, '_no_improvement_count', None),
+            _random_state=getattr(self._clf, '_random_state', None),
+            best_validation_score_=getattr(self._clf, 'best_validation_score_', None),
+            loss_curve_=getattr(self._clf, 'loss_curve_', None),
+            t_=getattr(self._clf, 't_', None),
+            _optimizer=getattr(self._clf, '_optimizer', None),
+            validation_scores_=getattr(self._clf, 'validation_scores_', None),
+            input_column_names=self._input_column_names,
+            training_indices_=self._training_indices,
+            target_names_=self._target_names,
+            target_column_indices_=self._target_column_indices,
+            target_columns_metadata_=self._target_columns_metadata
+        )
+
+    def set_params(self, *, params: Params) -> None:
+        self._clf.classes_ = params['classes_']
+        self._clf.loss_ = params['loss_']
+        self._clf.coefs_ = params['coefs_']
+        self._clf.intercepts_ = params['intercepts_']
+        self._clf.n_iter_ = params['n_iter_']
+        self._clf.n_layers_ = params['n_layers_']
+        self._clf.n_outputs_ = params['n_outputs_']
+        self._clf.out_activation_ = params['out_activation_']
+        self._clf._best_coefs = params['_best_coefs']
+        self._clf._best_intercepts = params['_best_intercepts']
+        self._clf._label_binarizer = params['_label_binarizer']
+        self._clf._no_improvement_count = params['_no_improvement_count']
+        self._clf._random_state = params['_random_state']
+        self._clf.best_validation_score_ = params['best_validation_score_']
+        self._clf.loss_curve_ = params['loss_curve_']
+        self._clf.t_ = params['t_']
+        self._clf._optimizer = params['_optimizer']
+        self._clf.validation_scores_ = params['validation_scores_']
+        self._input_column_names = params['input_column_names']
+        self._training_indices = params['training_indices_']
+        self._target_names = params['target_names_']
+        self._target_column_indices = params['target_column_indices_']
+        self._target_columns_metadata = params['target_columns_metadata_']
+
+        if params['classes_'] is not None:
+            self._fitted = True
+        if params['loss_'] is not None:
+            self._fitted = True
+        if params['coefs_'] is not None:
+            self._fitted = True
+        if params['intercepts_'] is not None:
+            self._fitted = True
+        if params['n_iter_'] is not None:
+            self._fitted = True
+        if params['n_layers_'] is not None:
+            self._fitted = True
+        if params['n_outputs_'] is not None:
+            self._fitted = True
+        if params['out_activation_'] is not None:
+            self._fitted = True
+        if params['_best_coefs'] is not None:
+            self._fitted = True
+        if params['_best_intercepts'] is not None:
+            self._fitted = True
+        if params['_label_binarizer'] is not None:
+            self._fitted = True
+        if params['_no_improvement_count'] is not None:
+            self._fitted = True
+        if params['_random_state'] is not None:
+            self._fitted = True
+        if params['best_validation_score_'] is not None:
+            self._fitted = True
+        if params['loss_curve_'] is not None:
+            self._fitted = True
+        if params['t_'] is not None:
+            self._fitted = True
+        if params['_optimizer'] is not None:
+            self._fitted = True
+        if params['validation_scores_'] is not None:
+            self._fitted = True
+
+
+    def log_likelihoods(self, *,
+                    outputs: Outputs,
+                    inputs: Inputs,
+                    timeout: float = None,
+                    iterations: int = None) -> CallResult[Sequence[float]]:
+        inputs = inputs.iloc[:, self._training_indices]  # Get ndarray
+        outputs = outputs.iloc[:, self._target_column_indices]
+
+        if len(inputs.columns) and len(outputs.columns):
+
+            if outputs.shape[1] != self._clf.n_outputs_:
+                raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")
+
+            log_proba = self._clf.predict_log_proba(inputs)
+
+            # Making it always a list, even when only one target.
+            if self._clf.n_outputs_ == 1:
+                log_proba = [log_proba]
+                classes = [self._clf.classes_]
+            else:
+                classes = self._clf.classes_
+
+            samples_length = inputs.shape[0]
+
+            log_likelihoods = []
+            for k in range(self._clf.n_outputs_):
+                # We have to map each class to its internal (numerical) index used in the learner.
+                # This allows "outputs" to contain string classes.
+                outputs_column = outputs.iloc[:, k]
+                classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
+                mapped_outputs_column = outputs_column.map(classes_map)
+
+                # For each target column (column in "outputs"), for each sample (row) we pick the log
+                # likelihood for a given class.
+                log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])
+
+            results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
+            results.columns = outputs.columns
+
+            for k in range(self._clf.n_outputs_):
+                column_metadata = outputs.metadata.query_column(k)
+                if 'name' in column_metadata:
+                    results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})
+
+        else:
+            results = d3m_dataframe(generate_metadata=True)
+
+        return CallResult(results)
+
+
+
+
+
+    @classmethod
+    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return inputs, list(range(len(inputs.columns)))
+
+        inputs_metadata = inputs.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
+
+        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
+                                                                             use_columns=hyperparams['use_inputs_columns'],
+                                                                             exclude_columns=hyperparams['exclude_inputs_columns'],
+                                                                             can_use_column=can_produce_column)
+        return inputs.iloc[:, columns_to_produce], columns_to_produce
+        # return columns_to_produce
+
+    @classmethod
+    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
+        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+
+        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
+        accepted_semantic_types = set()
+        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
+        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
+            return False
+
+        semantic_types = set(column_metadata.get('semantic_types', []))
+
+        if len(semantic_types) == 0:
+            cls.logger.warning("No semantic types found in column metadata")
+            return False
+        # Making sure all accepted_semantic_types are available in semantic_types
+        if len(accepted_semantic_types - semantic_types) == 0:
+            return True
+
+        return False
+
+    @classmethod
+    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return data, list(data.columns), list(range(len(data.columns)))
+
+        metadata = data.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            accepted_semantic_types = set()
+            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
+            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            if len(semantic_types) == 0:
+                cls.logger.warning("No semantic types found in column metadata")
+                return False
+            # Making sure all accepted_semantic_types are available in semantic_types
+            if len(accepted_semantic_types - semantic_types) == 0:
+                return True
+            return False
+
+        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
+                                                                                               use_columns=hyperparams[
+                                                                                                   'use_outputs_columns'],
+                                                                                               exclude_columns=
+                                                                                               hyperparams[
+                                                                                                   'exclude_outputs_columns'],
+                                                                                               can_use_column=can_produce_column)
+        targets = []
+        if target_column_indices:
+            targets = data.select_columns(target_column_indices)
+        target_column_names = []
+        for idx in target_column_indices:
+            target_column_names.append(data.columns[idx])
+        return targets, target_column_names, target_column_indices
+
+    @classmethod
+    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
+
+            # Update semantic types and prepare it for predicted targets.
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
+            add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+    @classmethod
+    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
+                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
+        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
+
+        for column_index, column_metadata in enumerate(target_columns_metadata):
+            column_metadata.pop("structural_type", None)
+            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
+
+        return outputs_metadata
+
+    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
+        outputs = d3m_dataframe(predictions, generate_metadata=False)
+        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
+        return outputs
+
+
+    @classmethod
+    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict()
+            semantic_types = []
+            semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
+            column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
+            if column_name is None:
+                column_name = "output_{}".format(column_index)
+            column_metadata["semantic_types"] = semantic_types
+            column_metadata["name"] = str(column_name)
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+
+afSKMLPClassifier.__doc__ = afMLPClassifier.__doc__
diff --git a/afsklearn/afSKMLPRegressor.py b/afsklearn/afSKMLPRegressor.py
new file mode 100644
index 0000000..0ef9c56
--- /dev/null
+++ b/afsklearn/afSKMLPRegressor.py
@@ -0,0 +1,677 @@
+from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
+from numpy import ndarray
+from collections import OrderedDict
+from scipy import sparse
+import os
+import sklearn
+import numpy
+import typing
+import pandas
+
+# Custom import commands if any
+from af_multilayer_perceptron import afMLPRegressor
+
+from d3m.container.numpy import ndarray as d3m_ndarray
+from d3m.container import DataFrame as d3m_dataframe
+from d3m.metadata import hyperparams, params, base as metadata_base
+from d3m import utils
+from d3m.base import utils as base_utils
+from d3m.exceptions import PrimitiveNotFittedError
+from d3m.primitive_interfaces.base import CallResult, DockerContainer
+
+from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
+from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
+from d3m import exceptions
+
+
+
+Inputs = d3m_dataframe
+Outputs = d3m_dataframe
+
+
+class Params(params.Params):
+    loss_: Optional[float]
+    coefs_: Optional[Sequence[Any]]
+    intercepts_: Optional[Sequence[Any]]
+    n_iter_: Optional[int]
+    n_layers_: Optional[int]
+    n_outputs_: Optional[int]
+    out_activation_: Optional[str]
+    _best_coefs: Optional[Sequence[Any]]
+    _best_intercepts: Optional[Sequence[Any]]
+    _no_improvement_count: Optional[int]
+    _random_state: Optional[numpy.random.mtrand.RandomState]
+    best_validation_score_: Optional[numpy.float64]
+    loss_curve_: Optional[Sequence[Any]]
+    t_: Optional[int]
+    _optimizer: Optional[sklearn.neural_network._stochastic_optimizers.AdamOptimizer]
+    validation_scores_: Optional[Sequence[Any]]
+    input_column_names: Optional[pandas.core.indexes.base.Index]
+    target_names_: Optional[Sequence[Any]]
+    training_indices_: Optional[Sequence[int]]
+    target_column_indices_: Optional[Sequence[int]]
+    target_columns_metadata_: Optional[List[OrderedDict]]
+
+
+
+class Hyperparams(hyperparams.Hyperparams):
+    hidden_layer_sizes = hyperparams.List(
+        elements=hyperparams.Bounded(1, None, 100),
+        default=(100, ),
+        min_size=1,
+        max_size=None,
+        description='The ith element represents the number of neurons in the ith hidden layer.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    activation = hyperparams.Enumeration[str](
+        values=['identity', 'logistic', 'tanh', 'relu'],
+        default='relu',
+        description='Activation function for the hidden layer.  - \'identity\', no-op activation, useful to implement linear bottleneck, returns f(x) = x  - \'logistic\', the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).  - \'tanh\', the hyperbolic tan function, returns f(x) = tanh(x).  - \'relu\', the rectified linear unit function, returns f(x) = max(0, x)',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    solver = hyperparams.Choice(
+        choices={
+            'lbfgs': hyperparams.Hyperparams.define(
+                configuration=OrderedDict({
+                    'max_fun': hyperparams.Bounded[int](
+                        default=15000,
+                        lower=1,
+                        upper=None,
+                        description='Maximum number of loss function calls',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    )
+                })
+            ),
+            'sgd': hyperparams.Hyperparams.define(
+                configuration=OrderedDict({
+                    'learning_rate': hyperparams.Enumeration[str](
+                        values=['constant', 'invscaling', 'adaptive'],
+                        default='constant',
+                        description='Learning rate schedule for weight updates. Only used when solver=’sgd’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'learning_rate_init': hyperparams.Bounded[float](
+                        lower=0,
+                        upper=None,
+                        default=0.001,
+                        description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'power_t': hyperparams.Bounded[float](
+                        lower=0,
+                        upper=None,
+                        default=0.5,
+                        description='The exponent for inverse scaling learning rate. Only used when solver=’sgd’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'shuffle': hyperparams.UniformBool(
+                        default=True,
+                        description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'momentum': hyperparams.Bounded[float](
+                        default=0.9,
+                        lower=0,
+                        upper=1,
+                        description='Momentum for gradient descent update. Should be between 0 and 1. Only used when solver=’sgd’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'nesterovs_momentum': hyperparams.UniformBool(
+                        default=True,
+                        description='Whether to use Nesterov’s momentum. Only used when solver=’sgd’ and momentum > 0.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'early_stopping': hyperparams.UniformBool(
+                        default=False,
+                        description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'n_iter_no_change': hyperparams.Bounded[int](
+                        default=10,
+                        lower=1,
+                        upper=None,
+                        description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    )
+                })
+            ),
+            'adam': hyperparams.Hyperparams.define(
+                configuration=OrderedDict({
+                    'learning_rate_init': hyperparams.Bounded[float](
+                        lower=0,
+                        upper=None,
+                        default=0.001,
+                        description='The initial learning rate used. It controls the step-size in updating the weights. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'shuffle': hyperparams.UniformBool(
+                        default=True,
+                        description='Whether to shuffle samples in each iteration. Only used when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'early_stopping': hyperparams.UniformBool(
+                        default=False,
+                        description='Whether to use early stopping to terminate training when validation score is not improving.If set to true, it will automatically set aside 10% of training data as validation and terminate training when validation score is not improving by at least tol for n_iter_no_change consecutive epochs.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'beta_1': hyperparams.Bounded[float](
+                        default=0.9,
+                        lower=0,
+                        upper=1,
+                        description='Exponential decay rate for estimates of first moment vector in adam, should be in [0, 1).',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'beta_2': hyperparams.Bounded[float](
+                        default=0.999,
+                        lower=0,
+                        upper=1,
+                        description='Exponential decay rate for estimates of second moment vector in adam, should be in [0, 1).',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'epsilon': hyperparams.Bounded[float](
+                        default=1e-08,
+                        lower=0,
+                        upper=None,
+                        description='Value for numerical stability in adam. Only used when solver=’adam’',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    ),
+                    'n_iter_no_change': hyperparams.Bounded[int](
+                        default=10,
+                        lower=1,
+                        upper=None,
+                        description='Maximum number of epochs to not meet tol improvement. Only effective when solver=’sgd’ or ‘adam’.',
+                        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+                    )
+                })
+            )
+        },
+        default='adam',
+        description='The solver for weight optimization.  - \'lbfgs\' is an optimizer in the family of quasi-Newton methods.  - \'sgd\' refers to stochastic gradient descent.  - \'adam\' refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba  Note: The default solver \'adam\' works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, \'lbfgs\' can converge faster and perform better.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    alpha = hyperparams.Bounded[float](
+        lower=0,
+        upper=None,
+        default=0.0001,
+        description='L2 penalty (regularization term) parameter.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    batch_size = hyperparams.Union(
+        configuration=OrderedDict({
+            'int': hyperparams.Bounded[int](
+                lower=0,
+                upper=None,
+                default=16,
+                description='Size of minibatches for stochastic optimizers. If the solver is \'lbfgs\', the classifier will not use minibatch',
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            ),
+            'auto': hyperparams.Constant(
+                default='auto',
+                description='When set to \'auto\', batch_size=min(200, n_samples)',
+                semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
+            )
+        }),
+        default='auto',
+        description='Size of minibatches for stochastic optimizers. If the solver is \'lbfgs\', the classifier will not use minibatch. When set to "auto", `batch_size=min(200, n_samples)`',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    max_iter = hyperparams.Bounded[int](
+        lower=0,
+        upper=None,
+        default=200,
+        description='Maximum number of iterations. The solver iterates until convergence (determined by \'tol\') or this number of iterations. For stochastic solvers (\'sgd\', \'adam\'), note that this determines the number of epochs (how many times each data point will be used), not the number of gradient steps.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    tol = hyperparams.Bounded[float](
+        default=0.0001,
+        lower=0,
+        upper=None,
+        description='Tolerance for the optimization. When the loss or score is not improving by at least ``tol`` for ``n_iter_no_change`` consecutive iterations, unless ``learning_rate`` is set to \'adaptive\', convergence is considered to be reached and training stops.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    warm_start = hyperparams.UniformBool(
+        default=False,
+        description='When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary <warm_start>`.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    validation_fraction = hyperparams.Bounded[float](
+        default=0.1,
+        lower=0,
+        upper=None,
+        description='The proportion of training data to set aside as validation set for early stopping. Must be between 0 and 1. Only used if early_stopping is True',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    
+    use_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
+    )
+    use_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
+    )
+    exclude_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
+    )
+    exclude_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
+    )
+    return_result = hyperparams.Enumeration(
+        values=['append', 'replace', 'new'],
+        default='new',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
+    )
+    use_semantic_types = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
+    )
+    add_index_columns = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
+    )
+    error_on_no_input = hyperparams.UniformBool(
+        default=True,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
+    )
+    
+    return_semantic_type = hyperparams.Enumeration[str](
+        values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
+        default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
+        description='Decides what semantic type to attach to generated output',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+
+class afSKMLPRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
+    """
+    Primitive for ArrayFire accelerated variant of sklearn MLPRegressor
+    `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html>`_
+    
+    """
+    
+    __author__ = "JPL MARVIN"
+    metadata = metadata_base.PrimitiveMetadata({ 
+         "algorithm_types": [metadata_base.PrimitiveAlgorithmType.MULTILAYER_PERCEPTRON, ],
+         "name": "sklearn.neural_network.multilayer_perceptron.afMLPRegressor",
+         "primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
+         "python_path": "d3m.primitives.regression.mlp.ArrayFire",
+         "source": {'name': 'ArrayFire', 'contact': 'mailto:support@arrayfire.com', 'uris': ['https://github.com/arrayfire/d3m-arrayfire-primitives.git']},
+         "version": "0.1.0",
+         "id": "36762aa3-d8d7-4875-a17c-384b3a7d2d7c",
+         "hyperparams_to_tune": ['hidden_layer_sizes', 'activation', 'solver', 'alpha'],
+         'installation': [
+                        {'type': metadata_base.PrimitiveInstallationType.PIP,
+                           'package_uri': 'git+https://github.com/arrayfire/d3m-arrayfire-primitives.git@{git_commit}#egg=af_primitives'.format(
+                               git_commit=utils.current_git_commit(os.path.dirname(__file__)),
+                            ),
+                           }]
+    })
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams,
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None,
+                 _verbose: bool = False) -> None:
+
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        # False
+        self._clf = afMLPRegressor(
+              hidden_layer_sizes=self.hyperparams['hidden_layer_sizes'],
+              activation=self.hyperparams['activation'],
+              solver=self.hyperparams['solver']['choice'],
+              learning_rate=self.hyperparams['solver'].get('learning_rate', 'constant'),
+              learning_rate_init=self.hyperparams['solver'].get('learning_rate_init', 0.001),
+              power_t=self.hyperparams['solver'].get('power_t', 0.5),
+              shuffle=self.hyperparams['solver'].get('shuffle', True),
+              momentum=self.hyperparams['solver'].get('momentum', 0.9),
+              nesterovs_momentum=self.hyperparams['solver'].get('nesterovs_momentum', True),
+              early_stopping=self.hyperparams['solver'].get('early_stopping', False),
+              beta_1=self.hyperparams['solver'].get('beta_1', 0.9),
+              beta_2=self.hyperparams['solver'].get('beta_2', 0.999),
+              epsilon=self.hyperparams['solver'].get('epsilon', 1e-08),
+              n_iter_no_change=self.hyperparams['solver'].get('n_iter_no_change', 10),
+              max_fun=self.hyperparams['solver'].get('max_fun', 15000),
+              alpha=self.hyperparams['alpha'],
+              batch_size=self.hyperparams['batch_size'],
+              max_iter=self.hyperparams['max_iter'],
+              tol=self.hyperparams['tol'],
+              warm_start=self.hyperparams['warm_start'],
+              validation_fraction=self.hyperparams['validation_fraction'],
+              verbose=_verbose,
+              random_state=self.random_seed,
+        )
+
+        self._inputs = None
+        self._outputs = None
+        self._training_inputs = None
+        self._training_outputs = None
+        self._target_names = None
+        self._training_indices = None
+        self._target_column_indices = None
+        self._target_columns_metadata: List[OrderedDict] = None
+        self._input_column_names = None
+        self._fitted = False
+        self._new_training_data = False
+
+    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
+        self._inputs = inputs
+        self._outputs = outputs
+        self._fitted = False
+        self._new_training_data = True
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        if self._inputs is None or self._outputs is None:
+            raise ValueError("Missing training data.")
+
+        if not self._new_training_data:
+            return CallResult(None)
+        self._new_training_data = False
+
+        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
+        self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
+        self._input_column_names = self._training_inputs.columns.astype(str)
+
+        if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
+            self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
+            sk_training_output = self._training_outputs.values
+
+            shape = sk_training_output.shape
+            if len(shape) == 2 and shape[1] == 1:
+                sk_training_output = numpy.ravel(sk_training_output)
+
+            self._clf.fit(self._training_inputs, sk_training_output)
+            self._fitted = True
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+
+        return CallResult(None)
+
+
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
+        output = []
+        if len(sk_inputs.columns):
+            try:
+                sk_output = self._clf.predict(sk_inputs)
+            except sklearn.exceptions.NotFittedError as error:
+                raise PrimitiveNotFittedError("Primitive not fitted.") from error
+            # For primitives that allow predicting without fitting like GaussianProcessRegressor
+            if not self._fitted:
+                raise PrimitiveNotFittedError("Primitive not fitted.")
+            if sparse.issparse(sk_output):
+                sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output)
+            output = self._wrap_predictions(inputs, sk_output)
+            output.columns = self._target_names
+            output = [output]
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
+                                               add_index_columns=self.hyperparams['add_index_columns'],
+                                               inputs=inputs, column_indices=self._target_column_indices,
+                                               columns_list=output)
+
+        return CallResult(outputs)
+
+
+    def get_params(self) -> Params:
+        if not self._fitted:
+            return Params(
+                loss_=None,
+                coefs_=None,
+                intercepts_=None,
+                n_iter_=None,
+                n_layers_=None,
+                n_outputs_=None,
+                out_activation_=None,
+                _best_coefs=None,
+                _best_intercepts=None,
+                _no_improvement_count=None,
+                _random_state=None,
+                best_validation_score_=None,
+                loss_curve_=None,
+                t_=None,
+                _optimizer=None,
+                validation_scores_=None,
+                input_column_names=self._input_column_names,
+                training_indices_=self._training_indices,
+                target_names_=self._target_names,
+                target_column_indices_=self._target_column_indices,
+                target_columns_metadata_=self._target_columns_metadata
+            )
+
+        return Params(
+            loss_=getattr(self._clf, 'loss_', None),
+            coefs_=getattr(self._clf, 'coefs_', None),
+            intercepts_=getattr(self._clf, 'intercepts_', None),
+            n_iter_=getattr(self._clf, 'n_iter_', None),
+            n_layers_=getattr(self._clf, 'n_layers_', None),
+            n_outputs_=getattr(self._clf, 'n_outputs_', None),
+            out_activation_=getattr(self._clf, 'out_activation_', None),
+            _best_coefs=getattr(self._clf, '_best_coefs', None),
+            _best_intercepts=getattr(self._clf, '_best_intercepts', None),
+            _no_improvement_count=getattr(self._clf, '_no_improvement_count', None),
+            _random_state=getattr(self._clf, '_random_state', None),
+            best_validation_score_=getattr(self._clf, 'best_validation_score_', None),
+            loss_curve_=getattr(self._clf, 'loss_curve_', None),
+            t_=getattr(self._clf, 't_', None),
+            _optimizer=getattr(self._clf, '_optimizer', None),
+            validation_scores_=getattr(self._clf, 'validation_scores_', None),
+            input_column_names=self._input_column_names,
+            training_indices_=self._training_indices,
+            target_names_=self._target_names,
+            target_column_indices_=self._target_column_indices,
+            target_columns_metadata_=self._target_columns_metadata
+        )
+
+    def set_params(self, *, params: Params) -> None:
+        self._clf.loss_ = params['loss_']
+        self._clf.coefs_ = params['coefs_']
+        self._clf.intercepts_ = params['intercepts_']
+        self._clf.n_iter_ = params['n_iter_']
+        self._clf.n_layers_ = params['n_layers_']
+        self._clf.n_outputs_ = params['n_outputs_']
+        self._clf.out_activation_ = params['out_activation_']
+        self._clf._best_coefs = params['_best_coefs']
+        self._clf._best_intercepts = params['_best_intercepts']
+        self._clf._no_improvement_count = params['_no_improvement_count']
+        self._clf._random_state = params['_random_state']
+        self._clf.best_validation_score_ = params['best_validation_score_']
+        self._clf.loss_curve_ = params['loss_curve_']
+        self._clf.t_ = params['t_']
+        self._clf._optimizer = params['_optimizer']
+        self._clf.validation_scores_ = params['validation_scores_']
+        self._input_column_names = params['input_column_names']
+        self._training_indices = params['training_indices_']
+        self._target_names = params['target_names_']
+        self._target_column_indices = params['target_column_indices_']
+        self._target_columns_metadata = params['target_columns_metadata_']
+
+        if params['loss_'] is not None:
+            self._fitted = True
+        if params['coefs_'] is not None:
+            self._fitted = True
+        if params['intercepts_'] is not None:
+            self._fitted = True
+        if params['n_iter_'] is not None:
+            self._fitted = True
+        if params['n_layers_'] is not None:
+            self._fitted = True
+        if params['n_outputs_'] is not None:
+            self._fitted = True
+        if params['out_activation_'] is not None:
+            self._fitted = True
+        if params['_best_coefs'] is not None:
+            self._fitted = True
+        if params['_best_intercepts'] is not None:
+            self._fitted = True
+        if params['_no_improvement_count'] is not None:
+            self._fitted = True
+        if params['_random_state'] is not None:
+            self._fitted = True
+        if params['best_validation_score_'] is not None:
+            self._fitted = True
+        if params['loss_curve_'] is not None:
+            self._fitted = True
+        if params['t_'] is not None:
+            self._fitted = True
+        if params['_optimizer'] is not None:
+            self._fitted = True
+        if params['validation_scores_'] is not None:
+            self._fitted = True
+
+
+
+
+
+
+
+    @classmethod
+    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return inputs, list(range(len(inputs.columns)))
+
+        inputs_metadata = inputs.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
+
+        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
+                                                                             use_columns=hyperparams['use_inputs_columns'],
+                                                                             exclude_columns=hyperparams['exclude_inputs_columns'],
+                                                                             can_use_column=can_produce_column)
+        return inputs.iloc[:, columns_to_produce], columns_to_produce
+        # return columns_to_produce
+
+    @classmethod
+    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
+        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+
+        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
+        accepted_semantic_types = set()
+        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
+        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
+            return False
+
+        semantic_types = set(column_metadata.get('semantic_types', []))
+
+        if len(semantic_types) == 0:
+            cls.logger.warning("No semantic types found in column metadata")
+            return False
+        # Making sure all accepted_semantic_types are available in semantic_types
+        if len(accepted_semantic_types - semantic_types) == 0:
+            return True
+
+        return False
+
+    @classmethod
+    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return data, list(data.columns), list(range(len(data.columns)))
+
+        metadata = data.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            accepted_semantic_types = set()
+            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
+            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            if len(semantic_types) == 0:
+                cls.logger.warning("No semantic types found in column metadata")
+                return False
+            # Making sure all accepted_semantic_types are available in semantic_types
+            if len(accepted_semantic_types - semantic_types) == 0:
+                return True
+            return False
+
+        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
+                                                                                               use_columns=hyperparams[
+                                                                                                   'use_outputs_columns'],
+                                                                                               exclude_columns=
+                                                                                               hyperparams[
+                                                                                                   'exclude_outputs_columns'],
+                                                                                               can_use_column=can_produce_column)
+        targets = []
+        if target_column_indices:
+            targets = data.select_columns(target_column_indices)
+        target_column_names = []
+        for idx in target_column_indices:
+            target_column_names.append(data.columns[idx])
+        return targets, target_column_names, target_column_indices
+
+    @classmethod
+    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
+
+            # Update semantic types and prepare it for predicted targets.
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
+            add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+    @classmethod
+    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
+                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
+        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
+
+        for column_index, column_metadata in enumerate(target_columns_metadata):
+            column_metadata.pop("structural_type", None)
+            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
+
+        return outputs_metadata
+
+    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
+        outputs = d3m_dataframe(predictions, generate_metadata=False)
+        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
+        return outputs
+
+
+    @classmethod
+    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict()
+            semantic_types = []
+            semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
+            column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
+            if column_name is None:
+                column_name = "output_{}".format(column_index)
+            column_metadata["semantic_types"] = semantic_types
+            column_metadata["name"] = str(column_name)
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+
+afSKMLPRegressor.__doc__ = afMLPRegressor.__doc__
diff --git a/afsklearn/afSKStringImputer.py b/afsklearn/afSKStringImputer.py
new file mode 100644
index 0000000..debb698
--- /dev/null
+++ b/afsklearn/afSKStringImputer.py
@@ -0,0 +1,364 @@
+from typing import Any, List, Dict, Optional, Sequence
+from numpy import ndarray  # FIXME to arrayfire ndarray typing
+from scipy import sparse
+import os
+import sklearn
+import pandas
+
+# Custom import commands if any
+from .afSimpleImputer import afSimpleImputer as SimpleImputer
+from .afSimpleImputer import _get_mask
+
+from d3m.container import DataFrame as d3m_dataframe
+from d3m.metadata import hyperparams, params, base as metadata_base
+from d3m import utils
+from d3m.base import utils as base_utils
+from d3m.exceptions import PrimitiveNotFittedError
+from d3m.primitive_interfaces.base import CallResult, DockerContainer
+from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase
+
+
+Inputs = d3m_dataframe
+Outputs = d3m_dataframe
+
+
+class Params(params.Params):
+    statistics_: Optional[ndarray]
+    indicator_: Optional[sklearn.base.BaseEstimator]
+    input_column_names: Optional[pandas.core.indexes.base.Index]
+    target_names_: Optional[Sequence[Any]]
+    training_indices_: Optional[Sequence[int]]
+    target_column_indices_: Optional[Sequence[int]]
+    target_columns_metadata_: Optional[List[Dict]]
+
+
+class Hyperparams(hyperparams.Hyperparams):
+    missing_values = hyperparams.Hyperparameter[str](
+        default='',
+        description='The placeholder for the missing values. All occurrences of `missing_values` will be imputed.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+    add_indicator = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+    strategy = hyperparams.Enumeration[str](
+        default='most_frequent',
+        values=['most_frequent', 'constant'],
+        description='The imputation strategy.  - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data.  .. versionadded:: 0.20 strategy="constant" for fixed value imputation.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    fill_value = hyperparams.Hyperparameter[str](
+        default='',
+        description='When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and "missing_value" for strings or object data types.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+
+    use_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
+    )
+    exclude_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
+    )
+    return_result = hyperparams.Enumeration(
+        values=['append', 'replace', 'new'],
+        default='new',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
+    )
+    use_semantic_types = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
+    )
+    add_index_columns = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
+    )
+    error_on_no_input = hyperparams.UniformBool(
+        default=True,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
+    )
+
+    return_semantic_type = hyperparams.Enumeration[str](
+        values=['https://metadata.datadrivendiscovery.org/types/Attribute',
+                'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
+        default='https://metadata.datadrivendiscovery.org/types/Attribute',
+        description='Decides what semantic type to attach to generated attributes',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+
+
+class SKStringImputer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
+    """
+    Primitive for ArrayFire accelerated variant of sklearn SimpleImputer
+    `sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`_
+
+    """
+
+    __author__ = "ArrayFire"
+    metadata = metadata_base.PrimitiveMetadata({
+        "algorithm_types": [metadata_base.PrimitiveAlgorithmType.IMPUTATION, ],
+        "name": "ArrayFire.impute.SimpleImputer",
+        "primitive_family": metadata_base.PrimitiveFamily.DATA_CLEANING,
+        "python_path": "d3m.primitives.data_cleaning.string_imputer.ArrayFire",
+        "source": {'name': 'ArrayFire', 'contact': 'mailto:support@arrayfire.com', 'uris': ['https://github.com/arrayfire/d3m-arrayfire-primitives.git']},
+         "version": "0.1.0",
+        "id": "3b7a2e76-5277-45f9-b361-419af3127f61",
+        "hyperparams_to_tune": ['strategy'],
+        'installation': [
+            {'type': metadata_base.PrimitiveInstallationType.PIP,
+             'package_uri': 'git+https://github.com/arrayfire/d3m-arrayfire-primitives.git@{git_commit}#egg=af_primitives'.format(
+                             git_commit=utils.current_git_commit(os.path.dirname(__file__)),
+              ),
+             }]
+    })
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams,
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None,
+                 _verbose: int = 0) -> None:
+
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        # False
+        self._clf = SimpleImputer(
+            missing_values=self.hyperparams['missing_values'],
+            add_indicator=self.hyperparams['add_indicator'],
+            strategy=self.hyperparams['strategy'],
+            fill_value=self.hyperparams['fill_value'],
+            verbose=_verbose
+        )
+
+        self._inputs = None
+        self._outputs = None
+        self._training_inputs = None
+        self._training_outputs = None
+        self._target_names = None
+        self._training_indices = None
+        self._target_column_indices = None
+        self._target_columns_metadata: List[Dict] = None
+        self._input_column_names = None
+        self._fitted = False
+
+    def set_training_data(self, *, inputs: Inputs) -> None:
+        self._inputs = inputs
+        self._fitted = False
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        if self._fitted:
+            return CallResult(None)
+
+        self._training_inputs, self._training_indices, _ = self._get_columns_to_fit(self._inputs, self.hyperparams)
+        self._input_column_names = self._training_inputs.columns.astype(str)
+
+        if self._training_inputs is None:
+            return CallResult(None)
+
+        if len(self._training_indices) > 0:
+            self._clf.fit(self._training_inputs)
+            self._fitted = True
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        return CallResult(None)
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        sk_inputs, columns_to_use, _ = self._get_columns_to_fit(inputs, self.hyperparams)
+        output = []
+        if len(sk_inputs.columns):
+            try:
+                sk_output = self._clf.transform(sk_inputs)
+            except sklearn.exceptions.NotFittedError as error:
+                raise PrimitiveNotFittedError("Primitive not fitted.") from error
+            if sparse.issparse(sk_output):
+                sk_output = pandas.DataFrame.sparse.from_spmatrix(sk_output)
+            target_columns_metadata = self._copy_columns_metadata(
+                inputs.metadata, self._training_indices, self.hyperparams)
+            output = self._wrap_predictions(inputs, sk_output, target_columns_metadata)
+
+            output.columns = [inputs.columns[idx]
+                              for idx in range(len(inputs.columns)) if idx in self._training_indices]
+            output = [output]
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        _, _, dropped_cols = self._get_columns_to_fit(inputs, self.hyperparams)
+        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
+                                             add_index_columns=self.hyperparams['add_index_columns'],
+                                             inputs=inputs, column_indices=self._training_indices + dropped_cols,
+                                             columns_list=output)
+        return CallResult(outputs)
+
+    def get_params(self) -> Params:
+        if not self._fitted:
+            return Params(
+                statistics_=None,
+                indicator_=None,
+                input_column_names=self._input_column_names,
+                training_indices_=self._training_indices,
+                target_names_=self._target_names,
+                target_column_indices_=self._target_column_indices,
+                target_columns_metadata_=self._target_columns_metadata
+            )
+
+        return Params(
+            statistics_=getattr(self._clf, 'statistics_', None),
+            indicator_=getattr(self._clf, 'indicator_', None),
+            input_column_names=self._input_column_names,
+            training_indices_=self._training_indices,
+            target_names_=self._target_names,
+            target_column_indices_=self._target_column_indices,
+            target_columns_metadata_=self._target_columns_metadata
+        )
+
+    def set_params(self, *, params: Params) -> None:
+        self._clf.statistics_ = params['statistics_']
+        self._clf.indicator_ = params['indicator_']
+        self._input_column_names = params['input_column_names']
+        self._training_indices = params['training_indices_']
+        self._target_names = params['target_names_']
+        self._target_column_indices = params['target_column_indices_']
+        self._target_columns_metadata = params['target_columns_metadata_']
+
+        if params['statistics_'] is not None:
+            self._fitted = True
+        if params['indicator_'] is not None:
+            self._fitted = True
+
+    @classmethod
+    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
+
+        if not hyperparams['use_semantic_types']:
+            columns_to_produce = list(range(len(inputs.columns)))
+
+        else:
+            inputs_metadata = inputs.metadata
+
+            def can_produce_column(column_index: int) -> bool:
+                return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
+
+            columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(
+                inputs_metadata, use_columns=hyperparams['use_columns'],
+                exclude_columns=hyperparams['exclude_columns'], can_use_column=can_produce_column)
+
+        columns_to_drop = cls._get_columns_to_drop(inputs, columns_to_produce, hyperparams)
+        for col in columns_to_drop:
+            columns_to_produce.remove(col)
+
+        return inputs.iloc[:, columns_to_produce], columns_to_produce, columns_to_drop
+
+    @classmethod
+    def _get_columns_to_drop(cls, inputs: Inputs, column_indices: List[int], hyperparams: Hyperparams):
+        """
+        Check for columns that contain missing_values that need to be imputed
+        If strategy is constant and missin_values is nan, then all nan columns will not be dropped
+        :param inputs:
+        :param column_indices:
+        :return:
+        """
+        columns_to_remove = []
+        if hyperparams['strategy'] != "constant":
+            for _, col in enumerate(column_indices):
+                inp = inputs.iloc[:, [col]].values
+                mask = _get_mask(inp, hyperparams['missing_values'])
+                if mask.all():
+                    columns_to_remove.append(col)
+        return columns_to_remove
+
+    @classmethod
+    def _can_produce_column(
+            cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
+
+        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+
+        accepted_structural_types = (str,)
+        accepted_semantic_types = set()
+        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
+        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
+            return False
+
+        semantic_types = set(column_metadata.get('semantic_types', []))
+
+        if len(semantic_types) == 0:
+            cls.logger.warning("No semantic types found in column metadata")
+            return False
+        # Making sure all accepted_semantic_types are available in semantic_types
+        if len(accepted_semantic_types - semantic_types) == 0:
+            return True
+
+        return False
+
+    @classmethod
+    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[Dict]:
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[Dict] = []
+        for column_index in range(outputs_length):
+            column_metadata = dict(outputs_metadata.query_column(column_index))
+
+            # Update semantic types and prepare it for predicted targets.
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set([])
+            add_semantic_types = []
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+    @classmethod
+    def _update_predictions_metadata(
+            cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
+            target_columns_metadata: List[Dict]) -> metadata_base.DataMetadata:
+
+        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
+
+        for column_index, column_metadata in enumerate(target_columns_metadata):
+            column_metadata.pop("structural_type", None)
+            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
+
+        return outputs_metadata
+
+    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray, target_columns_metadata) -> Outputs:
+        outputs = d3m_dataframe(predictions, generate_metadata=False)
+        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
+        return outputs
+
+    @classmethod
+    def _copy_columns_metadata(
+            cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[Dict]:
+
+        target_columns_metadata: List[Dict] = []
+        for column_index in column_indices:
+            column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
+            column_metadata = dict(inputs_metadata.query_column(column_index))
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set([])
+            add_semantic_types = set()
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            column_metadata["name"] = str(column_name)
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+
+afSKStringImputer.__doc__ = SimpleImputer.__doc__
diff --git a/afsklearn/afSimpleImputer.py b/afsklearn/afSimpleImputer.py
new file mode 100644
index 0000000..3714d03
--- /dev/null
+++ b/afsklearn/afSimpleImputer.py
@@ -0,0 +1,342 @@
+from sklearn.impute import SimpleImputer as _SimpleImputer
+from sklearn.impute._base import _most_frequent, _BaseImputer
+from sklearn.utils.validation import (
+        #ret = a @ b
+    _deprecate_positional_args, _ensure_no_complex_data, _ensure_sparse_format, check_is_fitted, FLOAT_DTYPES)
+from sklearn import get_config as _get_config
+from sklearn.exceptions import DataConversionWarning
+from sklearn.utils.fixes import _object_dtype_isnan  # FIXME
+from sklearn.utils.sparsefuncs import _get_median
+from afBaseEstimator import afBaseEstimator
+from af_type_utils import typemap
+from scipy import stats
+
+import numbers
+import warnings
+import numpy.ma as ma
+from numpy.core.numeric import ComplexWarning
+
+import scipy.sparse as sp
+from contextlib import suppress
+import arrayfire as af
+import numpy as np
+
+def _most_frequent(array, extra_value, n_repeat):
+    """Compute the most frequent value in a 1d array extended with
+       [extra_value] * n_repeat, where extra_value is assumed to be not part
+       of the array."""
+    #TODO: af
+    # Compute the most frequent value in array only
+    if array.size > 0:
+        with warnings.catch_warnings():
+            # stats.mode raises a warning when input array contains objects due
+            # to incapacity to detect NaNs. Irrelevant here since input array
+            # has already been NaN-masked.
+            warnings.simplefilter("ignore", RuntimeWarning)
+            mode = stats.mode(array)
+
+        most_frequent_value = mode[0][0]
+        most_frequent_count = mode[1][0]
+    else:
+        most_frequent_value = 0
+        most_frequent_count = 0
+
+    # Compare to array + [extra_value] * n_repeat
+    if most_frequent_count == 0 and n_repeat == 0:
+        return np.nan
+    elif most_frequent_count < n_repeat:
+        return extra_value
+    elif most_frequent_count > n_repeat:
+        return most_frequent_value
+    elif most_frequent_count == n_repeat:
+        # Ties the breaks. Copy the behaviour of scipy.stats.mode
+        if most_frequent_value < extra_value:
+            return most_frequent_value
+        else:
+            return extra_value
+
+class BaseImputer(_BaseImputer, afBaseEstimator):
+    def _concatenate_indicator(self, X_imputed, X_indicator):
+        """Concatenate indicator mask with the imputed data."""
+        if not self.add_indicator:
+            return X_imputed
+
+        hstack = sp.hstack if sp.issparse(X_imputed) else af.hstack
+        if X_indicator is not None:
+            return hstack((X_imputed, X_indicator))
+
+        raise ValueError(
+            "Data from the missing indicator are not provided. Call "
+            "_fit_indicator and _transform_indicator in the imputer implementation.")
+
+
+class afSimpleImputer(_SimpleImputer, BaseImputer):
+
+    def _validate_input(self, X, in_fit):  # NOTE: is duplicated due to a type checks
+        #import pdb; pdb.set_trace()
+        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError(f"Can only use these strategies: {allowed_strategies} got strategy={self.strategy}")
+
+        if self.strategy in ("most_frequent", "constant"):
+            dtype = None
+        else:
+            dtype = FLOAT_DTYPES
+
+        if not is_scalar_nan(self.missing_values):
+            force_all_finite = True
+        else:
+            force_all_finite = "allow-nan"
+
+        try:
+            X = self._validate_data(
+                X, reset=in_fit, accept_sparse='csc', dtype=dtype, force_all_finite=force_all_finite, copy=self.copy)
+        except ValueError as ve:
+            if "could not convert" in str(ve):
+                new_ve = ValueError("Cannot use {self.strategy} strategy with non-numeric data:\n{ve}")
+                raise new_ve from None
+            else:
+                raise ve
+
+        # BUG
+        # _check_inputs_dtype(X, self.missing_values)
+        # if X.dtype.kind not in ("i", "u", "f", "O"):
+        #     raise ValueError("SimpleImputer does not support data with dtype "
+        #                      f"{X.dtype}. Please provide either a numeric array (with"
+        #                      " a floating point or integer dtype) or "
+        #                      "categorical data represented either as an array "
+        #                      "with integer dtype or an array of string values "
+        #                      "with an object dtype.")
+
+        return X
+
+    def fit(self, X, y=None):
+        """Fit the imputer on X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Input data, where ``n_samples`` is the number of samples and
+            ``n_features`` is the number of features.
+        Returns
+        -------
+        self : SimpleImputer
+        """
+        X = self._validate_input(X, in_fit=True)
+        super()._fit_indicator(X)
+
+        # default fill_value is 0 for numerical input and "missing_value"
+        # otherwise
+        # BUG uncomment below
+        # if self.fill_value is None:
+        #     if X.dtype.kind in ("i", "u", "f"):
+        #         fill_value = 0
+        #     else:
+        #         fill_value = "missing_value"
+        # else:
+        #     fill_value = self.fill_value
+        fill_value = 0  # FIXME: remove after bug is fixed
+
+        # fill_value should be numerical in case of numerical input
+        npdtype = typemap(X.dtype())
+        if self.strategy == "constant" and npdtype.kind in ("i", "u", "f") and not isinstance(fill_value, numbers.Real):
+            raise ValueError(
+                f"'fill_value'={fill_value} is invalid. Expected a numerical value when imputing numerical data")
+
+        if sp.issparse(X):
+            # missing_values = 0 not allowed with sparse data as it would
+            # force densification
+            if self.missing_values == 0:
+                raise ValueError(
+                    "Imputation not possible when missing_values "
+                    "== 0 and input is sparse. Provide a dense array instead.")
+            else:
+                self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, fill_value)
+        else:
+            self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, fill_value)
+        return self
+
+    def _sparse_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on sparse data."""
+        mask_data = _get_mask(X.data, missing_values)
+        n_implicit_zeros = X.shape[0] - af.diff(X.indptr)
+
+        statistics = af.empty(X.shape[1])
+
+        if strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            statistics.fill(fill_value)
+        else:
+            for i in range(X.shape[1]):
+                column = X.data[X.indptr[i]:X.indptr[i + 1]]
+                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
+                column = column[~mask_column]
+
+                # combine explicit and implicit zeros
+                mask_zeros = _get_mask(column, 0)
+                column = column[~mask_zeros]
+                n_explicit_zeros = mask_zeros.sum()
+                n_zeros = n_implicit_zeros[i] + n_explicit_zeros
+
+                if strategy == "mean":
+                    s = column.size + n_zeros
+                    statistics[i] = np.nan if s == 0 else column.sum() / s
+
+                elif strategy == "median":
+                    statistics[i] = _get_median(column, n_zeros)
+
+                elif strategy == "most_frequent":
+                    statistics[i] = _most_frequent(column, 0, n_zeros)
+        return statistics
+
+    def _dense_fit(self, X, strategy, missing_values, fill_value):
+        """Fit the transformer on dense data."""
+        mask = _get_mask(X, missing_values)
+        X_np = X.to_ndarray()
+        masked_X = ma.masked_array(X_np, mask=np.isnan(X_np))  # FIXME
+
+        # Mean
+        if strategy == "mean":
+            mean_masked = ma.mean(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            mean = ma.getdata(mean_masked)
+            mean[ma.getmask(mean_masked)] = np.nan
+
+            return mean
+
+        # Median
+        elif strategy == "median":
+            median_masked = ma.median(masked_X, axis=0)
+            # Avoid the warning "Warning: converting a masked element to nan."
+            median = ma.getdata(median_masked)
+            median[ma.getmaskarray(median_masked)] = np.nan
+
+            return median
+
+        # Most frequent
+        elif strategy == "most_frequent":
+            # Avoid use of scipy.stats.mstats.mode due to the required
+            # additional overhead and slow benchmarking performance.
+            # See Issue 14325 and PR 14399 for full discussion.
+
+            # To be able access the elements by columns
+
+            X = X.T
+            mask = mask.T
+
+            npdtype = typemap(X.dtype())
+            if npdtype.kind == "O":
+                most_frequent = af.constant(0, X.shape[0], dtype=object)
+            else:
+                most_frequent = af.constant(0, X.shape[0])
+
+            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
+                row_mask = row_mask.logical_not()
+                row = row[row_mask].to_ndarray()
+                #most_frequent[i] = _most_frequent(row, np.nan, 0)
+                most_frequent[i] = _most_frequent(row, np.nan, 0)
+
+            return most_frequent
+
+        # Constant
+        elif strategy == "constant":
+            # for constant strategy, self.statistcs_ is used to store
+            # fill_value in each column
+            return af.constant(fill_value, X.shape[1], dtype=X.dtype())
+
+    def transform(self, X):
+        """Impute all missing values in X.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            The input data to complete.
+        """
+        check_is_fitted(self)
+
+        X = self._validate_input(X, in_fit=False)
+        #X = af.Array.to_ndarray(X)
+        X_indicator = super()._transform_indicator(X)
+
+        statistics = self.statistics_
+
+        if X.shape[1] != statistics.shape[0]:
+            raise ValueError(f"X has {X.shape[1]} features per sample, expected {self.statistics_.shape[0]}")
+
+        # Delete the invalid columns if strategy is not constant
+        if self.strategy == "constant":
+            valid_statistics = statistics
+        else:
+            # same as af.isnan but also works for object dtypes
+            # invalid_mask = _get_mask(statistics, np.nan)  # BUG: af runtime error
+            invalid_mask = af.isnan(statistics)  # FIXME
+            valid_mask = invalid_mask.logical_not()
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
+
+            if af.any_true(invalid_mask):
+                missing = af.arange(X.shape[1])[invalid_mask]
+                if self.verbose:
+                    warnings.warn(f"Deleting features without observed values: {missing}")
+                X = X[:, valid_statistics_indexes]
+
+        # Do actual imputation
+        if sp.issparse(X):
+            if self.missing_values == 0:
+                raise ValueError(
+                    "Imputation not possible when missing_values == 0 and input is sparse."
+                    "Provide a dense array instead.")
+            else:
+                mask = _get_mask(X.data, self.missing_values)
+                indexes = af.repeat(af.arange(len(X.indptr) - 1, dtype=af.int), af.diff(X.indptr))[mask]
+
+                X.data[mask] = valid_statistics[indexes].astype(X.dtype, copy=False)
+        else:
+            # mask = _get_mask(X, self.missing_values)  # BUG
+            mask = af.isnan(X)  # FIXME
+            # n_missing = af.sum(mask, axis=0)  # BUG af
+            n_missing = af.sum(mask, dim=0)
+            coordinates = af.where(mask.T)[::-1]  # BUG
+            valid_statistics = valid_statistics.to_ndarray().ravel()
+            n_missing = n_missing.to_ndarray().ravel()
+            values = np.repeat(valid_statistics, n_missing)  # BUG
+            values = af.interop.from_ndarray(values)
+
+            odims = X.dims()
+            X = af.flat(X)
+            X[coordinates] = values
+            X = af.moddims(X, *odims)
+
+        return super()._concatenate_indicator(X, X_indicator)
+
+def _get_mask(X, value_to_mask):
+    """Compute the boolean mask X == value_to_mask."""
+    # BUG: doesnt work properly
+    npdtype = typemap(X.dtype())
+    if is_scalar_nan(value_to_mask):
+        if npdtype.kind == "f":
+            return af.isnan(X)
+        elif X.dtype.kind in ("i", "u"):
+            # can't have NaNs in integer array.
+            return af.constant(0, X.shape[0], X.shape[1], dtype=af.Dtype.b8)
+        else:
+            # np.isnan does not work on object dtypes.
+            return _object_dtype_isnan(X) #todo:fix
+    else:
+        return X == value_to_mask
+
+
+def is_scalar_nan(x):
+    """
+    Ref: https://github.com/scikit-learn/scikit-learn/blob/fd237278e895b42abe8d8d09105cbb82dc2cbba7/sklearn/utils/__init__.py#L1004
+    """
+    # convert from numpy.bool_ to python bool to ensure that testing
+    # is_scalar_nan(x) is True does not fail.
+    # import ipdb; ipdb.set_trace()
+    return bool(isinstance(x, numbers.Real) and np.isnan(x))
+
+
+def _check_inputs_dtype(X, missing_values):
+    if X.dtype.kind in ("f", "i", "u") and not isinstance(missing_values, numbers.Real):
+        raise ValueError(
+            "'X' and 'missing_values' types are expected to be"
+            f" both numerical. Got X.dtype={X.dtype} and type(missing_values)={type(missing_values)}.")
diff --git a/afsklearn/af_KNeigborsClassifier.py b/afsklearn/af_KNeigborsClassifier.py
new file mode 100644
index 0000000..b53679b
--- /dev/null
+++ b/afsklearn/af_KNeigborsClassifier.py
@@ -0,0 +1,459 @@
+from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
+from numpy import ndarray
+from collections import OrderedDict
+from scipy import sparse
+import os
+import numpy
+import typing
+
+# Custom import commands if any
+import arrayfire as af
+
+from d3m.container.numpy import ndarray as d3m_ndarray
+from d3m.container import DataFrame as d3m_dataframe
+from d3m.metadata import hyperparams, params, base as metadata_base
+from d3m import utils
+from d3m.base import utils as base_utils
+from d3m.exceptions import PrimitiveNotFittedError
+from d3m.primitive_interfaces.base import CallResult, DockerContainer
+
+from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
+from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
+from d3m import exceptions
+import pandas
+
+Inputs = d3m_dataframe
+Outputs = d3m_dataframe
+
+
+class Params(params.Params):
+    input_column_names: Optional[Any]
+    target_names_: Optional[Sequence[Any]]
+    training_indices_: Optional[Sequence[int]]
+    target_column_indices_: Optional[Sequence[int]]
+    target_columns_metadata_: Optional[List[OrderedDict]]
+
+
+class Hyperparams(hyperparams.Hyperparams):
+    n_neighbors = hyperparams.Bounded[int](
+        default=5,
+        lower=0,
+        upper=None,
+        description='Number of neighbors to use by default for :meth:`k_neighbors` queries.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    weights = hyperparams.Enumeration[str](
+        values=['uniform', 'distance'],
+        default='uniform',
+        description='weight function used in prediction.  Possible values:  - \'uniform\' : uniform weights.  All points in each neighborhood are weighted equally. - \'distance\' : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    dist_type = hyperparams.Enumeration[str](
+        values=['sad', 'ssd', 'hamming'],
+        default='ssd',
+        description='The distance computation type. Currently \'sad\' (sum of absolute differences), \'ssd\' (sum of squared differences), and \'hamming\' (hamming distances) are supported.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    use_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
+    )
+    use_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
+    )
+    exclude_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
+    )
+    exclude_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
+    )
+    return_result = hyperparams.Enumeration(
+        values=['append', 'replace', 'new'],
+        default='new',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
+    )
+    use_semantic_types = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
+    )
+    add_index_columns = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
+    )
+    error_on_no_input = hyperparams.UniformBool(
+        default=True,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
+    )
+
+    return_semantic_type = hyperparams.Enumeration[str](
+        values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
+        default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
+        description='Decides what semantic type to attach to generated output',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+
+class af_KNeighborsClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
+                              ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
+    """
+    Primitive implementing KNeighborsClassifier using ArrayFire library
+    """
+
+    __author__ = 'ArrayFire'
+    metadata = metadata_base.PrimitiveMetadata({
+        'name': 'ArrayFire KNN Classifier',
+        'source': {
+            'name': 'ArrayFire',
+            'contact': 'mailto:support@arrayfire.com',
+            'uris': ['https://github.com/arrayfire/d3m-arrayfire-primitives.git']},
+        'id': '78c4acd6-ca23-456c-ab1c-c6d687b0957f',
+        'version': '0.1.0',
+        'python_path': 'd3m.primitives.classification.k_neighbors.ArrayFire',
+        'keywords' : ['arrayfire', 'knearestneighbors', 'knn'],
+        'installation': [
+            {'type': metadata_base.PrimitiveInstallationType.PIP,
+             'package_uri': 'git+https://github.com/arrayfire/d3m-arrayfire-primitives.git{git_commit}#egg=af_primitives'.format(
+                 git_commit=utils.current_git_commit(os.path.dirname(__file__)),
+             ),
+            }],
+        'algorithm_types': [metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS, ],
+        'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION,
+        'hyperparams_to_tune': ['n_neighbors', 'dist_type'],
+    })
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams,
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None) -> None:
+
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        self._n_neighbors=self.hyperparams['n_neighbors'],
+        self._weights=self.hyperparams['weights'],
+        self._data = None
+        self._labels = None
+
+        self._inputs = None
+        self._outputs = None
+        self._training_inputs = None
+        self._training_outputs = None
+        self._target_names = None
+        self._training_indices = None
+        self._target_column_indices = None
+        self._target_columns_metadata: List[OrderedDict] = None
+        self._input_column_names = None
+        self._fitted = False
+        self._new_training_data = False
+
+    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
+        self._inputs = inputs
+        self._outputs = outputs
+        self._fitted = False
+        self._new_training_data = True
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        if self._inputs is None or self._outputs is None:
+            raise ValueError("Missing training data.")
+
+        if not self._new_training_data:
+            return CallResult(None)
+        self._new_training_data = False
+
+        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
+        self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
+        self._input_column_names = self._training_inputs.columns
+
+        if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
+            self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
+            training_output = self._training_outputs.values
+
+            shape = training_output.shape
+            if len(shape) == 2 and shape[1] == 1:
+                training_output = numpy.ravel(training_output)
+
+            # "fit" data
+            self._data = af.from_ndarray(self._training_inputs.values)
+            self._labels = af.from_ndarray(training_output.astype('int32'))
+
+            self._fitted = True
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+
+        return CallResult(None)
+
+
+    @classmethod
+    def _get_neighbor_weights(self, dists, weight_by_dist, k):
+        weights = None
+        if weight_by_dist:
+            inv_dists = 1./dists
+            sum_inv_dists = af.sum(inv_dists)
+            weights = inv_dists / sum_inv_dists
+        else:
+            weights = af.Array.copy(dists)
+            weights[:] = 1./k
+        return weights
+
+
+    @classmethod
+    def _get_dist_type(self, dist_type_str):
+        dist_type = None
+        if dist_type_str == 'sad':
+            dist_type = af.MATCH.SAD
+        elif dist_type_str == 'ssd':
+            dist_type = af.MATCH.SSD
+        elif dist_type_str == 'hamming':
+            dist_type = af.MATCH.SHD
+        else:
+            raise RuntimeError('Invalid ArrayFire nearest neighbour distance type')
+        return dist_type
+
+
+    @classmethod
+    def _predict(self, query, train_feats, train_labels, k, dist_type, weight_by_dist):
+        near_locs, near_dists = af.vision.nearest_neighbour(query, train_feats, 1, \
+                                                            k, dist_type)
+        weights = self._get_neighbor_weights(near_dists, weight_by_dist, k)
+        top_labels = af.moddims(train_labels[near_locs], \
+                                near_locs.dims()[0], near_locs.dims()[1])
+        accum_weights = af.scan_by_key(top_labels, weights) # reduce by key would be more ideal
+        _, max_weight_locs = af.imax(accum_weights, dim=0)
+        pred_idxs = af.range(accum_weights.dims()[1]) * accum_weights.dims()[0] + max_weight_locs.T
+        top_labels_flat = af.flat(top_labels)
+        pred_classes = top_labels_flat[pred_idxs]
+        return pred_classes
+
+
+    @classmethod
+    def _predict_proba(self, query, train_feats, train_labels, k, dist_type, weight_by_dist):
+        near_locs, near_dists = af.vision.nearest_neighbour(query, train_feats, 1, \
+                                                            k, dist_type)
+        weights = self._get_neighbor_weights(near_dists, weight_by_dist, k)
+        top_labels = af.moddims(train_labels[near_locs], \
+                                near_locs.dims()[0], near_locs.dims()[1])
+        accum_weights = af.scan_by_key(top_labels, weights) # reduce by key would be more ideal
+        probs, _ = af.imax(accum_weights, dim=0)
+        return probs.T
+
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
+        output = []
+        if len(sk_inputs.columns):
+            try:
+                af_inputs = af.from_ndarray(sk_inputs.values)
+                weight_by_dist = self._weights == 'distance'
+                dist_type = self._get_dist_type(self.hyperparams['dist_type'])
+                af_output = self._predict(af_inputs, self._data, self._labels,        \
+                                          self.hyperparams['n_neighbors'], dist_type, \
+                                          weight_by_dist)
+                af_ndarray_output = af_output.to_ndarray().astype('int32')
+            except sklearn.exceptions.NotFittedError as error:
+                raise PrimitiveNotFittedError("Primitive not fitted.") from error
+            # For primitives that allow predicting without fitting like GaussianProcessRegressor
+            if not self._fitted:
+                raise PrimitiveNotFittedError("Primitive not fitted.")
+            if sparse.issparse(af_ndarray_output):
+                af_ndarray_output = af_ndarray_output.toarray()
+            output = self._wrap_predictions(inputs, af_ndarray_output)
+            output.columns = self._target_names
+            output = [output]
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
+                                             add_index_columns=self.hyperparams['add_index_columns'],
+                                             inputs=inputs, column_indices=self._target_column_indices,
+                                             columns_list=output)
+
+        return CallResult(outputs)
+
+
+    def get_params(self) -> Params:
+        if not self._fitted:
+            return Params(
+                input_column_names=self._input_column_names,
+                training_indices_=self._training_indices,
+                target_names_=self._target_names,
+                target_column_indices_=self._target_column_indices,
+                target_columns_metadata_=self._target_columns_metadata
+            )
+
+        return Params(
+            input_column_names=self._input_column_names,
+            training_indices_=self._training_indices,
+            target_names_=self._target_names,
+            target_column_indices_=self._target_column_indices,
+            target_columns_metadata_=self._target_columns_metadata
+        )
+
+
+    def set_params(self, *, params: Params) -> None:
+        self._input_column_names = params['input_column_names']
+        self._training_indices = params['training_indices_']
+        self._target_names = params['target_names_']
+        self._target_column_indices = params['target_column_indices_']
+        self._target_columns_metadata = params['target_columns_metadata_']
+
+
+    def log_likelihoods(self, *,
+                    outputs: Outputs,
+                    inputs: Inputs,
+                    timeout: float = None,
+                    iterations: int = None) -> CallResult[Sequence[float]]:
+        sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
+        af_inputs = af.from_ndarray(sk_inputs.values)
+        weight_by_dist = self._weights == 'distance'
+        dist_type = self._get_dist_type(self.hyperparams['dist_type'])
+        probs = self._predict_proba(af_inputs, self._data, self._labels,        \
+                                    self.hyperparams['n_neighbors'], dist_type, \
+                                    weight_by_dist)
+        return CallResult(af.log(probs).to_ndarray())
+
+
+    @classmethod
+    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return inputs, list(range(len(inputs.columns)))
+
+        inputs_metadata = inputs.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
+
+        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
+                                                                             use_columns=hyperparams['use_inputs_columns'],
+                                                                             exclude_columns=hyperparams['exclude_inputs_columns'],
+                                                                             can_use_column=can_produce_column)
+        return inputs.iloc[:, columns_to_produce], columns_to_produce
+        # return columns_to_produce
+
+    @classmethod
+    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
+        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+
+        accepted_structural_types = (int, float, numpy.integer, numpy.float64)
+        accepted_semantic_types = set()
+        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
+        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
+            return False
+
+        semantic_types = set(column_metadata.get('semantic_types', []))
+
+        if len(semantic_types) == 0:
+            cls.logger.warning("No semantic types found in column metadata")
+            return False
+        # Making sure all accepted_semantic_types are available in semantic_types
+        if len(accepted_semantic_types - semantic_types) == 0:
+            return True
+
+        return False
+
+    @classmethod
+    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return data, list(data.columns), list(range(len(data.columns)))
+
+        metadata = data.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            accepted_semantic_types = set()
+            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
+            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            if len(semantic_types) == 0:
+                cls.logger.warning("No semantic types found in column metadata")
+                return False
+            # Making sure all accepted_semantic_types are available in semantic_types
+            if len(accepted_semantic_types - semantic_types) == 0:
+                return True
+            return False
+
+        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
+                                                                                               use_columns=hyperparams[
+                                                                                                   'use_outputs_columns'],
+                                                                                               exclude_columns=
+                                                                                               hyperparams[
+                                                                                                   'exclude_outputs_columns'],
+                                                                                               can_use_column=can_produce_column)
+        targets = []
+        if target_column_indices:
+            targets = data.select_columns(target_column_indices)
+        target_column_names = []
+        for idx in target_column_indices:
+            target_column_names.append(data.columns[idx])
+        return targets, target_column_names, target_column_indices
+
+    @classmethod
+    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
+
+            # Update semantic types and prepare it for predicted targets.
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
+            add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+    @classmethod
+    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
+                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
+        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
+
+        for column_index, column_metadata in enumerate(target_columns_metadata):
+            column_metadata.pop("structural_type", None)
+            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
+
+        return outputs_metadata
+
+    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
+        outputs = d3m_dataframe(predictions, generate_metadata=False)
+        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
+        return outputs
+
+
+    @classmethod
+    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict()
+            semantic_types = []
+            semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
+            column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
+            if column_name is None:
+                column_name = "output_{}".format(column_index)
+            column_metadata["semantic_types"] = semantic_types
+            column_metadata["name"] = str(column_name)
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
diff --git a/afsklearn/af_LogisticRegression.py b/afsklearn/af_LogisticRegression.py
new file mode 100644
index 0000000..55f3998
--- /dev/null
+++ b/afsklearn/af_LogisticRegression.py
@@ -0,0 +1,595 @@
+import os
+from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
+from collections import OrderedDict
+import numpy as np
+from numpy import ndarray
+
+from d3m import exceptions
+from d3m import utils
+from d3m.base import utils as base_utils
+from d3m.container import DataFrame as d3m_dataframe
+from d3m.exceptions import PrimitiveNotFittedError
+from d3m.metadata import hyperparams, params, base as metadata_base
+from d3m.primitive_interfaces.base import CallResult, DockerContainer
+from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin
+from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
+
+import arrayfire as af
+from arrayfire.algorithm import max, imax, count, sum
+from arrayfire.arith import abs, sigmoid, log
+from arrayfire.array import read_array, transpose
+from arrayfire.blas import matmul, matmulTN
+from arrayfire.data import constant, join, moddims
+from arrayfire.device import sync, eval
+from arrayfire.interop import from_ndarray
+
+
+Inputs = d3m_dataframe
+Outputs = d3m_dataframe
+
+
+class Params(params.Params):
+    classes_: Optional[ndarray]
+    input_column_names: Optional[Any]
+    target_names_: Optional[Sequence[Any]]
+    training_indices_: Optional[Sequence[int]]
+    target_column_indices_: Optional[Sequence[int]]
+    target_columns_metadata_: Optional[List[OrderedDict]]
+
+
+class Hyperparams(hyperparams.Hyperparams):
+    penalty = hyperparams.Enumeration[str](
+        values=['l1', 'l2'],
+        default='l2',
+        description='Used to specify the norm used in the penalization. The \'newton-cg\', \'sag\' and \'lbfgs\' solvers support only l2 penalties.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    use_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
+    )
+    use_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
+    )
+    exclude_inputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
+    )
+    exclude_outputs_columns = hyperparams.Set(
+        elements=hyperparams.Hyperparameter[int](-1),
+        default=(),
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
+    )
+    return_result = hyperparams.Enumeration(
+        values=['append', 'replace', 'new'],
+        default='new',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
+    )
+    use_semantic_types = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
+    )
+    add_index_columns = hyperparams.UniformBool(
+        default=False,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
+    )
+    error_on_no_input = hyperparams.UniformBool(
+        default=True,
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
+        description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
+    )
+
+    return_semantic_type = hyperparams.Enumeration[str](
+        values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
+        default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
+        description='Decides what semantic type to attach to generated output',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+    learning_rate = hyperparams.Hyperparameter[float](
+        default=0.1,
+        description='(alpha) Rate at which to update the weights at each iteration during gradient descent',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    reg_constant = hyperparams.Hyperparameter[float](
+        default=1.0,
+        description='(lambda) Weight decay',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    max_err = hyperparams.Hyperparameter[float](
+        default=0.01,
+        description='Maximum error',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    max_iter = hyperparams.Bounded[int](
+        default=1000,
+        lower=0,
+        upper=None,
+        description='Maximum number of iterations taken for the solver to converge.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
+    )
+    verbose = hyperparams.Hyperparameter[int](
+        default=0,
+        description='Controls the verbosity of the building process.',
+        semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
+    )
+
+
+class af_LogisticRegression(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
+                            ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
+    '''
+    Primitive implementing LogisticRegression using the ArrayFire library
+    '''
+
+    __author__ = 'ArrayFire'
+    metadata = metadata_base.PrimitiveMetadata({
+        'name': 'ArrayFire Logistic Regression',
+        'source': {
+            'name': 'ArrayFire',
+            'contact': 'mailto:support@arrayfire.com',
+            'uris': ['https://github.com/arrayfire/d3m-arrayfire-primitives.git']},
+        'id': '25b08bb7-12f0-4447-a75b-5856ead6227e',
+        'version': '0.1.0',
+        'python_path': 'd3m.primitives.classification.logistic_regression.ArrayFire',
+        'keywords' : ['arrayfire', 'logistic regression', 'logistic regressor'],
+        'installation': [
+            {'type': metadata_base.PrimitiveInstallationType.PIP,
+             'package_uri': 'git+https://github.com/arrayfire/d3m-arrayfire-primitives.git@{git_commit}#egg=af_primitives'.format(
+                 git_commit=utils.current_git_commit(os.path.dirname(__file__)),
+             ),
+            }],
+        'algorithm_types': [ metadata_base.PrimitiveAlgorithmType.LOGISTIC_REGRESSION, ],
+        'primitive_family': metadata_base.PrimitiveFamily.CLASSIFICATION,
+        'hyperparameters_to_tune': ['learning_rate', 'reg_constant', 'max_err', 'max_iter']
+    })
+
+
+    def __init__(self, *,
+                 hyperparams: Hyperparams,
+                 random_seed: int = 0,
+                 docker_containers: Dict[str, DockerContainer] = None,
+                 _verbose: int = 0) -> None:
+
+        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
+
+        self._learning_rate = self.hyperparams['learning_rate']
+        self._reg_constant = self.hyperparams['reg_constant']
+        self._penalty = self.hyperparams['penalty']
+        self._max_err = self.hyperparams['max_err']
+        self._max_iter = self.hyperparams['max_iter']
+        self._verbose = bool(self.hyperparams['verbose'])
+        self._classes = None
+        self._n_classes = 0
+        self._label_offset = 0
+        self._max_feature_value = 0
+        self._max_feature_value_defined = False
+
+        self._inputs = None
+        self._outputs = None
+        self._training_inputs = None
+        self._training_outputs = None
+        self._target_names = None
+        self._training_indices = None
+        self._target_column_indices = None
+        self._target_columns_metadata: List[OrderedDict] = None
+        self._input_column_names = None
+        self._weights = None
+        self._fitted = False
+        self._new_training_data = False
+
+
+    def _predict_proba(self, X: af.Array, Weights: af.Array) -> af.Array:
+        Z = af.matmul(X, Weights)
+        return af.sigmoid(Z)
+
+
+    def _predict_log_proba(self, X: af.Array, Weights: af.Array) -> af.Array:
+        return af.log(self._predict_proba(X, Weights))
+
+
+    def _predict(self, X: af.Array, Weights: af.Array) -> af.Array:
+        probs = self._predict_proba(X, Weights)
+        _, classes = af.imax(probs, 1)
+        classes = classes + self._label_offset
+        return classes
+
+
+    def _cost(self, Weights: af.Array, X: af.Array, Y: af.Array,
+              reg_constant: float, penalty: str) -> (af.Array, af.Array):
+        # Number of samples
+        m = Y.dims()[0]
+
+        dim0 = Weights.dims()[0]
+        dim1 = Weights.dims()[1] if len(Weights.dims()) > 1 else None
+        dim2 = Weights.dims()[2] if len(Weights.dims()) > 2 else None
+        dim3 = Weights.dims()[3] if len(Weights.dims()) > 3 else None
+
+        # Make the lambda corresponding to Weights(0) == 0
+        lambdat = af.constant(reg_constant, dim0, dim1, dim2, dim3)
+
+        # No regularization for bias weights
+        lambdat[0, :] = 0
+
+        # Get the prediction
+        H = self._predict_proba(X, Weights)
+
+        # Cost of misprediction
+        Jerr = -1 * af.sum(Y * af.log(H) + (1 - Y) * af.log(1 - H), dim=0)
+
+        # Regularization cost
+        penalty_norm = None
+        if penalty == 'l2':
+            penalty_norm = Weights * Weights
+        else:
+            penalty_norm = af.abs(Weights)
+        Jreg = 0.5 * af.sum(lambdat * penalty_norm, dim=0)
+
+        # Total cost
+        J = (Jerr + Jreg) / m
+
+        # Find the gradient of cost
+        D = (H - Y)
+        dJ = (af.matmulTN(X, D) + lambdat * Weights) / m
+
+        return J, dJ
+
+
+    def _ints_to_onehots(self, digits: np.ndarray, num_classes: int) -> np.ndarray:
+        # Need labels to start with 0, but some datasets might start with 1 or other numbers
+        self._label_offset = np.amin(digits)
+        onehots = np.zeros((digits.shape[0], num_classes), dtype='float32')
+        onehots[np.arange(digits.shape[0]), digits - self._label_offset] = 1
+        return onehots
+
+
+    def _train(self, X:af.Array, Y:af.Array, alpha: float, lambda_param: float,
+               penalty: str, maxerr: float, maxiter: int) -> af.Array:
+        # Add bias feature
+        bias = af.constant(1, X.dims()[0], 1)
+        X_biased = af.join(1, bias, X)
+
+        # Initialize parameters to 0
+        Weights = af.constant(0, X_biased.dims()[1], Y.dims()[1])
+
+        for i in range(maxiter):
+            # Get the cost and gradient
+            J, dJ = self._cost(Weights, X_biased, Y, lambda_param, penalty)
+            err = af.max(af.abs(J))
+            if err < maxerr:
+                Weights = Weights[1:] # Remove bias weights
+                return Weights
+
+            # Update the weights via gradient descent
+            Weights = Weights - alpha * dJ
+
+        # Remove bias weights
+        Weights = Weights[1:]
+
+        return Weights
+
+
+    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
+        self._inputs = inputs
+        self._outputs = outputs
+        self._fitted = False
+        self._new_training_data = True
+
+
+    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
+        if self._inputs is None or self._outputs is None:
+            raise ValueError("Missing training data.")
+
+        if not self._new_training_data:
+            return CallResult(None)
+        self._new_training_data = False
+
+        self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
+        self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
+        self._input_column_names = self._training_inputs.columns
+
+        if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
+            self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
+            sk_training_output = self._training_outputs.values
+
+            shape = sk_training_output.shape
+            if len(shape) == 2 and shape[1] == 1:
+                sk_training_output = np.ravel(sk_training_output)
+
+            # Assume training input data is an ndarray
+            training_inputs = self._training_inputs.values.astype('float32')
+            training_outputs = sk_training_output.astype('uint32')
+
+            if self._n_classes == 0:
+                # Assume that class labels are integers and nonnegative
+                self._n_classes = np.amax(training_outputs).astype('uint32').item() + 1
+
+            self._classes = list(range(self._n_classes))
+
+            # Convert ndarray to af array
+            train_feats = af.from_ndarray(training_inputs)
+            train_targets = af.from_ndarray(
+                self._ints_to_onehots(training_outputs, self._n_classes)
+            )
+            num_train = train_feats.dims()[0]
+
+            # Normalize feature values
+            self._max_feature_value = af.max(train_feats)
+            self._max_feature_value_defined = True
+            train_feats = train_feats / self._max_feature_value
+
+            # Start training
+            self._weights = self._train(train_feats, train_targets,
+                                        self._learning_rate,
+                                        self._reg_constant,
+                                        self._penalty,
+                                        self._max_err,
+                                        self._max_iter
+            )
+
+            self._fitted = True
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+
+        return CallResult(None)
+
+
+    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
+        if not self._fitted:
+            raise PrimitiveNotFittedError("Primitive not fitted.")
+
+        sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
+
+        output = []
+        if len(sk_inputs.columns):
+            af_inputs = af.from_ndarray(sk_inputs.values.astype('float32'))
+
+            # Normalize feature values
+            if not self._max_feature_value_defined:
+                self._max_feature_value = af.max(train_feats)
+            af_inputs = af_inputs / self._max_feature_value
+
+            af_output = self._predict(af_inputs, self._weights)
+            ndarray_output = af_output.to_ndarray()
+
+            output = self._wrap_predictions(inputs, ndarray_output)
+            output.columns = self._target_names
+            output = [output]
+        else:
+            if self.hyperparams['error_on_no_input']:
+                raise RuntimeError("No input columns were selected")
+            self.logger.warn("No input columns were selected")
+
+        outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
+                                             add_index_columns=self.hyperparams['add_index_columns'],
+                                             inputs=inputs, column_indices=self._target_column_indices,
+                                             columns_list=output)
+
+        return CallResult(outputs)
+
+
+    def get_params(self) -> Params:
+        if not self._fitted:
+            return Params(
+                classes_=None,
+                input_column_names=self._input_column_names,
+                training_indices_=self._training_indices,
+                target_names_=self._target_names,
+                target_column_indices_=self._target_column_indices,
+                target_columns_metadata_=self._target_columns_metadata
+            )
+
+        return Params(
+            classes_=self._classes,
+            input_column_names=self._input_column_names,
+            training_indices_=self._training_indices,
+            target_names_=self._target_names,
+            target_column_indices_=self._target_column_indices,
+            target_columns_metadata_=self._target_columns_metadata
+        )
+
+
+    def set_params(self, *, params: Params) -> None:
+        self._classes_ = params['classes_']
+        self._input_column_names = params['input_column_names']
+        self._training_indices = params['training_indices_']
+        self._target_names = params['target_names_']
+        self._target_column_indices = params['target_column_indices_']
+        self._target_columns_metadata = params['target_columns_metadata_']
+
+
+    def log_likelihoods(self, *,
+                        outputs: Outputs,
+                        inputs: Inputs,
+                        timeout: float = None,
+                        iterations: int = None) -> CallResult[Sequence[float]]:
+        inputs = inputs.iloc[:, self._training_indices]  # Get ndarray
+        outputs = outputs.iloc[:, self._target_column_indices]
+
+        if len(inputs.columns) and len(outputs.columns):
+
+            if outputs.shape[1] != self._n_classes:
+                raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")
+
+            log_proba = self._predict_log_proba(inputs, self._weights)
+
+            # Making it always a list, even when only one target.
+            if self._n_classes == 1:
+                log_proba = [log_proba]
+                classes = [self._classes_]
+            else:
+                classes = self._classes_
+
+            samples_length = inputs.shape[0]
+
+            log_likelihoods = []
+            for k in range(self._n_classes):
+                # We have to map each class to its internal (numerical) index used in the learner.
+                # This allows "outputs" to contain string classes.
+                outputs_column = outputs.iloc[:, k]
+                classes_map = pandas.Series(np.arange(len(classes[k])), index=classes[k])
+                mapped_outputs_column = outputs_column.map(classes_map)
+
+                # For each target column (column in "outputs"), for each sample (row) we pick the log
+                # likelihood for a given class.
+                log_likelihoods.append(log_proba[k][np.arange(samples_length), mapped_outputs_column])
+
+            results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
+            results.columns = outputs.columns
+
+            for k in range(self._n_classes):
+                column_metadata = outputs.metadata.query_column(k)
+                if 'name' in column_metadata:
+                    results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})
+
+        else:
+            results = d3m_dataframe(generate_metadata=True)
+
+        return CallResult(results)
+
+
+    @classmethod
+    def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return inputs, list(range(len(inputs.columns)))
+
+        inputs_metadata = inputs.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            return cls._can_produce_column(inputs_metadata, column_index, hyperparams)
+
+        columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
+                                                                                   use_columns=hyperparams['use_inputs_columns'],
+                                                                                   exclude_columns=hyperparams['exclude_inputs_columns'],
+                                                                                   can_use_column=can_produce_column)
+        return inputs.iloc[:, columns_to_produce], columns_to_produce
+        # return columns_to_produce
+
+
+    @classmethod
+    def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
+        column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+
+        accepted_structural_types = (int, float, np.integer, np.float64)
+        accepted_semantic_types = set()
+        accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
+        if not issubclass(column_metadata['structural_type'], accepted_structural_types):
+            return False
+
+        semantic_types = set(column_metadata.get('semantic_types', []))
+
+        if len(semantic_types) == 0:
+            cls.logger.warning("No semantic types found in column metadata")
+            return False
+        # Making sure all accepted_semantic_types are available in semantic_types
+        if len(accepted_semantic_types - semantic_types) == 0:
+            return True
+
+        return False
+
+
+    @classmethod
+    def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
+        if not hyperparams['use_semantic_types']:
+            return data, list(data.columns), list(range(len(data.columns)))
+
+        metadata = data.metadata
+
+        def can_produce_column(column_index: int) -> bool:
+            accepted_semantic_types = set()
+            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
+            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            if len(semantic_types) == 0:
+                cls.logger.warning("No semantic types found in column metadata")
+                return False
+            # Making sure all accepted_semantic_types are available in semantic_types
+            if len(accepted_semantic_types - semantic_types) == 0:
+                return True
+            return False
+
+        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
+                                                                                             use_columns=hyperparams[
+                                                                                                 'use_outputs_columns'],
+                                                                                             exclude_columns=
+                                                                                             hyperparams[
+                                                                                                 'exclude_outputs_columns'],
+                                                                                             can_use_column=can_produce_column)
+
+        targets = []
+        if target_column_indices:
+            targets = data.select_columns(target_column_indices)
+        target_column_names = []
+        for idx in target_column_indices:
+            target_column_names.append(data.columns[idx])
+        return targets, target_column_names, target_column_indices
+
+
+    @classmethod
+    def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict(outputs_metadata.query_column(column_index))
+
+            # Update semantic types and prepare it for predicted targets.
+            semantic_types = set(column_metadata.get('semantic_types', []))
+            semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
+            add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
+            add_semantic_types.add(hyperparams["return_semantic_type"])
+            semantic_types = semantic_types - semantic_types_to_remove
+            semantic_types = semantic_types.union(add_semantic_types)
+            column_metadata['semantic_types'] = list(semantic_types)
+
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
+
+
+    @classmethod
+    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
+                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
+        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)
+
+        for column_index, column_metadata in enumerate(target_columns_metadata):
+            column_metadata.pop("structural_type", None)
+            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)
+
+        return outputs_metadata
+
+
+    def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
+        outputs = d3m_dataframe(predictions, generate_metadata=False)
+        outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
+        return outputs
+
+
+    @classmethod
+    def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
+        outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
+
+        target_columns_metadata: List[OrderedDict] = []
+        for column_index in range(outputs_length):
+            column_metadata = OrderedDict()
+            semantic_types = []
+            semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
+            column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
+            if column_name is None:
+                column_name = "output_{}".format(column_index)
+            column_metadata["semantic_types"] = semantic_types
+            column_metadata["name"] = str(column_name)
+            target_columns_metadata.append(column_metadata)
+
+        return target_columns_metadata
diff --git a/afsklearn/af_extmath.py b/afsklearn/af_extmath.py
new file mode 100644
index 0000000..2808b6a
--- /dev/null
+++ b/afsklearn/af_extmath.py
@@ -0,0 +1,54 @@
+import arrayfire as af
+import cupy as np
+import numpy
+import scipy.sparse as sparse
+#import scipy.sparse as sp
+import warnings
+import numbers
+from collections.abc import Sequence
+from scipy.sparse.base import spmatrix
+from itertools import chain
+
+from sklearn.utils.validation import _deprecate_positional_args
+
+@_deprecate_positional_args
+def safe_sparse_dot(a, b, *, dense_output=False):
+    """Dot product that handle the sparse matrix case correctly
+    Parameters
+    ----------
+    a : array or sparse matrix
+    b : array or sparse matrix
+    dense_output : boolean, (default=False)
+        When False, ``a`` and ``b`` both being sparse will yield sparse output.
+        When True, output will always be a dense array.
+    Returns
+    -------
+    dot_product : array or sparse matrix
+        sparse if ``a`` and ``b`` are sparse and ``dense_output=False``.
+    """
+    #if a.ndim > 2 or b.ndim > 2:
+    if a.numdims() > 2 or b.numdims() > 2:
+        if sparse.issparse(a):
+            # sparse is always 2D. Implies b is 3D+
+            # [i, j] @ [k, ..., l, m, n] -> [i, k, ..., l, n]
+            b_ = np.rollaxis(b, -2)
+            b_2d = b_.reshape((b.shape[-2], -1))
+            ret = a @ b_2d
+            ret = ret.reshape(a.shape[0], *b_.shape[1:])
+        elif sparse.issparse(b):
+            # sparse is always 2D. Implies a is 3D+
+            # [k, ..., l, m] @ [i, j] -> [k, ..., l, j]
+            a_2d = a.reshape(-1, a.shape[-1])
+            ret = a_2d @ b
+            ret = ret.reshape(*a.shape[:-1], b.shape[1])
+        else:
+            ret = np.dot(a, b)
+    else:
+        #ret = a @ b
+        ret = af.blas.matmul(a.as_type(af.Dtype.f32), b.as_type(af.Dtype.f32))
+
+    if (sparse.issparse(a) and sparse.issparse(b)
+            and dense_output and hasattr(ret, "toarray")):
+        return ret.toarray()
+    return ret
+
diff --git a/afsklearn/af_multilayer_perceptron.py b/afsklearn/af_multilayer_perceptron.py
new file mode 100644
index 0000000..eb3bf55
--- /dev/null
+++ b/afsklearn/af_multilayer_perceptron.py
@@ -0,0 +1,1257 @@
+#import numpy
+#import cupy as np
+import numpy as np
+import arrayfire as af
+import time
+from math import sqrt
+
+from abc import ABCMeta, abstractmethod
+import warnings
+
+import sklearn
+from afClassifierMixin import afClassifierMixin
+from afRegressorMixin import afRegressorMixin
+from afBaseEstimator import afBaseEstimator
+from afLabelBinarizer import afLabelBinarizer
+from af_stochastic_optimizers import SGDOptimizer, AdamOptimizer
+
+from sklearn.utils.validation import _deprecate_positional_args
+from sklearn.utils import check_random_state
+from sklearn.utils import gen_batches
+from af_validation import _safe_indexing
+from af_validation import check_is_fitted
+from af_validation import check_array
+from af_validation import column_or_1d
+from af_extmath import safe_sparse_dot
+
+
+from sklearn.base import is_classifier
+from nn_utils import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
+from sklearn.utils import shuffle
+
+from sklearn.model_selection import train_test_split
+
+# from ..exceptions import ConvergenceWarning
+# from ..utils.extmath import safe_sparse_dot
+# from ..utils.multiclass import _check_partial_fit_first_call, unique_labels
+# from ..utils.multiclass import type_of_target
+# from ..utils.optimize import _check_optimize_result
+# import scipy.optimize
+
+
+_STOCHASTIC_SOLVERS = ['sgd', 'adam']
+
+
+def _pack(coefs_, intercepts_):
+    """Pack the parameters into a single vector."""
+    return np.hstack([l.ravel() for l in coefs_ + intercepts_])
+
+
+class BaseMultilayerPerceptron(afBaseEstimator, metaclass=ABCMeta):
+    """
+    Base class for MLP classification and regression.
+    """
+
+    @abstractmethod
+    def __init__(self, hidden_layer_sizes, activation, solver,
+                 alpha, batch_size, learning_rate, learning_rate_init, power_t,
+                 max_iter, loss, shuffle, random_state, tol, verbose,
+                 warm_start, momentum, nesterovs_momentum, early_stopping,
+                 validation_fraction, beta_1, beta_2, epsilon,
+                 n_iter_no_change, max_fun):
+        self.activation = activation
+        self.solver = solver
+        self.alpha = alpha
+        self.batch_size = batch_size
+        self.learning_rate = learning_rate
+        self.learning_rate_init = learning_rate_init
+        self.power_t = power_t
+        self.max_iter = max_iter
+        self.loss = loss
+        self.hidden_layer_sizes = hidden_layer_sizes
+        self.shuffle = shuffle
+        self.random_state = random_state
+        self.tol = tol
+        self.verbose = verbose
+        self.warm_start = warm_start
+        self.momentum = momentum
+        self.nesterovs_momentum = nesterovs_momentum
+        self.early_stopping = early_stopping
+        self.validation_fraction = validation_fraction
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.n_iter_no_change = n_iter_no_change
+        self.max_fun = max_fun
+
+#    def _unpack(self, packed_parameters):
+#        """Extract the coefficients and intercepts from packed_parameters."""
+#        for i in range(self.n_layers_ - 1):
+#            start, end, shape = self._coef_indptr[i]
+#            self.coefs_[i] = np.reshape(packed_parameters[start:end], shape)
+#
+#            start, end = self._intercept_indptr[i]
+#            self.intercepts_[i] = packed_parameters[start:end]
+#
+    def _forward_pass(self, activations):
+        """Perform a forward pass on the network by computing the values
+        of the neurons in the hidden layers and the output layer.
+        Parameters
+        ----------
+        activations : list, length = n_layers - 1
+            The ith element of the list holds the values of the ith layer.
+        """
+
+        hidden_activation = ACTIVATIONS[self.activation]
+        # Iterate over the hidden layers
+        for i in range(self.n_layers_ - 1):
+            activations[i + 1] = safe_sparse_dot(activations[i],
+                                                 self.coefs_[i])
+            activations[i + 1] += af.tile(self.intercepts_[i].T, activations[i+1].dims()[0])
+
+            # For the hidden layers
+            if (i + 1) != (self.n_layers_ - 1):
+                activations[i + 1] = hidden_activation(activations[i + 1])
+                #activations[i + 1] = af.tile(self.intercepts_[i].T, activations[i+1].dims()[0])
+
+        # For the last layer
+        output_activation = ACTIVATIONS[self.out_activation_]
+        activations[i + 1] = output_activation(activations[i + 1])
+
+        return activations
+#
+    def _compute_loss_grad(self, layer, n_samples, activations, deltas,
+                           coef_grads, intercept_grads):
+        """Compute the gradient of loss with respect to coefs and intercept for
+        specified layer.
+        This function does backpropagation for the specified one layer.
+        """
+        coef_grads[layer] = safe_sparse_dot(activations[layer].T,
+                                            deltas[layer])
+        coef_grads[layer] += (self.alpha * self.coefs_[layer])
+        coef_grads[layer] /= n_samples
+
+        intercept_grads[layer] = af.flat(af.mean(deltas[layer], dim=0))
+
+        return coef_grads, intercept_grads
+
+#    def _loss_grad_lbfgs(self, packed_coef_inter, X, y, activations, deltas,
+#                         coef_grads, intercept_grads):
+#        """Compute the MLP loss function and its corresponding derivatives
+#        with respect to the different parameters given in the initialization.
+#        Returned gradients are packed in a single vector so it can be used
+#        in lbfgs
+#        Parameters
+#        ----------
+#        packed_coef_inter : ndarray
+#            A vector comprising the flattened coefficients and intercepts.
+#        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+#            The input data.
+#        y : ndarray of shape (n_samples,)
+#            The target values.
+#        activations : list, length = n_layers - 1
+#            The ith element of the list holds the values of the ith layer.
+#        deltas : list, length = n_layers - 1
+#            The ith element of the list holds the difference between the
+#            activations of the i + 1 layer and the backpropagated error.
+#            More specifically, deltas are gradients of loss with respect to z
+#            in each layer, where z = wx + b is the value of a particular layer
+#            before passing through the activation function
+#        coef_grads : list, length = n_layers - 1
+#            The ith element contains the amount of change used to update the
+#            coefficient parameters of the ith layer in an iteration.
+#        intercept_grads : list, length = n_layers - 1
+#            The ith element contains the amount of change used to update the
+#            intercept parameters of the ith layer in an iteration.
+#        Returns
+#        -------
+#        loss : float
+#        grad : array-like, shape (number of nodes of all layers,)
+#        """
+#        self._unpack(packed_coef_inter)
+#        loss, coef_grads, intercept_grads = self._backprop(
+#            X, y, activations, deltas, coef_grads, intercept_grads)
+#        grad = _pack(coef_grads, intercept_grads)
+#        return loss, grad
+#
+    def _backprop(self, X, y, activations, deltas, coef_grads,
+                  intercept_grads):
+        """Compute the MLP loss function and its corresponding derivatives
+        with respect to each parameter: weights and bias vectors.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+        y : ndarray of shape (n_samples,)
+            The target values.
+        activations : list, length = n_layers - 1
+             The ith element of the list holds the values of the ith layer.
+        deltas : list, length = n_layers - 1
+            The ith element of the list holds the difference between the
+            activations of the i + 1 layer and the backpropagated error.
+            More specifically, deltas are gradients of loss with respect to z
+            in each layer, where z = wx + b is the value of a particular layer
+            before passing through the activation function
+        coef_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            coefficient parameters of the ith layer in an iteration.
+        intercept_grads : list, length = n_layers - 1
+            The ith element contains the amount of change used to update the
+            intercept parameters of the ith layer in an iteration.
+        Returns
+        -------
+        loss : float
+        coef_grads : list, length = n_layers - 1
+        intercept_grads : list, length = n_layers - 1
+        """
+        n_samples = X.shape[0]
+
+        # Forward propagate
+        activations = self._forward_pass(activations)
+
+        # Get loss
+        loss_func_name = self.loss
+        if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':
+            loss_func_name = 'binary_log_loss'
+
+        loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])
+        # Add L2 regularization term to loss
+        values = np.sum(
+            np.array([af.dot(af.flat(s), af.flat(s), return_scalar=True) for s in self.coefs_]))
+        loss += (0.5 * self.alpha) * values / n_samples
+
+        # Backward propagate
+        last = self.n_layers_ - 2
+
+        # The calculation of delta[last] here works with following
+        # combinations of output activation and loss function:
+        # sigmoid and binary cross entropy, softmax and categorical cross
+        # entropy, and identity with squared loss
+        deltas[last] = activations[-1] - y
+
+        # Compute gradient for the last layer
+        coef_grads, intercept_grads = self._compute_loss_grad(
+            last, n_samples, activations, deltas, coef_grads, intercept_grads)
+
+        # Iterate over the hidden layers
+        for i in range(self.n_layers_ - 2, 0, -1):
+            deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
+            inplace_derivative = DERIVATIVES[self.activation]
+            inplace_derivative(activations[i], deltas[i - 1])
+
+            coef_grads, intercept_grads = self._compute_loss_grad(
+                i - 1, n_samples, activations, deltas, coef_grads,
+                intercept_grads)
+
+        return loss, coef_grads, intercept_grads
+
+    def _initialize(self, y, layer_units):
+        # set all attributes, allocate weights etc for first call
+        # Initialize parameters
+        self.n_iter_ = 0
+        self.t_ = 0
+        self.n_outputs_ = y.shape[1] if y.numdims() > 1 else 1
+
+        # Compute the number of layers
+        self.n_layers_ = len(layer_units)
+
+        # Output for regression
+        if not is_classifier(self):
+            self.out_activation_ = 'identity'
+       # Output for multi class
+        elif self._label_binarizer.y_type_ == 'multiclass':
+            self.out_activation_ = 'softmax'
+        # Output for binary class and multi-label
+        else:
+            self.out_activation_ = 'logistic'
+
+        # Initialize coefficient and intercept layers
+        self.coefs_ = []
+        self.intercepts_ = []
+
+        for i in range(self.n_layers_ - 1):
+            coef_init, intercept_init = self._init_coef(layer_units[i],
+                                                        layer_units[i + 1])
+            self.coefs_.append(coef_init)
+            self.intercepts_.append(intercept_init)
+
+        if self.solver in _STOCHASTIC_SOLVERS:
+            self.loss_curve_ = []
+            self._no_improvement_count = 0
+            if self.early_stopping:
+                self.validation_scores_ = []
+                self.best_validation_score_ = -np.inf
+            else:
+                self.best_loss_ = np.inf
+
+    def _init_coef(self, fan_in, fan_out):
+        # Use the initialization method recommended by
+        # Glorot et al.
+        factor = 6.
+        if self.activation == 'logistic':
+            factor = 2.
+        init_bound = sqrt(factor / (fan_in + fan_out))
+
+        # Generate weights and bias:
+        coef_init = (2 * init_bound) * af.randu(fan_in, fan_out) - init_bound
+        intercept_init = (2 * init_bound) * af.randu(fan_out) - init_bound
+
+        return coef_init, intercept_init
+
+    def _fit(self, X, y, incremental=False):
+        # Make sure self.hidden_layer_sizes is a list
+        hidden_layer_sizes = self.hidden_layer_sizes
+        if not hasattr(hidden_layer_sizes, "__iter__"):
+            hidden_layer_sizes = [hidden_layer_sizes]
+        hidden_layer_sizes = list(hidden_layer_sizes)
+
+        # Validate input parameters.
+        self._validate_hyperparameters()
+        if np.any(np.array(hidden_layer_sizes) <= 0):
+            raise ValueError("hidden_layer_sizes must be > 0, got %s." %
+                             hidden_layer_sizes)
+
+        X, y = self._validate_input(X, y, incremental)
+        n_samples, n_features = X.shape
+
+        # Ensure y is 2D
+        #if y.numdims() == 1:
+            #y = af.moddims(y, y.elements(), 1)
+
+        self.n_outputs_ = y.shape[1] if y.numdims() > 1 else 1
+
+        layer_units = ([n_features] + hidden_layer_sizes +
+                       [self.n_outputs_])
+
+        # check random state
+        self._random_state = check_random_state(self.random_state)
+
+        if not hasattr(self, 'coefs_') or (not self.warm_start and not
+                                           incremental):
+            # First time training the model
+            self._initialize(y, layer_units)
+
+        # lbfgs does not support mini-batches
+        if self.solver == 'lbfgs':
+            batch_size = n_samples
+        elif self.batch_size == 'auto':
+            batch_size = min(200, n_samples)
+        else:
+            if self.batch_size < 1 or self.batch_size > n_samples:
+                warnings.warn("Got `batch_size` less than 1 or larger than "
+                              "sample size. It is going to be clipped")
+            batch_size = np.clip(self.batch_size, 1, n_samples)
+
+        # Initialize lists
+        activations = [X] + [None] * (len(layer_units) - 1)
+        deltas = [None] * (len(activations) - 1)
+
+        coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_,
+                      n_fan_out_ in zip(layer_units[:-1],
+                                        layer_units[1:])]
+
+        intercept_grads = [af.constant(0, n_fan_out_) for n_fan_out_ in
+                           layer_units[1:]]
+
+        # Run the Stochastic optimization solver
+        if self.solver in _STOCHASTIC_SOLVERS:
+            self._fit_stochastic(X, y, activations, deltas, coef_grads,
+                                 intercept_grads, layer_units, incremental)
+
+#        # Run the LBFGS solver
+#        elif self.solver == 'lbfgs':
+#            self._fit_lbfgs(X, y, activations, deltas, coef_grads,
+#                            intercept_grads, layer_units)
+
+        return self
+
+    def _validate_hyperparameters(self):
+        if not isinstance(self.shuffle, bool):
+            raise ValueError("shuffle must be either True or False, got %s." %
+                             self.shuffle)
+        if self.max_iter <= 0:
+            raise ValueError("max_iter must be > 0, got %s." % self.max_iter)
+        if self.max_fun <= 0:
+            raise ValueError("max_fun must be > 0, got %s." % self.max_fun)
+        if self.alpha < 0.0:
+            raise ValueError("alpha must be >= 0, got %s." % self.alpha)
+        if (self.learning_rate in ["constant", "invscaling", "adaptive"] and
+                self.learning_rate_init <= 0.0):
+            raise ValueError("learning_rate_init must be > 0, got %s." %
+                             self.learning_rate)
+        if self.momentum > 1 or self.momentum < 0:
+            raise ValueError("momentum must be >= 0 and <= 1, got %s" %
+                             self.momentum)
+        if not isinstance(self.nesterovs_momentum, bool):
+            raise ValueError("nesterovs_momentum must be either True or False,"
+                             " got %s." % self.nesterovs_momentum)
+        if not isinstance(self.early_stopping, bool):
+            raise ValueError("early_stopping must be either True or False,"
+                             " got %s." % self.early_stopping)
+        if self.validation_fraction < 0 or self.validation_fraction >= 1:
+            raise ValueError("validation_fraction must be >= 0 and < 1, "
+                             "got %s" % self.validation_fraction)
+        if self.beta_1 < 0 or self.beta_1 >= 1:
+            raise ValueError("beta_1 must be >= 0 and < 1, got %s" %
+                             self.beta_1)
+        if self.beta_2 < 0 or self.beta_2 >= 1:
+            raise ValueError("beta_2 must be >= 0 and < 1, got %s" %
+                             self.beta_2)
+        if self.epsilon <= 0.0:
+            raise ValueError("epsilon must be > 0, got %s." % self.epsilon)
+        if self.n_iter_no_change <= 0:
+            raise ValueError("n_iter_no_change must be > 0, got %s."
+                             % self.n_iter_no_change)
+
+        # raise ValueError if not registered
+        if self.activation not in ACTIVATIONS:
+            raise ValueError("The activation '%s' is not supported. Supported "
+                             "activations are %s."
+                             % (self.activation, list(sorted(ACTIVATIONS))))
+        if self.learning_rate not in ["constant", "invscaling", "adaptive"]:
+            raise ValueError("learning rate %s is not supported. " %
+                             self.learning_rate)
+        supported_solvers = _STOCHASTIC_SOLVERS + ["lbfgs"]
+        if self.solver not in supported_solvers:
+            raise ValueError("The solver %s is not supported. "
+                             " Expected one of: %s" %
+                             (self.solver, ", ".join(supported_solvers)))
+
+#    def _fit_lbfgs(self, X, y, activations, deltas, coef_grads,
+#                   intercept_grads, layer_units):
+#        # Store meta information for the parameters
+#        self._coef_indptr = []
+#        self._intercept_indptr = []
+#        start = 0
+#
+#        # Save sizes and indices of coefficients for faster unpacking
+#        for i in range(self.n_layers_ - 1):
+#            n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]
+#
+#            end = start + (n_fan_in * n_fan_out)
+#            self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
+#            start = end
+#
+#        # Save sizes and indices of intercepts for faster unpacking
+#        for i in range(self.n_layers_ - 1):
+#            end = start + layer_units[i + 1]
+#            self._intercept_indptr.append((start, end))
+#            start = end
+#
+#        # Run LBFGS
+#        packed_coef_inter = _pack(self.coefs_,
+#                                  self.intercepts_)
+#
+#        if self.verbose is True or self.verbose >= 1:
+#            iprint = 1
+#        else:
+#            iprint = -1
+#
+#        opt_res = scipy.optimize.minimize(
+#                self._loss_grad_lbfgs, packed_coef_inter,
+#                method="L-BFGS-B", jac=True,
+#                options={
+#                    "maxfun": self.max_fun,
+#                    "maxiter": self.max_iter,
+#                    "iprint": iprint,
+#                    "gtol": self.tol
+#                },
+#                args=(X, y, activations, deltas, coef_grads, intercept_grads))
+#        self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
+#        self.loss_ = opt_res.fun
+#        self._unpack(opt_res.x)
+#
+    def _fit_stochastic(self, X, y, activations, deltas, coef_grads,
+                        intercept_grads, layer_units, incremental):
+
+
+        if not incremental or not hasattr(self, '_optimizer'):
+            params = self.coefs_ + self.intercepts_
+
+            if self.solver == 'sgd':
+                self._optimizer = SGDOptimizer(
+                    params, self.learning_rate_init, self.learning_rate,
+                    self.momentum, self.nesterovs_momentum, self.power_t)
+            elif self.solver == 'adam':
+                self._optimizer = AdamOptimizer(
+                    params, self.learning_rate_init, self.beta_1, self.beta_2,
+                    self.epsilon)
+
+        # early_stopping in partial_fit doesn't make sense
+        early_stopping = self.early_stopping and not incremental
+        if early_stopping:
+            # don't stratify in multilabel classification
+            should_stratify = is_classifier(self) and self.n_outputs_ == 1
+            stratify = y if should_stratify else None
+            X, X_val, y, y_val = train_test_split(
+                X, y, random_state=self._random_state,
+                test_size=self.validation_fraction,
+                stratify=stratify)
+            if is_classifier(self):
+                y_val = self._label_binarizer.inverse_transform(y_val)
+        else:
+            X_val = None
+            y_val = None
+
+        n_samples = X.shape[0]
+        sample_idx = np.arange(n_samples, dtype=int)
+
+        if self.batch_size == 'auto':
+            batch_size = min(4096, n_samples)
+        else:
+            batch_size = np.clip(self.batch_size, 1, n_samples)
+
+        try:
+            for it in range(self.max_iter):
+                if self.shuffle:
+                    # Only shuffle the sample indices instead of X and y to
+                    # reduce the memory footprint. These indices will be used
+                    # to slice the X and y.
+                    sample_idx = shuffle(sample_idx,
+                                         random_state=self._random_state)
+
+                #sloooow loop
+                accumulated_loss = 0.0
+                for batch_slice in gen_batches(n_samples, batch_size):
+                    if self.shuffle:
+                        X_batch = _safe_indexing(X, sample_idx[batch_slice])
+                        ii = af.interop.from_ndarray(sample_idx[batch_slice])
+                        y_batch = y[ii]
+                    else:
+                        X_batch = X[batch_slice]
+                        y_batch = y[batch_slice]
+
+                    activations[0] = X_batch
+                    batch_loss, coef_grads, intercept_grads = self._backprop(
+                        X_batch, y_batch, activations, deltas,
+                        coef_grads, intercept_grads)
+                    accumulated_loss += batch_loss * (batch_slice.stop -
+                                                      batch_slice.start)
+
+                    # update weights
+                    grads = coef_grads + intercept_grads
+                    self._optimizer.update_params(grads)
+
+                self.n_iter_ += 1
+                self.loss_ = accumulated_loss / X.shape[0]
+
+                self.t_ += n_samples
+                self.loss_curve_.append(self.loss_)
+                if self.verbose:
+                    print("Iteration %d, loss = %.8f" % (self.n_iter_,
+                                                         self.loss_))
+
+                # update no_improvement_count based on training loss or
+                # validation score according to early_stopping
+                self._update_no_improvement_count(early_stopping, X_val, y_val)
+
+                # for learning rate that needs to be updated at iteration end
+                self._optimizer.iteration_ends(self.t_)
+
+                if self._no_improvement_count > self.n_iter_no_change:
+                    # not better than last `n_iter_no_change` iterations by tol
+                    # stop or decrease learning rate
+                    if early_stopping:
+                        msg = ("Validation score did not improve more than "
+                               "tol=%f for %d consecutive epochs." % (
+                                   self.tol, self.n_iter_no_change))
+                    else:
+                        msg = ("Training loss did not improve more than tol=%f"
+                               " for %d consecutive epochs." % (
+                                   self.tol, self.n_iter_no_change))
+
+                    is_stopping = self._optimizer.trigger_stopping(
+                        msg, self.verbose)
+                    if is_stopping:
+                        break
+                    else:
+                        self._no_improvement_count = 0
+
+                if incremental:
+                    break
+
+                if self.n_iter_ == self.max_iter:
+                    warnings.warn(
+                        "Stochastic Optimizer: Maximum iterations (%d) "
+                        "reached and the optimization hasn't converged yet."
+                        % self.max_iter, ConvergenceWarning)
+
+
+        except KeyboardInterrupt:
+            warnings.warn("Training interrupted by user.")
+
+        if early_stopping:
+            # restore best weights
+            self.coefs_ = self._best_coefs
+            self.intercepts_ = self._best_intercepts
+
+    def _update_no_improvement_count(self, early_stopping, X_val, y_val):
+        if early_stopping:
+            # compute validation score, use that for stopping
+            self.validation_scores_.append(self.score(X_val, y_val))
+
+            if self.verbose:
+                print("Validation score: %f" % self.validation_scores_[-1])
+            # update best parameters
+            # use validation_scores_, not loss_curve_
+            # let's hope no-one overloads .score with mse
+            last_valid_score = self.validation_scores_[-1]
+
+            if last_valid_score < (self.best_validation_score_ +
+                                   self.tol):
+                self._no_improvement_count += 1
+            else:
+                self._no_improvement_count = 0
+
+            if last_valid_score > self.best_validation_score_:
+                self.best_validation_score_ = last_valid_score
+                self._best_coefs = [c.copy() for c in self.coefs_]
+                self._best_intercepts = [i.copy()
+                                         for i in self.intercepts_]
+        else:
+            if self.loss_curve_[-1] > self.best_loss_ - self.tol:
+                self._no_improvement_count += 1
+            else:
+                self._no_improvement_count = 0
+            if self.loss_curve_[-1] < self.best_loss_:
+                self.best_loss_ = self.loss_curve_[-1]
+
+    def fit(self, X, y):
+        """Fit the model to data matrix X and target(s) y.
+        Parameters
+        ----------
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
+            The input data.
+        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels in classification, real numbers in
+            regression).
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        return self._fit(X, y, incremental=False)
+
+    @property
+    def partial_fit(self):
+        """Update the model with a single iteration over the given data.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+        y : ndarray of shape (n_samples,)
+            The target values.
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        if self.solver not in _STOCHASTIC_SOLVERS:
+            raise AttributeError("partial_fit is only available for stochastic"
+                                 " optimizers. %s is not stochastic."
+                                 % self.solver)
+        return self._partial_fit
+
+    def _partial_fit(self, X, y):
+        return self._fit(X, y, incremental=True)
+
+    def _predict(self, X):
+        """Predict using the trained mode
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
+            The decision function of the samples for each class in the model.
+        """
+        X = check_array(X, accept_sparse=['csr', 'csc'])
+
+        # Make sure self.hidden_layer_sizes is a list
+        hidden_layer_sizes = self.hidden_layer_sizes
+        if not hasattr(hidden_layer_sizes, "__iter__"):
+            hidden_layer_sizes = [hidden_layer_sizes]
+        hidden_layer_sizes = list(hidden_layer_sizes)
+
+        layer_units = [X.shape[1]] + hidden_layer_sizes + \
+            [self.n_outputs_]
+
+        # Initialize layers
+        activations = [X]
+
+        for i in range(self.n_layers_ - 1):
+            #activations.append(np.empty((X.shape[0],
+                                         #layer_units[i + 1])))
+            activations.append(af.constant(0, X.shape[0], layer_units[i + 1]))
+
+        # forward propagate
+        self._forward_pass(activations)
+        y_pred = activations[-1]
+
+        return y_pred
+
+
+class afMLPClassifier(afClassifierMixin, BaseMultilayerPerceptron):
+    """Multi-layer Perceptron classifier.
+    This model optimizes the log-loss function using LBFGS or stochastic
+    gradient descent.
+    .. versionadded:: 0.18
+    Parameters
+    ----------
+    hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
+        The ith element represents the number of neurons in the ith
+        hidden layer.
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
+        Activation function for the hidden layer.
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+        - 'logistic', the logistic sigmoid function,
+          returns f(x) = 1 / (1 + exp(-x)).
+        - 'tanh', the hyperbolic tan function,
+          returns f(x) = tanh(x).
+        - 'relu', the rectified linear unit function,
+          returns f(x) = max(0, x)
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
+        The solver for weight optimization.
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
+        - 'sgd' refers to stochastic gradient descent.
+        - 'adam' refers to a stochastic gradient-based optimizer proposed
+          by Kingma, Diederik, and Jimmy Ba
+        Note: The default solver 'adam' works pretty well on relatively
+        large datasets (with thousands of training samples or more) in terms of
+        both training time and validation score.
+        For small datasets, however, 'lbfgs' can converge faster and perform
+        better.
+    alpha : float, default=0.0001
+        L2 penalty (regularization term) parameter.
+    batch_size : int, default='auto'
+        Size of minibatches for stochastic optimizers.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
+        When set to "auto", `batch_size=min(200, n_samples)`
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
+        Learning rate schedule for weight updates.
+        - 'constant' is a constant learning rate given by
+          'learning_rate_init'.
+        - 'invscaling' gradually decreases the learning rate at each
+          time step 't' using an inverse scaling exponent of 'power_t'.
+          effective_learning_rate = learning_rate_init / pow(t, power_t)
+        - 'adaptive' keeps the learning rate constant to
+          'learning_rate_init' as long as training loss keeps decreasing.
+          Each time two consecutive epochs fail to decrease training loss by at
+          least tol, or fail to increase validation score by at least tol if
+          'early_stopping' is on, the current learning rate is divided by 5.
+        Only used when ``solver='sgd'``.
+    learning_rate_init : double, default=0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+    power_t : double, default=0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+    max_iter : int, default=200
+        Maximum number of iterations. The solver iterates until convergence
+        (determined by 'tol') or this number of iterations. For stochastic
+        solvers ('sgd', 'adam'), note that this determines the number of epochs
+        (how many times each data point will be used), not the number of
+        gradient steps.
+    shuffle : bool, default=True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for weights and bias
+        initialization, train-test split if early stopping is used, and batch
+        sampling when solver='sgd' or 'adam'.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    tol : float, default=1e-4
+        Tolerance for the optimization. When the loss or score is not improving
+        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
+        unless ``learning_rate`` is set to 'adaptive', convergence is
+        considered to be reached and training stops.
+    verbose : bool, default=False
+        Whether to print progress messages to stdout.
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous
+        call to fit as initialization, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+    momentum : float, default=0.9
+        Momentum for gradient descent update. Should be between 0 and 1. Only
+        used when solver='sgd'.
+    nesterovs_momentum : boolean, default=True
+        Whether to use Nesterov's momentum. Only used when solver='sgd' and
+        momentum > 0.
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to true, it will automatically set
+        aside 10% of training data as validation and terminate training when
+        validation score is not improving by at least tol for
+        ``n_iter_no_change`` consecutive epochs. The split is stratified,
+        except in a multilabel setting.
+        Only effective when solver='sgd' or 'adam'
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+    epsilon : float, default=1e-8
+        Value for numerical stability in adam. Only used when solver='adam'
+    n_iter_no_change : int, default=10
+        Maximum number of epochs to not meet ``tol`` improvement.
+        Only effective when solver='sgd' or 'adam'
+        .. versionadded:: 0.20
+    max_fun : int, default=15000
+        Only used when solver='lbfgs'. Maximum number of loss function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of loss function calls.
+        Note that number of loss function calls will be greater than or equal
+        to the number of iterations for the `MLPClassifier`.
+        .. versionadded:: 0.22
+    Attributes
+    ----------
+    classes_ : ndarray or list of ndarray of shape (n_classes,)
+        Class labels for each output.
+    loss_ : float
+        The current loss computed with the loss function.
+    coefs_ : list, length n_layers - 1
+        The ith element in the list represents the weight matrix corresponding
+        to layer i.
+    intercepts_ : list, length n_layers - 1
+        The ith element in the list represents the bias vector corresponding to
+        layer i + 1.
+    n_iter_ : int,
+        The number of iterations the solver has ran.
+    n_layers_ : int
+        Number of layers.
+    n_outputs_ : int
+        Number of outputs.
+    out_activation_ : string
+        Name of the output activation function.
+    Examples
+    --------
+    >>> from sklearn.neural_network import MLPClassifier
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = make_classification(n_samples=100, random_state=1)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
+    ...                                                     random_state=1)
+    >>> clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)
+    >>> clf.predict_proba(X_test[:1])
+    array([[0.038..., 0.961...]])
+    >>> clf.predict(X_test[:5, :])
+    array([1, 0, 1, 0, 1])
+    >>> clf.score(X_test, y_test)
+    0.8...
+    Notes
+    -----
+    MLPClassifier trains iteratively since at each time step
+    the partial derivatives of the loss function with respect to the model
+    parameters are computed to update the parameters.
+    It can also have a regularization term added to the loss function
+    that shrinks model parameters to prevent overfitting.
+    This implementation works with data represented as dense numpy arrays or
+    sparse scipy arrays of floating point values.
+    References
+    ----------
+    Hinton, Geoffrey E.
+        "Connectionist learning procedures." Artificial intelligence 40.1
+        (1989): 185-234.
+    Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of
+        training deep feedforward neural networks." International Conference
+        on Artificial Intelligence and Statistics. 2010.
+    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level
+        performance on imagenet classification." arXiv preprint
+        arXiv:1502.01852 (2015).
+    Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
+        optimization." arXiv preprint arXiv:1412.6980 (2014).
+    """
+    @_deprecate_positional_args
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
+                 solver='adam', alpha=0.0001,
+                 batch_size='auto', learning_rate="constant",
+                 learning_rate_init=0.001, power_t=0.5, max_iter=200,
+                 shuffle=True, random_state=None, tol=1e-4,
+                 verbose=False, warm_start=False, momentum=0.9,
+                 nesterovs_momentum=True, early_stopping=False,
+                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
+        super().__init__(
+            hidden_layer_sizes=hidden_layer_sizes,
+            activation=activation, solver=solver, alpha=alpha,
+            batch_size=batch_size, learning_rate=learning_rate,
+            learning_rate_init=learning_rate_init, power_t=power_t,
+            max_iter=max_iter, loss='log_loss', shuffle=shuffle,
+            random_state=random_state, tol=tol, verbose=verbose,
+            warm_start=warm_start, momentum=momentum,
+            nesterovs_momentum=nesterovs_momentum,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
+            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
+        print("done init")
+
+    def _validate_input(self, X, y, incremental):
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
+                                  multi_output=True)
+        #if y.ndim == 2 and y.shape[1] == 1:
+            #y = column_or_1d(y, warn=True)
+        if y.numdims() == 2 and y.dims(1) == 1:
+            y = column_or_1d(y, warn=True)
+
+        if not incremental:
+            self._label_binarizer = afLabelBinarizer()
+            self._label_binarizer.fit(y)
+            self.classes_ = self._label_binarizer.classes_
+        elif self.warm_start:
+            classes = unique_labels(y)
+            if set(classes) != set(self.classes_):
+                raise ValueError("warm_start can only be used where `y` has "
+                                 "the same classes as in the previous "
+                                 "call to fit. Previously got %s, `y` has %s" %
+                                 (self.classes_, classes))
+        else:
+            classes = unique_labels(y)
+            if len(np.setdiff1d(classes, self.classes_, assume_unique=True)):
+                raise ValueError("`y` has classes not in `self.classes_`."
+                                 " `self.classes_` has %s. 'y' has %s." %
+                                 (self.classes_, classes))
+
+        y = af.interop.from_ndarray(self._label_binarizer.transform(y))
+        return X, y
+
+    def predict(self, X):
+        """Predict using the multi-layer perceptron classifier
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+        Returns
+        -------
+        y : ndarray, shape (n_samples,) or (n_samples, n_classes)
+            The predicted classes.
+        """
+        check_is_fitted(self)
+        y_pred = self._predict(X)
+
+        if self.n_outputs_ == 1:
+            y_pred = af.flat(y_pred)
+
+        return self._label_binarizer.inverse_transform(y_pred)
+
+    def fit(self, X, y):
+        """Fit the model to data matrix X and target(s) y.
+        Parameters
+        ----------
+        X : ndarray or sparse matrix of shape (n_samples, n_features)
+            The input data.
+        y : ndarray, shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels in classification, real numbers in
+            regression).
+        Returns
+        -------
+        self : returns a trained MLP model.
+        """
+        return self._fit(X, y, incremental=(self.warm_start and
+                                            hasattr(self, "classes_")))
+
+#    @property
+#    def partial_fit(self):
+#        """Update the model with a single iteration over the given data.
+#        Parameters
+#        ----------
+#        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+#            The input data.
+#        y : array-like, shape (n_samples,)
+#            The target values.
+#        classes : array, shape (n_classes), default None
+#            Classes across all calls to partial_fit.
+#            Can be obtained via `np.unique(y_all)`, where y_all is the
+#            target vector of the entire dataset.
+#            This argument is required for the first call to partial_fit
+#            and can be omitted in the subsequent calls.
+#            Note that y doesn't need to contain all labels in `classes`.
+#        Returns
+#        -------
+#        self : returns a trained MLP model.
+#        """
+#        if self.solver not in _STOCHASTIC_SOLVERS:
+#            raise AttributeError("partial_fit is only available for stochastic"
+#                                 " optimizer. %s is not stochastic"
+#                                 % self.solver)
+#        return self._partial_fit
+#
+#    def _partial_fit(self, X, y, classes=None):
+#        if _check_partial_fit_first_call(self, classes):
+#            self._label_binarizer = LabelBinarizer()
+#            if type_of_target(y).startswith('multilabel'):
+#                self._label_binarizer.fit(y)
+#            else:
+#                self._label_binarizer.fit(classes)
+#
+#        super()._partial_fit(X, y)
+#
+#        return self
+#
+#    def predict_log_proba(self, X):
+#        """Return the log of probability estimates.
+#        Parameters
+#        ----------
+#        X : ndarray of shape (n_samples, n_features)
+#            The input data.
+#        Returns
+#        -------
+#        log_y_prob : ndarray of shape (n_samples, n_classes)
+#            The predicted log-probability of the sample for each class
+#            in the model, where classes are ordered as they are in
+#            `self.classes_`. Equivalent to log(predict_proba(X))
+#        """
+#        y_prob = self.predict_proba(X)
+#        return np.log(y_prob, out=y_prob)
+#
+#    def predict_proba(self, X):
+#        """Probability estimates.
+#        Parameters
+#        ----------
+#        X : {array-like, sparse matrix} of shape (n_samples, _features)
+#            The input data.
+#        Returns
+#        -------
+#        y_prob : ndarray of shape (n_samples, n_classes)
+#            The predicted probability of the sample for each class in the
+#            model, where classes are ordered as they are in `self.classes_`.
+#        """
+#        check_is_fitted(self)
+#        y_pred = self._predict(X)
+#
+#        if self.n_outputs_ == 1:
+#            y_pred = y_pred.ravel()
+#
+#        if y_pred.ndim == 1:
+#            return np.vstack([1 - y_pred, y_pred]).T
+#        else:
+#            return y_pred
+
+class afMLPRegressor(afRegressorMixin, BaseMultilayerPerceptron):
+    """Multi-layer Perceptron regressor.
+    This model optimizes the squared-loss using LBFGS or stochastic gradient
+    descent.
+    .. versionadded:: 0.18
+    Parameters
+    ----------
+    hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
+        The ith element represents the number of neurons in the ith
+        hidden layer.
+    activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
+        Activation function for the hidden layer.
+        - 'identity', no-op activation, useful to implement linear bottleneck,
+          returns f(x) = x
+        - 'logistic', the logistic sigmoid function,
+          returns f(x) = 1 / (1 + exp(-x)).
+        - 'tanh', the hyperbolic tan function,
+          returns f(x) = tanh(x).
+        - 'relu', the rectified linear unit function,
+          returns f(x) = max(0, x)
+    solver : {'lbfgs', 'sgd', 'adam'}, default='adam'
+        The solver for weight optimization.
+        - 'lbfgs' is an optimizer in the family of quasi-Newton methods.
+        - 'sgd' refers to stochastic gradient descent.
+        - 'adam' refers to a stochastic gradient-based optimizer proposed by
+          Kingma, Diederik, and Jimmy Ba
+        Note: The default solver 'adam' works pretty well on relatively
+        large datasets (with thousands of training samples or more) in terms of
+        both training time and validation score.
+        For small datasets, however, 'lbfgs' can converge faster and perform
+        better.
+    alpha : float, default=0.0001
+        L2 penalty (regularization term) parameter.
+    batch_size : int, default='auto'
+        Size of minibatches for stochastic optimizers.
+        If the solver is 'lbfgs', the classifier will not use minibatch.
+        When set to "auto", `batch_size=min(200, n_samples)`
+    learning_rate : {'constant', 'invscaling', 'adaptive'}, default='constant'
+        Learning rate schedule for weight updates.
+        - 'constant' is a constant learning rate given by
+          'learning_rate_init'.
+        - 'invscaling' gradually decreases the learning rate ``learning_rate_``
+          at each time step 't' using an inverse scaling exponent of 'power_t'.
+          effective_learning_rate = learning_rate_init / pow(t, power_t)
+        - 'adaptive' keeps the learning rate constant to
+          'learning_rate_init' as long as training loss keeps decreasing.
+          Each time two consecutive epochs fail to decrease training loss by at
+          least tol, or fail to increase validation score by at least tol if
+          'early_stopping' is on, the current learning rate is divided by 5.
+        Only used when solver='sgd'.
+    learning_rate_init : double, default=0.001
+        The initial learning rate used. It controls the step-size
+        in updating the weights. Only used when solver='sgd' or 'adam'.
+    power_t : double, default=0.5
+        The exponent for inverse scaling learning rate.
+        It is used in updating effective learning rate when the learning_rate
+        is set to 'invscaling'. Only used when solver='sgd'.
+    max_iter : int, default=200
+        Maximum number of iterations. The solver iterates until convergence
+        (determined by 'tol') or this number of iterations. For stochastic
+        solvers ('sgd', 'adam'), note that this determines the number of epochs
+        (how many times each data point will be used), not the number of
+        gradient steps.
+    shuffle : bool, default=True
+        Whether to shuffle samples in each iteration. Only used when
+        solver='sgd' or 'adam'.
+    random_state : int, RandomState instance, default=None
+        Determines random number generation for weights and bias
+        initialization, train-test split if early stopping is used, and batch
+        sampling when solver='sgd' or 'adam'.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+    tol : float, default=1e-4
+        Tolerance for the optimization. When the loss or score is not improving
+        by at least ``tol`` for ``n_iter_no_change`` consecutive iterations,
+        unless ``learning_rate`` is set to 'adaptive', convergence is
+        considered to be reached and training stops.
+    verbose : bool, default=False
+        Whether to print progress messages to stdout.
+    warm_start : bool, default=False
+        When set to True, reuse the solution of the previous
+        call to fit as initialization, otherwise, just erase the
+        previous solution. See :term:`the Glossary <warm_start>`.
+    momentum : float, default=0.9
+        Momentum for gradient descent update.  Should be between 0 and 1. Only
+        used when solver='sgd'.
+    nesterovs_momentum : boolean, default=True
+        Whether to use Nesterov's momentum. Only used when solver='sgd' and
+        momentum > 0.
+    early_stopping : bool, default=False
+        Whether to use early stopping to terminate training when validation
+        score is not improving. If set to true, it will automatically set
+        aside 10% of training data as validation and terminate training when
+        validation score is not improving by at least ``tol`` for
+        ``n_iter_no_change`` consecutive epochs.
+        Only effective when solver='sgd' or 'adam'
+    validation_fraction : float, default=0.1
+        The proportion of training data to set aside as validation set for
+        early stopping. Must be between 0 and 1.
+        Only used if early_stopping is True
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector in adam,
+        should be in [0, 1). Only used when solver='adam'
+    epsilon : float, default=1e-8
+        Value for numerical stability in adam. Only used when solver='adam'
+    n_iter_no_change : int, default=10
+        Maximum number of epochs to not meet ``tol`` improvement.
+        Only effective when solver='sgd' or 'adam'
+        .. versionadded:: 0.20
+    max_fun : int, default=15000
+        Only used when solver='lbfgs'. Maximum number of function calls.
+        The solver iterates until convergence (determined by 'tol'), number
+        of iterations reaches max_iter, or this number of function calls.
+        Note that number of function calls will be greater than or equal to
+        the number of iterations for the MLPRegressor.
+        .. versionadded:: 0.22
+    Attributes
+    ----------
+    loss_ : float
+        The current loss computed with the loss function.
+    coefs_ : list, length n_layers - 1
+        The ith element in the list represents the weight matrix corresponding
+        to layer i.
+    intercepts_ : list, length n_layers - 1
+        The ith element in the list represents the bias vector corresponding to
+        layer i + 1.
+    n_iter_ : int,
+        The number of iterations the solver has ran.
+    n_layers_ : int
+        Number of layers.
+    n_outputs_ : int
+        Number of outputs.
+    out_activation_ : string
+        Name of the output activation function.
+    Examples
+    --------
+    >>> from sklearn.neural_network import MLPRegressor
+    >>> from sklearn.datasets import make_regression
+    >>> from sklearn.model_selection import train_test_split
+    >>> X, y = make_regression(n_samples=200, random_state=1)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y,
+    ...                                                     random_state=1)
+    >>> regr = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)
+    >>> regr.predict(X_test[:2])
+    array([-0.9..., -7.1...])
+    >>> regr.score(X_test, y_test)
+    0.4...
+    Notes
+    -----
+    MLPRegressor trains iteratively since at each time step
+    the partial derivatives of the loss function with respect to the model
+    parameters are computed to update the parameters.
+    It can also have a regularization term added to the loss function
+    that shrinks model parameters to prevent overfitting.
+    This implementation works with data represented as dense and sparse numpy
+    arrays of floating point values.
+    References
+    ----------
+    Hinton, Geoffrey E.
+        "Connectionist learning procedures." Artificial intelligence 40.1
+        (1989): 185-234.
+    Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of
+        training deep feedforward neural networks." International Conference
+        on Artificial Intelligence and Statistics. 2010.
+    He, Kaiming, et al. "Delving deep into rectifiers: Surpassing human-level
+        performance on imagenet classification." arXiv preprint
+        arXiv:1502.01852 (2015).
+    Kingma, Diederik, and Jimmy Ba. "Adam: A method for stochastic
+        optimization." arXiv preprint arXiv:1412.6980 (2014).
+    """
+    @_deprecate_positional_args
+    def __init__(self, hidden_layer_sizes=(100,), activation="relu", *,
+                 solver='adam', alpha=0.0001,
+                 batch_size='auto', learning_rate="constant",
+                 learning_rate_init=0.001,
+                 power_t=0.5, max_iter=200, shuffle=True,
+                 random_state=None, tol=1e-4,
+                 verbose=False, warm_start=False, momentum=0.9,
+                 nesterovs_momentum=True, early_stopping=False,
+                 validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
+                 epsilon=1e-8, n_iter_no_change=10, max_fun=15000):
+        super().__init__(
+            hidden_layer_sizes=hidden_layer_sizes,
+            activation=activation, solver=solver, alpha=alpha,
+            batch_size=batch_size, learning_rate=learning_rate,
+            learning_rate_init=learning_rate_init, power_t=power_t,
+            max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
+            random_state=random_state, tol=tol, verbose=verbose,
+            warm_start=warm_start, momentum=momentum,
+            nesterovs_momentum=nesterovs_momentum,
+            early_stopping=early_stopping,
+            validation_fraction=validation_fraction,
+            beta_1=beta_1, beta_2=beta_2, epsilon=epsilon,
+            n_iter_no_change=n_iter_no_change, max_fun=max_fun)
+
+    def predict(self, X):
+        """Predict using the multi-layer perceptron model.
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The input data.
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_outputs)
+            The predicted values.
+        """
+        check_is_fitted(self)
+        y_pred = self._predict(X)
+        if y_pred.numdims() > 2 and y_pred.shape[1] == 1:
+            return af.flat(y_pred)
+        return y_pred
+
+    def _validate_input(self, X, y, incremental):
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'],
+                                   multi_output=True, y_numeric=True)
+        if y.numdims() == 2 and y.shape[1] == 1:
+            y = column_or_1d(y, warn=True)
+        return X, y
diff --git a/afsklearn/af_stochastic_optimizers.py b/afsklearn/af_stochastic_optimizers.py
new file mode 100644
index 0000000..4c16a1b
--- /dev/null
+++ b/afsklearn/af_stochastic_optimizers.py
@@ -0,0 +1,276 @@
+"""Stochastic optimization methods for MLP
+"""
+
+# Authors: Jiyuan Qian <jq401@nyu.edu>
+# License: BSD 3 clause
+
+import time
+import numpy
+import cupy as np
+import arrayfire as af
+import numpy
+from math import sqrt
+
+
+class BaseOptimizer:
+    """Base (Stochastic) gradient descent optimizer
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+    """
+
+    def __init__(self, params, learning_rate_init=0.1):
+        self.params = [param for param in params]
+        self.learning_rate_init = learning_rate_init
+        self.learning_rate = float(learning_rate_init)
+
+    def update_params(self, grads):
+        """Update parameters with given gradients
+        Parameters
+        ----------
+        grads : list, length = len(params)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+        """
+        t0 = time.time()
+        updates = self._get_updates(grads)
+        #print(f'\tupdate0  time: {time.time()  - t0:.6f}')
+        t1 = time.time()
+        for param, update in zip(self.params, updates):
+            param += update
+        #print(f'\tupdate1  time: {time.time()  - t1:.6f}')
+
+    def iteration_ends(self, time_step):
+        """Perform update to learning rate and potentially other states at the
+        end of an iteration
+        """
+        pass
+
+    def trigger_stopping(self, msg, verbose):
+        """Decides whether it is time to stop training
+        Parameters
+        ----------
+        msg : str
+            Message passed in for verbose output
+        verbose : bool
+            Print message to stdin if True
+        Returns
+        -------
+        is_stopping : bool
+            True if training needs to stop
+        """
+        if verbose:
+            print(msg + " Stopping.")
+        return True
+
+
+class SGDOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with momentum
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+    learning_rate_init : float, default=0.1
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+    lr_schedule : {'constant', 'adaptive', 'invscaling'}, default='constant'
+        Learning rate schedule for weight updates.
+        -'constant', is a constant learning rate given by
+         'learning_rate_init'.
+        -'invscaling' gradually decreases the learning rate 'learning_rate_' at
+          each time step 't' using an inverse scaling exponent of 'power_t'.
+          learning_rate_ = learning_rate_init / pow(t, power_t)
+        -'adaptive', keeps the learning rate constant to
+         'learning_rate_init' as long as the training keeps decreasing.
+         Each time 2 consecutive epochs fail to decrease the training loss by
+         tol, or fail to increase validation score by tol if 'early_stopping'
+         is on, the current learning rate is divided by 5.
+    momentum : float, default=0.9
+        Value of momentum used, must be larger than or equal to 0
+    nesterov : bool, default=True
+        Whether to use nesterov's momentum or not. Use nesterov's if True
+    power_t : float, default=0.5
+        Power of time step 't' in inverse scaling. See `lr_schedule` for
+        more details.
+    Attributes
+    ----------
+    learning_rate : float
+        the current learning rate
+    velocities : list, length = len(params)
+        velocities that are used to update params
+    """
+
+    def __init__(self, params, learning_rate_init=0.1, lr_schedule='constant',
+                 momentum=0.9, nesterov=True, power_t=0.5):
+        super().__init__(params, learning_rate_init)
+
+        self.lr_schedule = lr_schedule
+        self.momentum = momentum
+        self.nesterov = nesterov
+        self.power_t = power_t
+
+        dims_list = [paddims2(param.dims()) for param in params]
+
+        self.velocities = [af.constant(0, dim[0], dim[1]) for dim in dims_list]
+
+    def iteration_ends(self, time_step):
+        """Perform updates to learning rate and potential other states at the
+        end of an iteration
+        Parameters
+        ----------
+        time_step : int
+            number of training samples trained on so far, used to update
+            learning rate for 'invscaling'
+        """
+        if self.lr_schedule == 'invscaling':
+            self.learning_rate = (float(self.learning_rate_init) /
+                                  (time_step + 1) ** self.power_t)
+
+    def trigger_stopping(self, msg, verbose):
+        if self.lr_schedule != 'adaptive':
+            if verbose:
+                print(msg + " Stopping.")
+            return True
+
+        if self.learning_rate <= 1e-6:
+            if verbose:
+                print(msg + " Learning rate too small. Stopping.")
+            return True
+
+        self.learning_rate /= 5.
+        if verbose:
+            print(msg + " Setting learning rate to %f" %
+                  self.learning_rate)
+        return False
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        updates = [self.momentum * velocity - self.learning_rate * grad
+                   for velocity, grad in zip(self.velocities, grads)]
+        self.velocities = updates
+
+        if self.nesterov:
+            updates = [self.momentum * velocity - self.learning_rate * grad
+                       for velocity, grad in zip(self.velocities, grads)]
+
+        return updates
+
+def paddims2(dims):
+    if len(dims) > 1:
+        return (dims[0], dims[1])
+    return (dims[0], 1)
+
+
+class AdamOptimizer(BaseOptimizer):
+    """Stochastic gradient descent optimizer with Adam
+    Note: All default values are from the original Adam paper
+    Parameters
+    ----------
+    params : list, length = len(coefs_) + len(intercepts_)
+        The concatenated list containing coefs_ and intercepts_ in MLP model.
+        Used for initializing velocities and updating params
+    learning_rate_init : float, default=0.001
+        The initial learning rate used. It controls the step-size in updating
+        the weights
+    beta_1 : float, default=0.9
+        Exponential decay rate for estimates of first moment vector, should be
+        in [0, 1)
+    beta_2 : float, default=0.999
+        Exponential decay rate for estimates of second moment vector, should be
+        in [0, 1)
+    epsilon : float, default=1e-8
+        Value for numerical stability
+    Attributes
+    ----------
+    learning_rate : float
+        The current learning rate
+    t : int
+        Timestep
+    ms : list, length = len(params)
+        First moment vectors
+    vs : list, length = len(params)
+        Second moment vectors
+    References
+    ----------
+    Kingma, Diederik, and Jimmy Ba.
+    "Adam: A method for stochastic optimization."
+    arXiv preprint arXiv:1412.6980 (2014).
+    """
+
+    def __init__(self, params, learning_rate_init=0.001, beta_1=0.9,
+                 beta_2=0.999, epsilon=1e-8):
+        super().__init__(params, learning_rate_init)
+
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.t = 0
+
+
+        dims_list = [paddims2(param.dims()) for param in params]
+
+        self.ms = [af.constant(0, dim[0], dim[1]) for dim in dims_list]
+        self.vs = [af.constant(0, dim[0], dim[1]) for dim in dims_list]
+
+    def _get_updates(self, grads):
+        """Get the values used to update params with given gradients
+        Parameters
+        ----------
+        grads : list, length = len(coefs_) + len(intercepts_)
+            Containing gradients with respect to coefs_ and intercepts_ in MLP
+            model. So length should be aligned with params
+        Returns
+        -------
+        updates : list, length = len(grads)
+            The values to add to params
+        """
+        self.t += 1
+        #af.sync()
+        t0 = time.time()
+
+        self.ms = [self.beta_1 * m + (1.0 - self.beta_1) * grad
+                   for m, grad in zip(self.ms, grads)]
+        #af.sync()
+        #print(f'\tget_update0  time: {time.time()  - t0:.6f}')
+        t1 = time.time()
+        #cupy super slow here
+        #get_update0  time:   0.000916
+        #SKget_update0  time: 0.000066
+        #get_update1  time:   0.001256
+        #SKget_update1  time: 0.000099
+        self.vs = [self.beta_2 * v + (1 - self.beta_2) * (grad * grad)
+                   for v, grad in zip(self.vs, grads)]#slooow
+        #af.sync()
+        #print(f'\tget_update1  time: {time.time()  - t1:.6f}')
+        #2 = time.time()
+        self.learning_rate = (self.learning_rate_init *
+                              sqrt(1 - self.beta_2 ** self.t) /
+                              (1 - self.beta_1 ** self.t))
+        #print(f'\tget_update2  time: {time.time()  - t2:.6f}')
+        #t3 = time.time()
+        updates = [-self.learning_rate * m / (af.sqrt(v) + self.epsilon)
+                   for m, v in zip(self.ms, self.vs)] #sloow
+        #print(f'\tget_update3  time: {time.time()  - t3:.6f}')
+        #import pdb; pdb.set_trace()
+
+        return updates
diff --git a/afsklearn/base.py b/afsklearn/base.py
index 097d2df..6dbf68d 100644
--- a/afsklearn/base.py
+++ b/afsklearn/base.py
@@ -7,8 +7,8 @@
 from numpy.core.numeric import ComplexWarning
 from sklearn.base import BaseEstimator, TransformerMixin
 
-from ._validation import (
-    _assert_all_finite, _ensure_no_complex_data, _num_samples, _safe_accumulator_op, check_array,
+from .utils import (
+    assert_all_finite, _num_samples, _safe_accumulator_op, check_array,
     check_consistent_length, check_X_y, column_or_1d)
 
 
@@ -117,8 +117,6 @@ class afTransformerMixin(TransformerMixin):
 from sklearn.preprocessing import LabelBinarizer
 from sklearn.utils.validation import _deprecate_positional_args
 
-from ._validation import _num_samples, check_array, check_is_fitted, column_or_1d
-
 
 def _unique_multiclass(y):
     if hasattr(y, '__array__'):
diff --git a/afsklearn/neural_network/_nn_utils.py b/afsklearn/neural_network/_nn_utils.py
new file mode 100644
index 0000000..7e15dbf
--- /dev/null
+++ b/afsklearn/neural_network/_nn_utils.py
@@ -0,0 +1,245 @@
+"""Utilities for the neural network modules
+"""
+# Author: Issam H. Laradji <issam.laradji@gmail.com>
+# License: BSD 3 clause
+
+import arrayfire as af
+#import numpy as np
+import numpy as np
+import numpy
+from ..utils._type_utils import typemap
+
+
+def logistic_sigmoid(x):
+    # return 1 / (1 + af.exp(-x))
+    return 1 / (1 + np.exp(-x))
+
+
+def xlogy(x, y):
+    return x * af.log(af.to_array(y))
+
+
+def identity(X):
+    """Simply return the input array.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        Data, where n_samples is the number of samples
+        and n_features is the number of features.
+    Returns
+    -------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        Same as the input data.
+    """
+    return X
+
+
+def logistic(X):
+    """Compute the logistic function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    return logistic_sigmoid(X)
+
+
+def tanh(X):
+    """Compute the hyperbolic tan function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    return np.tanh(X, out=X)
+
+
+def relu(X):
+    """Compute the rectified linear unit function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    #np.clip(X, 0, np.finfo(X.dtype).max, out=X)
+    ii = (X < 0)
+    if len(ii) > 0:
+        X[ii] = 0
+    return X
+
+
+def softmax(X):
+    """Compute the K-way softmax function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    tmp = X - af.max(X, dim=1)
+    X = af.exp(tmp)
+    X /= af.sum(X, dim=1)
+
+    return X
+
+
+ACTIVATIONS = {'identity': identity, 'tanh': tanh, 'logistic': logistic,
+               'relu': relu, 'softmax': softmax}
+
+
+def inplace_identity_derivative(Z, delta):
+    """Apply the derivative of the identity function: do nothing.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the identity activation function during
+        the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    # Nothing to do
+
+
+def inplace_logistic_derivative(Z, delta):
+    """Apply the derivative of the logistic sigmoid function.
+    It exploits the fact that the derivative is a simple function of the output
+    value from logistic function.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the logistic activation function during
+        the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= Z
+    delta *= (1 - Z)
+
+
+def inplace_tanh_derivative(Z, delta):
+    """Apply the derivative of the hyperbolic tanh function.
+    It exploits the fact that the derivative is a simple function of the output
+    value from hyperbolic tangent.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the hyperbolic tangent activation
+        function during the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= (1 - Z ** 2)
+
+
+def inplace_relu_derivative(Z, delta):
+    """Apply the derivative of the relu function.
+    It exploits the fact that the derivative is a simple function of the output
+    value from rectified linear units activation function.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the rectified linear units activation
+        function during the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta[Z == 0] = 0
+
+
+DERIVATIVES = {'identity': inplace_identity_derivative,
+               'tanh': inplace_tanh_derivative,
+               'logistic': inplace_logistic_derivative,
+               'relu': inplace_relu_derivative}
+
+
+def squared_loss(y_true, y_pred):
+    """Compute the squared loss for regression.
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) values.
+    y_pred : array-like or label indicator matrix
+        Predicted values, as returned by a regression estimator.
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    return af.mean(af.flat((y_true - y_pred) ** 2)) / 2
+
+
+def log_loss(y_true, y_prob):
+    """Compute Logistic loss for classification.
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+    y_prob : array-like of float, shape = (n_samples, n_classes)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+#    eps = np.finfo(y_prob.dtype).eps
+#    y_prob = np.clip(y_prob, eps, 1 - eps)
+#    if y_prob.shape[1] == 1:
+#        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+#
+#    if y_true.shape[1] == 1:
+#        y_true = np.append(1 - y_true, y_true, axis=1)
+#
+#    return - xlogy(y_true, y_prob).sum() / y_prob.shape[0]
+
+    eps = numpy.finfo(typemap(y_prob.dtype())).eps
+    y_prob[y_prob < eps] = eps
+    y_prob[y_prob > (1.0 - eps)] = 1.0 - eps
+
+    if y_prob.numdims() == 1:
+        y_prob = af.join(1, (1.0 - y_prob).as_type(y_prob.dtype()), y_prob)
+
+    if y_true.numdims() == 1:
+        y_true = af.join(1, (1.0 - y_true).as_type(y_true.dtype()), y_true)
+
+    return - af.sum(af.flat(xlogy(y_true, y_prob))) / y_prob.shape[0]
+
+
+def binary_log_loss(y_true, y_prob):
+    """Compute binary logistic loss for classification.
+    This is identical to log_loss in binary classification case,
+    but is kept for its use in multilabel case.
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+    y_prob : array-like of float, shape = (n_samples, 1)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    eps = np.finfo(y_prob.dtype).eps
+    y_prob = np.clip(y_prob, eps, 1 - eps)
+    return af.sum(-(xlogy(y_true, y_prob) + xlogy(1 - y_true, 1 - y_prob))) / y_prob.shape[0]
+
+
+LOSS_FUNCTIONS = {'squared_loss': squared_loss, 'log_loss': log_loss,
+                  'binary_log_loss': binary_log_loss}
diff --git a/afsklearn/neural_network/base.py b/afsklearn/neural_network/base.py
index 605ecaa..4458520 100644
--- a/afsklearn/neural_network/base.py
+++ b/afsklearn/neural_network/base.py
@@ -16,9 +16,10 @@
 
 from ..base import afBaseEstimator
 from .._stochastic_optimizers import SGDOptimizer, AdamOptimizer
-from .._validation import _safe_indexing, check_is_fitted, check_array, column_or_1d
+from ..utils import check_array, column_or_1d
+from ..utils.validation import _safe_indexing, check_is_fitted#, check_array, column_or_1d
 from .._extmath import safe_sparse_dot
-from .._nn_utils import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
+from ._nn_utils import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
 
 # from ..exceptions import ConvergenceWarning
 # from ..utils.extmath import safe_sparse_dot
diff --git a/afsklearn/neural_network/nn_utils.py b/afsklearn/neural_network/nn_utils.py
new file mode 100644
index 0000000..b3e201a
--- /dev/null
+++ b/afsklearn/neural_network/nn_utils.py
@@ -0,0 +1,245 @@
+"""Utilities for the neural network modules
+"""
+# Author: Issam H. Laradji <issam.laradji@gmail.com>
+# License: BSD 3 clause
+
+import arrayfire as af
+#import numpy as np
+import cupy as np
+import numpy
+from af_type_utils import typemap
+
+def logistic_sigmoid(x):
+    #return 1 / (1 + af.exp(-x))
+    return 1 / (1 + np.exp(-x))
+
+def xlogy(x, y):
+    return x * af.log(y)
+
+
+def identity(X):
+    """Simply return the input array.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        Data, where n_samples is the number of samples
+        and n_features is the number of features.
+    Returns
+    -------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        Same as the input data.
+    """
+    return X
+
+
+def logistic(X):
+    """Compute the logistic function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    return logistic_sigmoid(X, out=X)
+
+
+def tanh(X):
+    """Compute the hyperbolic tan function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    return np.tanh(X, out=X)
+
+
+def relu(X):
+    """Compute the rectified linear unit function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    #np.clip(X, 0, np.finfo(X.dtype).max, out=X)
+    ii = (X < 0)
+    if len(ii) > 0:
+        X[ii] = 0
+    return X
+
+
+def softmax(X):
+    """Compute the K-way softmax function inplace.
+    Parameters
+    ----------
+    X : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The input data.
+    Returns
+    -------
+    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The transformed data.
+    """
+    tmp = X - af.max(X, dim=1)
+    X = af.exp(tmp)
+    X /= af.sum(X, dim=1)
+
+    return X
+
+
+ACTIVATIONS = {'identity': identity, 'tanh': tanh, 'logistic': logistic,
+               'relu': relu, 'softmax': softmax}
+
+
+def inplace_identity_derivative(Z, delta):
+    """Apply the derivative of the identity function: do nothing.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the identity activation function during
+        the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    # Nothing to do
+
+
+def inplace_logistic_derivative(Z, delta):
+    """Apply the derivative of the logistic sigmoid function.
+    It exploits the fact that the derivative is a simple function of the output
+    value from logistic function.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the logistic activation function during
+        the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= Z
+    delta *= (1 - Z)
+
+
+def inplace_tanh_derivative(Z, delta):
+    """Apply the derivative of the hyperbolic tanh function.
+    It exploits the fact that the derivative is a simple function of the output
+    value from hyperbolic tangent.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the hyperbolic tangent activation
+        function during the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta *= (1 - Z ** 2)
+
+
+def inplace_relu_derivative(Z, delta):
+    """Apply the derivative of the relu function.
+    It exploits the fact that the derivative is a simple function of the output
+    value from rectified linear units activation function.
+    Parameters
+    ----------
+    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
+        The data which was output from the rectified linear units activation
+        function during the forward pass.
+    delta : {array-like}, shape (n_samples, n_features)
+         The backpropagated error signal to be modified inplace.
+    """
+    delta[Z == 0] = 0
+
+
+DERIVATIVES = {'identity': inplace_identity_derivative,
+               'tanh': inplace_tanh_derivative,
+               'logistic': inplace_logistic_derivative,
+               'relu': inplace_relu_derivative}
+
+
+def squared_loss(y_true, y_pred):
+    """Compute the squared loss for regression.
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) values.
+    y_pred : array-like or label indicator matrix
+        Predicted values, as returned by a regression estimator.
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    return af.mean(af.flat((y_true - y_pred) ** 2)) / 2
+
+
+def log_loss(y_true, y_prob):
+    """Compute Logistic loss for classification.
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+    y_prob : array-like of float, shape = (n_samples, n_classes)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+#    eps = np.finfo(y_prob.dtype).eps
+#    y_prob = np.clip(y_prob, eps, 1 - eps)
+#    if y_prob.shape[1] == 1:
+#        y_prob = np.append(1 - y_prob, y_prob, axis=1)
+#
+#    if y_true.shape[1] == 1:
+#        y_true = np.append(1 - y_true, y_true, axis=1)
+#
+#    return - xlogy(y_true, y_prob).sum() / y_prob.shape[0]
+
+    eps = numpy.finfo(typemap(y_prob.dtype())).eps
+    y_prob[y_prob < eps] = eps
+    y_prob[y_prob > (1.0 - eps)] = 1.0 - eps
+
+    if y_prob.numdims() == 1:
+        y_prob = af.join(1, (1.0 - y_prob).as_type(y_prob.dtype()), y_prob)
+
+    if y_true.numdims() == 1:
+        y_true = af.join(1, (1.0 - y_true).as_type(y_true.dtype()), y_true)
+
+    return - af.sum(af.flat(xlogy(y_true, y_prob))) / y_prob.shape[0]
+
+
+
+def binary_log_loss(y_true, y_prob):
+    """Compute binary logistic loss for classification.
+    This is identical to log_loss in binary classification case,
+    but is kept for its use in multilabel case.
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels.
+    y_prob : array-like of float, shape = (n_samples, 1)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+    Returns
+    -------
+    loss : float
+        The degree to which the samples are correctly predicted.
+    """
+    eps = np.finfo(y_prob.dtype).eps
+    y_prob = np.clip(y_prob, eps, 1 - eps)
+    return -(xlogy(y_true, y_prob) +
+             xlogy(1 - y_true, 1 - y_prob)).sum() / y_prob.shape[0]
+
+
+LOSS_FUNCTIONS = {'squared_loss': squared_loss, 'log_loss': log_loss,
+                  'binary_log_loss': binary_log_loss}
diff --git a/afsklearn/patched_modules.yml b/afsklearn/patched_modules.yml
index cd1c424..a7c4b34 100644
--- a/afsklearn/patched_modules.yml
+++ b/afsklearn/patched_modules.yml
@@ -2,3 +2,8 @@ mlp_classifier:
   name: MLPClassifier
   module: sklearn.neural_network
   module_patch: afsklearn.neural_network.mlp_classifier
+
+QuantileTransformer:
+  name: QuantileTransformer
+  module: sklearn.preprocessing
+  module_patch: afsklearn.preprocessing
diff --git a/afsklearn/patcher/__init__.py b/afsklearn/patcher/__init__.py
index b940bbf..605ea9a 100644
--- a/afsklearn/patcher/__init__.py
+++ b/afsklearn/patcher/__init__.py
@@ -20,11 +20,16 @@ def rollback(module_name: str) -> None:
         patch_config = patches_info[module_name]
         _apply_patch(patch_config["module"], patch_config["name"], None)
 
-    def patch_all():
-        raise NotImplemented
+    @staticmethod
+    def patch_all() -> None:
+        for name in patches_info:
+            Patcher.patch(name)
 
-    def rollback_all():
-        raise NotImplemented
+    @staticmethod
+    def rollback_all() -> None:
+        for name in patches_info:
+            if name in temporary_storage:
+                Patcher.rollback(name)
 
 
 def _load_module(module_path: str) -> Any:
diff --git a/afsklearn/preprocessing/_data.py b/afsklearn/preprocessing/_data.py
index 8c1d8c2..990764a 100644
--- a/afsklearn/preprocessing/_data.py
+++ b/afsklearn/preprocessing/_data.py
@@ -2,14 +2,16 @@
 import warnings
 from itertools import combinations_with_replacement as combinations_w_r
 
+import time
 import numpy as np
+import arrayfire as af
 from scipy import sparse
 from scipy import stats
 from scipy import optimize
 from scipy.special import boxcox
 
 from ..base import afBaseEstimator, afTransformerMixin
-from .._validation import check_array
+from ..utils import check_array
 #from ..utils.deprecation import deprecated
 #from ..utils.extmath import row_norms
 #from ..utils.extmath import (_incremental_mean_and_var,
@@ -19,165 +21,196 @@
 #from ..utils.sparsefuncs import (inplace_column_scale,
                                  #mean_variance_axis, incr_mean_variance_axis,
                                  #min_max_axis)
-#from ..utils.validation import (check_is_fitted, check_random_state,
-                                #_check_sample_weight,
-                                #FLOAT_DTYPES, _deprecate_positional_args)
+from ..utils.validation import (check_is_fitted, check_random_state, _check_sample_weight)
+import sklearn
+from sklearn.utils.validation import FLOAT_DTYPES, _deprecate_positional_args
 #from ._csr_polynomial_expansion import _csr_polynomial_expansion
 #
 #from ._encoders import OneHotEncoder
+def interp1_af(pos, x, y):
+    # distances to normalize between two data points
+    dists = af.diff1(x)
+
+    # tile data locations to test against all desired positions
+    #x = af.tile(x, 1, pos.dims()[0])
+    # get indices of two nearest data points
+    #idxs = af.count(x < af.tile(pos, 1, x.dims()[0]).T, dim=0).as_type(af.Dtype.s32) - 1;
+
+    idxs = af.constant(0.0, 1, pos.dims()[0], dtype=af.Dtype.s32);
+    for i in range(pos.dims()[0]):
+        temp = af.tile(pos[i], x.dims()[0]);
+        idxs[0, i] = af.count( temp > x, 0).as_type(af.Dtype.s32) - 1 ;
+
+    idxs[idxs < 0] = 0;
+    idxs[idxs > (x.dims()[0] - 1)] = x.dims()[0] - 1;
+
+    minvals = x[idxs]
+
+    #import pdb; pdb.set_trace();
+    pos -= minvals
+    pos /= dists[idxs]
+    pos += idxs.T
+
+    return af.approx1(y, pos)
+
+
 
 class QuantileTransformer(afTransformerMixin, afBaseEstimator):
-    def fit(self, x):
-        print('nope')
-#    """Transform features using quantiles information.
-#
-#    This method transforms the features to follow a uniform or a normal
-#    distribution. Therefore, for a given feature, this transformation tends
-#    to spread out the most frequent values. It also reduces the impact of
-#    (marginal) outliers: this is therefore a robust preprocessing scheme.
-#
-#    The transformation is applied on each feature independently. First an
-#    estimate of the cumulative distribution function of a feature is
-#    used to map the original values to a uniform distribution. The obtained
-#    values are then mapped to the desired output distribution using the
-#    associated quantile function. Features values of new/unseen data that fall
-#    below or above the fitted range will be mapped to the bounds of the output
-#    distribution. Note that this transform is non-linear. It may distort linear
-#    correlations between variables measured at the same scale but renders
-#    variables measured at different scales more directly comparable.
-#
-#    Read more in the :ref:`User Guide <preprocessing_transformer>`.
-#
-#    .. versionadded:: 0.19
-#
-#    Parameters
-#    ----------
-#    n_quantiles : int, default=1000 or n_samples
-#        Number of quantiles to be computed. It corresponds to the number
-#        of landmarks used to discretize the cumulative distribution function.
-#        If n_quantiles is larger than the number of samples, n_quantiles is set
-#        to the number of samples as a larger number of quantiles does not give
-#        a better approximation of the cumulative distribution function
-#        estimator.
-#
-#    output_distribution : {'uniform', 'normal'}, default='uniform'
-#        Marginal distribution for the transformed data. The choices are
-#        'uniform' (default) or 'normal'.
-#
-#    ignore_implicit_zeros : bool, default=False
-#        Only applies to sparse matrices. If True, the sparse entries of the
-#        matrix are discarded to compute the quantile statistics. If False,
-#        these entries are treated as zeros.
-#
-#    subsample : int, default=1e5
-#        Maximum number of samples used to estimate the quantiles for
-#        computational efficiency. Note that the subsampling procedure may
-#        differ for value-identical sparse and dense matrices.
-#
-#    random_state : int, RandomState instance or None, default=None
-#        Determines random number generation for subsampling and smoothing
-#        noise.
-#        Please see ``subsample`` for more details.
-#        Pass an int for reproducible results across multiple function calls.
-#        See :term:`Glossary <random_state>`
-#
-#    copy : bool, default=True
-#        Set to False to perform inplace transformation and avoid a copy (if the
-#        input is already a numpy array).
-#
-#    Attributes
-#    ----------
-#    n_quantiles_ : int
-#        The actual number of quantiles used to discretize the cumulative
-#        distribution function.
-#
-#    quantiles_ : ndarray of shape (n_quantiles, n_features)
-#        The values corresponding the quantiles of reference.
-#
-#    references_ : ndarray of shape (n_quantiles, )
-#        Quantiles of references.
-#
-#    Examples
-#    --------
-#    >>> import numpy as np
-#    >>> from sklearn.preprocessing import QuantileTransformer
-#    >>> rng = np.random.RandomState(0)
-#    >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
-#    >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
-#    >>> qt.fit_transform(X)
-#    array([...])
-#
-#    See Also
-#    --------
-#    quantile_transform : Equivalent function without the estimator API.
-#    PowerTransformer : Perform mapping to a normal distribution using a power
-#        transform.
-#    StandardScaler : Perform standardization that is faster, but less robust
-#        to outliers.
-#    RobustScaler : Perform robust standardization that removes the influence
-#        of outliers but does not put outliers and inliers on the same scale.
-#
-#    Notes
-#    -----
-#    NaNs are treated as missing values: disregarded in fit, and maintained in
-#    transform.
-#
-#    For a comparison of the different scalers, transformers, and normalizers,
-#    see :ref:`examples/preprocessing/plot_all_scaling.py
-#    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-#    """
-#
-#    @_deprecate_positional_args
-#    def __init__(self, *, n_quantiles=1000, output_distribution='uniform',
-#                 ignore_implicit_zeros=False, subsample=int(1e5),
-#                 random_state=None, copy=True):
-#        self.n_quantiles = n_quantiles
-#        self.output_distribution = output_distribution
-#        self.ignore_implicit_zeros = ignore_implicit_zeros
-#        self.subsample = subsample
-#        self.random_state = random_state
-#        self.copy = copy
-#
-#    def _dense_fit(self, X, random_state):
-#        """Compute percentiles for dense matrices.
-#
-#        Parameters
-#        ----------
-#        X : ndarray of shape (n_samples, n_features)
-#            The data used to scale along the features axis.
-#        """
-#        if self.ignore_implicit_zeros:
-#            warnings.warn("'ignore_implicit_zeros' takes effect only with"
-#                          " sparse matrix. This parameter has no effect.")
-#
-#        n_samples, n_features = X.shape
-#        references = self.references_ * 100
-#
-#        self.quantiles_ = []
-#        for col in X.T:
-#            if self.subsample < n_samples:
-#                subsample_idx = random_state.choice(n_samples,
-#                                                    size=self.subsample,
-#                                                    replace=False)
-#                col = col.take(subsample_idx, mode='clip')
-#            self.quantiles_.append(np.nanpercentile(col, references))
-#        self.quantiles_ = np.transpose(self.quantiles_)
-#        # Due to floating-point precision error in `np.nanpercentile`,
-#        # make sure that quantiles are monotonically increasing.
-#        # Upstream issue in numpy:
-#        # https://github.com/numpy/numpy/issues/14685
-#        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
-#
-#    def _sparse_fit(self, X, random_state):
-#        """Compute percentiles for sparse matrices.
-#
-#        Parameters
-#        ----------
-#        X : sparse matrix of shape (n_samples, n_features)
-#            The data used to scale along the features axis. The sparse matrix
-#            needs to be nonnegative. If a sparse matrix is provided,
-#            it will be converted into a sparse ``csc_matrix``.
-#        """
+    """Transform features using quantiles information.
+
+    This method transforms the features to follow a uniform or a normal
+    distribution. Therefore, for a given feature, this transformation tends
+    to spread out the most frequent values. It also reduces the impact of
+    (marginal) outliers: this is therefore a robust preprocessing scheme.
+
+    The transformation is applied on each feature independently. First an
+    estimate of the cumulative distribution function of a feature is
+    used to map the original values to a uniform distribution. The obtained
+    values are then mapped to the desired output distribution using the
+    associated quantile function. Features values of new/unseen data that fall
+    below or above the fitted range will be mapped to the bounds of the output
+    distribution. Note that this transform is non-linear. It may distort linear
+    correlations between variables measured at the same scale but renders
+    variables measured at different scales more directly comparable.
+
+    Read more in the :ref:`User Guide <preprocessing_transformer>`.
+
+    .. versionadded:: 0.19
+
+    Parameters
+    ----------
+    n_quantiles : int, default=1000 or n_samples
+        Number of quantiles to be computed. It corresponds to the number
+        of landmarks used to discretize the cumulative distribution function.
+        If n_quantiles is larger than the number of samples, n_quantiles is set
+        to the number of samples as a larger number of quantiles does not give
+        a better approximation of the cumulative distribution function
+        estimator.
+
+    output_distribution : {'uniform', 'normal'}, default='uniform'
+        Marginal distribution for the transformed data. The choices are
+        'uniform' (default) or 'normal'.
+
+    ignore_implicit_zeros : bool, default=False
+        Only applies to sparse matrices. If True, the sparse entries of the
+        matrix are discarded to compute the quantile statistics. If False,
+        these entries are treated as zeros.
+
+    subsample : int, default=1e5
+        Maximum number of samples used to estimate the quantiles for
+        computational efficiency. Note that the subsampling procedure may
+        differ for value-identical sparse and dense matrices.
+
+    random_state : int, RandomState instance or None, default=None
+        Determines random number generation for subsampling and smoothing
+        noise.
+        Please see ``subsample`` for more details.
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`
+
+    copy : bool, default=True
+        Set to False to perform inplace transformation and avoid a copy (if the
+        input is already a numpy array).
+
+    Attributes
+    ----------
+    n_quantiles_ : int
+        The actual number of quantiles used to discretize the cumulative
+        distribution function.
+
+    quantiles_ : ndarray of shape (n_quantiles, n_features)
+        The values corresponding the quantiles of reference.
+
+    references_ : ndarray of shape (n_quantiles, )
+        Quantiles of references.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import QuantileTransformer
+    >>> rng = np.random.RandomState(0)
+    >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
+    >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
+    >>> qt.fit_transform(X)
+    array([...])
+
+    See Also
+    --------
+    quantile_transform : Equivalent function without the estimator API.
+    PowerTransformer : Perform mapping to a normal distribution using a power
+        transform.
+    StandardScaler : Perform standardization that is faster, but less robust
+        to outliers.
+    RobustScaler : Perform robust standardization that removes the influence
+        of outliers but does not put outliers and inliers on the same scale.
+
+    Notes
+    -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
+    For a comparison of the different scalers, transformers, and normalizers,
+    see :ref:`examples/preprocessing/plot_all_scaling.py
+    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    """
+
+    @_deprecate_positional_args
+    def __init__(self, *, n_quantiles=1000, output_distribution='uniform',
+                 ignore_implicit_zeros=False, subsample=int(1e5),
+                 random_state=None, copy=True):
+        self.n_quantiles = n_quantiles
+        self.output_distribution = output_distribution
+        self.ignore_implicit_zeros = ignore_implicit_zeros
+        self.subsample = subsample
+        self.random_state = random_state
+        self.copy = copy
+
+    def _dense_fit(self, X, random_state):
+        """Compute percentiles for dense matrices.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+        """
+        tic = time.perf_counter()
+
+        if self.ignore_implicit_zeros:
+            warnings.warn("'ignore_implicit_zeros' takes effect only with"
+                          " sparse matrix. This parameter has no effect.")
+
+        n_samples, n_features = X.shape
+        references = self.references_ * 100
+
+        #TODO: fix slowdowns heeere
+        self.quantiles_ = []
+        for col in X.T:
+            if self.subsample < n_samples:
+                subsample_idx = random_state.choice(n_samples,
+                                                    size=self.subsample,
+                                                    replace=False)
+                col = col.take(subsample_idx, mode='clip')
+            self.quantiles_.append(np.nanpercentile(col, references))
+        self.quantiles_ = np.transpose(self.quantiles_)
+        # Due to floating-point precision error in `np.nanpercentile`,
+        # make sure that quantiles are monotonically increasing.
+        # Upstream issue in numpy:
+        # https://github.com/numpy/numpy/issues/14685
+        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
+
+        toc = time.perf_counter()
+        print(f"sklearn _dense_fit time {(toc - tic):0.4f} seconds")
+
+    def _sparse_fit(self, X, random_state):
+        """Compute percentiles for sparse matrices.
+
+        Parameters
+        ----------
+        X : sparse matrix of shape (n_samples, n_features)
+            The data used to scale along the features axis. The sparse matrix
+            needs to be nonnegative. If a sparse matrix is provided,
+            it will be converted into a sparse ``csc_matrix``.
+        """
 #        n_samples, n_features = X.shape
 #        references = self.references_ * 100
 #
@@ -216,210 +249,312 @@ def fit(self, x):
 #        # Upstream issue in numpy:
 #        # https://github.com/numpy/numpy/issues/14685
 #        self.quantiles_ = np.maximum.accumulate(self.quantiles_)
-#
-#    def fit(self, X, y=None):
-#        """Compute the quantiles used for transforming.
-#
-#        Parameters
-#        ----------
-#        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-#            The data used to scale along the features axis. If a sparse
-#            matrix is provided, it will be converted into a sparse
-#            ``csc_matrix``. Additionally, the sparse matrix needs to be
-#            nonnegative if `ignore_implicit_zeros` is False.
-#
-#        y : None
-#            Ignored.
-#
-#        Returns
-#        -------
-#        self : object
-#           Fitted transformer.
-#        """
-#        if self.n_quantiles <= 0:
-#            raise ValueError("Invalid value for 'n_quantiles': %d. "
-#                             "The number of quantiles must be at least one."
-#                             % self.n_quantiles)
-#
-#        if self.subsample <= 0:
-#            raise ValueError("Invalid value for 'subsample': %d. "
-#                             "The number of subsamples must be at least one."
-#                             % self.subsample)
-#
-#        if self.n_quantiles > self.subsample:
-#            raise ValueError("The number of quantiles cannot be greater than"
-#                             " the number of samples used. Got {} quantiles"
-#                             " and {} samples.".format(self.n_quantiles,
-#                                                       self.subsample))
-#
-#        X = self._check_inputs(X, in_fit=True, copy=False)
-#        n_samples = X.shape[0]
-#
-#        if self.n_quantiles > n_samples:
-#            warnings.warn("n_quantiles (%s) is greater than the total number "
-#                          "of samples (%s). n_quantiles is set to "
-#                          "n_samples."
-#                          % (self.n_quantiles, n_samples))
-#        self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
-#
-#        rng = check_random_state(self.random_state)
-#
-#        # Create the quantiles of reference
-#        self.references_ = np.linspace(0, 1, self.n_quantiles_,
-#                                       endpoint=True)
-#        if sparse.issparse(X):
-#            self._sparse_fit(X, rng)
-#        else:
-#            self._dense_fit(X, rng)
-#
-#        return self
-#
-#    def _transform_col(self, X_col, quantiles, inverse):
-#        """Private function to transform a single feature."""
-#
-#        output_distribution = self.output_distribution
-#
-#        if not inverse:
-#            lower_bound_x = quantiles[0]
-#            upper_bound_x = quantiles[-1]
-#            lower_bound_y = 0
-#            upper_bound_y = 1
-#        else:
-#            lower_bound_x = 0
-#            upper_bound_x = 1
-#            lower_bound_y = quantiles[0]
-#            upper_bound_y = quantiles[-1]
-#            # for inverse transform, match a uniform distribution
-#            with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
-#                if output_distribution == 'normal':
-#                    X_col = stats.norm.cdf(X_col)
-#                # else output distribution is already a uniform distribution
-#
-#        # find index for lower and higher bounds
-#        with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
-#            if output_distribution == 'normal':
-#                lower_bounds_idx = (X_col - BOUNDS_THRESHOLD <
-#                                    lower_bound_x)
-#                upper_bounds_idx = (X_col + BOUNDS_THRESHOLD >
-#                                    upper_bound_x)
-#            if output_distribution == 'uniform':
-#                lower_bounds_idx = (X_col == lower_bound_x)
-#                upper_bounds_idx = (X_col == upper_bound_x)
-#
-#        isfinite_mask = ~np.isnan(X_col)
-#        X_col_finite = X_col[isfinite_mask]
-#        if not inverse:
-#            # Interpolate in one direction and in the other and take the
-#            # mean. This is in case of repeated values in the features
-#            # and hence repeated quantiles
-#            #
-#            # If we don't do this, only one extreme of the duplicated is
-#            # used (the upper when we do ascending, and the
-#            # lower for descending). We take the mean of these two
-#            X_col[isfinite_mask] = .5 * (
-#                np.interp(X_col_finite, quantiles, self.references_)
-#                - np.interp(-X_col_finite, -quantiles[::-1],
-#                            -self.references_[::-1]))
-#        else:
-#            X_col[isfinite_mask] = np.interp(X_col_finite,
-#                                             self.references_, quantiles)
-#
-#        X_col[upper_bounds_idx] = upper_bound_y
-#        X_col[lower_bounds_idx] = lower_bound_y
-#        # for forward transform, match the output distribution
-#        if not inverse:
-#            with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
-#                if output_distribution == 'normal':
-#                    X_col = stats.norm.ppf(X_col)
-#                    # find the value to clip the data to avoid mapping to
-#                    # infinity. Clip such that the inverse transform will be
-#                    # consistent
-#                    clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
-#                    clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD -
-#                                                   np.spacing(1)))
-#                    X_col = np.clip(X_col, clip_min, clip_max)
-#                # else output distribution is uniform and the ppf is the
-#                # identity function so we let X_col unchanged
-#
-#        return X_col
-#
-#    def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
-#                      copy=False):
-#        """Check inputs before fit and transform."""
-#        X = self._validate_data(X, reset=in_fit,
-#                                accept_sparse='csc', copy=copy,
-#                                dtype=FLOAT_DTYPES,
-#                                force_all_finite='allow-nan')
-#        # we only accept positive sparse matrix when ignore_implicit_zeros is
-#        # false and that we call fit or transform.
-#        with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
-#            if (not accept_sparse_negative and not self.ignore_implicit_zeros
-#                    and (sparse.issparse(X) and np.any(X.data < 0))):
-#                raise ValueError('QuantileTransformer only accepts'
-#                                 ' non-negative sparse matrices.')
-#
-#        # check the output distribution
-#        if self.output_distribution not in ('normal', 'uniform'):
-#            raise ValueError("'output_distribution' has to be either 'normal'"
-#                             " or 'uniform'. Got '{}' instead.".format(
-#                                 self.output_distribution))
-#
-#        return X
-#
-#    def _transform(self, X, inverse=False):
-#        """Forward and inverse transform.
-#
-#        Parameters
-#        ----------
-#        X : ndarray of shape (n_samples, n_features)
-#            The data used to scale along the features axis.
-#
-#        inverse : bool, default=False
-#            If False, apply forward transform. If True, apply
-#            inverse transform.
-#
-#        Returns
-#        -------
-#        X : ndarray of shape (n_samples, n_features)
-#            Projected data.
-#        """
-#
-#        if sparse.issparse(X):
-#            for feature_idx in range(X.shape[1]):
-#                column_slice = slice(X.indptr[feature_idx],
-#                                     X.indptr[feature_idx + 1])
-#                X.data[column_slice] = self._transform_col(
-#                    X.data[column_slice], self.quantiles_[:, feature_idx],
-#                    inverse)
-#        else:
-#            for feature_idx in range(X.shape[1]):
-#                X[:, feature_idx] = self._transform_col(
-#                    X[:, feature_idx], self.quantiles_[:, feature_idx],
-#                    inverse)
-#
-#        return X
-#
-#    def transform(self, X):
-#        """Feature-wise transformation of the data.
-#
-#        Parameters
-#        ----------
-#        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-#            The data used to scale along the features axis. If a sparse
-#            matrix is provided, it will be converted into a sparse
-#            ``csc_matrix``. Additionally, the sparse matrix needs to be
-#            nonnegative if `ignore_implicit_zeros` is False.
-#
-#        Returns
-#        -------
-#        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
-#            The projected data.
-#        """
-#        check_is_fitted(self)
-#        X = self._check_inputs(X, in_fit=False, copy=self.copy)
-#
-#        return self._transform(X, inverse=False)
-#
+
+    def fit(self, X, y=None):
+        """Compute the quantiles used for transforming.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        y : None
+            Ignored.
+
+        Returns
+        -------
+        self : object
+           Fitted transformer.
+        """
+        if self.n_quantiles <= 0:
+            raise ValueError("Invalid value for 'n_quantiles': %d. "
+                             "The number of quantiles must be at least one."
+                             % self.n_quantiles)
+
+        if self.subsample <= 0:
+            raise ValueError("Invalid value for 'subsample': %d. "
+                             "The number of subsamples must be at least one."
+                             % self.subsample)
+
+        if self.n_quantiles > self.subsample:
+            raise ValueError("The number of quantiles cannot be greater than"
+                             " the number of samples used. Got {} quantiles"
+                             " and {} samples.".format(self.n_quantiles,
+                                                       self.subsample))
+
+        X = self._check_inputs(X, in_fit=True, copy=False)
+        n_samples = X.shape[0]
+
+        if self.n_quantiles > n_samples:
+            warnings.warn("n_quantiles (%s) is greater than the total number "
+                          "of samples (%s). n_quantiles is set to "
+                          "n_samples."
+                          % (self.n_quantiles, n_samples))
+        self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
+
+        rng = check_random_state(self.random_state)
+
+        # Create the quantiles of reference
+        self.references_ = np.linspace(0, 1, self.n_quantiles_,
+                                       endpoint=True)
+        if sparse.issparse(X):
+            self._sparse_fit(X, rng)
+        else:
+            self._dense_fit(X, rng)
+
+        return self
+
+    def _transform_col(self, X_col, quantiles, inverse):
+        """Private function to transform a single feature."""
+        tic = time.perf_counter()
+        #import pdb; pdb.set_trace()
+
+        output_distribution = self.output_distribution
+
+        if not inverse:
+            lower_bound_x = quantiles[0]
+            upper_bound_x = quantiles[-1]
+            lower_bound_y = 0
+            upper_bound_y = 1
+        else:
+            lower_bound_x = 0
+            upper_bound_x = 1
+            lower_bound_y = quantiles[0]
+            upper_bound_y = quantiles[-1]
+            # for inverse transform, match a uniform distribution
+            with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+                if output_distribution == 'normal':
+                    X_col = stats.norm.cdf(X_col)
+                # else output distribution is already a uniform distribution
+
+        # find index for lower and higher bounds
+        with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+            if output_distribution == 'normal':
+                lower_bounds_idx = (X_col - BOUNDS_THRESHOLD <
+                                    lower_bound_x)
+                upper_bounds_idx = (X_col + BOUNDS_THRESHOLD >
+                                    upper_bound_x)
+            if output_distribution == 'uniform':
+                lower_bounds_idx = (X_col == lower_bound_x)
+                upper_bounds_idx = (X_col == upper_bound_x)
+
+        isfinite_mask = ~np.isnan(X_col)
+        X_col_finite = X_col[isfinite_mask]
+        if not inverse:
+            # Interpolate in one direction and in the other and take the
+            # mean. This is in case of repeated values in the features
+            # and hence repeated quantiles
+            #
+            # If we don't do this, only one extreme of the duplicated is
+            # used (the upper when we do ascending, and the
+            # lower for descending). We take the mean of these two
+            X_col[isfinite_mask] = .5 * (
+                np.interp(X_col_finite, quantiles, self.references_)
+                - np.interp(-X_col_finite, -quantiles[::-1],
+                            -self.references_[::-1]))
+        else:
+            X_col[isfinite_mask] = np.interp(X_col_finite,
+                                             self.references_, quantiles)
+
+        X_col[upper_bounds_idx] = upper_bound_y
+        X_col[lower_bounds_idx] = lower_bound_y
+        # for forward transform, match the output distribution
+        if not inverse:
+            with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+                if output_distribution == 'normal':
+                    X_col = stats.norm.ppf(X_col)
+                    # find the value to clip the data to avoid mapping to
+                    # infinity. Clip such that the inverse transform will be
+                    # consistent
+                    clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
+                    clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD -
+                                                   np.spacing(1)))
+                    X_col = np.clip(X_col, clip_min, clip_max)
+                # else output distribution is uniform and the ppf is the
+                # identity function so we let X_col unchanged
+
+        toc = time.perf_counter()
+        print(f"sklearn _transform_col time {(toc - tic):0.4f} seconds")
+        return X_col
+
+    def _transform_col_af(self, X_col, quantiles, inverse):
+        """Private function to transform a single feature."""
+        tic = time.perf_counter()
+        #import pdb; pdb.set_trace()
+
+        output_distribution = self.output_distribution
+
+        if not inverse:
+            lower_bound_x = quantiles[0]
+            upper_bound_x = quantiles[-1]
+            lower_bound_y = 0
+            upper_bound_y = 1
+        else:
+            lower_bound_x = 0
+            upper_bound_x = 1
+            lower_bound_y = quantiles[0]
+            upper_bound_y = quantiles[-1]
+            # for inverse transform, match a uniform distribution
+            with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+                if output_distribution == 'normal':
+                    X_col = stats.norm.cdf(X_col)
+                # else output distribution is already a uniform distribution
+
+        # find index for lower and higher bounds
+        with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+            if output_distribution == 'normal':
+                lower_bounds_idx = (X_col - BOUNDS_THRESHOLD <
+                                    lower_bound_x)
+                upper_bounds_idx = (X_col + BOUNDS_THRESHOLD >
+                                    upper_bound_x)
+            if output_distribution == 'uniform':
+                lower_bounds_idx = (X_col == lower_bound_x)
+                upper_bounds_idx = (X_col == upper_bound_x)
+
+        isfinite_mask = ~np.isnan(X_col)
+        X_col_finite = X_col[isfinite_mask]
+
+        X_col_af = af.interop.from_ndarray(X_col)
+        isfinite_mask_af = ~af.isnan(X_col_af)
+        X_col_finite_af = X_col_af[isfinite_mask_af]
+        if not inverse:
+            # Interpolate in one direction and in the other and take the
+            # mean. This is in case of repeated values in the features
+            # and hence repeated quantiles
+            #
+            # If we don't do this, only one extreme of the duplicated is
+            # used (the upper when we do ascending, and the
+            # lower for descending). We take the mean of these two
+            quantiles_af = af.interop.from_ndarray(quantiles)
+            references_af = af.interop.from_ndarray(self.references_)
+            #import pdb;pdb.set_trace()
+            #(Pdb) quantiles.shape
+            #(1000,)
+            #(Pdb) X_col_finite.shape
+            #(25000,)
+            #(Pdb) self.references_.shape
+            #(1000,)
+            ires0    = np.interp(X_col_finite, quantiles, self.references_)
+            #ires0_af = af.approx(X_col_finite, quantiles, self.references_)
+            ires0_af = interp1_af(X_col_finite_af, quantiles_af, references_af)
+            print(np.max(ires0_af.to_ndarray() - ires0))
+
+            X_col[isfinite_mask] = .5 * (
+                np.interp(X_col_finite, quantiles, self.references_)
+                - np.interp(-X_col_finite, -quantiles[::-1],
+                            -self.references_[::-1]))
+        else:
+            X_col[isfinite_mask] = np.interp(X_col_finite,
+                                             self.references_, quantiles)
+
+        X_col[upper_bounds_idx] = upper_bound_y
+        X_col[lower_bounds_idx] = lower_bound_y
+        # for forward transform, match the output distribution
+        if not inverse:
+            with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+                if output_distribution == 'normal':
+                    X_col = stats.norm.ppf(X_col)
+                    # find the value to clip the data to avoid mapping to
+                    # infinity. Clip such that the inverse transform will be
+                    # consistent
+                    clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
+                    clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD -
+                                                   np.spacing(1)))
+                    X_col = np.clip(X_col, clip_min, clip_max)
+                # else output distribution is uniform and the ppf is the
+                # identity function so we let X_col unchanged
+
+        toc = time.perf_counter()
+        print(f"sklearn _transform_col_af time {(toc - tic):0.4f} seconds")
+        return X_col
+
+    def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
+                      copy=False):
+        """Check inputs before fit and transform."""
+        X = self._validate_data(X, reset=in_fit,
+                                accept_sparse='csc', copy=copy,
+                                dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
+        # we only accept positive sparse matrix when ignore_implicit_zeros is
+        # false and that we call fit or transform.
+        with np.errstate(invalid='ignore'):  # hide NaN comparison warnings
+            if (not accept_sparse_negative and not self.ignore_implicit_zeros
+                    and (sparse.issparse(X) and np.any(X.data < 0))):
+                raise ValueError('QuantileTransformer only accepts'
+                                 ' non-negative sparse matrices.')
+
+        # check the output distribution
+        if self.output_distribution not in ('normal', 'uniform'):
+            raise ValueError("'output_distribution' has to be either 'normal'"
+                             " or 'uniform'. Got '{}' instead.".format(
+                                 self.output_distribution))
+
+        return X
+
+    def _transform(self, X, inverse=False):
+        """Forward and inverse transform.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            The data used to scale along the features axis.
+
+        inverse : bool, default=False
+            If False, apply forward transform. If True, apply
+            inverse transform.
+
+        Returns
+        -------
+        X : ndarray of shape (n_samples, n_features)
+            Projected data.
+        """
+
+        tic = time.perf_counter()
+        if sparse.issparse(X):
+            for feature_idx in range(X.shape[1]):
+                column_slice = slice(X.indptr[feature_idx],
+                                     X.indptr[feature_idx + 1])
+                X.data[column_slice] = self._transform_col(
+                    X.data[column_slice], self.quantiles_[:, feature_idx],
+                    inverse)
+        else:
+            for feature_idx in range(X.shape[1]):
+                feature_update = self._transform_col(
+                    X[:, feature_idx], self.quantiles_[:, feature_idx],
+                    inverse)
+                af_feature_update = self._transform_col_af(
+                    X[:, feature_idx], self.quantiles_[:, feature_idx],
+                    inverse)
+                X[:, feature_idx] = feature_update
+
+        toc = time.perf_counter()
+        print(f"sklearn _transform time {(toc - tic):0.4f} seconds")
+        return X
+
+    def transform(self, X):
+        """Feature-wise transformation of the data.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The data used to scale along the features axis. If a sparse
+            matrix is provided, it will be converted into a sparse
+            ``csc_matrix``. Additionally, the sparse matrix needs to be
+            nonnegative if `ignore_implicit_zeros` is False.
+
+        Returns
+        -------
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The projected data.
+        """
+        check_is_fitted(self)
+        X = self._check_inputs(X, in_fit=False, copy=self.copy)
+
+        return self._transform(X, inverse=False)
+
 #    def inverse_transform(self, X):
 #        """Back-projection to the original space.
 #
diff --git a/afsklearn/test_af_LogisticRegression.py b/afsklearn/test_af_LogisticRegression.py
new file mode 100644
index 0000000..fc00ab7
--- /dev/null
+++ b/afsklearn/test_af_LogisticRegression.py
@@ -0,0 +1,242 @@
+import unittest
+import numpy as np
+import pickle
+from typing import NamedTuple
+
+from sklearn import datasets
+from sklearn.utils import shuffle
+
+import af_LogisticRegression
+# from autogenerate.d3m_sklearn_wrap.sklearn_wrap import af_LogisticRegression
+
+# Taken from original af logit example
+import arrayfire as af
+from arrayfire.algorithm import count, imax, sum
+from arrayfire.arith import abs, log, sigmoid
+from arrayfire.blas import matmul, matmulTN
+from arrayfire.data import constant, join
+from arrayfire.device import eval, sync
+from arrayfire.interop import from_ndarray
+
+# Common random state
+rng = np.random.RandomState(0)
+
+# Toy sample from sklearn tests
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y_class = ["foo", "foo", "foo", 1, 1, 1]    # test string class labels
+T = [[-1, -1], [2, 2], [3, 2]]
+y_t_class = ["foo", 1, 1]
+
+# Load the iris dataset and randomly permute it
+iris = datasets.load_iris()
+perm = rng.permutation(iris.target.size)
+iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
+
+Hyperparams = NamedTuple('Hyperparams', [
+    ('learning_rate', float),
+    ('reg_constant', float),
+    ('max_err', float),
+    ('max_iter', int),
+    ('verbose', int),
+])
+
+
+def ints_to_onehots(ints, num_classes):
+    onehots = np.zeros((ints.shape[0], num_classes), dtype='float32')
+    onehots[np.arange(ints.shape[0]), ints] = 1
+    return onehots
+
+
+def onehots_to_ints(onehots):
+    return np.argmax(onehots, axis=1)
+
+
+def accuracy(predicted, target):
+    _, tlabels = af.imax(target, 1)
+    _, plabels = af.imax(predicted, 1)
+    return 100 * af.count(plabels == tlabels) / tlabels.elements()
+
+
+def abserr(predicted, target):
+    return 100 * af.sum(af.abs(predicted - target)) / predicted.elements()
+
+
+class RefAfLogisticRegression:
+    def __init__(self, alpha=0.1, lambda_param=1.0, maxerr=0.01, maxiter=1000, verbose=False):
+        self.__alpha = alpha
+        self.__lambda_param = lambda_param
+        self.__maxerr = maxerr
+        self.__maxiter = maxiter
+        self.__verbose = verbose
+        self.__weights = None
+
+
+    def predict_proba(self, X):
+        Z = af.matmul(X, self.__weights)
+        return af.sigmoid(Z)
+
+
+    def predict_log_proba(self, X):
+        return af.log(self.predict_proba(X))
+
+
+    def predict(self, X):
+        probs = self.predict_proba(X)
+        _, classes = af.imax(probs, 1)
+        return classes
+
+
+    def cost(self, X, Y):
+        # Number of samples
+        m = Y.dims()[0]
+
+        dim0 = self.__weights.dims()[0]
+        dim1 = self.__weights.dims()[1] if len(self.__weights.dims()) > 1 else None
+        dim2 = self.__weights.dims()[2] if len(self.__weights.dims()) > 2 else None
+        dim3 = self.__weights.dims()[3] if len(self.__weights.dims()) > 3 else None
+        # Make the lambda corresponding to self.__weights(0) == 0
+        lambdat = af.constant(self.__lambda_param, dim0, dim1, dim2, dim3)
+
+        # No regularization for bias weights
+        lambdat[0, :] = 0
+
+        # Get the prediction
+        H = self.predict_proba(X)
+
+        # Cost of misprediction
+        Jerr = -1 * af.sum(Y * af.log(H) + (1 - Y) * af.log(1 - H), dim=0)
+
+        # Regularization cost
+        Jreg = 0.5 * af.sum(lambdat * self.__weights * self.__weights, dim=0)
+
+        # Total cost
+        J = (Jerr + Jreg) / m
+
+        # Find the gradient of cost
+        D = (H - Y)
+        dJ = (af.matmulTN(X, D) + lambdat * self.__weights) / m
+
+        return J, dJ
+
+
+    def train(self, X, Y):
+        # Initialize parameters to 0
+        self.__weights = af.constant(0, X.dims()[1], Y.dims()[1])
+
+        for i in range(self.__maxiter):
+            # Get the cost and gradient
+            J, dJ = self.cost(X, Y)
+            err = af.max(af.abs(J))
+            if err < self.__maxerr:
+                print('Iteration {0:4d} Err: {1:4f}'.format(i + 1, err))
+                print('Training converged')
+                return self.__weights
+
+            if self.__verbose and ((i+1) % 10 == 0):
+                print('Iteration {0:4d} Err: {1:4f}'.format(i + 1, err))
+
+            # Update the weights via gradient descent
+            self.__weights = self.__weights - self.__alpha * dJ
+
+        if self.__verbose:
+            print('Training stopped after {0:d} iterations'.format(self.__maxiter))
+
+
+    def eval(self):
+        af.eval(self.__weights)
+        af.sync()
+
+
+class Testaf_LogisticRegression(unittest.TestCase):
+
+    def test(self):
+        ############# Pure arrayfire-python example ###########
+
+        # Determine number of classes if not provided
+        num_classes = np.unique(iris.target).shape[0]
+
+        # Convert numpy array to af array (and convert labels/targets from ints to
+        # one-hot encodings)
+        train_feats = af.from_ndarray(iris.data.astype('float32'))
+        train_targets = af.from_ndarray(ints_to_onehots(iris.target.astype('uint32'), num_classes))
+        test_feats = af.from_ndarray(iris.data.astype('float32'))
+        test_targets = af.from_ndarray(ints_to_onehots(iris.target.astype('uint32'), num_classes))
+
+        num_train = train_feats.dims()[0]
+        num_test = test_feats.dims()[0]
+
+        # Remove bias for now to match output with pure arrayfire example
+        # Pure arrayfire example uses features with bias column for train and test
+        # but we can't expect that for d3m's inputs in general
+        # Add bias
+        # train_bias = af.constant(1, num_train, 1)
+        # test_bias = af.constant(1, num_test, 1)
+        # train_feats = af.join(1, train_bias, train_feats)
+        # test_feats = af.join(1, test_bias, test_feats)
+
+        ref_clf = RefAfLogisticRegression(alpha=0.1,          # learning rate
+                                          lambda_param = 1.0, # regularization constant
+                                          maxerr=0.01,        # max error
+                                          maxiter=1000,       # max iters
+                                          verbose=False       # verbose mode
+        )
+
+        ref_clf.train(train_feats, train_targets)
+        af_output = ref_clf.predict(test_feats)
+        ref_output = af_output.to_ndarray()
+        print('Completed reference calculation')
+
+        ############# d3m-arrayfire example ###########
+
+        hyperparams = af_LogisticRegression.Hyperparams.defaults()
+        # Create the model object
+        test_clf = af_LogisticRegression.af_LogisticRegression(hyperparams=hyperparams)
+        train_set = iris.data
+        targets = iris.target
+        test_clf.set_training_data(inputs=train_set, outputs=targets)
+        test_clf.fit()
+
+        test_output = test_clf.produce(inputs=train_set)
+        print('Completed test calculation')
+
+        self.assertTrue(np.array_equal(ref_output, test_output.value))
+        print('SUCCESS: Pure arrayfire-python output equals d3m-arrayfire output')
+
+        # Testing get_params() and set_params()
+        params = test_clf.get_params()
+        test_clf.set_params(params=params)
+        first_output = test_clf.produce(inputs=train_set)
+        self.assertTrue(np.array_equal(ref_output, first_output.value))
+        print('SUCCESS: get_params and set_params work')
+
+        # pickle the params and hyperparams
+        pickled_params = pickle.dumps(params)
+        unpickled_params = pickle.loads(pickled_params)
+
+        pickled_hyperparams = pickle.dumps(hyperparams)
+        unpickled_hyperparams = pickle.loads(pickled_hyperparams)
+
+        # Create a new object from pickled params and hyperparams
+        pickle_params_clf = af_LogisticRegression.af_LogisticRegression(hyperparams=unpickled_hyperparams)
+        pickle_params_clf.set_params(params=unpickled_params)
+        pickle_params_clf.set_training_data(inputs=train_set, outputs=targets)
+        pickle_params_clf.fit()
+        pickle_params_output = pickle_params_clf.produce(inputs=train_set)
+
+        # Check if outputs match
+        self.assertTrue(np.array_equal(ref_output, pickle_params_output.value))
+        # We want to test the running of the code without errors and not the correctness of it
+        # since that is assumed to be tested by sklearn
+        print("SUCCESS: Pickling and unpickling hyperparams and params work")
+
+        # Pickling whole model doesn't work right now, probably because model
+        # is stored in device memory and pickle cannot serialize it trivially
+        # model = pickle.dumps(test_clf)
+        # pickle_model_clf = pickle.loads(model)
+        # pickle_model_output = pickle_model_clf.produce(inputs=train_set)
+        # self.assertTrue(np.array_equal(ref_output, pickle_model_output.value))
+        # print("SUCCESS: Pickling whole model works")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/afsklearn/utils/__init__.py b/afsklearn/utils/__init__.py
new file mode 100644
index 0000000..5b9c406
--- /dev/null
+++ b/afsklearn/utils/__init__.py
@@ -0,0 +1,26 @@
+import sklearn
+from sklearn.utils import (murmurhash3_32, as_float_array, assert_all_finite,
+                           check_array, check_random_state, compute_class_weight,
+                           compute_sample_weight, column_or_1d, check_consistent_length,
+                           check_X_y, check_scalar, indexable, check_symmetric, indices_to_mask,
+                           deprecated, parallel_backend, register_parallel_backend, resample,
+                           shuffle, check_matplotlib_support, all_estimators,
+                           DataConversionWarning, estimator_html_repr)
+
+# overwrite required functions with arrayfire versions
+from .validation import (_assert_all_finite, check_random_state, _num_samples, _safe_accumulator_op, check_array,
+                        check_consistent_length, check_X_y, column_or_1d)
+
+__all__ = ["murmurhash3_32", "as_float_array",
+           "_assert_all_finite", "check_array",
+           "check_random_state",
+           "compute_class_weight", "compute_sample_weight",
+           "column_or_1d",
+           "check_consistent_length", "check_X_y", "check_scalar", 'indexable',
+           "check_symmetric", "indices_to_mask", "deprecated",
+           "parallel_backend", "register_parallel_backend",
+           "resample", "shuffle", "check_matplotlib_support", "all_estimators",
+           "DataConversionWarning", "estimator_html_repr"]
+
+
+
diff --git a/afsklearn/_type_utils.py b/afsklearn/utils/_type_utils.py
similarity index 100%
rename from afsklearn/_type_utils.py
rename to afsklearn/utils/_type_utils.py
diff --git a/afsklearn/_validation.py b/afsklearn/utils/validation.py
similarity index 93%
rename from afsklearn/_validation.py
rename to afsklearn/utils/validation.py
index c867ad2..4c387eb 100644
--- a/afsklearn/_validation.py
+++ b/afsklearn/utils/validation.py
@@ -788,3 +788,58 @@ def check_random_state(seed):
         return seed
     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
                      ' instance' % seed)
+
+def _check_sample_weight(sample_weight, X, dtype=None, copy=False):
+    """Validate sample weights.
+
+    Note that passing sample_weight=None will output an array of ones.
+    Therefore, in some cases, you may want to protect the call with:
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(...)
+
+    Parameters
+    ----------
+    sample_weight : {ndarray, Number or None}, shape (n_samples,)
+       Input sample weights.
+
+    X : {ndarray, list, sparse matrix}
+        Input data.
+
+    dtype: dtype, default=None
+       dtype of the validated `sample_weight`.
+       If None, and the input `sample_weight` is an array, the dtype of the
+       input is preserved; otherwise an array with the default numpy dtype
+       is be allocated.  If `dtype` is not one of `float32`, `float64`,
+       `None`, the output will be of dtype `float64`.
+
+    copy : bool, default=False
+        If True, a copy of sample_weight will be created.
+
+    Returns
+    -------
+    sample_weight : ndarray of shape (n_samples,)
+       Validated sample weight. It is guaranteed to be "C" contiguous.
+    """
+    n_samples = _num_samples(X)
+
+    if dtype is not None and dtype not in [np.float32, np.float64]:
+        dtype = np.float64
+
+    if sample_weight is None:
+        sample_weight = np.ones(n_samples, dtype=dtype)
+    elif isinstance(sample_weight, numbers.Number):
+        sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
+    else:
+        if dtype is None:
+            dtype = [np.float64, np.float32]
+        sample_weight = check_array(
+            sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype,
+            order="C", copy=copy
+        )
+        if sample_weight.ndim != 1:
+            raise ValueError("Sample weights must be 1D array or scalar")
+
+        if sample_weight.shape != (n_samples,):
+            raise ValueError("sample_weight.shape == {}, expected {}!"
+                             .format(sample_weight.shape, (n_samples,)))
+    return sample_weight
diff --git a/test/__init__.py b/test/__init__.py
index 7b1b1c1..e69de29 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -1,14 +0,0 @@
-from functools import wraps
-import time
-
-def measure_time(func):
-    @wraps(func)
-    def time_it(*args, **kwargs):
-        print(f'Testing "{func.__name__}"')
-        start_timer = time.perf_counter()
-        try:
-            return func(*args, **kwargs)
-        finally:
-            end_timer = time.perf_counter()
-            print(f"Execution time: {end_timer - start_timer} sec")
-    return time_it
diff --git a/test/test_mlp.py b/test/test_mlp.py
index 3f02b66..23b5827 100644
--- a/test/test_mlp.py
+++ b/test/test_mlp.py
@@ -5,7 +5,7 @@
 from sklearn.model_selection import train_test_split
 
 from afsklearn.patcher import Patcher
-from . import measure_time
+from timing import measure_time
 
 X, y = make_classification(n_samples=10000, random_state=1)
 X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)
@@ -31,4 +31,4 @@ def test_afsklearn_mlp() -> None:
 
 if __name__ == "__main__":
     test_afsklearn_mlp()
-    test_sklearn_mlp()
+    #test_sklearn_mlp()
diff --git a/test/test_quantile.py b/test/test_quantile.py
index 99e707c..2509d1b 100644
--- a/test/test_quantile.py
+++ b/test/test_quantile.py
@@ -3,7 +3,6 @@
 import time
 import afsklearn
 import numpy as np
-from afsklearn.preprocessing import QuantileTransformer as afQT
 
 nbench = 5
 rng = np.random.RandomState(0)
@@ -13,14 +12,28 @@
 #import pdb; pdb.set_trace()
 for n in range(nbench):
     qt = sklearn.preprocessing.QuantileTransformer()
-    qt.fit_transform(X)
+    res = qt.fit_transform(X)
 toc = time.perf_counter()
 print(f"sklearn fit time {(toc - tic)/nbench:0.4f} seconds")
 
-sklearn.preprocessing.QuantileTransformer = afQT
+#daal4py.sklearn.patch_sklearn()
+#tic = time.perf_counter()
+#for n in range(nbench):
+#    qt = sklearn.preprocessing.QuantileTransformer()
+#    qt.fit_transform(X)
+#toc = time.perf_counter()
+#print(f"daal4py sklearn fit time {(toc - tic)/nbench:0.4f} seconds")
+#
+#daal4py.sklearn.unpatch_sklearn()
 
+
+afsklearn.patch_sklearn()
+tic = time.perf_counter()
 for n in range(nbench):
     qt = sklearn.preprocessing.QuantileTransformer()
-    qt.fit_transform(X)
+    res_af = qt.fit_transform(X)
 toc = time.perf_counter()
-print(f"sklearn fit time {(toc - tic)/nbench:0.4f} seconds")
+print(f"afsklearn fit time {(toc - tic)/nbench:0.4f} seconds")
+afsklearn.unpatch_sklearn()
+
+print(np.max(np.abs(res - res_af)))
diff --git a/test/timing.py b/test/timing.py
new file mode 100644
index 0000000..7b1b1c1
--- /dev/null
+++ b/test/timing.py
@@ -0,0 +1,14 @@
+from functools import wraps
+import time
+
+def measure_time(func):
+    @wraps(func)
+    def time_it(*args, **kwargs):
+        print(f'Testing "{func.__name__}"')
+        start_timer = time.perf_counter()
+        try:
+            return func(*args, **kwargs)
+        finally:
+            end_timer = time.perf_counter()
+            print(f"Execution time: {end_timer - start_timer} sec")
+    return time_it