Merge pull request #44 from histogrammar/fix_filling_spark_bool

mbaak · web-flow · commit 2207fc920e04 · 2021-04-03T16:48:22.000+02:00
categorize histogram now handles nans in friendlier way
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -0,0 +1,12 @@
+=============
+Release notes
+=============
+
+Version 1.0.24, Apr 2021
+------------------------
+* Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN".
+* make_histogram() now casts spark nulls to nan in case of numeric columns. scala interprets null as 0.
+* SparselyBin histograms did not add up nanflow when added. Now fixed.
+* Added unit test for doing checks on null conversion to nans
+* Use new histogrammar-scala jar files, v1.0.20
+* Added histogrammar-scala v1.0.20 jar files to tests/jars/
diff --git a/README.rst b/README.rst
@@ -19,7 +19,7 @@ Histograms and other aggregators may also be converted into CUDA code for inclus
 PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling the CUDA code.
 This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
 
-Latest Python release: v1.0.23 (Mar 2021).
+Latest Python release: v1.0.24 (April 2021).
 
 Announcements
 =============
@@ -31,7 +31,7 @@ With Spark 3.0, based on Scala 2.12, make sure to pick up the correct histogramm
 
 .. code-block:: python
 
-  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.11,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11").getOrCreate()
+  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate()
 
 For Spark 2.X compiled against scala 2.11, in the string above simply replace "2.12" with "2.11".
 
diff --git a/histogrammar/dfinterface/histogram_filler_base.py b/histogrammar/dfinterface/histogram_filler_base.py
@@ -231,6 +231,7 @@ def assign_and_check_features(self, df, cols_by_type):
         all_cols = (
             list(cols_by_type["num"]) +
             list(cols_by_type["dt"]) +
+            list(cols_by_type["bool"]) +
             list(cols_by_type["str"])
         )
 
@@ -421,6 +422,10 @@ def categorize_features(self, df):
                     colset = cols_by_type["dt"]
                     if col not in colset:
                         colset.add(col)
+                elif np.issubdtype(dt, np.bool_):
+                    colset = cols_by_type["bool"]
+                    if col not in colset:
+                        colset.add(col)
                 else:
                     colset = cols_by_type["str"]
                     if col not in colset:
@@ -500,7 +505,7 @@ def transform(self, datastore):
     def get_hist_bin(self, hist, features, quant, col, dt):
         is_number = np.issubdtype(dt, np.number)
         is_timestamp = np.issubdtype(dt, np.datetime64)
-        is_bool = np.issubdtype(dt, bool)
+        is_bool = np.issubdtype(dt, np.bool_)
         specs = self.var_bin_specs(features, features.index(col))
 
         if is_number or is_timestamp:
diff --git a/histogrammar/dfinterface/pandas_histogrammar.py b/histogrammar/dfinterface/pandas_histogrammar.py
@@ -157,7 +157,7 @@ def process_features(self, df, cols_by_type):
         """
         # timestamp variables are converted to ns here
         # make temp df for value counting (used below)
-        idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"])].copy()
+        idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"]) + list(cols_by_type["bool"])].copy()
         for col in cols_by_type["dt"]:
             self.logger.debug(
                 'Converting column "{col}" of type "{type}" to nanosec.'.format(
@@ -215,7 +215,7 @@ def construct_empty_hist(self, features):
             # histogram type depends on the data type
             dt = self.var_dtype[col]
 
-            # processing function, e.g. only accept boolians during filling
+            # processing function, e.g. only accept booleans during filling
             f = QUANTITY[dt]
             if len(features) == 1:
                 # df[col] is a pd.series
diff --git a/histogrammar/dfinterface/spark_histogrammar.py b/histogrammar/dfinterface/spark_histogrammar.py
@@ -15,7 +15,7 @@
 try:
     from pyspark.sql import DataFrame
     from pyspark.sql.functions import approxCountDistinct
-    from pyspark.sql.functions import col as sparkcol
+    from pyspark.sql import functions as f
 except (ModuleNotFoundError, AttributeError):
     pass
 
@@ -148,7 +148,7 @@ def get_nunique(self, df, columns=[]):
         """
         if not columns:
             columns = df.columns
-        qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns))
+        qdf = df.agg(*(approxCountDistinct(f.col(c)).alias(c) for c in columns))
         return qdf.toPandas().T[0].to_dict()
 
     def get_data_type(self, df, col):
@@ -185,19 +185,26 @@ def process_features(self, df, cols_by_type):
         idf = df.alias("")
 
         # timestamp variables are converted here to ns since 1970-1-1
-        # histogrammar does not yet support long integers, so convert timestamps to float
-        # epoch = (sparkcol("ts").cast("bigint") * 1000000000).cast("bigint")
+        # histogrammar does not (yet) support long integers, so convert timestamps to float
         for col in cols_by_type["dt"]:
             self.logger.debug(
                 'Converting column "{col}" of type "{type}" to nanosec.'.format(
                     col=col, type=self.var_dtype[col]
                 )
             )
-
             # first cast to timestamp (in case column is stored as date)
-            to_ns = sparkcol(col).cast("timestamp").cast("float") * 1e9
+            to_ns = f.col(col).cast("timestamp").cast("float") * 1e9
             idf = idf.withColumn(col, to_ns)
 
+        # spark nulls are interpreted to 0 when cast to double in scala, done when given as input to numeric histograms
+        # in columns that have them, replace by nones by nans
+        for col in cols_by_type["num"]:
+            if len(idf.where(f.col(col).isNull()).limit(1).collect()) > 0:
+                self.logger.debug(
+                    'In numeric column "{col}" converting each None to NaN.'.format(col=col)
+                )
+                idf = idf.withColumn(col, f.when(f.col(col).isNotNull(), f.col(col)).otherwise(float('nan')))
+
         return idf
 
     def construct_empty_hist(self, df, features):
diff --git a/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb b/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb
@@ -103,6 +103,7 @@
     "try:\n",
     "    from pyspark.sql import SparkSession\n",
     "    from pyspark.sql.functions import col\n",
+    "    from pyspark import __version__ as pyspark_version\n",
     "    pyspark_installed = True\n",
     "except ImportError:\n",
     "    print(\"pyspark needs to be installed for this example\")\n",
@@ -119,8 +120,12 @@
     "# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n",
     "\n",
     "if pyspark_installed:\n",
+    "    scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
+    "    hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
+    "    hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
+    "        \n",
     "    spark = SparkSession.builder.config(\n",
-    "        \"spark.jars.packages\", \"io.github.histogrammar:histogrammar_2.12:1.0.11,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11\"\n",
+    "        \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
     "    ).getOrCreate()\n",
     "\n",
     "    sdf = spark.createDataFrame(df)"
@@ -518,7 +523,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.7.6"
   },
   "nteract": {
    "version": "0.15.0"
diff --git a/histogrammar/primitives/categorize.py b/histogrammar/primitives/categorize.py
@@ -16,6 +16,7 @@
 
 import math
 import numbers
+import numpy as np
 
 from histogrammar.defs import Container, Factory, identity, JsonFormatException, ContainerException
 from histogrammar.util import n_dim, datatype, serializable, inheritdoc, maybeAdd, floatToJson, hasKeys, numeq, \
@@ -186,8 +187,12 @@ def fill(self, datum, weight=1.0):
 
         if weight > 0.0:
             q = self.quantity(datum)
-            if not isinstance(q, basestring):
-                raise TypeError("function return value ({0}) must be a string".format(q))
+            if isinstance(q, (basestring, bool)):
+                pass
+            elif q is None or np.isnan(q):
+                q = 'NaN'
+            if not isinstance(q, (basestring, bool)):
+                raise TypeError("function return value ({0}) must be a string or bool".format(q))
 
             if q not in self.bins:
                 self.bins[q] = self.value.zero()
@@ -275,6 +280,8 @@ def _c99StructName(self):
 
     def _numpy(self, data, weights, shape):
         q = self.quantity(data)
+        if isinstance(q, (list, tuple)):
+            q = np.array(q)
         self._checkNPQuantity(q, shape)
         self._checkNPWeights(weights, shape)
         weights = self._makeNPWeights(weights, shape)
@@ -283,17 +290,19 @@ def _numpy(self, data, weights, shape):
         subweights = weights.copy()
         subweights[weights < 0.0] = 0.0
 
-        import numpy
-        selection = numpy.empty(q.shape, dtype=numpy.bool)
-
-        uniques, inverse = numpy.unique(q, return_inverse=True)
+        selection = np.empty(q.shape, dtype=np.bool)
+        uniques, inverse = np.unique(q, return_inverse=True)
 
         # no possibility of exception from here on out (for rollback)
         for i, x in enumerate(uniques):
+            if isinstance(x, (basestring, bool)):
+                pass
+            elif x is None or np.isnan(x):
+                x = 'NaN'
             if x not in self.bins:
                 self.bins[x] = self.value.zero()
 
-            numpy.not_equal(inverse, i, selection)
+            np.not_equal(inverse, i, selection)
             subweights[:] = weights
             subweights[selection] = 0.0
             self.bins[x]._numpy(data, subweights, shape)
@@ -412,7 +421,6 @@ def bin_entries(self, labels=[]):
         :returns: array of bin-entries
         :rtype: numpy.array
         """
-        import numpy as np
         if len(labels) == 0:
             return np.array([self.bins[i].entries for i in self.bins])
         entries = [self.bins[lab].entries if lab in self.bins else 0.0 for lab in labels]
@@ -426,7 +434,6 @@ def bin_labels(self, max_length=-1):
         :returns: array of labels
         :rtype: numpy.array
         """
-        import numpy as np
         labels = []
 
         for i, key in enumerate(self.bins.keys()):
diff --git a/histogrammar/primitives/sparselybin.py b/histogrammar/primitives/sparselybin.py
@@ -186,6 +186,7 @@ def __iadd__(self, other):
                     self.bins[i] += v
                 else:
                     self.bins[i] = v.copy()
+            self.nanflow += other.nanflow
             return self
         else:
             raise ContainerException("cannot add {0} and {1}".format(self.name, other.name))
diff --git a/histogrammar/version.py b/histogrammar/version.py
@@ -3,9 +3,9 @@
 import re
 
 name = "histogrammar"
-__version__ = "1.0.22"
-version = "1.0.22"
-full_version = "1.0.22"
+__version__ = "1.0.24"
+version = "1.0.24"
+full_version = "1.0.24"
 release = True
 
 version_info = tuple(re.split(r"[-\.]", __version__))
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 
 MAJOR = 1
 REVISION = 0
-PATCH = 23
+PATCH = 24
 DEV = False
 # NOTE: also update version at: README.rst
 
diff --git a/tests/jars/histogrammar-sparksql_2.11-1.0.20.jar b/tests/jars/histogrammar-sparksql_2.11-1.0.20.jar
diff --git a/tests/jars/histogrammar-sparksql_2.12-1.0.20.jar b/tests/jars/histogrammar-sparksql_2.12-1.0.20.jar
diff --git a/tests/jars/histogrammar_2.11-1.0.20.jar b/tests/jars/histogrammar_2.11-1.0.20.jar
diff --git a/tests/jars/histogrammar_2.12-1.0.20.jar b/tests/jars/histogrammar_2.12-1.0.20.jar
diff --git a/tests/test_spark_histogrammar.py b/tests/test_spark_histogrammar.py
@@ -3,11 +3,13 @@
 import pandas as pd
 import pytest
 
-# from popmon.hist.filling import make_histograms
 from histogrammar.dfinterface.spark_histogrammar import SparkHistogrammar
+from histogrammar.dfinterface.make_histograms import make_histograms
+
 
 try:
     from pyspark.sql import SparkSession
+    from pyspark import __version__ as pyspark_version
 
     spark_found = True
 except (ModuleNotFoundError, AttributeError):
@@ -20,8 +22,9 @@ def get_spark():
 
     current_path = dirname(abspath(__file__))
 
-    hist_spark_jar = join(current_path, "jars/histogrammar-sparksql_2.12-1.0.11.jar")
-    hist_jar = join(current_path, "jars/histogrammar_2.12-1.0.11.jar")
+    scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'
+    hist_spark_jar = join(current_path, f"jars/histogrammar-sparksql_{scala}-1.0.20.jar")
+    hist_jar = join(current_path, f"jars/histogrammar_{scala}-1.0.20.jar")
 
     spark = (
         SparkSession.builder.master("local")
@@ -251,3 +254,37 @@ def test_get_histograms_date(spark_co):
     filler = SparkHistogrammar(features=["dt"])
     current_hists = filler.get_histograms(sdf)
     assert current_hists["dt"].toJson() == expected
+
+
+@pytest.mark.spark
+@pytest.mark.skipif(not spark_found, reason="spark not found")
+@pytest.mark.filterwarnings(
+    "ignore:createDataFrame attempted Arrow optimization because"
+)
+def test_null_histograms(spark_co):
+    spark = spark_co
+
+    data = [(None, None, None, None), (1, None, None, 2), (None, True, "Jones", None), (3, True, "USA", 4),
+            (4, False, "FL", 5)]
+    columns = ["transaction", "isActive", "eyeColor", "t2"]
+    sdf = spark.createDataFrame(data=data, schema=columns)
+
+    hists = make_histograms(sdf, bin_specs={'transaction': {'num': 40, 'low': 0, 'high': 10}})
+
+    assert 'transaction' in hists
+    assert 'isActive' in hists
+    assert 'eyeColor' in hists
+    assert 't2' in hists
+
+    h = hists['transaction']
+    assert h.nanflow.entries == 2
+    h = hists['t2']
+    assert h.nanflow.entries == 2
+
+    h = hists['isActive']
+    assert 'NaN' in h.bins
+    assert h.bins['NaN'].entries == 2
+
+    h = hists['eyeColor']
+    assert 'NaN' in h.bins
+    assert h.bins['NaN'].entries == 2