Skip to content

Commit 2207fc9

Browse files
authored
Merge pull request #44 from histogrammar/fix_filling_spark_bool
categorize histogram now handles nans in friendlier way
2 parents 63ffe40 + 6d43206 commit 2207fc9

15 files changed

+103
-29
lines changed

CHANGES.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
=============
2+
Release notes
3+
=============
4+
5+
Version 1.0.24, Apr 2021
6+
------------------------
7+
* Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN".
8+
* make_histogram() now casts spark nulls to nan in case of numeric columns. scala interprets null as 0.
9+
* SparselyBin histograms did not add up nanflow when added. Now fixed.
10+
* Added unit test for doing checks on null conversion to nans
11+
* Use new histogrammar-scala jar files, v1.0.20
12+
* Added histogrammar-scala v1.0.20 jar files to tests/jars/

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Histograms and other aggregators may also be converted into CUDA code for inclus
1919
PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling the CUDA code.
2020
This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
2121

22-
Latest Python release: v1.0.23 (Mar 2021).
22+
Latest Python release: v1.0.24 (April 2021).
2323

2424
Announcements
2525
=============
@@ -31,7 +31,7 @@ With Spark 3.0, based on Scala 2.12, make sure to pick up the correct histogramm
3131

3232
.. code-block:: python
3333
34-
spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.11,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11").getOrCreate()
34+
spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20").getOrCreate()
3535
3636
For Spark 2.X compiled against scala 2.11, in the string above simply replace "2.12" with "2.11".
3737

histogrammar/dfinterface/histogram_filler_base.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ def assign_and_check_features(self, df, cols_by_type):
231231
all_cols = (
232232
list(cols_by_type["num"]) +
233233
list(cols_by_type["dt"]) +
234+
list(cols_by_type["bool"]) +
234235
list(cols_by_type["str"])
235236
)
236237

@@ -421,6 +422,10 @@ def categorize_features(self, df):
421422
colset = cols_by_type["dt"]
422423
if col not in colset:
423424
colset.add(col)
425+
elif np.issubdtype(dt, np.bool_):
426+
colset = cols_by_type["bool"]
427+
if col not in colset:
428+
colset.add(col)
424429
else:
425430
colset = cols_by_type["str"]
426431
if col not in colset:
@@ -500,7 +505,7 @@ def transform(self, datastore):
500505
def get_hist_bin(self, hist, features, quant, col, dt):
501506
is_number = np.issubdtype(dt, np.number)
502507
is_timestamp = np.issubdtype(dt, np.datetime64)
503-
is_bool = np.issubdtype(dt, bool)
508+
is_bool = np.issubdtype(dt, np.bool_)
504509
specs = self.var_bin_specs(features, features.index(col))
505510

506511
if is_number or is_timestamp:

histogrammar/dfinterface/pandas_histogrammar.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def process_features(self, df, cols_by_type):
157157
"""
158158
# timestamp variables are converted to ns here
159159
# make temp df for value counting (used below)
160-
idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"])].copy()
160+
idf = df[list(cols_by_type["num"]) + list(cols_by_type["str"]) + list(cols_by_type["bool"])].copy()
161161
for col in cols_by_type["dt"]:
162162
self.logger.debug(
163163
'Converting column "{col}" of type "{type}" to nanosec.'.format(
@@ -215,7 +215,7 @@ def construct_empty_hist(self, features):
215215
# histogram type depends on the data type
216216
dt = self.var_dtype[col]
217217

218-
# processing function, e.g. only accept boolians during filling
218+
# processing function, e.g. only accept booleans during filling
219219
f = QUANTITY[dt]
220220
if len(features) == 1:
221221
# df[col] is a pd.series

histogrammar/dfinterface/spark_histogrammar.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
try:
1616
from pyspark.sql import DataFrame
1717
from pyspark.sql.functions import approxCountDistinct
18-
from pyspark.sql.functions import col as sparkcol
18+
from pyspark.sql import functions as f
1919
except (ModuleNotFoundError, AttributeError):
2020
pass
2121

@@ -148,7 +148,7 @@ def get_nunique(self, df, columns=[]):
148148
"""
149149
if not columns:
150150
columns = df.columns
151-
qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns))
151+
qdf = df.agg(*(approxCountDistinct(f.col(c)).alias(c) for c in columns))
152152
return qdf.toPandas().T[0].to_dict()
153153

154154
def get_data_type(self, df, col):
@@ -185,19 +185,26 @@ def process_features(self, df, cols_by_type):
185185
idf = df.alias("")
186186

187187
# timestamp variables are converted here to ns since 1970-1-1
188-
# histogrammar does not yet support long integers, so convert timestamps to float
189-
# epoch = (sparkcol("ts").cast("bigint") * 1000000000).cast("bigint")
188+
# histogrammar does not (yet) support long integers, so convert timestamps to float
190189
for col in cols_by_type["dt"]:
191190
self.logger.debug(
192191
'Converting column "{col}" of type "{type}" to nanosec.'.format(
193192
col=col, type=self.var_dtype[col]
194193
)
195194
)
196-
197195
# first cast to timestamp (in case column is stored as date)
198-
to_ns = sparkcol(col).cast("timestamp").cast("float") * 1e9
196+
to_ns = f.col(col).cast("timestamp").cast("float") * 1e9
199197
idf = idf.withColumn(col, to_ns)
200198

199+
# spark nulls are interpreted to 0 when cast to double in scala, done when given as input to numeric histograms
200+
# in columns that have them, replace by nones by nans
201+
for col in cols_by_type["num"]:
202+
if len(idf.where(f.col(col).isNull()).limit(1).collect()) > 0:
203+
self.logger.debug(
204+
'In numeric column "{col}" converting each None to NaN.'.format(col=col)
205+
)
206+
idf = idf.withColumn(col, f.when(f.col(col).isNotNull(), f.col(col)).otherwise(float('nan')))
207+
201208
return idf
202209

203210
def construct_empty_hist(self, df, features):

histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
"try:\n",
104104
" from pyspark.sql import SparkSession\n",
105105
" from pyspark.sql.functions import col\n",
106+
" from pyspark import __version__ as pyspark_version\n",
106107
" pyspark_installed = True\n",
107108
"except ImportError:\n",
108109
" print(\"pyspark needs to be installed for this example\")\n",
@@ -119,8 +120,12 @@
119120
"# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n",
120121
"\n",
121122
"if pyspark_installed:\n",
123+
" scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
124+
" hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
125+
" hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
126+
" \n",
122127
" spark = SparkSession.builder.config(\n",
123-
" \"spark.jars.packages\", \"io.github.histogrammar:histogrammar_2.12:1.0.11,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.11\"\n",
128+
" \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
124129
" ).getOrCreate()\n",
125130
"\n",
126131
" sdf = spark.createDataFrame(df)"
@@ -518,7 +523,7 @@
518523
"name": "python",
519524
"nbconvert_exporter": "python",
520525
"pygments_lexer": "ipython3",
521-
"version": "3.8.6"
526+
"version": "3.7.6"
522527
},
523528
"nteract": {
524529
"version": "0.15.0"

histogrammar/primitives/categorize.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import math
1818
import numbers
19+
import numpy as np
1920

2021
from histogrammar.defs import Container, Factory, identity, JsonFormatException, ContainerException
2122
from histogrammar.util import n_dim, datatype, serializable, inheritdoc, maybeAdd, floatToJson, hasKeys, numeq, \
@@ -186,8 +187,12 @@ def fill(self, datum, weight=1.0):
186187

187188
if weight > 0.0:
188189
q = self.quantity(datum)
189-
if not isinstance(q, basestring):
190-
raise TypeError("function return value ({0}) must be a string".format(q))
190+
if isinstance(q, (basestring, bool)):
191+
pass
192+
elif q is None or np.isnan(q):
193+
q = 'NaN'
194+
if not isinstance(q, (basestring, bool)):
195+
raise TypeError("function return value ({0}) must be a string or bool".format(q))
191196

192197
if q not in self.bins:
193198
self.bins[q] = self.value.zero()
@@ -275,6 +280,8 @@ def _c99StructName(self):
275280

276281
def _numpy(self, data, weights, shape):
277282
q = self.quantity(data)
283+
if isinstance(q, (list, tuple)):
284+
q = np.array(q)
278285
self._checkNPQuantity(q, shape)
279286
self._checkNPWeights(weights, shape)
280287
weights = self._makeNPWeights(weights, shape)
@@ -283,17 +290,19 @@ def _numpy(self, data, weights, shape):
283290
subweights = weights.copy()
284291
subweights[weights < 0.0] = 0.0
285292

286-
import numpy
287-
selection = numpy.empty(q.shape, dtype=numpy.bool)
288-
289-
uniques, inverse = numpy.unique(q, return_inverse=True)
293+
selection = np.empty(q.shape, dtype=np.bool)
294+
uniques, inverse = np.unique(q, return_inverse=True)
290295

291296
# no possibility of exception from here on out (for rollback)
292297
for i, x in enumerate(uniques):
298+
if isinstance(x, (basestring, bool)):
299+
pass
300+
elif x is None or np.isnan(x):
301+
x = 'NaN'
293302
if x not in self.bins:
294303
self.bins[x] = self.value.zero()
295304

296-
numpy.not_equal(inverse, i, selection)
305+
np.not_equal(inverse, i, selection)
297306
subweights[:] = weights
298307
subweights[selection] = 0.0
299308
self.bins[x]._numpy(data, subweights, shape)
@@ -412,7 +421,6 @@ def bin_entries(self, labels=[]):
412421
:returns: array of bin-entries
413422
:rtype: numpy.array
414423
"""
415-
import numpy as np
416424
if len(labels) == 0:
417425
return np.array([self.bins[i].entries for i in self.bins])
418426
entries = [self.bins[lab].entries if lab in self.bins else 0.0 for lab in labels]
@@ -426,7 +434,6 @@ def bin_labels(self, max_length=-1):
426434
:returns: array of labels
427435
:rtype: numpy.array
428436
"""
429-
import numpy as np
430437
labels = []
431438

432439
for i, key in enumerate(self.bins.keys()):

histogrammar/primitives/sparselybin.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ def __iadd__(self, other):
186186
self.bins[i] += v
187187
else:
188188
self.bins[i] = v.copy()
189+
self.nanflow += other.nanflow
189190
return self
190191
else:
191192
raise ContainerException("cannot add {0} and {1}".format(self.name, other.name))

histogrammar/version.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
import re
44

55
name = "histogrammar"
6-
__version__ = "1.0.22"
7-
version = "1.0.22"
8-
full_version = "1.0.22"
6+
__version__ = "1.0.24"
7+
version = "1.0.24"
8+
full_version = "1.0.24"
99
release = True
1010

1111
version_info = tuple(re.split(r"[-\.]", __version__))

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
MAJOR = 1
2424
REVISION = 0
25-
PATCH = 23
25+
PATCH = 24
2626
DEV = False
2727
# NOTE: also update version at: README.rst
2828

0 commit comments

Comments
 (0)