Skip to content

Commit 9fdfbc6

Browse files
committed
Add CLI option to use treeReduce on Spark
1 parent 2b04dc2 commit 9fdfbc6

File tree

2 files changed

+12
-2
lines changed

2 files changed

+12
-2
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88
## [Unreleased]
9+
### Added
10+
- Add CLI option to use `treeReduce` on Spark
911

1012
## [0.40.0] - 2024-08-22
1113
### Added

src/main/scala/io/github/dataunitylab/jsonoid/discovery/spark/JsonoidSpark.scala

+10-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ private final case class Config(
1515
propertySet: PropertySet = PropertySets.AllProperties,
1616
addDefinitions: Boolean = false,
1717
detectDynamic: Boolean = false,
18-
detectDisjoint: Boolean = false
18+
detectDisjoint: Boolean = false,
19+
treeReduce: Boolean = false
1920
)
2021

2122
object JsonoidSpark {
@@ -60,6 +61,10 @@ object JsonoidSpark {
6061
opt[Unit]('j', "detect-disjoint")
6162
.action((x, c) => c.copy(detectDisjoint = true))
6263
.text("detect objects with disjoint keys")
64+
65+
opt[Unit]('t', "tree-reduce")
66+
.action((x, c) => c.copy(treeReduce = true))
67+
.text("use treeReduce for schema reduction")
6368
}
6469

6570
parser.parse(args, Config()) match {
@@ -71,8 +76,11 @@ object JsonoidSpark {
7176
val jsonRdd = JsonoidRDD.fromStringRDD(
7277
sc.textFile(config.input)
7378
)(p)
74-
var schema: ObjectSchema =
79+
var schema: ObjectSchema = if (config.treeReduce) {
80+
jsonRdd.treeReduceSchemas().asInstanceOf[ObjectSchema]
81+
} else {
7582
jsonRdd.reduceSchemas().asInstanceOf[ObjectSchema]
83+
}
7684

7785
// Skip transformation if we know the required properties don't exist
7886
if (!(config.propertySet === PropertySets.MinProperties)) {

0 commit comments

Comments
 (0)