Merge pull request #43 from racinmat/master

Using newer HUtils, added function for pruning jsons
CTUAvastLab · Oct 14, 2020 · 604dce2 · 604dce2 · racinmat · Oct 14, 2020
2 parents 83fc395 + 10dfff6
commit 604dce2
Show file tree

Hide file tree

Showing 11 changed files with 134 additions and 75 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "JsonGrinder"
 uuid = "d201646e-a9c0-11e8-1063-23b139159713"
 authors = ["pevnak <[email protected]>", "Matej Racinsky <[email protected]>"]
-version = "1.6.1"
+version = "1.7.0"
 
 [deps]
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
@@ -29,12 +29,12 @@ BSON = "^0.2"
 DataStructures = "^0.17"
 FillArrays = "0.6, 0.7, 0.8"
 Flux = "~0.11"
-HierarchicalUtils = "~1.0"
+HierarchicalUtils = "^1.1"
 HttpCommon = "^0.5"
 JSON = "0.18, 0.19, 0.20, 0.21"
 LearnBase = "0.2, 0.3, 0.4"
 MLDataPattern = "^0.5"
-Mill = "^1.4"
+Mill = "^1.5"
 Mustache = "^1"
 StatsBase = "0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33"
 ThreadTools = "^0.2"

diff --git a/examples/recipes.jl b/examples/recipes.jl
@@ -49,7 +49,9 @@ extract_data(JsonGrinder.sample_synthetic(sch))
 data = tmap(extract_data, samples[1:5_000])
 data = reduce(catobs, data)
 target = tmap(extract_target, samples[1:5_000])
-target = reduce(catobs, target)[:cuisine].data
+# replace by this once we'll merge the correct handling of dicts
+#target = reduce(catobs, target)[:cuisine].data
+target = reduce(catobs, target).data
 
 e = sch[:cuisine]
 

diff --git a/src/JsonGrinder.jl b/src/JsonGrinder.jl
@@ -25,7 +25,7 @@ export ExtractScalar, ExtractCategorical, ExtractArray, ExtractDict, ExtractOneH
 export suggestextractor, schema, extractbatch, generate_html
 
 Base.show(io::IO, ::T) where T <: Union{JSONEntry, AbstractExtractor} = show(io, Base.typename(T))
-Base.show(io::IO, ::MIME"text/plain", n::Union{JSONEntry, AbstractExtractor}) = HierarchicalUtils.printtree(io, n; trav=false, trunc=3)
+Base.show(io::IO, ::MIME"text/plain", n::Union{JSONEntry, AbstractExtractor}) = HierarchicalUtils.printtree(io, n; trav=false, htrunc=3, vtrunc=20)
 Base.getindex(n::Union{JSONEntry, AbstractExtractor}, i::AbstractString) = HierarchicalUtils.walk(n, i)
 
 end # module
diff --git a/src/extractors/extractdict.jl b/src/extractors/extractdict.jl
@@ -41,7 +41,14 @@ end
 (s::ExtractDict{S,V})(v::Dict) where {S<:Dict,V<:Nothing} = vcat([f(get(v,k,nothing)) for (k,f) in s.vec]...)
 
 function (s::ExtractDict{S,V})(v::Dict) where {S<:Nothing,V<:Dict}
+	# o = [f(get(v,k,nothing)) for (k,f) in s.other]
 	o = [Symbol(k) => f(get(v,String(k),nothing)) for (k,f) in s.other]
+	if length(o) == 1
+		# return(o[1])
+		return o[1].second
+	else
+		return ProductNode((;o...))
+	end
 	ProductNode((;o...))
 end
 

diff --git a/src/hierarchical_utils.jl b/src/hierarchical_utils.jl
@@ -11,8 +11,9 @@ noderepr(n::DictEntry) = "[" * (isnothing(n.childs) ? "Empty " : "") * "Dict] (u
 noderepr(n::MultiEntry) = "[" * (isempty(n.childs) ? "Empty " : "") * "MultiEntry] (updated = $(n.updated))"
 
 children(n::ArrayEntry) = (n.items,)
-children(n::DictEntry) = (; n.childs...)
-children(n::MultiEntry) = (; Dict( Symbol(k) => v for (k,v) in enumerate(n.childs))...)
+# using vector of pairs because splatting to named tuple is not good for compiler
+children(n::DictEntry) = collect(n.childs)
+children(n::MultiEntry) = [Symbol(k) => v for (k,v) in enumerate(n.childs)]
 
 # for extractor structures
 # default extractor
@@ -35,4 +36,4 @@ children(n::ExtractArray) = (n.item,)
 children(n::MultipleRepresentation) = n.extractors
 children(e::ExtractKeyAsField) = (e.key, e.item)
 children(n::AuxiliaryExtractor) = (n.extractor,)
-children(n::ExtractDict) = (; Dict(Symbol(k)=>v for (k,v) in merge(filter(!isnothing, [n.vec, n.other])...))...)
+children(n::ExtractDict) = [Symbol(k)=>v for (k,v) in merge(filter(!isnothing, [n.vec, n.other])...)]
diff --git a/src/schema/schema.jl b/src/schema/schema.jl
@@ -78,3 +78,17 @@ function Base.delete!(sch::JSONEntry, path::AbstractString, field::AbstractStrin
 	item = reduce((s, f) -> f(s), map(make_selector, selectors), init=sch)
 	delete!(item.childs, Symbol(field))
 end
+
+prune_json(json, sch::Entry) = json
+
+prune_json(json, sch::ArrayEntry) = map(json) do el
+	prune_json(el, sch.items)
+end
+
+function prune_json(json, sch::DictEntry)
+    out = Dict()
+    for (k,v) in children(sch)
+        String(k) ∈ keys(json) && (out[String(k)] = prune_json(json[String(k)], v))
+    end
+    out
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -10,4 +10,4 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
-Mill = "~1.4"
+Mill = "~1.5"
diff --git a/test/extractors.jl b/test/extractors.jl
@@ -79,18 +79,18 @@ end
 	a2 = br(Dict("a" => 5, "b" => 7))
 	a3 = br(Dict("a" => 5, "c" => [1,2,3,4]))
 
-	@test all(a1[:c].data.data .== [-3 0 3 6])
-	@test all(a1[:c].bags .== [1:4])
-	@test all(catobs(a1,a1)[:c].data.data .== [-3 0 3 6 -3 0 3 6])
-	@test all(catobs(a1,a1)[:c].bags .== [1:4,5:8])
-
-	@test all(catobs(a1,a2)[:c].data.data .== [-3 0 3 6])
-	@test all(catobs(a1,a2)[:c].bags .== [1:4,0:-1])
-
-	@test all(a3[:c].data.data .== [-3 0 3 6])
-	@test all(a3[:c].bags .== [1:4])
-	@test all(catobs(a3,a3)[:c].data.data .== [-3 0 3 6 -3 0 3 6])
-	@test all(catobs(a3,a3)[:c].bags .== [1:4,5:8])
+	@test all(a1.data.data .== [-3 0 3 6])
+	@test all(a1.bags .== [1:4])
+	@test all(catobs(a1,a1).data.data .== [-3 0 3 6 -3 0 3 6])
+	@test all(catobs(a1,a1).bags .== [1:4,5:8])
+
+	@test all(catobs(a1,a2).data.data .== [-3 0 3 6])
+	@test all(catobs(a1,a2).bags .== [1:4,0:-1])
+
+	@test all(a3.data.data .== [-3 0 3 6])
+	@test all(a3.bags .== [1:4])
+	@test all(catobs(a3,a3).data.data .== [-3 0 3 6 -3 0 3 6])
+	@test all(catobs(a3,a3).bags .== [1:4,5:8])
 end
 
 @testset "Testing Nested Missing Arrays" begin
@@ -347,10 +347,10 @@ end
 	ext_j3 = ext(j3)
 	ext_j4 = ext(j4)
 
-	@test ext_j1[:a].data.data isa Array{Float32,2}
-	@test ext_j2[:a].data.data isa Array{Float32,2}
-	@test ext_j3[:a].data.data isa Array{Float32,2}
-	@test ext_j4[:a].data.data isa Array{Float32,2}
+	@test ext_j1.data.data isa Array{Float32,2}
+	@test ext_j2.data.data isa Array{Float32,2}
+	@test ext_j3.data.data isa Array{Float32,2}
+	@test ext_j4.data.data isa Array{Float32,2}
 end
 
 @testset "testing irregular extractor" begin
@@ -361,13 +361,13 @@ end
 	sch = schema([j1,j2,j3])
 	ext = suggestextractor(sch)
 	a = ext(j1)
-	@test a[:a].data[1].data[1] == 0
-	@test a[:a].data[2].data[:a].data.s[1] == ""
-	@test nobs(a[:a].data[3]) == 1
+	@test a[:e1].data[1] == 0
+	@test a[:e2].data[:a].data.s[1] == ""
+	@test nobs(a[:e3]) == 1
 	# this should be 0, there is problem with handling missing valus
 	# todo: make it and issue on github so we have it tracked
-	@test_broken nobs(a.data[3].data) == 0
-	@test nobs(a[:a].data[3].data) == 1
+	@test_broken nobs(a[:e3].data) == 0
+	@test nobs(a[:e3].data) == 1
 end
 
 @testset "Mixed scalar extraction" begin
@@ -386,10 +386,10 @@ end
 	e2 = ext(j2)
 	e3 = ext(j3)
 	e4 = ext(j4)
-	@test e1["k"].data ≈ [0]
-	@test e2["k"].data ≈ [1]
-	@test e3["k"].data ≈ [0.7]
-	@test e4["k"].data ≈ [0.5]
+	@test e1["U"].data ≈ [0]
+	@test e2["U"].data ≈ [1]
+	@test e3["U"].data ≈ [0.7]
+	@test e4["U"].data ≈ [0.5]
 end
 
 @testset "Mixed scalar extraction with other types" begin
@@ -417,32 +417,30 @@ end
 
 	@test buf_printtree(ext) ==
     """
-    Dict
-      └── a: MultiRepresentation
-               ├── e1: FeatureVector with 5 items
-               ├── e2: Dict
-               │         └── Sylvanas is the worst warchief ever: String
-               └── e3: Float32"""
+	Dict
+	  └── a: MultiRepresentation
+	           ├── e1: FeatureVector with 5 items
+	           ├── e2: Dict
+	           │         └── Sylvanas is the worst warchief ever: String
+	           └── e3: Float32"""
 
 	e1 = ext(j1)
 	e2 = ext(j2)
 	e3 = ext(j3)
 	e4 = ext(j4)
 	e5 = ext(j5)
-	@test e1["s"].data ≈ [0]
-	@test e2["s"].data ≈ [0.375]
-	@test e3["s"].data ≈ [0.525]
-	@test e4["s"].data ≈ [1.0]
-	@test e5["s"].data ≈ [0.875]
+	@test e1["k"].data ≈ [0]
+	@test e2["k"].data ≈ [0.375]
+	@test e3["k"].data ≈ [0.525]
+	@test e4["k"].data ≈ [1.0]
+	@test e5["k"].data ≈ [0.875]
 
 	@test buf_printtree(e1) ==
 	"""
 	ProductNode
-	  └── a: ProductNode
-	           ├── e1: ArrayNode(5, 1)
-	           ├── e2: ProductNode
-	           │         └── Sylvanas is the worst warchief ever: ArrayNode(2053, 1)
-	           └── e3: ArrayNode(1, 1)"""
+	  ├── e1: ArrayNode(5, 1)
+	  ├── e2: ArrayNode(2053, 1)
+	  └── e3: ArrayNode(1, 1)"""
 end
 
 @testset "mixing numeric and non-numeric strings" begin
@@ -477,11 +475,11 @@ end
 	e3 = ext(j3)
 	e4 = ext(j4)
 	e5 = ext(j5)
-	@test e1["k"].data ≈ [0]
-	@test e2["k"].data ≈ [0.5]
-	@test e3["k"].data ≈ [1.0]
-	@test e4["k"].data ≈ [0]
-	@test e5["k"].data ≈ [0]
+	@test e1["U"].data ≈ [0]
+	@test e2["U"].data ≈ [0.5]
+	@test e3["U"].data ≈ [1.0]
+	@test e4["U"].data ≈ [0]
+	@test e5["U"].data ≈ [0]
 
 	@test hash(ext) !== hash(suggestextractor(JsonGrinder.schema([j1, j2, j4, j5])))
 end
@@ -552,7 +550,6 @@ end
 	ext_j2 = ext(j2)
     @test buf_printtree(ext_j2) ==
     """
-	ProductNode
-	  └── a: BagNode with 1 bag(s)
-	           └── ArrayNode(1, 2)"""
+    BagNode with 1 bag(s)
+      └── ArrayNode(1, 2)"""
 end
diff --git a/test/hierarchical_utils_extractors.jl b/test/hierarchical_utils_extractors.jl
@@ -43,13 +43,13 @@ end
 end
 
 @testset "children" begin
-    @test children(ext) == (a=ext[:a], b=ext[:b], c=ext[:c])
+    @test children(ext) == [:a=>ext[:a], :b=>ext[:b], :c=>ext[:c]]
     @test children(ext[:a]) == ()
-    @test children(ext[:b]) == (a=ext[:b][:a], b=ext[:b][:b])
+    @test children(ext[:b]) == [:a=>ext[:b][:a], :b=>ext[:b][:b]]
     @test children(ext[:b][:a]) == (ext[:b][:a].item,)
     @test children(ext[:b][:b]) == ()
-    @test children(ext[:c]) == (a=ext[:c][:a],)
-    @test children(ext[:c][:a]) == (a=ext[:c][:a][:a], b=ext[:c][:a][:b])
+    @test children(ext[:c]) == [:a=>ext[:c][:a]]
+    @test children(ext[:c][:a]) == [:a=>ext[:c][:a][:a], :b=>ext[:c][:a][:b]]
     @test children(ext[:c][:a][:a]) == (ext[:c][:a][:a].item,)
     @test children(ext[:c][:a][:b]) == (ext[:c][:a][:b].item,)
 end
@@ -91,7 +91,7 @@ end
 end
 
 @testset "TypeIterator" begin
-    @test collect(TypeIterator(ext, ExtractArray)) == [ext["Y"], ext["u"], ext["w"]]
+    @test collect(TypeIterator(ExtractArray, ext)) == [ext["Y"], ext["u"], ext["w"]]
 end
 
 @testset "show" begin

diff --git a/test/hierarchical_utils_schema.jl b/test/hierarchical_utils_schema.jl
@@ -42,13 +42,13 @@ end
 end
 
 @testset "children" begin
-    @test children(sch) == (a=sch[:a], b=sch[:b], c=sch[:c])
+    @test children(sch) == [:a=>sch[:a], :b=>sch[:b], :c=>sch[:c]]
     @test children(sch[:a]) == ()
-    @test children(sch[:b]) == (a=sch[:b][:a], b=sch[:b][:b])
+    @test children(sch[:b]) == [:a=>sch[:b][:a], :b=>sch[:b][:b]]
     @test children(sch[:b][:a]) == (sch[:b][:a].items,)
     @test children(sch[:b][:b]) == ()
-    @test children(sch[:c]) == (a=sch[:c][:a],)
-    @test children(sch[:c][:a]) == (a=sch[:c][:a][:a], b=sch[:c][:a][:b])
+    @test children(sch[:c]) == [:a=>sch[:c][:a]]
+    @test children(sch[:c][:a]) == [:a=>sch[:c][:a][:a], :b=>sch[:c][:a][:b]]
     @test children(sch[:c][:a][:a]) == (sch[:c][:a][:a].items,)
     @test children(sch[:c][:a][:b]) == (sch[:c][:a][:b].items,)
 end
@@ -90,7 +90,7 @@ end
 end
 
 @testset "TypeIterator" begin
-    @test collect(TypeIterator(sch, DictEntry)) == [sch[""], sch["U"], sch["k"], sch["s"]]
+    @test collect(TypeIterator(DictEntry, sch)) == [sch[""], sch["U"], sch["k"], sch["s"]]
 end
 
 @testset "print with empty lists" begin
@@ -184,7 +184,7 @@ end
     str_repr = String(take!(buf))
 
     buf = IOBuffer()
-    HierarchicalUtils.printtree(buf, sch; trav=false, trunc=3)
+    HierarchicalUtils.printtree(buf, sch; trav=false, htrunc=3)
     str_repr2 = String(take!(buf))
     @test str_repr == str_repr2
 end
diff --git a/test/schema.jl b/test/schema.jl
@@ -275,12 +275,12 @@ end
 	j6 = JSON.parse("""{}""")
 
 	sch = JsonGrinder.schema([j1,j2,j3,j4,j5,j6])
-	@test children(sch[:c][:a]) == (a=sch[:c][:a][:a], b=sch[:c][:a][:b])
+	@test children(sch[:c][:a]) == [:a=>sch[:c][:a][:a], :b=>sch[:c][:a][:b]]
 	delete!(sch, ".c.a", "a")
-	@test children(sch[:c][:a]) == (b=sch[:c][:a][:b],)
-	@test children(sch[:b].items) == (a=sch[:b].items[:a], b=sch[:b].items[:b])
+	@test children(sch[:c][:a]) == [:b=>sch[:c][:a][:b]]
+	@test children(sch[:b].items) == [:a=>sch[:b].items[:a], :b=>sch[:b].items[:b]]
 	delete!(sch, ".b.[]", "a")
-	@test children(sch[:b].items) == (b=sch[:b].items[:b],)
+	@test children(sch[:b].items) == [:b=>sch[:b].items[:b]]
 end
 
 @testset "Extractor from schema" begin
@@ -304,8 +304,8 @@ end
 	@test e1.data.scalars.data[1, 1] == 0
 	@test e1.data.b.data.a.data ≈ [1., 2., 3.]
 	@test e1.data.b.data.scalars.data[1, 1] == 0.
-	@test e1.data.c.data.a.data.a.data.data == [0. 0.5 1.]
-	@test e1.data.c.data.a.data.b.data.data == [0. 0.5 1.]
+	@test e1.data.c.data.a.data.data == [0. 0.5 1.]
+	@test e1.data.c.data.b.data.data == [0. 0.5 1.]
 end
 
 @testset "Mixing substrings with strings" begin
@@ -491,3 +491,41 @@ end
 	sch = JsonGrinder.schema([j1,j2,j3,j4,j5,j6,j7])
 	@test sch[:a].updated == 7
 end
+
+@testset "prune_json" begin
+	j1 = JSON.parse("""{"a": 4, "b": {"a":[1,2,3],"b": 1},"c": { "a": {"a":[1,2,3],"b":[4,5,6]}}}""",inttype=Float64)
+	j2 = JSON.parse("""{"a": 4, "c": { "a": {"a":[2,3],"b":[5,6]}}}""")
+	j3 = JSON.parse("""{"a": 4, "b": {"a":[1,2,3],"b": 1}}""")
+	j4 = JSON.parse("""{"a": 4, "b": {}}""")
+	j5 = JSON.parse("""{"b": {}}""")
+	j6 = JSON.parse("""{}""")
+
+	sch = JsonGrinder.schema([j1,j2,j3,j4,j5,j6])
+
+	@test JsonGrinder.prune_json(j1, sch) == Dict(
+		"c" => Dict("a"=>Dict("b"=>[4.0, 5.0, 6.0],"a"=>[1.0, 2.0, 3.0])),
+		"b" => Dict("b"=>1.0,"a"=>[1.0, 2.0, 3.0]),
+		"a" => 4.0)
+
+	@test JsonGrinder.prune_json(j2, sch) == Dict(
+		"c" => Dict("a"=>Dict("b"=>[5, 6],"a"=>[2, 3])),
+		"a" => 4)
+
+	delete!(sch.childs, :b)
+
+	@test JsonGrinder.prune_json(j1, sch) == Dict(
+  		"c" => Dict("a"=>Dict("b"=>[4.0, 5.0, 6.0],"a"=>[1.0, 2.0, 3.0])),
+  		"a" => 4.0)
+
+	@test JsonGrinder.prune_json(j2, sch) == Dict(
+		"c" => Dict("a"=>Dict("b"=>[5, 6],"a"=>[2, 3])),
+		"a" => 4)
+
+	delete!(sch.childs, :c)
+
+	@test JsonGrinder.prune_json(j1, sch) == Dict(
+  		"a" => 4.0)
+
+	@test JsonGrinder.prune_json(j2, sch) == Dict(
+		"a" => 4)
+end