Skip to content

Commit

Permalink
Merge pull request #43 from racinmat/master
Browse files Browse the repository at this point in the history
Using newer HUtils, added function for pruning jsons
  • Loading branch information
racinmat authored Oct 14, 2020
2 parents 83fc395 + 10dfff6 commit 604dce2
Show file tree
Hide file tree
Showing 11 changed files with 134 additions and 75 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "JsonGrinder"
uuid = "d201646e-a9c0-11e8-1063-23b139159713"
authors = ["pevnak <[email protected]>", "Matej Racinsky <[email protected]>"]
version = "1.6.1"
version = "1.7.0"

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
Expand Down Expand Up @@ -29,12 +29,12 @@ BSON = "^0.2"
DataStructures = "^0.17"
FillArrays = "0.6, 0.7, 0.8"
Flux = "~0.11"
HierarchicalUtils = "~1.0"
HierarchicalUtils = "^1.1"
HttpCommon = "^0.5"
JSON = "0.18, 0.19, 0.20, 0.21"
LearnBase = "0.2, 0.3, 0.4"
MLDataPattern = "^0.5"
Mill = "^1.4"
Mill = "^1.5"
Mustache = "^1"
StatsBase = "0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32, 0.33"
ThreadTools = "^0.2"
Expand Down
4 changes: 3 additions & 1 deletion examples/recipes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ extract_data(JsonGrinder.sample_synthetic(sch))
data = tmap(extract_data, samples[1:5_000])
data = reduce(catobs, data)
target = tmap(extract_target, samples[1:5_000])
target = reduce(catobs, target)[:cuisine].data
# replace by this once we'll merge the correct handling of dicts
#target = reduce(catobs, target)[:cuisine].data
target = reduce(catobs, target).data

e = sch[:cuisine]

Expand Down
2 changes: 1 addition & 1 deletion src/JsonGrinder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export ExtractScalar, ExtractCategorical, ExtractArray, ExtractDict, ExtractOneH
export suggestextractor, schema, extractbatch, generate_html

Base.show(io::IO, ::T) where T <: Union{JSONEntry, AbstractExtractor} = show(io, Base.typename(T))
Base.show(io::IO, ::MIME"text/plain", n::Union{JSONEntry, AbstractExtractor}) = HierarchicalUtils.printtree(io, n; trav=false, trunc=3)
Base.show(io::IO, ::MIME"text/plain", n::Union{JSONEntry, AbstractExtractor}) = HierarchicalUtils.printtree(io, n; trav=false, htrunc=3, vtrunc=20)
Base.getindex(n::Union{JSONEntry, AbstractExtractor}, i::AbstractString) = HierarchicalUtils.walk(n, i)

end # module
7 changes: 7 additions & 0 deletions src/extractors/extractdict.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,14 @@ end
(s::ExtractDict{S,V})(v::Dict) where {S<:Dict,V<:Nothing} = vcat([f(get(v,k,nothing)) for (k,f) in s.vec]...)

function (s::ExtractDict{S,V})(v::Dict) where {S<:Nothing,V<:Dict}
# o = [f(get(v,k,nothing)) for (k,f) in s.other]
o = [Symbol(k) => f(get(v,String(k),nothing)) for (k,f) in s.other]
if length(o) == 1
# return(o[1])
return o[1].second
else
return ProductNode((;o...))
end
ProductNode((;o...))
end

Expand Down
7 changes: 4 additions & 3 deletions src/hierarchical_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ noderepr(n::DictEntry) = "[" * (isnothing(n.childs) ? "Empty " : "") * "Dict] (u
noderepr(n::MultiEntry) = "[" * (isempty(n.childs) ? "Empty " : "") * "MultiEntry] (updated = $(n.updated))"

children(n::ArrayEntry) = (n.items,)
children(n::DictEntry) = (; n.childs...)
children(n::MultiEntry) = (; Dict( Symbol(k) => v for (k,v) in enumerate(n.childs))...)
# using vector of pairs because splatting to named tuple is not good for compiler
children(n::DictEntry) = collect(n.childs)
children(n::MultiEntry) = [Symbol(k) => v for (k,v) in enumerate(n.childs)]

# for extractor structures
# default extractor
Expand All @@ -35,4 +36,4 @@ children(n::ExtractArray) = (n.item,)
children(n::MultipleRepresentation) = n.extractors
children(e::ExtractKeyAsField) = (e.key, e.item)
children(n::AuxiliaryExtractor) = (n.extractor,)
children(n::ExtractDict) = (; Dict(Symbol(k)=>v for (k,v) in merge(filter(!isnothing, [n.vec, n.other])...))...)
children(n::ExtractDict) = [Symbol(k)=>v for (k,v) in merge(filter(!isnothing, [n.vec, n.other])...)]
14 changes: 14 additions & 0 deletions src/schema/schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,17 @@ function Base.delete!(sch::JSONEntry, path::AbstractString, field::AbstractStrin
item = reduce((s, f) -> f(s), map(make_selector, selectors), init=sch)
delete!(item.childs, Symbol(field))
end

prune_json(json, sch::Entry) = json

prune_json(json, sch::ArrayEntry) = map(json) do el
prune_json(el, sch.items)
end

function prune_json(json, sch::DictEntry)
out = Dict()
for (k,v) in children(sch)
String(k) keys(json) && (out[String(k)] = prune_json(json[String(k)], v))
end
out
end
2 changes: 1 addition & 1 deletion test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
Mill = "~1.4"
Mill = "~1.5"
95 changes: 46 additions & 49 deletions test/extractors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,18 @@ end
a2 = br(Dict("a" => 5, "b" => 7))
a3 = br(Dict("a" => 5, "c" => [1,2,3,4]))

@test all(a1[:c].data.data .== [-3 0 3 6])
@test all(a1[:c].bags .== [1:4])
@test all(catobs(a1,a1)[:c].data.data .== [-3 0 3 6 -3 0 3 6])
@test all(catobs(a1,a1)[:c].bags .== [1:4,5:8])

@test all(catobs(a1,a2)[:c].data.data .== [-3 0 3 6])
@test all(catobs(a1,a2)[:c].bags .== [1:4,0:-1])

@test all(a3[:c].data.data .== [-3 0 3 6])
@test all(a3[:c].bags .== [1:4])
@test all(catobs(a3,a3)[:c].data.data .== [-3 0 3 6 -3 0 3 6])
@test all(catobs(a3,a3)[:c].bags .== [1:4,5:8])
@test all(a1.data.data .== [-3 0 3 6])
@test all(a1.bags .== [1:4])
@test all(catobs(a1,a1).data.data .== [-3 0 3 6 -3 0 3 6])
@test all(catobs(a1,a1).bags .== [1:4,5:8])

@test all(catobs(a1,a2).data.data .== [-3 0 3 6])
@test all(catobs(a1,a2).bags .== [1:4,0:-1])

@test all(a3.data.data .== [-3 0 3 6])
@test all(a3.bags .== [1:4])
@test all(catobs(a3,a3).data.data .== [-3 0 3 6 -3 0 3 6])
@test all(catobs(a3,a3).bags .== [1:4,5:8])
end

@testset "Testing Nested Missing Arrays" begin
Expand Down Expand Up @@ -347,10 +347,10 @@ end
ext_j3 = ext(j3)
ext_j4 = ext(j4)

@test ext_j1[:a].data.data isa Array{Float32,2}
@test ext_j2[:a].data.data isa Array{Float32,2}
@test ext_j3[:a].data.data isa Array{Float32,2}
@test ext_j4[:a].data.data isa Array{Float32,2}
@test ext_j1.data.data isa Array{Float32,2}
@test ext_j2.data.data isa Array{Float32,2}
@test ext_j3.data.data isa Array{Float32,2}
@test ext_j4.data.data isa Array{Float32,2}
end

@testset "testing irregular extractor" begin
Expand All @@ -361,13 +361,13 @@ end
sch = schema([j1,j2,j3])
ext = suggestextractor(sch)
a = ext(j1)
@test a[:a].data[1].data[1] == 0
@test a[:a].data[2].data[:a].data.s[1] == ""
@test nobs(a[:a].data[3]) == 1
@test a[:e1].data[1] == 0
@test a[:e2].data[:a].data.s[1] == ""
@test nobs(a[:e3]) == 1
# this should be 0, there is problem with handling missing valus
# todo: make it and issue on github so we have it tracked
@test_broken nobs(a.data[3].data) == 0
@test nobs(a[:a].data[3].data) == 1
@test_broken nobs(a[:e3].data) == 0
@test nobs(a[:e3].data) == 1
end

@testset "Mixed scalar extraction" begin
Expand All @@ -386,10 +386,10 @@ end
e2 = ext(j2)
e3 = ext(j3)
e4 = ext(j4)
@test e1["k"].data [0]
@test e2["k"].data [1]
@test e3["k"].data [0.7]
@test e4["k"].data [0.5]
@test e1["U"].data [0]
@test e2["U"].data [1]
@test e3["U"].data [0.7]
@test e4["U"].data [0.5]
end

@testset "Mixed scalar extraction with other types" begin
Expand Down Expand Up @@ -417,32 +417,30 @@ end

@test buf_printtree(ext) ==
"""
Dict
└── a: MultiRepresentation
├── e1: FeatureVector with 5 items
├── e2: Dict
│ └── Sylvanas is the worst warchief ever: String
└── e3: Float32"""
Dict
└── a: MultiRepresentation
├── e1: FeatureVector with 5 items
├── e2: Dict
│ └── Sylvanas is the worst warchief ever: String
└── e3: Float32"""

e1 = ext(j1)
e2 = ext(j2)
e3 = ext(j3)
e4 = ext(j4)
e5 = ext(j5)
@test e1["s"].data [0]
@test e2["s"].data [0.375]
@test e3["s"].data [0.525]
@test e4["s"].data [1.0]
@test e5["s"].data [0.875]
@test e1["k"].data [0]
@test e2["k"].data [0.375]
@test e3["k"].data [0.525]
@test e4["k"].data [1.0]
@test e5["k"].data [0.875]

@test buf_printtree(e1) ==
"""
ProductNode
└── a: ProductNode
├── e1: ArrayNode(5, 1)
├── e2: ProductNode
│ └── Sylvanas is the worst warchief ever: ArrayNode(2053, 1)
└── e3: ArrayNode(1, 1)"""
├── e1: ArrayNode(5, 1)
├── e2: ArrayNode(2053, 1)
└── e3: ArrayNode(1, 1)"""
end

@testset "mixing numeric and non-numeric strings" begin
Expand Down Expand Up @@ -477,11 +475,11 @@ end
e3 = ext(j3)
e4 = ext(j4)
e5 = ext(j5)
@test e1["k"].data [0]
@test e2["k"].data [0.5]
@test e3["k"].data [1.0]
@test e4["k"].data [0]
@test e5["k"].data [0]
@test e1["U"].data [0]
@test e2["U"].data [0.5]
@test e3["U"].data [1.0]
@test e4["U"].data [0]
@test e5["U"].data [0]

@test hash(ext) !== hash(suggestextractor(JsonGrinder.schema([j1, j2, j4, j5])))
end
Expand Down Expand Up @@ -552,7 +550,6 @@ end
ext_j2 = ext(j2)
@test buf_printtree(ext_j2) ==
"""
ProductNode
└── a: BagNode with 1 bag(s)
└── ArrayNode(1, 2)"""
BagNode with 1 bag(s)
└── ArrayNode(1, 2)"""
end
10 changes: 5 additions & 5 deletions test/hierarchical_utils_extractors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ end
end

@testset "children" begin
@test children(ext) == (a=ext[:a], b=ext[:b], c=ext[:c])
@test children(ext) == [:a=>ext[:a], :b=>ext[:b], :c=>ext[:c]]
@test children(ext[:a]) == ()
@test children(ext[:b]) == (a=ext[:b][:a], b=ext[:b][:b])
@test children(ext[:b]) == [:a=>ext[:b][:a], :b=>ext[:b][:b]]
@test children(ext[:b][:a]) == (ext[:b][:a].item,)
@test children(ext[:b][:b]) == ()
@test children(ext[:c]) == (a=ext[:c][:a],)
@test children(ext[:c][:a]) == (a=ext[:c][:a][:a], b=ext[:c][:a][:b])
@test children(ext[:c]) == [:a=>ext[:c][:a]]
@test children(ext[:c][:a]) == [:a=>ext[:c][:a][:a], :b=>ext[:c][:a][:b]]
@test children(ext[:c][:a][:a]) == (ext[:c][:a][:a].item,)
@test children(ext[:c][:a][:b]) == (ext[:c][:a][:b].item,)
end
Expand Down Expand Up @@ -91,7 +91,7 @@ end
end

@testset "TypeIterator" begin
@test collect(TypeIterator(ext, ExtractArray)) == [ext["Y"], ext["u"], ext["w"]]
@test collect(TypeIterator(ExtractArray, ext)) == [ext["Y"], ext["u"], ext["w"]]
end

@testset "show" begin
Expand Down
12 changes: 6 additions & 6 deletions test/hierarchical_utils_schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,13 @@ end
end

@testset "children" begin
@test children(sch) == (a=sch[:a], b=sch[:b], c=sch[:c])
@test children(sch) == [:a=>sch[:a], :b=>sch[:b], :c=>sch[:c]]
@test children(sch[:a]) == ()
@test children(sch[:b]) == (a=sch[:b][:a], b=sch[:b][:b])
@test children(sch[:b]) == [:a=>sch[:b][:a], :b=>sch[:b][:b]]
@test children(sch[:b][:a]) == (sch[:b][:a].items,)
@test children(sch[:b][:b]) == ()
@test children(sch[:c]) == (a=sch[:c][:a],)
@test children(sch[:c][:a]) == (a=sch[:c][:a][:a], b=sch[:c][:a][:b])
@test children(sch[:c]) == [:a=>sch[:c][:a]]
@test children(sch[:c][:a]) == [:a=>sch[:c][:a][:a], :b=>sch[:c][:a][:b]]
@test children(sch[:c][:a][:a]) == (sch[:c][:a][:a].items,)
@test children(sch[:c][:a][:b]) == (sch[:c][:a][:b].items,)
end
Expand Down Expand Up @@ -90,7 +90,7 @@ end
end

@testset "TypeIterator" begin
@test collect(TypeIterator(sch, DictEntry)) == [sch[""], sch["U"], sch["k"], sch["s"]]
@test collect(TypeIterator(DictEntry, sch)) == [sch[""], sch["U"], sch["k"], sch["s"]]
end

@testset "print with empty lists" begin
Expand Down Expand Up @@ -184,7 +184,7 @@ end
str_repr = String(take!(buf))

buf = IOBuffer()
HierarchicalUtils.printtree(buf, sch; trav=false, trunc=3)
HierarchicalUtils.printtree(buf, sch; trav=false, htrunc=3)
str_repr2 = String(take!(buf))
@test str_repr == str_repr2
end
50 changes: 44 additions & 6 deletions test/schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,12 @@ end
j6 = JSON.parse("""{}""")

sch = JsonGrinder.schema([j1,j2,j3,j4,j5,j6])
@test children(sch[:c][:a]) == (a=sch[:c][:a][:a], b=sch[:c][:a][:b])
@test children(sch[:c][:a]) == [:a=>sch[:c][:a][:a], :b=>sch[:c][:a][:b]]
delete!(sch, ".c.a", "a")
@test children(sch[:c][:a]) == (b=sch[:c][:a][:b],)
@test children(sch[:b].items) == (a=sch[:b].items[:a], b=sch[:b].items[:b])
@test children(sch[:c][:a]) == [:b=>sch[:c][:a][:b]]
@test children(sch[:b].items) == [:a=>sch[:b].items[:a], :b=>sch[:b].items[:b]]
delete!(sch, ".b.[]", "a")
@test children(sch[:b].items) == (b=sch[:b].items[:b],)
@test children(sch[:b].items) == [:b=>sch[:b].items[:b]]
end

@testset "Extractor from schema" begin
Expand All @@ -304,8 +304,8 @@ end
@test e1.data.scalars.data[1, 1] == 0
@test e1.data.b.data.a.data [1., 2., 3.]
@test e1.data.b.data.scalars.data[1, 1] == 0.
@test e1.data.c.data.a.data.a.data.data == [0. 0.5 1.]
@test e1.data.c.data.a.data.b.data.data == [0. 0.5 1.]
@test e1.data.c.data.a.data.data == [0. 0.5 1.]
@test e1.data.c.data.b.data.data == [0. 0.5 1.]
end

@testset "Mixing substrings with strings" begin
Expand Down Expand Up @@ -491,3 +491,41 @@ end
sch = JsonGrinder.schema([j1,j2,j3,j4,j5,j6,j7])
@test sch[:a].updated == 7
end

@testset "prune_json" begin
j1 = JSON.parse("""{"a": 4, "b": {"a":[1,2,3],"b": 1},"c": { "a": {"a":[1,2,3],"b":[4,5,6]}}}""",inttype=Float64)
j2 = JSON.parse("""{"a": 4, "c": { "a": {"a":[2,3],"b":[5,6]}}}""")
j3 = JSON.parse("""{"a": 4, "b": {"a":[1,2,3],"b": 1}}""")
j4 = JSON.parse("""{"a": 4, "b": {}}""")
j5 = JSON.parse("""{"b": {}}""")
j6 = JSON.parse("""{}""")

sch = JsonGrinder.schema([j1,j2,j3,j4,j5,j6])

@test JsonGrinder.prune_json(j1, sch) == Dict(
"c" => Dict("a"=>Dict("b"=>[4.0, 5.0, 6.0],"a"=>[1.0, 2.0, 3.0])),
"b" => Dict("b"=>1.0,"a"=>[1.0, 2.0, 3.0]),
"a" => 4.0)

@test JsonGrinder.prune_json(j2, sch) == Dict(
"c" => Dict("a"=>Dict("b"=>[5, 6],"a"=>[2, 3])),
"a" => 4)

delete!(sch.childs, :b)

@test JsonGrinder.prune_json(j1, sch) == Dict(
"c" => Dict("a"=>Dict("b"=>[4.0, 5.0, 6.0],"a"=>[1.0, 2.0, 3.0])),
"a" => 4.0)

@test JsonGrinder.prune_json(j2, sch) == Dict(
"c" => Dict("a"=>Dict("b"=>[5, 6],"a"=>[2, 3])),
"a" => 4)

delete!(sch.childs, :c)

@test JsonGrinder.prune_json(j1, sch) == Dict(
"a" => 4.0)

@test JsonGrinder.prune_json(j2, sch) == Dict(
"a" => 4)
end

2 comments on commit 604dce2

@racinmat
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/22951

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.7.0 -m "<description of version>" 604dce27812c7d501ef917dc1f165eba5f07ad4c
git push origin v1.7.0

Please sign in to comment.