diff --git a/Project.toml b/Project.toml index d668ce18..25dce948 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,11 @@ name = "UnROOT" uuid = "3cd96dde-e98d-4713-81e9-a4a1b0235ce9" -authors = ["Tamas Gal", "Jerry Ling"] -version = "0.3.4" +authors = ["Tamas Gal", "Jerry Ling", "Johannes Schumann", "Nick Amin"] +version = "0.3.5" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +ArraysOfArrays = "65a8f2f4-9b39-5baf-92e2-a9cc46fdf018" CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561" CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" @@ -22,6 +23,7 @@ TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [compat] AbstractTrees = "^0.3.0" +ArraysOfArrays = "^0.5.3" CodecLz4 = "^0.3.0, ^0.4.0" CodecXz = "^0.6.0, ^0.7.0" CodecZlib = "^0.6.0, ^0.7.0" @@ -39,10 +41,10 @@ TypedTables = "^1.0.0" julia = "^1.3" [extras] +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" MD5 = "6ac74813-4b46-53a4-afec-0b5dc9d7885c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d" -InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [targets] test = ["Test", "ThreadsX", "MD5", "InteractiveUtils"] diff --git a/src/UnROOT.jl b/src/UnROOT.jl index ea0bb047..7c8d3d7c 100644 --- a/src/UnROOT.jl +++ b/src/UnROOT.jl @@ -6,9 +6,8 @@ import Base: keys, get, getindex, getproperty, show, length, iterate, position, ntoh(b::Bool) = b import AbstractTrees: children, printnode, print_tree -using Base.Threads: SpinLock -using CodecZlib, CodecLz4, CodecXz, CodecZstd, StaticArrays, LorentzVectors +using CodecZlib, CodecLz4, CodecXz, CodecZstd, StaticArrays, LorentzVectors, ArraysOfArrays using Mixers, Parameters, Memoization, LRUCache import Tables, TypedTables, PrettyTables, DataFrames diff --git a/src/custom.jl b/src/custom.jl index 39d74622..eaef9ae1 100644 --- a/src/custom.jl +++ b/src/custom.jl @@ -50,10 +50,16 @@ The `interped_data` method specialized for `LorentzVector`. This method will get [`basketarray`](@ref) instead of the default method for `TLorentzVector` branch. """ function interped_data(rawdata, rawoffsets, ::Type{Vector{LVF64}}, ::Type{Offsetjagg}) - @views map(1:length(rawoffsets)-1) do idx - idxrange = rawoffsets[idx]+10+1 : rawoffsets[idx+1] - interped_data(rawdata[idxrange], rawoffsets[idx], LVF64, Nojagg) + _size = 64 # needs to account for 32 bytes header + data = UInt8[] + offset = Int64[0] + @views @inbounds for i in 1:(length(rawoffsets) - 1) + rg = (rawoffsets[i]+10+1) : rawoffsets[i+1] + append!(data, rawdata[rg]) + push!(offset, last(offset) + length(rg)) end + real_data = interped_data(data, offset, LVF64, Nojagg) + VectorOfVectors(real_data, offset .÷ _size .+ 1) end function interped_data(rawdata, rawoffsets, ::Type{LVF64}, ::Type{J}) where {T, J <: JaggType} # even with rawoffsets, we know each TLV is destinied to be 64 bytes diff --git a/src/iteration.jl b/src/iteration.jl index 964aafdd..94459eb6 100644 --- a/src/iteration.jl +++ b/src/iteration.jl @@ -9,28 +9,30 @@ function arrays(f::ROOTFile, treename) Threads.@threads for i in eachindex(names) res[i] = array(f, "$treename/$(names[i])") end - res + return res end - """ array(f::ROOTFile, path; raw=false) Reads an array from a branch. Set `raw=true` to return raw data and correct offsets. """ -array(f::ROOTFile, path::AbstractString; raw=false) = array(f::ROOTFile, _getindex(f, path); raw=raw) +function array(f::ROOTFile, path::AbstractString; raw=false) + return array(f::ROOTFile, _getindex(f, path); raw=raw) +end function array(f::ROOTFile, branch; raw=false) ismissing(branch) && error("No branch found at $path") (!raw && length(branch.fLeaves.elements) > 1) && error( - "Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.") + "Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.", + ) rawdata, rawoffsets = readbranchraw(f, branch) if raw return rawdata, rawoffsets end - T, J = auto_T_JaggT(f, branch; customstructs = f.customstructs) - interped_data(rawdata, rawoffsets, T, J) + T, J = auto_T_JaggT(f, branch; customstructs=f.customstructs) + return interped_data(rawdata, rawoffsets, T, J) end """ @@ -38,19 +40,24 @@ end basketarray(f::ROOTFile, branch::Union{TBranch, TBranchElement}, ith) Reads actual data from ith basket of a branch. This function first calls [`readbasket`](@ref) -to obtain raw bytes and offsets of a basket, then calls [`auto_T_JaggT`](@ref) followed +to obtain raw bytes and offsets of a basket, then calls [`auto_T_JaggT`](@ref) followed by [`interped_data`](@ref) to translate raw bytes into actual data. """ -basketarray(f::ROOTFile, path::AbstractString, ithbasket) = basketarray(f, f[path], ithbasket) -@memoize LRU(; maxsize=1 * 1024^3, by=x->sum(sizeof, x)) function basketarray(f::ROOTFile, branch, ithbasket) -# function basketarray(f::ROOTFile, branch, ithbasket) +function basketarray(f::ROOTFile, path::AbstractString, ithbasket) + return basketarray(f, f[path], ithbasket) +end +@memoize LRU(; maxsize=1024^3, by=x -> sum(sizeof, x)) function basketarray( + f::ROOTFile, branch, ithbasket +) + # function basketarray(f::ROOTFile, branch, ithbasket) ismissing(branch) && error("No branch found at $path") length(branch.fLeaves.elements) > 1 && error( - "Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.") + "Branches with multiple leaves are not supported yet. Try reading with `array(...; raw=true)`.", + ) rawdata, rawoffsets = readbasket(f, branch, ithbasket) - T, J = auto_T_JaggT(f, branch; customstructs = f.customstructs) - interped_data(rawdata, rawoffsets, T, J) + T, J = auto_T_JaggT(f, branch; customstructs=f.customstructs) + return interped_data(rawdata, rawoffsets, T, J) end # function barrior to make getting individual index faster @@ -83,17 +90,18 @@ julia> ab[begin:end] ... ``` """ -mutable struct LazyBranch{T, J} <: AbstractVector{T} +mutable struct LazyBranch{T,J,B} <: AbstractVector{T} f::ROOTFile - b::Union{TBranch, TBranchElement} + b::Union{TBranch,TBranchElement} L::Int64 fEntry::Vector{Int64} - buffer::Vector{T} + buffer::B buffer_range::UnitRange{Int64} - function LazyBranch(f::ROOTFile, b::Union{TBranch, TBranchElement}) - T, J = auto_T_JaggT(f, b; customstructs = f.customstructs) - new{T, J}(f, b, length(b), b.fBasketEntry, T[], 0:0) + function LazyBranch(f::ROOTFile, b::Union{TBranch,TBranchElement}) + T, J = auto_T_JaggT(f, b; customstructs=f.customstructs) + _buffer = J === Nojagg ? T[] : VectorOfVectors{eltype(T)}() + return new{T,J,typeof(_buffer)}(f, b, length(b), b.fBasketEntry, _buffer, 0:0) end end @@ -102,13 +110,13 @@ function Base.hash(lb::LazyBranch, h::UInt) h = hash(lb.b.fClassName, h) h = hash(lb.L, h) h = hash(lb.buffer_range, h) - h + return h end Base.size(ba::LazyBranch) = (ba.L,) Base.length(ba::LazyBranch) = ba.L Base.firstindex(ba::LazyBranch) = 1 Base.lastindex(ba::LazyBranch) = ba.L -Base.eltype(ba::LazyBranch{T,J}) where {T,J} = T +Base.eltype(ba::LazyBranch{T,J,B}) where {T,J,B} = T function Base.show(io::IO, lb::LazyBranch) summary(io, lb) @@ -118,6 +126,7 @@ function Base.show(io::IO, lb::LazyBranch) println(" Description: $(lb.b.fTitle)") println(" NumEntry: $(lb.L)") print(" Entry Type: $(eltype(lb))") + nothing end """ @@ -133,24 +142,27 @@ and update buffer and buffer range accordingly. moment, access a `LazyBranch` from different threads at the same time can cause performance issue and incorrect event result. """ -function Base.getindex(ba::LazyBranch{T, J}, idx::Integer) where {T, J} +function Base.getindex(ba::LazyBranch{T,J,B}, idx::Integer) where {T,J,B} br = ba.buffer_range if idx ∉ br - seek_idx = findfirst(x -> x>(idx-1), ba.fEntry) - 1 #support 1.0 syntax - ba.buffer = basketarray(ba.f, ba.b, seek_idx) - br = ba.fEntry[seek_idx] + 1 : ba.fEntry[seek_idx+1] - 1 + seek_idx = findfirst(x -> x > (idx - 1), ba.fEntry) - 1 #support 1.0 syntax + bb = basketarray(ba.f, ba.b, seek_idx) + @assert typeof(bb) === B + ba.buffer = bb + br = (ba.fEntry[seek_idx] + 1):(ba.fEntry[seek_idx + 1] - 1) ba.buffer_range = br end localidx = idx - br.start + 1 return ba.buffer[localidx] end -function Base.iterate(ba::LazyBranch{T, J}, idx=1) where {T, J} - idx>ba.L && return nothing - return (ba[idx], idx+1) +function Base.iterate(ba::LazyBranch{T,J,B}, idx=1) where {T,J,B} + idx > ba.L && return nothing + return (ba[idx], idx + 1) end -const _LazyTreeType = TypedTables.Table{<:NamedTuple, 1, NamedTuple{S, N}} where {S, N <: Tuple{Vararg{LazyBranch}}} +const _LazyTreeType = + TypedTables.Table{<:NamedTuple,1,NamedTuple{S,N}} where {S,N<:Tuple{Vararg{LazyBranch}}} struct LazyTree{T} <: DataFrames.AbstractDataFrame treetable::T @@ -160,7 +172,9 @@ end # a specific branch Base.getindex(lt::LazyTree, row::Int) = innertable(lt)[row] -Base.getindex(lt::LazyTree, rang::UnitRange) = LazyTree(innertable(lt)[rang], Core.getfield(lt, :colidx)) +function Base.getindex(lt::LazyTree, rang::UnitRange) + return LazyTree(innertable(lt)[rang], Core.getfield(lt, :colidx)) +end Base.getindex(lt::LazyTree, ::typeof(!), s::Symbol) = lt[:, s] Base.getindex(lt::LazyTree, ::Colon, i::Int) = lt[:, propertynames(lt)[i]] Base.getindex(lt::LazyTree, ::typeof(!), i::Int) = lt[:, propertynames(lt)[i]] @@ -187,7 +201,7 @@ function getbranchnamesrecursive(obj) for b in obj.fBranches.elements push!(out, b.fName) for subname in getbranchnamesrecursive(b) - push!(out,"$(b.fName)/$(subname)") + push!(out, "$(b.fName)/$(subname)") end end return out @@ -224,39 +238,43 @@ function LazyTree(f::ROOTFile, s::AbstractString, branches) if length(branches) > 30 @warn "Your tree is quite wide, with $(length(branches)) branches, this will take compiler a moment." end - d = Dict{Symbol, LazyBranch}() - d_colidx = Dict{Symbol, Int}() + d = Dict{Symbol,LazyBranch}() + d_colidx = Dict{Symbol,Int}() _m(s::AbstractString) = isequal(s) _m(r::Regex) = Base.Fix1(occursin, r) - branches = mapreduce(b->filter(_m(b), getbranchnamesrecursive(tree)), ∪, branches) + branches = mapreduce(b -> filter(_m(b), getbranchnamesrecursive(tree)), ∪, branches) SB = Symbol.(branches) - for (i,b) in enumerate(SB) + for (i, b) in enumerate(SB) d[b] = f["$s/$b"] d_colidx[b] = i end - LazyTree( TypedTables.Table(d), DataFrames.Index(d_colidx, SB) ) + return LazyTree(TypedTables.Table(d), DataFrames.Index(d_colidx, SB)) end function LazyTree(f::ROOTFile, s::AbstractString) - LazyTree(f, s, keys(f[s])) + return LazyTree(f, s, keys(f[s])) end -LazyTree(f::ROOTFile, s::AbstractString, branch::Union{AbstractString, Regex}) = LazyTree(f, s, [branch]) +function LazyTree(f::ROOTFile, s::AbstractString, branch::Union{AbstractString,Regex}) + return LazyTree(f, s, [branch]) +end struct LazyEvent{T<:TypedTables.Table} tree::T idx::Int64 end Base.show(io::IO, evt::LazyEvent) = show(io, "LazyEvent with: $(propertynames(evt))") -Base.getproperty(evt::LazyEvent, s::Symbol) = @inbounds getproperty(Core.getfield(evt, :tree),s)[Core.getfield(evt, :idx)] +function Base.getproperty(evt::LazyEvent, s::Symbol) + @inbounds getproperty(Core.getfield(evt, :tree), s)[Core.getfield(evt, :idx)] +end Base.collect(evt::LazyEvent) = Core.getfield(evt, :tree)[Core.getfield(evt, :idx)] -function Base.iterate(tree::T, idx=1) where T <: LazyTree +function Base.iterate(tree::T, idx=1) where {T<:LazyTree} idx > length(tree) && return nothing - LazyEvent(Core.getfield(tree, :treetable), idx), idx+1 + return LazyEvent(Core.getfield(tree, :treetable), idx), idx + 1 end # TODO this is not terribly slow, but we can get faster implementation still ;) -function Base.getindex(ba::LazyBranch{T, J}, rang::UnitRange) where {T, J} - [ba[i] for i in rang] +function Base.getindex(ba::LazyBranch{T,J,B}, rang::UnitRange) where {T,J,B} + return [ba[i] for i in rang] end diff --git a/src/root.jl b/src/root.jl index 1fc9c85d..a9465d5b 100644 --- a/src/root.jl +++ b/src/root.jl @@ -189,19 +189,19 @@ function interped_data(rawdata, rawoffsets, ::Type{T}, ::Type{J}) where {T, J<:J # the other is where we need to auto detector T bsaed on class name # we want the fundamental type as `reinterpret` will create vector if J == Nojagg - return ntoh.(reinterpret(T, rawdata)) + return map(ntoh, reinterpret(T, rawdata)) elseif J == Offsetjaggjagg # the branch is doubly jagged jagg_offset = 10 subT = eltype(eltype(T)) - out = Vector{Vector{Vector{subT}}}() + out = VectorOfVectors{Vector{subT}}() @views for i in 1:(length(rawoffsets)-1) flat = rawdata[(rawoffsets[i]+1+jagg_offset:rawoffsets[i+1])] - row = Vector{Vector{subT}}() + row = VectorOfVectors{subT}() cursor = 1 while cursor < length(flat) n = ntoh(reinterpret(Int32, flat[cursor:cursor+sizeof(Int32)-1])[1]) cursor += sizeof(Int32) - b = ntoh.(reinterpret(subT, flat[cursor:cursor+n*sizeof(subT)-1])) + b = map(ntoh, reinterpret(subT, flat[cursor:cursor+n*sizeof(subT)-1])) cursor += n*sizeof(subT) push!(row, b) end @@ -215,13 +215,17 @@ function interped_data(rawdata, rawoffsets, ::Type{T}, ::Type{J}) where {T, J<:J # this is why we need to append `rawoffsets` in the `readbranchraw()` call # when you use this range to index `rawdata`, you will get raw bytes belong to each event # Say your real data is Int32 and you see 8 bytes after indexing, then this event has [num1, num2] as real data - @views [ - ntoh.(reinterpret( - T, rawdata[ (rawoffsets[i]+jagg_offset+1):rawoffsets[i+1] ] - )) for i in 1:(length(rawoffsets) - 1) - ] + _size = sizeof(eltype(T)) + data = UInt8[] + offset = Int64[0] # god damn 0-based index + @views @inbounds for i in 1:(length(rawoffsets) - 1) + rg = (rawoffsets[i]+jagg_offset+1) : rawoffsets[i+1] + append!(data, rawdata[rg]) + push!(offset, last(offset) + length(rg)) + end + real_data = map(ntoh, reinterpret(T, data)) + return VectorOfVectors(real_data, offset .÷ _size .+ 1) end - end function _normalize_ftype(fType) @@ -276,7 +280,7 @@ function auto_T_JaggT(f::ROOTFile, branch; customstructs::Dict{String, Type}) # this will call a customize routine if defined by user # see custom.jl _custom = customstructs[classname] - return _custom, _jaggtype + return _custom, Nojagg catch end m = match(r"vector<(.*)>", classname) @@ -372,7 +376,7 @@ See also: [`auto_T_JaggT`](@ref), [`basketarray`](@ref) """ readbasket(f::ROOTFile, branch, ith) = readbasketseek(f, branch, branch.fBasketSeek[ith]) -@memoize LRU(; maxsize=3 * 1024^3, by=x -> sum(sizeof, x)) function readbasketseek( +@memoize LRU(; maxsize=1024^3, by=x -> sum(sizeof, x)) function readbasketseek( # function readbasketseek( f::ROOTFile, branch::Union{TBranch, TBranchElement}, seek_pos::Int )::Tuple{Vector{UInt8},Vector{Int32},Int32} # just being extra careful diff --git a/src/types.jl b/src/types.jl index 4397e06b..e6f52b6d 100644 --- a/src/types.jl +++ b/src/types.jl @@ -46,8 +46,8 @@ end fDatime::UInt32 fKeylen::Int16 fCycle::Int16 - fSeekKey::Integer - fSeekPdir::Integer + fSeekKey::Int64 + fSeekPdir::Int64 fClassName::String fName::String fTitle::String