From 281e11471782bda40a464f768ce3bd7dbfc7cb3b Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 9 Dec 2017 15:38:23 -0500 Subject: [PATCH 1/5] add clobber and escape to stop LLVM to over-optimize benchmarking loop --- src/BenchmarkTools.jl | 8 ++++++ src/lowlevel.jl | 57 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 src/lowlevel.jl diff --git a/src/BenchmarkTools.jl b/src/BenchmarkTools.jl index c373d3b0..4539dce0 100644 --- a/src/BenchmarkTools.jl +++ b/src/BenchmarkTools.jl @@ -49,6 +49,14 @@ export BenchmarkGroup, addgroup!, leaves +########################## +# Low-level benchmarking # +########################## + +include("lowlevel.jl") +export clobber, + escape + ###################### # Execution Strategy # ###################### diff --git a/src/lowlevel.jl b/src/lowlevel.jl new file mode 100644 index 00000000..99159d5c --- /dev/null +++ b/src/lowlevel.jl @@ -0,0 +1,57 @@ +########################## +# Low-level benchmarking # +########################## +import Base: llvmcall + +""" + clobber() + +Force the compiler to flush pending writes to global memory. +Acts as an effective read/write barrier. +""" +@inline function clobber() + llvmcall(""" + call void asm sideeffect "", "~{memory}"() + ret void + """, Void, Tuple{}) +end + +""" + _llvmname(type::Type) + +Produce the string name of the llvm equivalent of our Julia code. +Oh my. The preferable way would be to use LLVM.jl to do this for us. +""" +function _llvmname(typ::Type) + isboxed_ref = Ref{Bool}() + llvmtyp = ccall(:julia_type_to_llvm, Ptr{Void}, + (Any, Ptr{Bool}), typ, isboxed_ref) + name = unsafe_string( + ccall(:LLVMPrintTypeToString, Cstring, (Ptr{Void},), llvmtyp)) + if isboxed_ref[] + return name * "*" + else + return name + end +end + +""" + escape(val) + +The `escape` function can be used to prevent a value or +expression from being optimized away by the compiler. This function is +intended to add little to no overhead. +See: https://youtu.be/nXaxk27zwlk?t=2441 +""" +@generated function escape(val::T) where T + # We need to get the darn LLVMName to be able to issue a + # fake call. There is probably a better way to do this. + name = _llvmname(T) + ir = """ + call void asm sideeffect "", "X,~{memory}"($name %0) + ret void + """ + quote + llvmcall($ir, Void, Tuple{T}, val) + end +end From 0802ed926f284611ccafdf1e0f41525f2f8710d3 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 9 Dec 2017 16:51:05 -0500 Subject: [PATCH 2/5] make escape a bit more robust --- src/lowlevel.jl | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/lowlevel.jl b/src/lowlevel.jl index 99159d5c..a57806f7 100644 --- a/src/lowlevel.jl +++ b/src/lowlevel.jl @@ -28,11 +28,7 @@ function _llvmname(typ::Type) (Any, Ptr{Bool}), typ, isboxed_ref) name = unsafe_string( ccall(:LLVMPrintTypeToString, Cstring, (Ptr{Void},), llvmtyp)) - if isboxed_ref[] - return name * "*" - else - return name - end + return (isboxed_ref[], name) end """ @@ -44,14 +40,25 @@ intended to add little to no overhead. See: https://youtu.be/nXaxk27zwlk?t=2441 """ @generated function escape(val::T) where T - # We need to get the darn LLVMName to be able to issue a - # fake call. There is probably a better way to do this. - name = _llvmname(T) - ir = """ - call void asm sideeffect "", "X,~{memory}"($name %0) - ret void - """ - quote - llvmcall($ir, Void, Tuple{T}, val) + # If the value is `nothing` then a memory clobber + # should have the same effect. + if T == Void + return :(clobber()) + end + # We need to get the string representation of the LLVM type to be able to issue a + # fake call. + isboxed, name = _llvmname(T) + if isboxed + # name will be `jl_value_t*` which we can't use since string based llvmcall can't handle named structs... + # Ideally we would issue a `bitcast jl_value_t* %0 to i8*` + Base.warn_once("Trying to escape a boxed value. Don't know how to handle that.") + else + ir = """ + call void asm sideeffect "", "X,~{memory}"($name %0) + ret void + """ + quote + llvmcall($ir, Void, Tuple{T}, val) + end end end From 1be663705a89a00635ccf931c3e124e630eb891a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 9 Dec 2017 16:51:27 -0500 Subject: [PATCH 3/5] add cycle counting infrastructure --- src/lowlevel.jl | 54 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/lowlevel.jl b/src/lowlevel.jl index a57806f7..96faba6b 100644 --- a/src/lowlevel.jl +++ b/src/lowlevel.jl @@ -62,3 +62,57 @@ See: https://youtu.be/nXaxk27zwlk?t=2441 end end end + +################ +# Count cycles # +################ + +# Only implemented on x86_64 and needs cpuflags: +# rdtscp, tsc, nonstop_tsc, tsc_known_freq, constant_tsc +# See https://github.com/dterei/gotsc for a good discussion. + +""" + bench_start() + +Issues the instructions `cpuid,rdtsc` to get a precise cycle counter at the beginning of a code segment. +""" +@inline function bench_start() + llvmcall(""" + %a = call {i32, i32} asm sideeffect "CPUID\nRDTSC\nMOV %edx, \$0\nMOV %eax, \$1", "=r,=r,~{rax},~{rbx},~{rcx},~{rdx}"() + %a.0 = extractvalue { i32, i32 } %a, 0 + %a.1 = extractvalue { i32, i32 } %a, 1 + %b0 = insertvalue [2 x i32] undef, i32 %a.0, 0 + %b = insertvalue [2 x i32] %b0 , i32 %a.1, 1 + ret [2 x i32] %b + """, Tuple{UInt32, UInt32}, Tuple{}) +end + +""" + bench_end() + +Issues the instructions `rdtscp,cpuid` to get a precise cycle counter at the end of a code segment. +""" +@inline function bench_end() + llvmcall(""" + %a = call {i32, i32} asm sideeffect "RDTSCP\nMOV %edx, \$0\nMOV %eax, \$1\nCPUID", "=r,=r,~{rax},~{rbx},~{rcx},~{rdx}"() + %a.0 = extractvalue { i32, i32 } %a, 0 + %a.1 = extractvalue { i32, i32 } %a, 1 + %b0 = insertvalue [2 x i32] undef, i32 %a.0, 0 + %b = insertvalue [2 x i32] %b0 , i32 %a.1, 1 + ret [2 x i32] %b + """, Tuple{UInt32, UInt32}, Tuple{}) +end + +function cyc_convert(c::Tuple{UInt32, UInt32}) + a, b = c + ((a % UInt64) << 32) | b +end + +macro elapsed_cyc(ex) + quote + local c0 = bench_start() + escape($(esc(ex))) + local c1 = bench_end() + cyc_convert(c1)-cyc_convert(c0) + end +end From f9babf7a8388de2c407b8f980f41ee7b5e7f86d1 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 9 Dec 2017 17:27:35 -0500 Subject: [PATCH 4/5] add process and thread cpu time --- src/lowlevel.jl | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/lowlevel.jl b/src/lowlevel.jl index 96faba6b..c1e029aa 100644 --- a/src/lowlevel.jl +++ b/src/lowlevel.jl @@ -116,3 +116,39 @@ macro elapsed_cyc(ex) cyc_convert(c1)-cyc_convert(c0) end end + +########## +# Timers # +########## +struct TimeSpec + tv_sec :: UInt64 # time_t + tv_nsec :: UInt64 +end + +const CLOCK_PROCESS_CPUTIME_ID = Cint(2) +const CLOCK_THREAD_CPUTIME_ID = Cint(3) + +@inline function clock_gettime(cid) + ts = Ref{TimeSpec}() + ccall(:clock_gettime, Cint, (Cint, Ref{TimeSpec}), cid, ts) + return ts[].tv_nsec +end + +""" + getProcessTime() + +Per-process CPU-time clock (measures CPU time consumed by all +threads in the process). +""" +@inline function getProcessTime() + clock_gettime(CLOCK_PROCESS_CPUTIME_ID) +end + +""" + getThreadTime() + +Thread-specific CPU-time clock. +""" +@inline function getThreadTime() + clock_gettime(CLOCK_THREAD_CPUTIME_ID) +end From 55a938268ffb679588bcaecbdbaa1c5b0a86e7ee Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 9 Dec 2017 17:44:46 -0500 Subject: [PATCH 5/5] convert process and thread time into seconds --- src/lowlevel.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lowlevel.jl b/src/lowlevel.jl index c1e029aa..6c4858fa 100644 --- a/src/lowlevel.jl +++ b/src/lowlevel.jl @@ -124,6 +124,7 @@ struct TimeSpec tv_sec :: UInt64 # time_t tv_nsec :: UInt64 end +maketime(ts) = ts.tv_sec + ts.tv_nsec * 1e-9 const CLOCK_PROCESS_CPUTIME_ID = Cint(2) const CLOCK_THREAD_CPUTIME_ID = Cint(3) @@ -131,7 +132,7 @@ const CLOCK_THREAD_CPUTIME_ID = Cint(3) @inline function clock_gettime(cid) ts = Ref{TimeSpec}() ccall(:clock_gettime, Cint, (Cint, Ref{TimeSpec}), cid, ts) - return ts[].tv_nsec + return ts[] end """ @@ -141,7 +142,7 @@ Per-process CPU-time clock (measures CPU time consumed by all threads in the process). """ @inline function getProcessTime() - clock_gettime(CLOCK_PROCESS_CPUTIME_ID) + maketime(clock_gettime(CLOCK_PROCESS_CPUTIME_ID)) end """ @@ -150,5 +151,5 @@ end Thread-specific CPU-time clock. """ @inline function getThreadTime() - clock_gettime(CLOCK_THREAD_CPUTIME_ID) + maketime(clock_gettime(CLOCK_THREAD_CPUTIME_ID)) end