Skip to content

Commit fadefca

Browse files
authored
Take fewer locks (#705)
1 parent ba79add commit fadefca

File tree

5 files changed

+47
-22
lines changed

5 files changed

+47
-22
lines changed

src/blas/highlevel.jl

+10
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,16 @@ end
9090
# BLAS 2
9191
#
9292

93+
const ROCUpperOrUnitUpperTriangular = LinearAlgebra.UpperOrUnitUpperTriangular{
94+
<:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}}
95+
const ROCLowerOrUnitLowerTriangular = LinearAlgebra.LowerOrUnitLowerTriangular{
96+
<:Any,<:Union{<:ROCArray, Adjoint{<:Any, <:ROCArray}, Transpose{<:Any, <:ROCArray}}}
97+
98+
LinearAlgebra.istriu(::ROCUpperOrUnitUpperTriangular) = true
99+
LinearAlgebra.istril(::ROCUpperOrUnitUpperTriangular) = false
100+
LinearAlgebra.istriu(::ROCLowerOrUnitLowerTriangular) = false
101+
LinearAlgebra.istril(::ROCLowerOrUnitLowerTriangular) = true
102+
93103
# multiplication
94104
LinearAlgebra.generic_trimatmul!(
95105
c::StridedROCVector{T}, uploc, isunitc, tfun::Function,

src/dnn/convolution.jl

+4-4
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ get_conv_cache_type(::Type{miopenConvBwdDataAlgorithm_t}) = CONV_BWD_DATA_BENCHM
3838
get_conv_cache_type(::Type{miopenConvBwdWeightsAlgorithm_t}) = CONV_BWD_WEIGHT_BENCHMARK_CACHE
3939

4040
function get_benchmark_cache(conv_type::C, conv_args) where C <: CONV_ALGOS
41-
perf_results = lock(get_conv_cache_type(conv_type)) do cache
42-
get(cache, conv_args, nothing)
43-
end
44-
isnothing(perf_results) && return nothing
41+
cache = get_conv_cache_type(conv_type).payload
42+
perf_results = get(cache, conv_args, nothing)
43+
perf_results nothing && return nothing
44+
4545
workspace = ROCArray{UInt8}(undef, perf_results.memory)
4646
perf_results, workspace
4747
end

src/hip/HIP.jl

+10-5
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,18 @@ end
3535
const CONTEXTS = AMDGPU.LockedObject(Dict{HIPDevice,HIPContext}())
3636

3737
function HIPContext(device::HIPDevice)
38-
lock(CONTEXTS) do contexts
38+
contexts = CONTEXTS.payload
39+
40+
ctx = get(contexts, device, nothing)
41+
ctx nothing || return ctx
42+
43+
Base.@lock CONTEXTS.lock begin
3944
get!(contexts, device) do
40-
context_ref = Ref{hipContext_t}()
41-
hipCtxCreate(context_ref, Cuint(0), device.device)
42-
context = HIPContext(context_ref[], true)
45+
ctx_ref = Ref{hipContext_t}()
46+
hipCtxCreate(ctx_ref, Cuint(0), device.device)
47+
ctx = HIPContext(ctx_ref[], true)
4348
device!(device)
44-
return context
49+
return ctx
4550
end
4651
end
4752
end

src/memory.jl

+14-9
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,14 @@ MemoryStats() = MemoryStats(0, 0.0, 0, 0.0, 0.0, 0)
119119
const MEMORY_STATS = AMDGPU.LockedObject(Dict{Int, MemoryStats}())
120120

121121
function memory_stats(dev::HIPDevice = AMDGPU.device())
122-
Base.lock(MEMORY_STATS) do ms
123-
get!(() -> MemoryStats(), ms, HIP.device_id(dev))
122+
ms = MEMORY_STATS.payload
123+
did = HIP.device_id(dev)
124+
125+
stats = get(ms, did, nothing)
126+
stats nothing || return stats
127+
128+
Base.@lock MEMORY_STATS.lock begin
129+
get!(() -> MemoryStats(), ms, did)
124130
end
125131
end
126132

@@ -158,7 +164,7 @@ function maybe_collect(; blocking::Bool = false)
158164

159165
# Check if we are under memory pressure.
160166
pressure = stats.live / stats.size
161-
min_pressure = 0.5
167+
min_pressure = blocking ? 0.5 : 0.75
162168
pressure < min_pressure && return
163169

164170
# TODO take allocations into account
@@ -175,22 +181,21 @@ function maybe_collect(; blocking::Bool = false)
175181
blocking && (max_gc_rate *= 2;)
176182

177183
# And even more if the pressure is high.
178-
pressure > 0.6 && (max_gc_rate *= 2;)
179-
pressure > 0.8 && (max_gc_rate *= 2;)
180-
# Always try to collect if pressure ≥ 0.9.
181-
gc_rate > max_gc_rate && pressure < 0.9 && return
184+
pressure > 0.9 && (max_gc_rate *= 2;)
185+
pressure > 0.95 && (max_gc_rate *= 2;)
186+
gc_rate > max_gc_rate && return
182187

183188
Base.@atomic stats.last_time = current_time
184189

185190
# Call the GC.
186191
pre_gc_live = stats.live
187-
gc_time = Base.@elapsed GC.gc(pressure > 0.8 ? true : false)
192+
gc_time = Base.@elapsed GC.gc(false)
188193
post_gc_live = stats.live
189194

190195
# Update stats.
191196
freed = pre_gc_live - post_gc_live
192197
Base.@atomic stats.last_freed = freed
193-
Base.@atomic stats.last_gc_time = gc_time
198+
Base.@atomic stats.last_gc_time = 0.75 * stats.last_gc_time + 0.25 * gc_time
194199
return
195200
end
196201

src/runtime/memory/hip.jl

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
# Device ID => HIPMemoryPool
2-
const MEMORY_POOLS = AMDGPU.LockedObject(
3-
Dict{Int64, HIP.HIPMemoryPool}())
2+
const MEMORY_POOLS = AMDGPU.LockedObject(Dict{Int64, HIP.HIPMemoryPool}())
43

54
function pool_create(dev::HIPDevice)
6-
Base.lock(MEMORY_POOLS) do pools
7-
get!(pools, HIP.device_id(dev)) do
5+
mp = MEMORY_POOLS.payload
6+
did = HIP.device_id(dev)
7+
8+
pool = get(mp, did, nothing)
9+
pool nothing || return pool
10+
11+
Base.@lock MEMORY_POOLS.lock begin
12+
get!(mp, HIP.device_id(dev)) do
813
max_size::UInt64 = AMDGPU.hard_memory_limit()
914
max_size = max_size != typemax(UInt64) ? max_size : UInt64(0)
1015

0 commit comments

Comments
 (0)