@@ -62,14 +62,50 @@ const Boolean = Union{Bit,Bool}
62
62
63
63
abstract type AbstractSIMD{W,T <: Union{<:StaticInt,NativeTypes} } <: Real end
64
64
abstract type AbstractSIMDVector{W,T} <: AbstractSIMD{W,T} end
65
+ """
66
+ VecUnroll{N,W,T,V<:Union{NativeTypes,AbstractSIMD{W,T}}} <: AbstractSIMD{W,T}
67
+
68
+ `VecUnroll` supports optimizations when interleaving instructions across different memory storage schemes.
69
+ `VecUnroll{N,W,T} is typically a tuple of `N+1` `AbstractSIMDVector{W,T}`s. For example, a `VecUnroll{3,8,Float32}`
70
+ is a collection of 4× `Vec{8,Float32}`.
71
+
72
+ # Examples
73
+
74
+ ```jldoctest; setup=:(using VectorizationBase)
75
+ julia> rgbs = [(R = Float32(i)/255, G = Float32(i+100)/255, B = Float32(i+200)/255) for i in 0:7:49]
76
+ 8-element Vector{NamedTuple{(:R, :G, :B), Tuple{Float32, Float32, Float32}}}:
77
+ (R = 0.0, G = 0.39215687, B = 0.78431374)
78
+ (R = 0.02745098, G = 0.41960785, B = 0.8117647)
79
+ (R = 0.05490196, G = 0.44705883, B = 0.8392157)
80
+ (R = 0.08235294, G = 0.4745098, B = 0.8666667)
81
+ (R = 0.10980392, G = 0.5019608, B = 0.89411765)
82
+ (R = 0.13725491, G = 0.5294118, B = 0.92156863)
83
+ (R = 0.16470589, G = 0.5568628, B = 0.9490196)
84
+ (R = 0.19215687, G = 0.58431375, B = 0.9764706)
85
+
86
+ julia> ret = vload(stridedpointer(reinterpret(reshape, Float32, rgbs)), Unroll{1,1,3,2,8,zero(UInt),1}((1,1)))
87
+ 3 x Vec{8, Float32}
88
+ Vec{8, Float32}<0.0f0, 0.02745098f0, 0.05490196f0, 0.08235294f0, 0.10980392f0, 0.13725491f0, 0.16470589f0, 0.19215687f0>
89
+ Vec{8, Float32}<0.39215687f0, 0.41960785f0, 0.44705883f0, 0.4745098f0, 0.5019608f0, 0.5294118f0, 0.5568628f0, 0.58431375f0>
90
+ Vec{8, Float32}<0.78431374f0, 0.8117647f0, 0.8392157f0, 0.8666667f0, 0.89411765f0, 0.92156863f0, 0.9490196f0, 0.9764706f0>
91
+
92
+ julia> typeof(ret)
93
+ VecUnroll{2, 8, Float32, Vec{8, Float32}}
94
+ ```
95
+ While the `R`, `G`, and `B` are interleaved in `rgb`s, they have effectively been split out in `ret`
96
+ (the first contains all 8 `R` values, with `G` and `B` in the second and third, respectively).
97
+
98
+ To optimize for the user's CPU, in real code it would typically be better to use `Int(pick_vector_width(Float32))`
99
+ in place of `8` (`W`) in the `Unroll` construction.
100
+ """
65
101
struct VecUnroll{N,W,T,V<: Union{NativeTypes,AbstractSIMD{W,T}} } <: AbstractSIMD{W,T}
66
102
data:: Tuple{V,Vararg{V,N}}
67
103
@inline (VecUnroll (data:: Tuple{V,Vararg{V,N}} ):: VecUnroll{N,W,T,V} ) where {N,W,T,V<: AbstractSIMD{W,T} } = new {N,W,T,V} (data)
68
104
@inline (VecUnroll (data:: Tuple{T,Vararg{T,N}} ):: VecUnroll{N,T,T} ) where {N,T<: NativeTypes } = new {N,1,T,T} (data)
69
105
# # following two definitions are for checking that you aren't accidentally creating `VecUnroll{0}`s.
70
106
# @inline (VecUnroll(data::Tuple{V,Vararg{V,N}})::VecUnroll{N,W,T,V}) where {N,W,T,V<:AbstractSIMD{W,T}} = (@assert(N > 0); new{N,W,T,V}(data))
71
107
# @inline (VecUnroll(data::Tuple{T,Vararg{T,N}})::VecUnroll{N,T,T}) where {N,T<:NativeTypes} = (@assert(N > 0); new{N,1,T,T}(data))
72
-
108
+
73
109
# @inline VecUnroll{N,W,T,V}(data::Tuple{V,Vararg{V,N}}) where {N,W,T,V<:AbstractSIMDVector{W,T}} = new{N,W,T,V}(data)
74
110
# @inline (VecUnroll(data::Tuple{V,Vararg{V,N}})::VecUnroll{N,W,T,Vec{W,T}}) where {N,W,T,V<:AbstractSIMDVector{W,T}} = new{N,W,T,V}(data)
75
111
# @inline (VecUnroll(data::Tuple{V,Vararg{V,N}})::VecUnroll{N,W,T,V}) where {N,W,T,V<:AbstractSIMDVector{W,T}} = new{N,W,T,V}(data)
0 commit comments