Skip to content

Commit bd71665

Browse files
authored
feat(batch): Add support for b4_SSE2 batched mode. (#1825)
Add support for b4_SSE2 batched mode, enabling batched execution for all x86-64 CPUs that don't support AVX. Quick tests were run to see that output with procedural and texture based materials looked ok and proper SSE2 batched code was being generated for wide/ functions. EDIT: Additionally, all tests pass now. --------- Signed-off-by: Tuomas Tonteri <[email protected]>
1 parent f5cfcc2 commit bd71665

21 files changed

+463
-53
lines changed

.github/workflows/ci.yml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,18 @@ jobs:
5050
pybind11_ver: v2.7.0
5151
simd: sse4.2
5252
batched: b8_AVX2_noFMA
53-
setenvs: export ENABLE_OPENVDB=0
54-
- desc: gcc9/C++17 llvm13 py3.9 oiio-rel avx2
53+
- desc: gcc9/C++17 llvm11 py3.7 exr2.5 oiio2.3 sse2 batch-b4sse2
54+
nametag: linux-vfx2021
55+
runner: ubuntu-latest
56+
container: aswftesting/ci-osl:2021-clang11
57+
vfxyear: 2021
58+
cxx_std: 17
59+
openimageio_ver: v2.4.13.0
60+
python_ver: 3.7
61+
pybind11_ver: v2.7.0
62+
simd: sse2
63+
batched: b4_SSE2
64+
- desc: gcc9/C++17 llvm13 py3.9 exr3.1 oiio-rel avx2
5565
nametag: linux-vfx2022
5666
runner: ubuntu-latest
5767
container: aswftesting/ci-osl:2022-clang13

INSTALL.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ and aarch64), and Windows (x86_64). It may build and run on other platforms as
99
well, but we don't officially support or test other than these platforms.
1010

1111
Shader execution is supported on the native architectures of those x86_64 and
12-
aarch64 platforms, a special batched 8- or 16-wide SIMD execution mode
13-
requiring x86_64 with AVX2 or AVX-512 instructions, as well as on NVIDIA GPUs
14-
using Cuda+OptiX.
12+
aarch64 platforms, a special batched 4-, 8- or 16-wide SIMD execution mode
13+
requiring x86_64 with SSE2, AVX/AVX2 or AVX-512 instructions, as well as on
14+
NVIDIA GPUs using Cuda+OptiX.
1515

1616
Dependencies
1717
------------

src/cmake/compiler.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ endif ()
338338
#
339339
# The USE_BATCHED option may be set to indicate that support for batched
340340
# SIMD shader execution be compiled along with targe specific libraries
341-
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
341+
set (USE_BATCHED "" CACHE STRING "Build batched SIMD shader execution for (0, b4_SSE2, b8_AVX, b8_AVX2, b8_AVX2_noFMA, b8_AVX512, b8_AVX512_noFMA, b16_AVX512, b16_AVX512_noFMA)")
342342
option (VEC_REPORT "Enable compiler's reporting system for vectorization" OFF)
343343
set (BATCHED_SUPPORT_DEFINES "")
344344
set (BATCHED_TARGET_LIBS "")

src/include/OSL/batched_texture.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ static_assert(std::alignment_of<VaryingTextureOptions<16>>::value
4949
static_assert(std::alignment_of<VaryingTextureOptions<8>>::value
5050
== VecReg<8>::alignment,
5151
"Expect alignment of data member to set alignment of struct");
52+
static_assert(std::alignment_of<VaryingTextureOptions<4>>::value
53+
== VecReg<4>::alignment,
54+
"Expect alignment of data member to set alignment of struct");
5255

5356
template<int WidthT> struct BatchedTextureOptions {
5457
VaryingTextureOptions<WidthT> varying;
@@ -90,11 +93,15 @@ static_assert(std::alignment_of<BatchedTextureOptions<16>>::value
9093
static_assert(std::alignment_of<BatchedTextureOptions<8>>::value
9194
== VecReg<8>::alignment,
9295
"Expect alignment of data member to set alignment of struct");
96+
static_assert(std::alignment_of<BatchedTextureOptions<4>>::value
97+
== VecReg<4>::alignment,
98+
"Expect alignment of data member to set alignment of struct");
9399

94100
#ifdef OIIO_TEXTURE_SIMD_BATCH_WIDTH
95101
// Code here is to validate our OSL BatchedTextureOptions<WidthT> is binary compatible
96102
// and safe to reinterpret_cast<TextureOptBatch*>
97-
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8),
103+
static_assert((OIIO::Tex::BatchWidth == 16) || (OIIO::Tex::BatchWidth == 8)
104+
|| (OIIO::Tex::BatchWidth == 4),
98105
"This validation requires OIIO_TEXTURE_SIMD_BATCH_WIDTH=16");
99106

100107
namespace validate_offsets {

src/include/OSL/llvm_util.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,8 @@ class OSLEXECPUBLIC LLVM_Util {
693693
llvm::Constant* constant(uint32_t i);
694694

695695
/// Return an llvm::Constant holding the given integer constant.
696+
llvm::Constant* constant4(int8_t i);
697+
llvm::Constant* constant4(uint8_t i);
696698
llvm::Constant* constant8(int8_t i);
697699
llvm::Constant* constant8(uint8_t i);
698700
llvm::Constant* constant16(int16_t i);
@@ -1229,6 +1231,7 @@ class OSLEXECPUBLIC LLVM_Util {
12291231

12301232
llvm::Value* op_linearize_16x_indices(llvm::Value* wide_index);
12311233
llvm::Value* op_linearize_8x_indices(llvm::Value* wide_index);
1234+
llvm::Value* op_linearize_4x_indices(llvm::Value* wide_index);
12321235
std::array<llvm::Value*, 2> op_split_16x(llvm::Value* vector_val);
12331236
std::array<llvm::Value*, 2> op_split_8x(llvm::Value* vector_val);
12341237
std::array<llvm::Value*, 4> op_quarter_16x(llvm::Value* vector_val);

src/include/OSL/rendererservices.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,7 @@ class OSLEXECPUBLIC RendererServices {
601601
/// Unless overridden, a nullptr is returned.
602602
virtual BatchedRendererServices<16>* batched(WidthOf<16>);
603603
virtual BatchedRendererServices<8>* batched(WidthOf<8>);
604+
virtual BatchedRendererServices<4>* batched(WidthOf<4>);
604605

605606
protected:
606607
TextureSystem* m_texturesys; // A place to hold a TextureSystem

src/liboslexec/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
379379
list (APPEND TARGET_CXX_OPTS "-march=core-avx2")
380380
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
381381
list (APPEND TARGET_CXX_OPTS "-march=corei7-avx")
382+
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
383+
list (APPEND TARGET_CXX_OPTS "-march=x86-64")
382384
else ()
383385
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
384386
endif ()
@@ -454,6 +456,8 @@ foreach(batched_target ${BATCHED_TARGET_LIST})
454456
list (APPEND TARGET_CXX_OPTS "-march=haswell")
455457
elseif (${TARGET_OPT_ISA} STREQUAL "AVX")
456458
list (APPEND TARGET_CXX_OPTS "-march=sandybridge")
459+
elseif (${TARGET_OPT_ISA} STREQUAL "SSE2")
460+
list (APPEND TARGET_CXX_OPTS "-march=x86-64")
457461
else ()
458462
message (FATAL_ERROR "Unknown ISA=${TARGET_OPT_ISA} extract from USE_BATCHED entry ${batched_target}")
459463
endif ()

src/liboslexec/batched_analysis.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1813,10 +1813,16 @@ struct Analyzer {
18131813
// specific BatchedRendererServices.
18141814
// Right here we don't know which width will be used,
18151815
// so we will just require all widths provide the same answer
1816+
auto rs4 = m_ba.renderer()->batched(WidthOf<4>());
18161817
auto rs8 = m_ba.renderer()->batched(WidthOf<8>());
18171818
auto rs16 = m_ba.renderer()->batched(WidthOf<16>());
1818-
if (rs8 || rs16) {
1819+
if (rs4 || rs8 || rs16) {
18191820
get_attr_is_uniform = true;
1821+
if (rs4) {
1822+
get_attr_is_uniform
1823+
&= rs4->is_attribute_uniform(obj_name,
1824+
attr_name);
1825+
}
18201826
if (rs8) {
18211827
get_attr_is_uniform
18221828
&= rs8->is_attribute_uniform(obj_name,

src/liboslexec/batched_backendllvm.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ BatchedBackendLLVM::BatchedBackendLLVM(ShadingSystemImpl& shadingsys,
141141
switch (vector_width()) {
142142
case 16: m_true_mask_value = Mask<16>(true).value(); break;
143143
case 8: m_true_mask_value = Mask<8>(true).value(); break;
144+
case 4: m_true_mask_value = Mask<4>(true).value(); break;
144145
default: OSL_ASSERT(0 && "unsupported vector width");
145146
}
146147
ll.dumpasm(shadingsys.m_llvm_dumpasm);

src/liboslexec/batched_llvm_instance.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,33 @@ const char*
537537
= "b8_AVX_";
538538
#endif
539539

540+
#ifdef __OSL_SUPPORTS_b4_SSE2
541+
template<>
542+
const NameAndSignature
543+
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_functions[]
544+
= {
545+
# define DECL_INDIRECT(name, signature) \
546+
NameAndSignature { #name, signature },
547+
# define DECL(name, signature) DECL_INDIRECT(name, signature)
548+
# define __OSL_WIDTH 4
549+
# define __OSL_TARGET_ISA SSE2
550+
// Don't allow order of xmacro includes be rearranged
551+
// clang-format off
552+
# include "wide/define_opname_macros.h"
553+
# include "builtindecl_wide_xmacro.h"
554+
# include "wide/undef_opname_macros.h"
555+
// clang-format on
556+
# undef __OSL_TARGET_ISA
557+
# undef __OSL_WIDTH
558+
# undef DECL
559+
# undef DECL_INDIRECT
560+
};
561+
template<>
562+
const char*
563+
ConcreteTargetLibraryHelper<4, TargetISA::x64>::library_selector_string
564+
= "b4_SSE2_";
565+
#endif
566+
540567

541568

542569
std::unique_ptr<BatchedBackendLLVM::TargetLibraryHelper>
@@ -592,6 +619,17 @@ BatchedBackendLLVM::TargetLibraryHelper::build(ShadingContext* context,
592619
default: break;
593620
}
594621
break;
622+
case 4:
623+
switch (target_isa) {
624+
#ifdef __OSL_SUPPORTS_b4_SSE2
625+
case TargetISA::x64:
626+
return RetType(
627+
new ConcreteTargetLibraryHelper<4, TargetISA::x64>());
628+
#endif
629+
default: break;
630+
}
631+
break;
632+
595633
default: OSL_ASSERT(0 && "unsupported vector width");
596634
}
597635
std::cerr << "Build is not configured to support TargetISA of "
@@ -735,6 +773,9 @@ BatchedBackendLLVM::llvm_type_batched_texture_options()
735773
{
736774
std::vector<unsigned int> offset_by_index;
737775
switch (m_width) {
776+
case 4:
777+
build_offsets_of_BatchedTextureOptions<4>(offset_by_index);
778+
break;
738779
case 8:
739780
build_offsets_of_BatchedTextureOptions<8>(offset_by_index);
740781
break;
@@ -2698,6 +2739,9 @@ BatchedBackendLLVM::run()
26982739
{
26992740
std::vector<unsigned int> offset_by_index;
27002741
switch (m_width) {
2742+
case 4:
2743+
build_offsets_of_BatchedShaderGlobals<4>(offset_by_index);
2744+
break;
27012745
case 8:
27022746
build_offsets_of_BatchedShaderGlobals<8>(offset_by_index);
27032747
break;

0 commit comments

Comments
 (0)