From 1f5f12687f69d1b4fd19d51b82d8579ea7d599b4 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Wed, 30 Jan 2019 09:19:39 -0800 Subject: [PATCH 001/378] configure: Bump SWR LLVM requirement to 7 It is currently impossible to build a dist tarball that works when SWR requires LLVM 6. To generate the tarball we'd need to configure with LLVM 6, which is fine. But to build the dist check we need LLVM 7, as RadeonSI and RadV require that version. Unfortunately the headers genererated with LLVM 6 don't compile with LLVM 7, the API has changed between the two versions. I weighed a couple of options here. One would be to ship an unbootstrapped tarball generated with meson. This would fix the issue by not bootstrapping, so whatever version of LLVM used would work because the SWR headers would be generated at compile time. Unfortunately this would involve some heavy modifications to the infastructure used to upload the tarballs, and I've decided not to persue this. --- configure.ac | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 858da79f4d0..d169223094f 100644 --- a/configure.ac +++ b/configure.ac @@ -122,7 +122,7 @@ LLVM_REQUIRED_OPENCL=3.9.0 LLVM_REQUIRED_R600=3.9.0 LLVM_REQUIRED_RADEONSI=7.0.0 LLVM_REQUIRED_RADV=7.0.0 -LLVM_REQUIRED_SWR=6.0.0 +LLVM_REQUIRED_SWR=7.0.0 dnl Check for progs AC_PROG_CPP @@ -2845,8 +2845,8 @@ if test -n "$with_gallium_drivers"; then fi # XXX: Keep in sync with LLVM_REQUIRED_SWR -AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x6.0.0 -a \ - "x$LLVM_VERSION" != x6.0.1) +AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x7.0.0 -a \ + "x$LLVM_VERSION" != x7.0.1) if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium" From e7f6a5d17f6075589c570c050c0f04a94a65d9a2 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Wed, 30 Jan 2019 09:44:24 -0800 Subject: [PATCH 002/378] automake: Add --enable-autotools to distcheck flags Fixes: e68777c87ceed02ab199b32f941778c3cf97c794 ("autotools: Deprecate the use of autotools") --- Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.am b/Makefile.am index e7e14f5b3cd..6d3c8cc19b4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,6 +22,7 @@ SUBDIRS = src AM_DISTCHECK_CONFIGURE_FLAGS = \ + --enable-autotools \ --enable-dri \ --enable-dri3 \ --enable-egl \ From 2b603ee4f1f68c7d34a2139d67e996de14bb40ef Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Wed, 30 Jan 2019 10:02:41 -0800 Subject: [PATCH 003/378] android,autotools,i965: Fix location of float64_glsl.h Android.mk and autotools disagree about where generated files should go, which wasn't a problem until we wanted to build a dist tarball. This corrects the problme by changing the output and include paths to be the same on android and autotools (meson already has the correct include path). Fixes: 7d7b30835cfb9eb89beca9fb8593d0954f79b84d ("automake: Fix path to generated source") --- src/compiler/Android.glsl.gen.mk | 2 +- src/mesa/drivers/dri/i965/Makefile.am | 2 ++ src/mesa/drivers/dri/i965/brw_program.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/compiler/Android.glsl.gen.mk b/src/compiler/Android.glsl.gen.mk index e31eb6f101f..3b94ea7bd2f 100644 --- a/src/compiler/Android.glsl.gen.mk +++ b/src/compiler/Android.glsl.gen.mk @@ -104,6 +104,6 @@ $(intermediates)/glsl/ir_expression_operation_strings.h: $(LOCAL_PATH)/glsl/ir_e @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $< strings > $@ -$(intermediates)/compiler/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py +$(intermediates)/glsl/float64_glsl.h: $(LOCAL_PATH)/glsl/xxd.py @mkdir -p $(dir $@) $(hide) $(MESA_PYTHON2) $< $(MESA_TOP)/src/compiler/glsl/float64.glsl $@ -n float64_source > $@ diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index b562c6ea21c..0bda2897e8e 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -34,6 +34,8 @@ AM_CFLAGS = \ -I$(top_builddir)/src/util \ -I$(top_srcdir)/src/mesa/drivers/dri/common \ -I$(top_srcdir)/src/gtest/include \ + -I$(top_builddir)/src/compiler \ + -I$(top_srcdir)/src/compiler \ -I$(top_builddir)/src/compiler/glsl \ -I$(top_builddir)/src/compiler/nir \ -I$(top_srcdir)/src/compiler/nir \ diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 9ab25cf664c..1038d9a47a0 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -42,7 +42,7 @@ #include "compiler/glsl/ir.h" #include "compiler/glsl/program.h" #include "compiler/glsl/glsl_to_nir.h" -#include "compiler/glsl/float64_glsl.h" +#include "glsl/float64_glsl.h" #include "brw_program.h" #include "brw_context.h" From 2fddad9e3f7259eb26930a99bfc145c7c0d7c851 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Tue, 29 Jan 2019 15:36:30 -0800 Subject: [PATCH 004/378] VERSION: bump to 19.0.0-rc1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 5bd94c44a5c..e17116a15f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -19.0.0-devel +19.0.0-rc1 From 45d1aa2f6cf2230a0e9f54ac7a20a56e832859a1 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Tue, 29 Jan 2019 17:25:17 +0000 Subject: [PATCH 005/378] vc4: Declare the last cpu pointer as being modified in NEON asm. Earlier commit addressed 7 of the 8 instances available. v2: Rebase patch back to master (by anholt) Cc: Carsten Haitzler (Rasterman) Cc: Eric Anholt Fixes: 300d3ae8b14 ("vc4: Declare the cpu pointers as being modified in NEON asm.") Signed-off-by: Emil Velikov (cherry picked from commit 385843ac3ce1b868d9e24fcb2dbc0c8d5f5a7c99) --- src/broadcom/common/v3d_cpu_tiling.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/broadcom/common/v3d_cpu_tiling.h b/src/broadcom/common/v3d_cpu_tiling.h index e10b4586609..cb1ee7c96f4 100644 --- a/src/broadcom/common/v3d_cpu_tiling.h +++ b/src/broadcom/common/v3d_cpu_tiling.h @@ -159,9 +159,8 @@ v3d_store_utile(void *gpu, uint32_t gpu_stride, * d0-d7. */ "vstm %[gpu], {q0, q1, q2, q3}\n" - : + : [cpu] "+r"(cpu) : [gpu] "r"(gpu), - [cpu] "r"(cpu), [cpu_stride] "r"(cpu_stride) : "q0", "q1", "q2", "q3"); return; From 4d1dd3b0cdad7f3b5308440bed2c3cbfbee2756c Mon Sep 17 00:00:00 2001 From: Ernestas Kulik Date: Thu, 30 Aug 2018 19:02:47 +0300 Subject: [PATCH 006/378] vc4: Fix leak in HW queries error path Reported by Coverity: in the case where there exist hardware and non-hardware queries, the code does not jump to err_free_query and leaks the query. CID: 1430194 Signed-off-by: Ernestas Kulik Fixes: 9ea90ffb98fb ("broadcom/vc4: Add support for HW perfmon") (cherry picked from commit f6e49d5ad0fde19a074644491475470d684dd721) --- src/gallium/drivers/vc4/vc4_query.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c index 6e4681e93cc..f08785f457f 100644 --- a/src/gallium/drivers/vc4/vc4_query.c +++ b/src/gallium/drivers/vc4/vc4_query.c @@ -132,7 +132,7 @@ vc4_create_batch_query(struct pipe_context *pctx, unsigned num_queries, /* We can't mix HW and non-HW queries. */ if (nhwqueries && nhwqueries != num_queries) - return NULL; + goto err_free_query; if (!nhwqueries) return (struct pipe_query *)query; From 31d0079a202be21e77ec15a35860f69cd9aa139c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 30 Jan 2019 12:07:29 +0100 Subject: [PATCH 007/378] radv/winsys: fix hash when adding internal buffers This fixes serious stuttering in Shadow Of The Tomb Raider. Fixes: 50fd253bd6e ("radv/winsys: Add priority handling during submit.") Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 9c762c01c8f69e8209935d902648cb174de8c8bf) --- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index d3b1e2cd4c6..438ed594ede 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -543,7 +543,7 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, cs->handles[cs->num_buffers].bo_handle = bo; cs->handles[cs->num_buffers].bo_priority = priority; - hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1); + hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1); cs->buffer_hash_table[hash] = cs->num_buffers; ++cs->num_buffers; From 0a72505a9e2a7ad368b2d17457b91873cb413b58 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 31 Jan 2019 08:03:43 -0500 Subject: [PATCH 008/378] freedreno: fix release tarball Fixes: b4476138d5a freedreno: move drm to common location Reviewed-by: Eric Engestrom Signed-off-by: Rob Clark (cherry picked from commit e252656d1481e5bbc6bf34beb01076b329073ac7) --- src/freedreno/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/src/freedreno/Makefile.am b/src/freedreno/Makefile.am index 460fb87fb46..342f03d644c 100644 --- a/src/freedreno/Makefile.am +++ b/src/freedreno/Makefile.am @@ -45,6 +45,7 @@ TESTS = BUILT_SOURCES = CLEANFILES = EXTRA_DIST = \ + meson.build \ drm/meson.build \ ir3/ir3_nir_trig.py \ ir3/meson.build From 7f91ae20b9dc33bb609ab5e631963af67e53c84a Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Thu, 31 Jan 2019 09:56:19 -0500 Subject: [PATCH 009/378] freedreno: more fixing release tarball Fixes: aa0fed10d35 freedreno: move ir3 to common location Signed-off-by: Rob Clark (cherry picked from commit 39cfdf9930659b01cd89f0fbc29c43c623e17d2d) --- src/gallium/drivers/freedreno/Makefile.am | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am index fe409fa5f52..dbc15f40389 100644 --- a/src/gallium/drivers/freedreno/Makefile.am +++ b/src/gallium/drivers/freedreno/Makefile.am @@ -23,4 +23,6 @@ libfreedreno_la_SOURCES = \ $(a6xx_SOURCES) \ $(ir3_SOURCES) -EXTRA_DIST = meson.build +EXTRA_DIST = \ + ir3/ir3_cmdline.c \ + meson.build From 535cc4f1d511c147c4c9525e2f7b9fc742ea83ae Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 30 Jan 2019 09:33:53 -0800 Subject: [PATCH 010/378] mesa: Skip partial InvalidateFramebuffer of packed depth/stencil. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One of the CTS cases tries to invalidate just stencil of packed depth/stencil, and we incorrectly lost the depth contents. Fixes dEQP-GLES3.functional.fbo.invalidate.whole.unbind_read_stencil Fixes: 0c42b5f3cb90 ("mesa: wire up InvalidateFramebuffer") Reviewed-by: Marek Olšák (cherry picked from commit db2ae51121067b66d4ee8313ba7f74cecb201a03) --- src/mesa/main/fbobject.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index 8290ea94dfc..87c33be7854 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -4691,6 +4691,29 @@ discard_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb, if (!att) continue; + /* If we're asked to invalidate just depth or just stencil, but the + * attachment is packed depth/stencil, then we can only use + * Driver.DiscardFramebuffer if the attachments list includes both depth + * and stencil and they both point at the same renderbuffer. + */ + if ((attachments[i] == GL_DEPTH_ATTACHMENT || + attachments[i] == GL_STENCIL_ATTACHMENT) && + (!att->Renderbuffer || + att->Renderbuffer->_BaseFormat == GL_DEPTH_STENCIL)) { + GLenum other_format = (attachments[i] == GL_DEPTH_ATTACHMENT ? + GL_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT); + bool has_both = false; + for (int j = 0; j < numAttachments; j++) { + if (attachments[j] == other_format) + has_both = true; + break; + } + + if (fb->Attachment[BUFFER_DEPTH].Renderbuffer != + fb->Attachment[BUFFER_STENCIL].Renderbuffer || !has_both) + continue; + } + ctx->Driver.DiscardFramebuffer(ctx, fb, att); } } From 7fdb08375f7091284778668f6ecf0d9304e0dc24 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 30 Jan 2019 11:17:35 -0800 Subject: [PATCH 011/378] v3d: Fix image_load_store clamping of signed integer stores. This was copy-and-paste fail, that oddly showed up in the CTS's reinterprets of r32f, rgba8, and srgba8 to rgba8i, but not r32ui and r32i to rgba8i or reinterprets to other signed int formats. Fixes: 6281f26f064a ("v3d: Add support for shader_image_load_store.") (cherry picked from commit ab4d5775b0decad7df56245cecad63912ed62b4c) --- src/broadcom/compiler/v3d_nir_lower_image_load_store.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c index e74206b3949..2aa3cbad495 100644 --- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c +++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c @@ -156,7 +156,7 @@ pack_sint(nir_builder *b, nir_ssa_def *color, const unsigned *bits, int num_components) { color = nir_channels(b, color, (1 << num_components) - 1); - color = nir_format_clamp_uint(b, color, bits); + color = nir_format_clamp_sint(b, color, bits); return pack_bits(b, color, bits, num_components, true); } From c824f8031cee0f4be10943438ffa264fd09ac4e4 Mon Sep 17 00:00:00 2001 From: Ernestas Kulik Date: Thu, 30 Aug 2018 19:02:46 +0300 Subject: [PATCH 012/378] v3d: Fix leak in resource setup error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by Coverity: in the case of unsupported modifier request, the code does not jump to the “fail” label to destroy the acquired resource. CID: 1435704 Signed-off-by: Ernestas Kulik Fixes: 45bb8f295710 ("broadcom: Add V3D 3.3 gallium driver called "vc5", for BCM7268.") (cherry picked from commit 90458bef544ac46a912f06e73f71c3cb20fdaaf6) --- src/gallium/drivers/v3d/v3d_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c index 21c68942e14..84e86799d5e 100644 --- a/src/gallium/drivers/v3d/v3d_resource.c +++ b/src/gallium/drivers/v3d/v3d_resource.c @@ -780,7 +780,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen, rsc->tiled = false; } else { fprintf(stderr, "Unsupported modifier requested\n"); - return NULL; + goto fail; } rsc->internal_format = prsc->format; From 89f84f98e0451629e44dcb0a7cdba10b60515bf6 Mon Sep 17 00:00:00 2001 From: Neha Bhende Date: Tue, 29 Jan 2019 12:21:00 -0700 Subject: [PATCH 013/378] st/mesa: Fix topogun-1.06-orc-84k-resize.trace crash We need to initialize all fields in rs->prim explicitly while creating new rastpos stage. Fixes: bac8534267 ("st/mesa: allow glDrawElements to work with GL_SELECT feedback") v2: Initializing all fields in rs->prim as per Ilia. Reviewed-by: Brian Paul Reviewed-by: Ilia Mirkin (cherry picked from commit 69d736b17a96a4d7a21c3c88fd787091acc1def0) --- src/mesa/state_tracker/st_cb_rasterpos.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c index fecaaf77da8..c54b50dc754 100644 --- a/src/mesa/state_tracker/st_cb_rasterpos.c +++ b/src/mesa/state_tracker/st_cb_rasterpos.c @@ -208,6 +208,10 @@ new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw) rs->prim.end = 1; rs->prim.start = 0; rs->prim.count = 1; + rs->prim.pad = 0; + rs->prim.num_instances = 1; + rs->prim.base_instance = 0; + rs->prim.is_indirect = 0; return rs; } From c6649ca94d07daa605814b243706e9ba4ca29576 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 15 Jan 2019 10:53:44 -0600 Subject: [PATCH 014/378] intel/fs: Do the grf127 hack on SIMD8 instructions in SIMD16 mode Previously, we only applied the fix to shaders with a dispatch mode of SIMD8 but the code it relies on for SIMD16 mode only applies to SIMD16 instructions. If you have a SIMD8 instruction in a SIMD16 shader, neither would trigger and the restriction could still be hit. Fixes: 232ed8980217dd "i965/fs: Register allocator shoudn't use grf127..." Reviewed-by: Jose Maria Casanova Crespo Reviewed-by: Kenneth Graunke (cherry picked from commit b4f0d062cd12b4f675bac900ac41d1085a79239a) --- src/intel/compiler/brw_fs_reg_allocate.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 6961cb1caf4..b3825f1ef8c 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -667,15 +667,14 @@ fs_visitor::assign_regs(bool allow_spilling, bool spill_all) * messages adding a node interference to the grf127_send_hack_node. * This node has a fixed asignment to grf127. * - * We don't apply it to SIMD16 because previous code avoids any register - * overlap between sources and destination. + * We don't apply it to SIMD16 instructions because previous code avoids + * any register overlap between sources and destination. */ ra_set_node_reg(g, grf127_send_hack_node, 127); - if (dispatch_width == 8) { - foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->is_send_from_grf() && inst->dst.file == VGRF) - ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); - } + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->exec_size < 16 && inst->is_send_from_grf() && + inst->dst.file == VGRF) + ra_add_node_interference(g, inst->dst.nr, grf127_send_hack_node); } if (spilled_any_registers) { From 9667d89fe64426f0b1051cefd1af981afe1888f9 Mon Sep 17 00:00:00 2001 From: Danylo Piliaiev Date: Fri, 1 Feb 2019 12:21:38 +0200 Subject: [PATCH 015/378] anv: Fix VK_EXT_transform_feedback working with varyings packed in PSIZ Transform feedback did not set correct SO_DECL.ComponentMask for varyings packed in VARYING_SLOT_PSIZ: gl_Layer - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y gl_ViewportIndex - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z gl_PointSize - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w Fixes: 36ee2fd61c8f94 "anv: Implement the basic form of VK_EXT_transform_feedback" Signed-off-by: Danylo Piliaiev Reviewed-by: Jason Ekstrand (cherry picked from commit 64d3b148fe71453c296ba9525f49ffe728171582) --- src/intel/vulkan/genX_pipeline.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index d2142ae42c2..2a7044a425e 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -1211,13 +1211,30 @@ emit_3dstate_streamout(struct anv_pipeline *pipeline, hole_dwords -= 4; } + int varying = output->location; + uint8_t component_mask = output->component_mask; + /* VARYING_SLOT_PSIZ contains three scalar fields packed together: + * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y + * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z + * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w + */ + if (varying == VARYING_SLOT_LAYER) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 1; // SO_DECL_COMPMASK_Y + } else if (varying == VARYING_SLOT_VIEWPORT) { + varying = VARYING_SLOT_PSIZ; + component_mask = 1 << 2; // SO_DECL_COMPMASK_Z + } else if (varying == VARYING_SLOT_PSIZ) { + component_mask = 1 << 3; // SO_DECL_COMPMASK_W + } + next_offset[buffer] = output->offset + - __builtin_popcount(output->component_mask) * 4; + __builtin_popcount(component_mask) * 4; so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { .OutputBufferSlot = buffer, - .RegisterIndex = vue_map->varying_to_slot[output->location], - .ComponentMask = output->component_mask, + .RegisterIndex = vue_map->varying_to_slot[varying], + .ComponentMask = component_mask, }; } From 3f5099180d0f76ebc875719b48a9d590f7aa653f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 25 Jan 2019 20:39:40 -0500 Subject: [PATCH 016/378] radeonsi: fix crashing performance counters (division by zero) Fixes: e2b9329f17 "radeonsi: move remaining perfcounter code into si_perfcounter.c" (cherry picked from commit 742d6cdb42e5570a3a74005f18bb89208069d01f) --- src/gallium/drivers/radeonsi/si_perfcounter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index 2da14f8868f..d55394f2cba 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -1333,7 +1333,7 @@ void si_init_perfcounters(struct si_screen *screen) for (i = 0; i < num_blocks; ++i) { struct si_pc_block *block = &pc->blocks[i]; block->b = &blocks[i]; - block->num_instances = block->b->instances; + block->num_instances = MAX2(1, block->b->instances); if (!strcmp(block->b->b->name, "CB") || !strcmp(block->b->b->name, "DB")) From 15e2fc16e9cbb0c8e02d30e6f419e01ce8bc675e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Mon, 4 Feb 2019 18:53:52 +0100 Subject: [PATCH 017/378] loader/dri3: Use strlen instead of sizeof for creating VRR property atom sizeof counts the terminating null character as well, so that also contributed to the ID computed for the X11 atom. But the convention is for only the non-null characters to contribute to the atom ID. Fixes: 2e12fe425fe3 "loader/dri3: Enable adaptive_sync via _VARIABLE_REFRESH property" Reviewed-by: Nicholas Kazlauskas Reviewed-by: Eric Anholt (cherry picked from commit c0a540f32067cc8cb126d9aa1eb12a11cf15373a) --- src/loader/loader_dri3_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c index ad9b9d87b05..7d61c1df4fc 100644 --- a/src/loader/loader_dri3_helper.c +++ b/src/loader/loader_dri3_helper.c @@ -111,7 +111,7 @@ set_adaptive_sync_property(xcb_connection_t *conn, xcb_drawable_t drawable, xcb_intern_atom_reply_t* reply; xcb_void_cookie_t check; - cookie = xcb_intern_atom(conn, 0, sizeof(name), name); + cookie = xcb_intern_atom(conn, 0, strlen(name), name); reply = xcb_intern_atom_reply(conn, cookie, NULL); if (reply == NULL) return; From f8f68c41a1fbb34624d53c211955c66e8c14768e Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Tue, 5 Feb 2019 12:09:45 +0000 Subject: [PATCH 018/378] anv: wire up the state_pool_padding test Cc: Jason Ekstrand Fixes: 927ba12b53c ("anv/tests: Adding test for the state_pool padding.") Signed-off-by: Emil Velikov Reviewed-by: Eric Engestrom Reviewed-by: Rafael Antognolli Reviewed-by: Dylan Baker (cherry picked from commit 8943eb8f03fe67710ce65fc0a54024751ff2b5bd) --- src/intel/Makefile.vulkan.am | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/intel/Makefile.vulkan.am b/src/intel/Makefile.vulkan.am index b315f10a01a..cad0a57bc7f 100644 --- a/src/intel/Makefile.vulkan.am +++ b/src/intel/Makefile.vulkan.am @@ -253,6 +253,7 @@ VULKAN_TESTS = \ vulkan/tests/block_pool_no_free \ vulkan/tests/state_pool_no_free \ vulkan/tests/state_pool_free_list_only \ + vulkan/tests/state_pool_padding \ vulkan/tests/state_pool VULKAN_TEST_LDADD = \ @@ -274,6 +275,10 @@ vulkan_tests_state_pool_free_list_only_CFLAGS = $(VULKAN_CFLAGS) vulkan_tests_state_pool_free_list_only_CPPFLAGS = $(VULKAN_CPPFLAGS) vulkan_tests_state_pool_free_list_only_LDADD = $(VULKAN_TEST_LDADD) +vulkan_tests_state_pool_padding_CFLAGS = $(VULKAN_CFLAGS) +vulkan_tests_state_pool_padding_CPPFLAGS = $(VULKAN_CPPFLAGS) +vulkan_tests_state_pool_padding_LDADD = $(VULKAN_TEST_LDADD) + vulkan_tests_state_pool_CFLAGS = $(VULKAN_CFLAGS) vulkan_tests_state_pool_CPPFLAGS = $(VULKAN_CPPFLAGS) vulkan_tests_state_pool_LDADD = $(VULKAN_TEST_LDADD) From 131f12d49fe7055c6626dde44d900738bfb5b627 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Tue, 5 Feb 2019 11:49:03 -0800 Subject: [PATCH 019/378] Version: Bump for rc2 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index e17116a15f1..d28078a87d4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -19.0.0-rc1 +19.0.0-rc2 From 452f9b9984d67ccac15b9532dd7f73493611824f Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Tue, 5 Feb 2019 14:08:12 -0500 Subject: [PATCH 020/378] freedreno: a2xx: fix fast clear Fixes: 912a9c8d Signed-off-by: Jonathan Marek Cc: 19.0 (cherry picked from commit 3361305f570505e0131c570041779496d0b9c663) --- src/gallium/drivers/freedreno/a2xx/fd2_draw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 772127c7478..498c1eae1d7 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -339,7 +339,6 @@ clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring, OUT_PKT3(ring, CP_SET_CONSTANT, 2); OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); OUT_RINGP(ring, patch_type, &batch->gmem_patches); - OUT_RING(ring, 0); OUT_PKT3(ring, CP_SET_CONSTANT, 4); OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); From 262fd16b993de6a0248da0d69f14fefa732753d3 Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Wed, 6 Feb 2019 16:21:08 +0000 Subject: [PATCH 021/378] xvmc: fix string comparison MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: c7b65dcaffeb9d0760c8 "xvmc: Define some Xv attribs to allow users to specify color standard and procamp" Cc: Christian König Signed-off-by: Eric Engestrom (cherry picked from commit 110a6e1839bcf31e3592389ad55a7ba07b551965) --- src/gallium/state_trackers/xvmc/attributes.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/gallium/state_trackers/xvmc/attributes.c b/src/gallium/state_trackers/xvmc/attributes.c index 375705669b0..6e4d78a9a29 100644 --- a/src/gallium/state_trackers/xvmc/attributes.c +++ b/src/gallium/state_trackers/xvmc/attributes.c @@ -90,15 +90,15 @@ Status XvMCSetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int if (!attr) return XvMCBadContext; - if (strcmp(attr, XV_BRIGHTNESS)) + if (strcmp(attr, XV_BRIGHTNESS) == 0) context_priv->procamp.brightness = value / 1000.0f; - else if (strcmp(attr, XV_CONTRAST)) + else if (strcmp(attr, XV_CONTRAST) == 0) context_priv->procamp.contrast = value / 1000.0f + 1.0f; - else if (strcmp(attr, XV_SATURATION)) + else if (strcmp(attr, XV_SATURATION) == 0) context_priv->procamp.saturation = value / 1000.0f + 1.0f; - else if (strcmp(attr, XV_HUE)) + else if (strcmp(attr, XV_HUE) == 0) context_priv->procamp.hue = value / 1000.0f; - else if (strcmp(attr, XV_COLORSPACE)) + else if (strcmp(attr, XV_COLORSPACE) == 0) context_priv->color_standard = value ? VL_CSC_COLOR_STANDARD_BT_601 : VL_CSC_COLOR_STANDARD_BT_709; @@ -134,15 +134,15 @@ Status XvMCGetAttribute(Display *dpy, XvMCContext *context, Atom attribute, int if (!attr) return XvMCBadContext; - if (strcmp(attr, XV_BRIGHTNESS)) + if (strcmp(attr, XV_BRIGHTNESS) == 0) *value = context_priv->procamp.brightness * 1000; - else if (strcmp(attr, XV_CONTRAST)) + else if (strcmp(attr, XV_CONTRAST) == 0) *value = context_priv->procamp.contrast * 1000 - 1000; - else if (strcmp(attr, XV_SATURATION)) + else if (strcmp(attr, XV_SATURATION) == 0) *value = context_priv->procamp.saturation * 1000 + 1000; - else if (strcmp(attr, XV_HUE)) + else if (strcmp(attr, XV_HUE) == 0) *value = context_priv->procamp.hue * 1000; - else if (strcmp(attr, XV_COLORSPACE)) + else if (strcmp(attr, XV_COLORSPACE) == 0) *value = context_priv->color_standard == VL_CSC_COLOR_STANDARD_BT_709; else return BadName; From dbc43e389777cc80ad5846fc2a57408646859f13 Mon Sep 17 00:00:00 2001 From: Eric Engestrom Date: Wed, 6 Feb 2019 16:28:12 +0000 Subject: [PATCH 022/378] xvmc: fix string comparison Fixes: 6fca18696d0e6a243f6f "g3dvl: Update XvMC unit tests." Cc: Younes Manton Signed-off-by: Eric Engestrom (cherry picked from commit 40b53a72033a601ab474c5f8e27eb5ca2c8bad6c) --- src/gallium/state_trackers/xvmc/tests/xvmc_bench.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c index 3cd23173c7c..dbd705639f6 100644 --- a/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c +++ b/src/gallium/state_trackers/xvmc/tests/xvmc_bench.c @@ -123,11 +123,11 @@ void ParseArgs(int argc, char **argv, struct Config *config) while (token && !fail) { - if (strcmp(token, "i")) + if (strcmp(token, "i") == 0) config->mb_types |= MB_TYPE_I; - else if (strcmp(token, "p")) + else if (strcmp(token, "p") == 0) config->mb_types |= MB_TYPE_P; - else if (strcmp(token, "b")) + else if (strcmp(token, "b") == 0) config->mb_types |= MB_TYPE_B; else fail = 1; From 7254d2f4a3f66d68e502e0b909eed8d34f5a6483 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 23 Jan 2019 22:41:46 +0100 Subject: [PATCH 023/378] radv: Fix the shader info pass for not having the variable. For example with VK_EXT_buffer_device_address or VK_KHR_variable_pointers. Fixes: a2b5cc3c399 "radv: enable variable pointers" Reviewed-by: Samuel Pitoiset (cherry picked from commit 00253ab2c4983fc300e3c8d21629b69257995bcf) --- src/amd/vulkan/radv_shader_info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c index 7e5a3789af2..e17b0e54e69 100644 --- a/src/amd/vulkan/radv_shader_info.c +++ b/src/amd/vulkan/radv_shader_info.c @@ -101,7 +101,7 @@ gather_intrinsic_load_deref_info(const nir_shader *nir, case MESA_SHADER_VERTEX: { nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - if (var->data.mode == nir_var_shader_in) { + if (var && var->data.mode == nir_var_shader_in) { unsigned idx = var->data.location; uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa); @@ -150,7 +150,7 @@ gather_intrinsic_store_deref_info(const nir_shader *nir, { nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); - if (var->data.mode == nir_var_shader_out) { + if (var && var->data.mode == nir_var_shader_out) { unsigned idx = var->data.location; switch (nir->info.stage) { From ef6809ba8852df765fd06425c68db251141b7fc3 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 23 Jan 2019 22:50:33 +0100 Subject: [PATCH 024/378] amd/common: Fix stores to derefs with unknown variable. Fixes: a2b5cc3c399 "radv: enable variable pointers" Reviewed-by: Samuel Pitoiset (cherry picked from commit dbdb44d5756cb98e15c40d0abf9efd4a7f250895) --- src/amd/common/ac_nir_to_llvm.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index efd3e260af1..73ac6e05a2d 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -2006,18 +2006,23 @@ static void visit_store_var(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { - nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); + nir_deref_instr *deref = nir_instr_as_deref(instr->src[0].ssa->parent_instr); + nir_variable *var = nir_deref_instr_get_variable(deref); LLVMValueRef temp_ptr, value; - int idx = var->data.driver_location; - unsigned comp = var->data.location_frac; + int idx = 0; + unsigned comp = 0; LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1])); int writemask = instr->const_index[0]; LLVMValueRef indir_index; unsigned const_index; - get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false, - NULL, NULL, &const_index, &indir_index); + if (var) { + get_deref_offset(ctx, deref, false, + NULL, NULL, &const_index, &indir_index); + idx = var->data.driver_location; + comp = var->data.location_frac; + } if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) { @@ -2030,7 +2035,7 @@ visit_store_var(struct ac_nir_context *ctx, writemask = writemask << comp; - switch (var->data.mode) { + switch (deref->mode) { case nir_var_shader_out: if (ctx->stage == MESA_SHADER_TESS_CTRL) { @@ -2039,8 +2044,8 @@ visit_store_var(struct ac_nir_context *ctx, unsigned const_index = 0; const bool is_patch = var->data.patch; - get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), - false, NULL, is_patch ? NULL : &vertex_index, + get_deref_offset(ctx, deref, false, NULL, + is_patch ? NULL : &vertex_index, &const_index, &indir_index); ctx->abi->store_tcs_outputs(ctx->abi, var, From b4e8a3294cb08ab22ba90895c3625d172addb92f Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 23 Jan 2019 01:53:59 +0100 Subject: [PATCH 025/378] amd/common: Add gep helper for pointer increment. Reviewed-by: Samuel Pitoiset (cherry picked from commit e00d9a9a728fe0c91bd295a5818fdb8303f321bf) --- src/amd/common/ac_llvm_build.c | 8 ++++++++ src/amd/common/ac_llvm_build.h | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 768364b2dc6..9aff2f8435d 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -923,6 +923,14 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx, ctx->f32, args, 4, AC_FUNC_ATTR_READNONE); } +LLVMValueRef +ac_build_gep_ptr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index) +{ + return LLVMBuildGEP(ctx->builder, base_ptr, &index, 1, ""); +} + LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index e47893bbbe6..f218eaf2832 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -223,6 +223,11 @@ ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef attr_number, LLVMValueRef params); +LLVMValueRef +ac_build_gep_ptr(struct ac_llvm_context *ctx, + LLVMValueRef base_ptr, + LLVMValueRef index); + LLVMValueRef ac_build_gep0(struct ac_llvm_context *ctx, LLVMValueRef base_ptr, From 6f36d3bbc01def8f92a417ee9a1303509343625a Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Thu, 24 Jan 2019 01:21:28 +0100 Subject: [PATCH 026/378] amd/common: Handle nir_deref_type_ptr_as_array for shared memory. Fixes: a2b5cc3c399 "radv: enable variable pointers" Reviewed-by: Samuel Pitoiset (cherry picked from commit 830fd0efc1ae58d722d8efa4b95f708cf70b23ca) --- src/amd/common/ac_nir_to_llvm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 73ac6e05a2d..bc7623570a2 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3844,6 +3844,10 @@ static void visit_deref(struct ac_nir_context *ctx, result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index)); break; + case nir_deref_type_ptr_as_array: + result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), + get_src(ctx, instr->arr.index)); + break; case nir_deref_type_cast: result = get_src(ctx, instr->parent); break; From f880c74717d5e9b2f5a68efef7273f19377a7f6f Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Thu, 24 Jan 2019 01:25:50 +0100 Subject: [PATCH 027/378] amd/common: handle nir_deref_cast for shared memory from integers. Can happen e.g. after a phi. Fixes: a2b5cc3c399 "radv: enable variable pointers" Reviewed-by: Samuel Pitoiset (cherry picked from commit 8d1718590b643aea744748ae4eeb83e0c82aab0c) --- src/amd/common/ac_nir_to_llvm.c | 150 +++++++++++++++++--------------- 1 file changed, 82 insertions(+), 68 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index bc7623570a2..82ff5390352 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3823,6 +3823,73 @@ static void visit_jump(struct ac_llvm_context *ctx, } } +static LLVMTypeRef +glsl_base_to_llvm_type(struct ac_llvm_context *ac, + enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_SUBROUTINE: + return ac->i32; + case GLSL_TYPE_INT16: + case GLSL_TYPE_UINT16: + return ac->i16; + case GLSL_TYPE_FLOAT: + return ac->f32; + case GLSL_TYPE_FLOAT16: + return ac->f16; + case GLSL_TYPE_INT64: + case GLSL_TYPE_UINT64: + return ac->i64; + case GLSL_TYPE_DOUBLE: + return ac->f64; + default: + unreachable("unknown GLSL type"); + } +} + +static LLVMTypeRef +glsl_to_llvm_type(struct ac_llvm_context *ac, + const struct glsl_type *type) +{ + if (glsl_type_is_scalar(type)) { + return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); + } + + if (glsl_type_is_vector(type)) { + return LLVMVectorType( + glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), + glsl_get_vector_elements(type)); + } + + if (glsl_type_is_matrix(type)) { + return LLVMArrayType( + glsl_to_llvm_type(ac, glsl_get_column_type(type)), + glsl_get_matrix_columns(type)); + } + + if (glsl_type_is_array(type)) { + return LLVMArrayType( + glsl_to_llvm_type(ac, glsl_get_array_element(type)), + glsl_get_length(type)); + } + + assert(glsl_type_is_struct(type)); + + LLVMTypeRef member_types[glsl_get_length(type)]; + + for (unsigned i = 0; i < glsl_get_length(type); i++) { + member_types[i] = + glsl_to_llvm_type(ac, + glsl_get_struct_field(type, i)); + } + + return LLVMStructTypeInContext(ac->context, member_types, + glsl_get_length(type), false); +} + static void visit_deref(struct ac_nir_context *ctx, nir_deref_instr *instr) { @@ -3848,9 +3915,23 @@ static void visit_deref(struct ac_nir_context *ctx, result = ac_build_gep_ptr(&ctx->ac, get_src(ctx, instr->parent), get_src(ctx, instr->arr.index)); break; - case nir_deref_type_cast: + case nir_deref_type_cast: { result = get_src(ctx, instr->parent); + + LLVMTypeRef pointee_type = glsl_to_llvm_type(&ctx->ac, instr->type); + LLVMTypeRef type = LLVMPointerType(pointee_type, AC_ADDR_SPACE_LDS); + + if (LLVMTypeOf(result) != type) { + if (LLVMGetTypeKind(LLVMTypeOf(result)) == LLVMVectorTypeKind) { + result = LLVMBuildBitCast(ctx->ac.builder, result, + type, ""); + } else { + result = LLVMBuildIntToPtr(ctx->ac.builder, result, + type, ""); + } + } break; + } default: unreachable("Unhandled deref_instr deref type"); } @@ -3999,73 +4080,6 @@ ac_handle_shader_output_decl(struct ac_llvm_context *ctx, } } -static LLVMTypeRef -glsl_base_to_llvm_type(struct ac_llvm_context *ac, - enum glsl_base_type type) -{ - switch (type) { - case GLSL_TYPE_INT: - case GLSL_TYPE_UINT: - case GLSL_TYPE_BOOL: - case GLSL_TYPE_SUBROUTINE: - return ac->i32; - case GLSL_TYPE_INT16: - case GLSL_TYPE_UINT16: - return ac->i16; - case GLSL_TYPE_FLOAT: - return ac->f32; - case GLSL_TYPE_FLOAT16: - return ac->f16; - case GLSL_TYPE_INT64: - case GLSL_TYPE_UINT64: - return ac->i64; - case GLSL_TYPE_DOUBLE: - return ac->f64; - default: - unreachable("unknown GLSL type"); - } -} - -static LLVMTypeRef -glsl_to_llvm_type(struct ac_llvm_context *ac, - const struct glsl_type *type) -{ - if (glsl_type_is_scalar(type)) { - return glsl_base_to_llvm_type(ac, glsl_get_base_type(type)); - } - - if (glsl_type_is_vector(type)) { - return LLVMVectorType( - glsl_base_to_llvm_type(ac, glsl_get_base_type(type)), - glsl_get_vector_elements(type)); - } - - if (glsl_type_is_matrix(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_column_type(type)), - glsl_get_matrix_columns(type)); - } - - if (glsl_type_is_array(type)) { - return LLVMArrayType( - glsl_to_llvm_type(ac, glsl_get_array_element(type)), - glsl_get_length(type)); - } - - assert(glsl_type_is_struct(type)); - - LLVMTypeRef member_types[glsl_get_length(type)]; - - for (unsigned i = 0; i < glsl_get_length(type); i++) { - member_types[i] = - glsl_to_llvm_type(ac, - glsl_get_struct_field(type, i)); - } - - return LLVMStructTypeInContext(ac->context, member_types, - glsl_get_length(type), false); -} - static void setup_locals(struct ac_nir_context *ctx, struct nir_function *func) From 94f0908216db0aa06fe49a53ecbb35840d855d8d Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Wed, 6 Feb 2019 13:47:32 -0800 Subject: [PATCH 028/378] freedreno/a6xx: Emit blitter dst with OUT_RELOCW We're writing to the bo and the kernel needs to know for fd_bo_cpu_prep() to work. Fixes: f93e43127252679b ("freedreno/a6xx: Enable blitter") Reviewed-by: Rob Clark Signed-off-by: Kristian H. Kristensen (cherry picked from commit 357ea7da51a2392eb1b7f464ff99cbe8e98378e2) --- src/gallium/drivers/freedreno/a6xx/fd6_blitter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c index 460255f748a..c8719636182 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c @@ -438,7 +438,7 @@ emit_blit_texture(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) | A6XX_RB_2D_DST_INFO_TILE_MODE(dtile) | A6XX_RB_2D_DST_INFO_COLOR_SWAP(dswap)); - OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RELOCW(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ OUT_RING(ring, A6XX_RB_2D_DST_SIZE_PITCH(dpitch)); OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); From 36d99d9ad0e13ca12e94d9dfaa510e2f3b0782c1 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 2 Feb 2019 02:56:48 -0500 Subject: [PATCH 029/378] nvc0/ir: fix second tex argument after levelZero optimization We used to pre-set a bunch of extra arguments to a texture instruction in order to force the RA to allocate a register at the boundary of 4. However with the levelZero optimization, which removes a LOD argument when it's uniformly equal to zero, we undid that logic by removing an extra argument. As a result, we could end up with insufficient alignment on the second wide texture argument. Instead we switch to a different method of achieving the same result. The logic runs during the constraint analysis of the RA, and adds unset sources as necessary right before being merged into a wide argument. Fixes MISALIGNED_REG errors in Hitman when run with bindless textures enabled on a GK208. Fixes: 9145873b152 ("nvc0/ir: use levelZero flag when the lod is set to 0") Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 5de5beedf21306b01730085f8e03d8f424729016) --- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 16 --------- .../drivers/nouveau/codegen/nv50_ir_ra.cpp | 33 ++++++++++++++----- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 295497be2f9..80a71ee8524 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1063,22 +1063,6 @@ NVC0LoweringPass::handleTEX(TexInstruction *i) } } - if (chipset >= NVISA_GK104_CHIPSET) { - // - // If TEX requires more than 4 sources, the 2nd register tuple must be - // aligned to 4, even if it consists of just a single 4-byte register. - // - // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case. - // - int s = i->srcCount(0xff, true); - if (s > 4 && s < 7) { - if (i->srcExists(s)) // move potential predicate out of the way - i->moveSources(s, 7 - s); - while (s < 7) - i->setSrc(s++, bld.loadImm(NULL, 0)); - } - } - return true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index f4379c137c5..f25bce00884 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -2341,9 +2341,19 @@ RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex) if (!tex->tex.target.isArray() && tex->tex.useOffsets) s++; } - n = tex->srcCount(0xff) - s; + n = tex->srcCount(0xff, true) - s; + // TODO: Is this necessary? Perhaps just has to be aligned to the + // level that the first arg is, not necessarily to 4. This + // requirement has not been rigorously verified, as it has been on + // Kepler. + if (n > 0 && n < 3) { + if (tex->srcExists(n + s)) // move potential predicate out of the way + tex->moveSources(n + s, 3 - n); + while (n < 3) + tex->setSrc(s + n++, new_LValue(func, FILE_GPR)); + } } else { - s = tex->srcCount(0xff); + s = tex->srcCount(0xff, true); n = 0; } @@ -2366,14 +2376,18 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex) } else if (isTextureOp(tex->op)) { int n = tex->srcCount(0xff, true); - if (n > 4) { - condenseSrcs(tex, 0, 3); - if (n > 5) // NOTE: first call modified positions already - condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1)); - } else - if (n > 1) { - condenseSrcs(tex, 0, n - 1); + int s = n > 4 ? 4 : n; + if (n > 4 && n < 7) { + if (tex->srcExists(n)) // move potential predicate out of the way + tex->moveSources(n, 7 - n); + + while (n < 7) + tex->setSrc(n++, new_LValue(func, FILE_GPR)); } + if (s > 1) + condenseSrcs(tex, 0, s - 1); + if (n > 4) + condenseSrcs(tex, 1, n - s); } } @@ -2510,6 +2524,7 @@ RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s) assert(cst->getSrc(s)->defs.size() == 1); // still SSA Instruction *defi = cst->getSrc(s)->defs.front()->getInsn(); + bool imm = defi->op == OP_MOV && defi->src(0).getFile() == FILE_IMMEDIATE; bool load = defi->op == OP_LOAD && From 07e299a0a071c9880f3f7eb9e97a28a34bf16891 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Thu, 24 Jan 2019 14:37:16 -0800 Subject: [PATCH 030/378] nir: Silence zillions of unused parameter warnings in release builds Fixes: cd56d79b59f "nir: check NIR_SKIP to skip passes by name" Reviewed-by: Caio Marcelo de Oliveira Filho Reviewed-by: Timothy Arceri (cherry picked from commit 78169870e416fde51946f8295fa6e1c652305447) --- src/compiler/nir/nir.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ff2c41faf27..8e0d285e2f2 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -2825,7 +2825,7 @@ should_print_nir(void) static inline void nir_validate_shader(nir_shader *shader, const char *when) { (void) shader; (void)when; } static inline void nir_metadata_set_validation_flag(nir_shader *shader) { (void) shader; } static inline void nir_metadata_check_validation_flag(nir_shader *shader) { (void) shader; } -static inline bool should_skip_nir(const char *pass_name) { return false; } +static inline bool should_skip_nir(UNUSED const char *pass_name) { return false; } static inline bool should_clone_nir(void) { return false; } static inline bool should_serialize_deserialize_nir(void) { return false; } static inline bool should_print_nir(void) { return false; } From ad2b712a56ef1ef85bf5b892092754cfa1727eae Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Sun, 10 Feb 2019 22:23:01 -0600 Subject: [PATCH 031/378] nir/deref: Rematerialize parents in rematerialize_derefs_in_use_blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When nir_rematerialize_derefs_in_use_blocks_impl was first written, I attempted to optimize things a bit by not bothering to re-materialize the sources of deref instructions figuring that the final caller would take care of that. However, in the case of more complex deref chains where the first link or two lives in block A and then another link and the load/store_deref intrinsic live in block B it doesn't work. The code in rematerialize_deref_in_block looks at the tail of the chain, sees that it's already in block B and skips it, not realizing that part of the chain also lives in block A. The easy solution here is to just rematerialize deref sources of deref instructions as well. This may potentially lead to a few more deref instructions being created by the conditions required for that to actually happen are fairly unlikely and, thanks to the caching, it's all linear time regardless. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109603 Fixes: 7d1d1208c2b "nir: Add a small pass to rematerialize derefs per-block" Reviewed-by: Alejandro Piñeiro (cherry picked from commit 9e6a6ef0d45a5bb61a541c495fe12e54e646ecfe) --- src/compiler/nir/nir_deref.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c index 2f5fda643ca..1af45a45deb 100644 --- a/src/compiler/nir/nir_deref.c +++ b/src/compiler/nir/nir_deref.c @@ -574,10 +574,9 @@ nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl) _mesa_hash_table_clear(state.cache, NULL); nir_foreach_instr_safe(instr, block) { - if (instr->type == nir_instr_type_deref) { - nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)); + if (instr->type == nir_instr_type_deref && + nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr))) continue; - } state.builder.cursor = nir_before_instr(instr); nir_foreach_src(instr, rematerialize_deref_src, &state); From 61c22ba94bcad965cb397a1844cdf2b35aba2ba7 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Mon, 11 Feb 2019 09:59:12 -0800 Subject: [PATCH 032/378] cherry-ignore: Add some patches --- bin/.cherry-ignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 bin/.cherry-ignore diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore new file mode 100644 index 00000000000..4b8f048ed65 --- /dev/null +++ b/bin/.cherry-ignore @@ -0,0 +1,3 @@ +# Both of these were already merged with different shas +da48cba61ef6fefb799bf96e6364b70dbf4ec712 +c812c740e60c14060eb89db66039111881a0f42f \ No newline at end of file From f59c77ef8c1c524e8bed319432351d993a605189 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Mon, 11 Feb 2019 16:26:01 -0800 Subject: [PATCH 033/378] Revert "intel/compiler: More peephole_select for pre-Gen6" This reverts commit af07141b33d0a58ed2cfe915b95f146481a4ffef. --- src/intel/compiler/brw_nir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9dbf06004a4..50416914f99 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -591,8 +591,8 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, (nir->info.stage == MESA_SHADER_TESS_CTRL || nir->info.stage == MESA_SHADER_TESS_EVAL); OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false); - OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation, - compiler->devinfo->gen >= 6); + if (compiler->devinfo->gen >= 6) + OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation, true); OPT(nir_opt_intrinsics); OPT(nir_opt_idiv_const, 32); From 9dd433dfa72b9ce95881d97a3184c6db5e8b0629 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Mon, 11 Feb 2019 16:26:20 -0800 Subject: [PATCH 034/378] Revert "nir/opt_peephole_select: Don't peephole_select expensive math instructions" This reverts commit 378f9967710e9145f2a4f8eee89d87badbe0e6ea. This also remove the default true argument from the a2xx nir backend, which was introduced after this commit. There should be no change in functionality. --- src/amd/vulkan/radv_shader.c | 2 +- src/broadcom/compiler/nir_to_vir.c | 2 +- src/compiler/nir/nir.h | 2 +- src/compiler/nir/nir_opt_peephole_select.c | 39 ++++---------------- src/freedreno/ir3/ir3_nir.c | 2 +- src/gallium/drivers/freedreno/a2xx/ir2_nir.c | 2 +- src/gallium/drivers/radeonsi/si_shader_nir.c | 2 +- src/gallium/drivers/vc4/vc4_program.c | 2 +- src/intel/compiler/brw_nir.c | 4 +- src/mesa/state_tracker/st_glsl_to_nir.cpp | 2 +- 10 files changed, 18 insertions(+), 41 deletions(-) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 32cd9ae25e9..2b45576bd41 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -159,7 +159,7 @@ radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively, NIR_PASS(progress, shader, nir_opt_if); NIR_PASS(progress, shader, nir_opt_dead_cf); NIR_PASS(progress, shader, nir_opt_cse); - NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, shader, nir_opt_peephole_select, 8, true); NIR_PASS(progress, shader, nir_opt_algebraic); NIR_PASS(progress, shader, nir_opt_constant_folding); NIR_PASS(progress, shader, nir_opt_undef); diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index a5e75f650e8..a3ff2b0e8b4 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1455,7 +1455,7 @@ v3d_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 8e0d285e2f2..c4c57c391de 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -3316,7 +3316,7 @@ bool nir_opt_move_comparisons(nir_shader *shader); bool nir_opt_move_load_ubo(nir_shader *shader); bool nir_opt_peephole_select(nir_shader *shader, unsigned limit, - bool indirect_load_ok, bool expensive_alu_ok); + bool indirect_load_ok); bool nir_opt_remove_phis(nir_shader *shader); diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c index 1deb02a380e..32d337f99dd 100644 --- a/src/compiler/nir/nir_opt_peephole_select.c +++ b/src/compiler/nir/nir_opt_peephole_select.c @@ -59,8 +59,7 @@ static bool block_check_for_allowed_instrs(nir_block *block, unsigned *count, - bool alu_ok, bool indirect_load_ok, - bool expensive_alu_ok) + bool alu_ok, bool indirect_load_ok) { nir_foreach_instr(instr, block) { switch (instr->type) { @@ -118,25 +117,6 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, case nir_op_vec3: case nir_op_vec4: break; - - case nir_op_fcos: - case nir_op_fdiv: - case nir_op_fexp2: - case nir_op_flog2: - case nir_op_fmod: - case nir_op_fpow: - case nir_op_frcp: - case nir_op_frem: - case nir_op_frsq: - case nir_op_fsin: - case nir_op_idiv: - case nir_op_irem: - case nir_op_udiv: - if (!alu_ok || !expensive_alu_ok) - return false; - - break; - default: if (!alu_ok) { /* It must be a move-like operation. */ @@ -180,8 +160,7 @@ block_check_for_allowed_instrs(nir_block *block, unsigned *count, static bool nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, - unsigned limit, bool indirect_load_ok, - bool expensive_alu_ok) + unsigned limit, bool indirect_load_ok) { if (nir_cf_node_is_first(&block->cf_node)) return false; @@ -202,9 +181,9 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, /* ... and those blocks must only contain "allowed" instructions. */ unsigned count = 0; if (!block_check_for_allowed_instrs(then_block, &count, limit != 0, - indirect_load_ok, expensive_alu_ok) || + indirect_load_ok) || !block_check_for_allowed_instrs(else_block, &count, limit != 0, - indirect_load_ok, expensive_alu_ok)) + indirect_load_ok)) return false; if (count > limit) @@ -271,15 +250,14 @@ nir_opt_peephole_select_block(nir_block *block, nir_shader *shader, static bool nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit, - bool indirect_load_ok, bool expensive_alu_ok) + bool indirect_load_ok) { nir_shader *shader = impl->function->shader; bool progress = false; nir_foreach_block_safe(block, impl) { progress |= nir_opt_peephole_select_block(block, shader, limit, - indirect_load_ok, - expensive_alu_ok); + indirect_load_ok); } if (progress) { @@ -295,15 +273,14 @@ nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit, bool nir_opt_peephole_select(nir_shader *shader, unsigned limit, - bool indirect_load_ok, bool expensive_alu_ok) + bool indirect_load_ok) { bool progress = false; nir_foreach_function(function, shader) { if (function->impl) progress |= nir_opt_peephole_select_impl(function->impl, limit, - indirect_load_ok, - expensive_alu_ok); + indirect_load_ok); } return progress; diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index d9fcf798b3d..68926c9553b 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -97,7 +97,7 @@ ir3_optimize_loop(nir_shader *s) progress |= OPT(s, nir_opt_gcm, true); else if (gcm == 2) progress |= OPT(s, nir_opt_gcm, false); - progress |= OPT(s, nir_opt_peephole_select, 16, true, true); + progress |= OPT(s, nir_opt_peephole_select, 16, true); progress |= OPT(s, nir_opt_intrinsics); progress |= OPT(s, nir_opt_algebraic); progress |= OPT(s, nir_opt_constant_folding); diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c index 5d92f86befc..b206911270a 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c @@ -74,7 +74,7 @@ ir2_optimize_loop(nir_shader *s) progress |= OPT(s, nir_opt_dce); progress |= OPT(s, nir_opt_cse); /* progress |= OPT(s, nir_opt_gcm, true); */ - progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); + progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true); progress |= OPT(s, nir_opt_intrinsics); progress |= OPT(s, nir_opt_algebraic); progress |= OPT(s, nir_opt_constant_folding); diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 7554f5b9f8b..d7618b46eb0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -834,7 +834,7 @@ si_lower_nir(struct si_shader_selector* sel) NIR_PASS(progress, sel->nir, nir_opt_if); NIR_PASS(progress, sel->nir, nir_opt_dead_cf); NIR_PASS(progress, sel->nir, nir_opt_cse); - NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, sel->nir, nir_opt_peephole_select, 8, true); /* Needed for algebraic lowering */ NIR_PASS(progress, sel->nir, nir_opt_algebraic); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 2d0a52bb5fb..8f1e561c444 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1591,7 +1591,7 @@ vc4_optimize_nir(struct nir_shader *s) NIR_PASS(progress, s, nir_opt_dce); NIR_PASS(progress, s, nir_opt_dead_cf); NIR_PASS(progress, s, nir_opt_cse); - NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, s, nir_opt_peephole_select, 8, true); NIR_PASS(progress, s, nir_opt_algebraic); NIR_PASS(progress, s, nir_opt_constant_folding); NIR_PASS(progress, s, nir_opt_undef); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 50416914f99..d7243f35b3d 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -590,9 +590,9 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, const bool is_vec4_tessellation = !is_scalar && (nir->info.stage == MESA_SHADER_TESS_CTRL || nir->info.stage == MESA_SHADER_TESS_EVAL); - OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false); + OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation); if (compiler->devinfo->gen >= 6) - OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation, true); + OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation); OPT(nir_opt_intrinsics); OPT(nir_opt_idiv_const, 32); diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp index d7f2e3e6eaa..a05ec0fa586 100644 --- a/src/mesa/state_tracker/st_glsl_to_nir.cpp +++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp @@ -327,7 +327,7 @@ st_nir_opts(nir_shader *nir, bool scalar) NIR_PASS(progress, nir, nir_opt_if); NIR_PASS(progress, nir, nir_opt_dead_cf); NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); + NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true); NIR_PASS(progress, nir, nir_opt_algebraic); NIR_PASS(progress, nir, nir_opt_constant_folding); From ca36eb12fdfb29c19379c666d24fd8c2d75064cd Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Mon, 11 Feb 2019 16:26:37 -0800 Subject: [PATCH 035/378] Revert "intel/compiler: More peephole select" This reverts commit 8fb8ebfbb05d3323481c8ba6d320b3a3580bad99. --- src/intel/compiler/brw_nir.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index d7243f35b3d..90fe7e7c85d 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -570,18 +570,7 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_dce); OPT(nir_opt_cse); - /* Passing 0 to the peephole select pass causes it to convert - * if-statements that contain only move instructions in the branches - * regardless of the count. - * - * Passing 1 to the peephole select pass causes it to convert - * if-statements that contain at most a single ALU instruction (total) - * in both branches. Before Gen6, some math instructions were - * prohibitively expensive and the results of compare operations need an - * extra resolve step. For these reasons, this pass is more harmful - * than good on those platforms. - * - * For indirect loads of uniforms (push constants), we assume that array + /* For indirect loads of uniforms (push constants), we assume that array * indices will nearly always be in bounds and the cost of the load is * low. Therefore there shouldn't be a performance benefit to avoid it. * However, in vec4 tessellation shaders, these loads operate by @@ -591,8 +580,6 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, (nir->info.stage == MESA_SHADER_TESS_CTRL || nir->info.stage == MESA_SHADER_TESS_EVAL); OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation); - if (compiler->devinfo->gen >= 6) - OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation); OPT(nir_opt_intrinsics); OPT(nir_opt_idiv_const, 32); From 56a47e342168e22353bc200f3498efb545bc6c07 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Tue, 12 Feb 2019 12:39:36 -0800 Subject: [PATCH 036/378] Bump version for 19.0-rc3 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index d28078a87d4..07977171c97 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -19.0.0-rc2 +19.0.0-rc3 From e9dc4e252f85e84038639f9e080e72e1effb50db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 23 Jan 2019 19:48:26 -0500 Subject: [PATCH 037/378] meson: drop the xcb-xrandr version requirement autotools doesn't have any requirement. This fixes meson on Ubuntu 16.04. Cc: 18.3 19.0 Reviewed-by: Erik Faye-Lund (cherry picked from commit 1e85cfb91a08565f28f51cc19573324722765823) --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index d975b0dbf4b..bfff862c3c8 100644 --- a/meson.build +++ b/meson.build @@ -1400,7 +1400,7 @@ if with_platform_x11 dep_xcb_xfixes = dependency('xcb-xfixes') endif if with_xlib_lease - dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12') + dep_xcb_xrandr = dependency('xcb-randr') dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3') endif endif From 5e85df1cfd53811149c99be446a1e8e103f377fa Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Mon, 4 Feb 2019 23:33:09 -0500 Subject: [PATCH 038/378] nv50,nvc0: add explicit settings for recent caps Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 38f542783faa360020b77fdd76b97f207a9e0068) --- src/gallium/drivers/nouveau/nv50/nv50_screen.c | 2 ++ src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 0c53b22eb3c..85cb016e3c2 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -215,6 +215,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_CLOCK: case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: + case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -312,6 +313,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_ATOMFADD: case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE: case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: + case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 68b5869276a..216fba49d9e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -266,6 +266,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX: case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION: case PIPE_CAP_QUERY_SO_OVERFLOW: + case PIPE_CAP_DEST_SURFACE_SRGB_CONTROL: return 1; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; @@ -336,6 +337,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SURFACE_SAMPLE_COUNT: case PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE: case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: + case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: return 0; case PIPE_CAP_VENDOR_ID: From 92fa6d695935e9755e6f5cdf00138750cf164aa7 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Sun, 3 Feb 2019 02:57:10 +0000 Subject: [PATCH 039/378] gallium-xlib: query MIT-SHM before using it. When Mesa is compiled for gallium-xlib using e.g. ./configure --enable-glx=gallium-xlib --disable-dri --disable-gbm -disable-egl and is used by an X server (usually remotely via SSH X11 forwarding) that does not support MIT-SHM such as XMing or MobaXterm, OpenGL clients report error messages such as Xlib: extension "MIT-SHM" missing on display "localhost:11.0". ad infinitum. The reason is that the code in src/gallium/winsys/sw/xlib uses MIT-SHM without checking for its existence, unlike the code in src/glx/drisw_glx.c and src/mesa/drivers/x11/xm_api.c. I copied the same check using XQueryExtension, and tested with glxgears on MobaXterm. This issue was reported before here: https://lists.freedesktop.org/archives/mesa-users/2016-July/001183.html Reviewed-by: Eric Anholt Reviewed-by: Emil Velikov Cc: (cherry picked from commit a203eaa4f4fb672b95426289b8dad3a8998f92d7) --- src/gallium/winsys/sw/xlib/xlib_sw_winsys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c index 8753139107c..a4c1d50453b 100644 --- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c +++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c @@ -396,6 +396,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys, { struct xlib_displaytarget *xlib_dt; unsigned nblocksy, size; + int ignore; xlib_dt = CALLOC_STRUCT(xlib_displaytarget); if (!xlib_dt) @@ -410,7 +411,8 @@ xlib_displaytarget_create(struct sw_winsys *winsys, xlib_dt->stride = align(util_format_get_stride(format, width), alignment); size = xlib_dt->stride * nblocksy; - if (!debug_get_option_xlib_no_shm()) { + if (!debug_get_option_xlib_no_shm() && + XQueryExtension(xlib_dt->display, "MIT-SHM", &ignore, &ignore, &ignore)) { xlib_dt->data = alloc_shm(xlib_dt, size); if (xlib_dt->data) { xlib_dt->shm = True; From 37ade3a566585df82abeb3bcf661624b695b79ac Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Tue, 22 Jan 2019 22:29:26 +0100 Subject: [PATCH 040/378] radv: Only look at pImmutableSamples if the descriptor has a sampler. Equivalent of ANV patch c7f4a2867ce492d78c1f8e2870c0a593d280572d CC: Reviewed-by: Samuel Pitoiset (cherry picked from commit 39ab4e12f71a640b43403a110d3d85565b59d16c) --- src/amd/vulkan/radv_descriptor_set.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c index cebe06aa078..e47ae6ad67a 100644 --- a/src/amd/vulkan/radv_descriptor_set.c +++ b/src/amd/vulkan/radv_descriptor_set.c @@ -84,7 +84,9 @@ VkResult radv_CreateDescriptorSetLayout( uint32_t immutable_sampler_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { max_binding = MAX2(max_binding, pCreateInfo->pBindings[j].binding); - if (pCreateInfo->pBindings[j].pImmutableSamplers) + if ((pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + pCreateInfo->pBindings[j].descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && + pCreateInfo->pBindings[j].pImmutableSamplers) immutable_sampler_count += pCreateInfo->pBindings[j].descriptorCount; } @@ -182,7 +184,9 @@ VkResult radv_CreateDescriptorSetLayout( set_layout->has_variable_descriptors = true; } - if (binding->pImmutableSamplers) { + if ((binding->descriptorType == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER || + binding->descriptorType == VK_DESCRIPTOR_TYPE_SAMPLER) && + binding->pImmutableSamplers) { set_layout->binding[b].immutable_samplers_offset = samplers_offset; set_layout->binding[b].immutable_samplers_equal = has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount); From a1ae60e9a30ec315323f3c770b8e53161ad98738 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 23 Jan 2019 22:44:05 +0100 Subject: [PATCH 041/378] amd/common: Use correct writemask for shared memory stores. The check was for 1 bit being set, which is clearly not what we want. CC: Reviewed-by: Samuel Pitoiset (cherry picked from commit 3c24fc64c7a4e564d84843fab7db25963d574d99) --- src/amd/common/ac_nir_to_llvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 82ff5390352..b24f2d59fde 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -2112,7 +2112,7 @@ visit_store_var(struct ac_nir_context *ctx, int writemask = instr->const_index[0]; LLVMValueRef address = get_src(ctx, instr->src[0]); LLVMValueRef val = get_src(ctx, instr->src[1]); - if (util_is_power_of_two_nonzero(writemask)) { + if (writemask == (1u << ac_get_llvm_num_components(val)) - 1) { val = LLVMBuildBitCast( ctx->ac.builder, val, LLVMGetElementType(LLVMTypeOf(address)), ""); From eb766a259e18cd40e03d8e998e4cf088c0a36615 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 2 Feb 2019 16:23:50 -0500 Subject: [PATCH 042/378] nvc0: add support for handling indirect draws with attrib conversion The hardware does not natively support FIXED and DOUBLE formats. If those are used in an indirect draw, they have to be converted. Our conversion tries to be clever about only converting the data that's needed. However for indirect, that won't work. Given that DOUBLE or FIXED are highly unlikely to ever be used with indirect draws, read the indirect buffer on the CPU and issue draws directly. Fixes the failing dEQP-GLES31.functional.draw_indirect.random.* tests. Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 399215eb7a0517463e5757c598d6cff6ae2301d0) --- .../drivers/nouveau/nvc0/nvc0_context.h | 1 + src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c | 5 +- .../drivers/nouveau/nvc0/nvc0_vbo_translate.c | 77 +++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index eb057bf2489..c1351062676 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -434,6 +434,7 @@ nvc0_video_buffer_create(struct pipe_context *pipe, /* nvc0_push.c */ void nvc0_push_vbo(struct nvc0_context *, const struct pipe_draw_info *); +void nvc0_push_vbo_indirect(struct nvc0_context *, const struct pipe_draw_info *); /* nve4_compute.c */ void nve4_launch_grid(struct pipe_context *, const struct pipe_grid_info *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index 3fbe7614e52..7d6be9382d1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -1040,7 +1040,10 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } if (nvc0->state.vbo_mode) { - nvc0_push_vbo(nvc0, info); + if (info->indirect) + nvc0_push_vbo_indirect(nvc0, info); + else + nvc0_push_vbo(nvc0, info); goto cleanup; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c index 256e20df2e4..4333fb26d23 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c @@ -466,6 +466,83 @@ nvc0_prim_gl(unsigned prim) } } +typedef struct { + uint32_t count; + uint32_t primCount; + uint32_t first; + uint32_t baseInstance; +} DrawArraysIndirectCommand; + +typedef struct { + uint32_t count; + uint32_t primCount; + uint32_t firstIndex; + int32_t baseVertex; + uint32_t baseInstance; +} DrawElementsIndirectCommand; + +void +nvc0_push_vbo_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) +{ + /* The strategy here is to just read the commands from the indirect buffer + * and do the draws. This is suboptimal, but will only happen in the case + * that conversion is required for FIXED or DOUBLE inputs. + */ + struct nvc0_screen *screen = nvc0->screen; + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nv04_resource *buf = nv04_resource(info->indirect->buffer); + struct nv04_resource *buf_count = nv04_resource(info->indirect->indirect_draw_count); + unsigned i; + + unsigned draw_count = info->indirect->draw_count; + if (buf_count) { + uint32_t *count = nouveau_resource_map_offset( + &nvc0->base, buf_count, info->indirect->indirect_draw_count_offset, + NOUVEAU_BO_RD); + draw_count = *count; + } + + uint8_t *buf_data = nouveau_resource_map_offset( + &nvc0->base, buf, info->indirect->offset, NOUVEAU_BO_RD); + struct pipe_draw_info single = *info; + single.indirect = NULL; + for (i = 0; i < draw_count; i++, buf_data += info->indirect->stride) { + if (info->index_size) { + DrawElementsIndirectCommand *cmd = (void *)buf_data; + single.start = info->start + cmd->firstIndex; + single.count = cmd->count; + single.start_instance = cmd->baseInstance; + single.instance_count = cmd->primCount; + single.index_bias = cmd->baseVertex; + } else { + DrawArraysIndirectCommand *cmd = (void *)buf_data; + single.start = cmd->first; + single.count = cmd->count; + single.start_instance = cmd->baseInstance; + single.instance_count = cmd->primCount; + } + + if (nvc0->vertprog->vp.need_draw_parameters) { + PUSH_SPACE(push, 9); + BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); + PUSH_DATA (push, NVC0_CB_AUX_SIZE); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3); + PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO); + PUSH_DATA (push, single.index_bias); + PUSH_DATA (push, single.start_instance); + PUSH_DATA (push, single.drawid + i); + } + + nvc0_push_vbo(nvc0, &single); + } + + nouveau_resource_unmap(buf); + if (buf_count) + nouveau_resource_unmap(buf_count); +} + void nvc0_push_vbo(struct nvc0_context *nvc0, const struct pipe_draw_info *info) { From f305135e0bde236d3e76a1aaff1279890dbb595a Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sun, 3 Feb 2019 10:06:24 -0500 Subject: [PATCH 043/378] nvc0/ir: always use CG mode for loads from atomic-only buffers Atomic operations don't update the local cache, which means that we would have to issue CCTL operations in order to get the updated values. When we know that a buffer is primarily used for atomic operations, it's easier to just avoid the caching at that level entirely. The same issue persists for non-atomic buffers, which will have to be fixed separately. Fixes the failing dEQP-GLES31.functional.atomic_counter.* tests. Signed-off-by: Ilia Mirkin Reviewed-by: Karol Herbst Cc: 19.0 (cherry picked from commit 4443b6ddf2e08d06f3d0457cf20a2e04244cde37) --- .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index afd7916a321..335e708c5cb 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -1087,6 +1087,8 @@ class Source }; std::vector memoryFiles; + std::vector bufferAtomics; + private: int inferSysValDirection(unsigned sn) const; bool scanDeclaration(const struct tgsi_full_declaration *); @@ -1137,6 +1139,7 @@ bool Source::scanSource() //resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1); tempArrayId.resize(scan.file_max[TGSI_FILE_TEMPORARY] + 1); memoryFiles.resize(scan.file_max[TGSI_FILE_MEMORY] + 1); + bufferAtomics.resize(scan.file_max[TGSI_FILE_BUFFER] + 1); info->immd.bufSize = 0; @@ -1483,11 +1486,14 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) tempArrayInfo.insert(std::make_pair(arrayId, std::make_pair( first, last - first + 1))); break; + case TGSI_FILE_BUFFER: + for (i = first; i <= last; ++i) + bufferAtomics[i] = decl->Declaration.Atomic; + break; case TGSI_FILE_ADDRESS: case TGSI_FILE_CONSTANT: case TGSI_FILE_IMMEDIATE: case TGSI_FILE_SAMPLER: - case TGSI_FILE_BUFFER: case TGSI_FILE_IMAGE: break; default: @@ -2720,7 +2726,11 @@ Converter::handleLOAD(Value *dst0[4]) } Instruction *ld = mkLoad(TYPE_U32, dst0[c], sym, off); - ld->cache = tgsi.getCacheMode(); + if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER && + code->bufferAtomics[r]) + ld->cache = nv50_ir::CACHE_CG; + else + ld->cache = tgsi.getCacheMode(); if (ind) ld->setIndirect(0, 1, ind); } From b9e5e15f877beb63af7020e145791cdbd77d9de9 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sun, 3 Feb 2019 21:55:12 -0500 Subject: [PATCH 044/378] nvc0: fix 3d images on kepler Looks like SUBFM.3D and SUEAU are perfectly capable of dealing with 3d tiling, they just need the correct inputs. Supply them. We also have to deal with the case where a 2d "layer" of a 3d image is bound. In this case, we supply the z coordinate separately to the shader, which has to optionally treat every 2d case as if it could be a slice of a 3d texture. Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 162352e6711b3ceab114686f7a3248074339e7f7) --- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 46 +++++++++++-------- src/gallium/drivers/nouveau/nvc0/nvc0_tex.c | 23 ++++------ 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 80a71ee8524..f95c4363beb 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -1871,7 +1871,8 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB; const int slot = su->tex.r; const int dim = su->tex.target.getDim(); - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); + const bool array = su->tex.target.isArray() || su->tex.target.isCube(); + const int arg = dim + array; int c; Value *zero = bld.mkImm(0); Value *p1 = NULL; @@ -1880,6 +1881,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) Value *bf, *eau, *off; Value *addr, *pred; Value *ind = su->getIndirectR(); + Value *y, *z; off = bld.getScratch(4); bf = bld.getScratch(4); @@ -1910,34 +1912,42 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) for (; c < 3; ++c) src[c] = zero; + if (dim == 2 && !array) { + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); + src[2] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), + v, bld.loadImm(NULL, 16)); + + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(2), su->tex.bindless); + bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[2], src[2], v, zero) + ->subOp = NV50_IR_SUBOP_SUCLAMP_SD(0, 2); + } + // set predicate output if (su->tex.target == TEX_TARGET_BUFFER) { src[0]->getInsn()->setFlagsDef(1, pred); } else - if (su->tex.target.isArray() || su->tex.target.isCube()) { + if (array) { p1 = bld.getSSA(1, FILE_PREDICATE); src[dim]->getInsn()->setFlagsDef(1, p1); } // calculate pixel offset if (dim == 1) { + y = z = zero; if (su->tex.target != TEX_TARGET_BUFFER) bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff)); - } else - if (dim == 3) { + } else { + y = src[1]; + z = src[2]; + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1]) - ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l + ->subOp = NV50_IR_SUBOP_MADSP(4,4,8); // u16l u16l u16l v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless); bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0]) - ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l - } else { - assert(dim == 2); - v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH, su->tex.bindless); - bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0]) - ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ? - NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l + ->subOp = array ? + NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l } // calculate effective address part 1 @@ -1950,19 +1960,15 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) ->subOp = NV50_IR_SUBOP_V1(7,6,8|2); } } else { - Value *y = src[1]; - Value *z = src[2]; uint16_t subOp = 0; switch (dim) { case 1: - y = zero; - z = zero; break; case 2: - z = off; - if (!su->tex.target.isArray() && !su->tex.target.isCube()) { - z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); + if (array) { + z = off; + } else { subOp = NV50_IR_SUBOP_SUBFM_3D; } break; @@ -1985,7 +1991,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v); } // add array layer offset - if (su->tex.target.isArray() || su->tex.target.isCube()) { + if (array) { v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY, su->tex.bindless); if (dim == 1) bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index 04f0a0d55da..8820b5aac66 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -1051,21 +1051,13 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, } else { struct nv50_miptree *mt = nv50_miptree(&res->base); struct nv50_miptree_level *lvl = &mt->level[view->u.tex.level]; - const unsigned z = view->u.tex.first_layer; - - if (z) { - if (mt->layout_3d) { - address += nvc0_mt_zslice_offset(mt, view->u.tex.level, z); - /* doesn't work if z passes z-tile boundary */ - if (depth > 1) { - pipe_debug_message(&nvc0->base.debug, CONFORMANCE, - "3D images are not really supported!"); - debug_printf("3D images are not really supported!\n"); - } - } else { - address += mt->layer_stride * z; - } + unsigned z = view->u.tex.first_layer; + + if (!mt->layout_3d) { + address += mt->layer_stride * z; + z = 0; } + address += lvl->offset; info[0] = address >> 8; @@ -1080,7 +1072,8 @@ nve4_set_surface_info(struct nouveau_pushbuf *push, info[6] = depth - 1; info[6] |= (lvl->tile_mode & 0xf00) << 21; info[6] |= NVC0_TILE_SHIFT_Z(lvl->tile_mode) << 22; - info[7] = 0; + info[7] = mt->layout_3d ? 1 : 0; + info[7] |= z << 16; info[14] = mt->ms_x; info[15] = mt->ms_y; } From 5a9b7bce9cee5563e94e75c93fffe462405dfcb1 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Mon, 4 Feb 2019 22:57:06 -0500 Subject: [PATCH 045/378] nv50,nvc0: use condition for occlusion queries when already complete For the NO_WAIT variants, we would jump into the ALWAYS case for both nested and inverted occlusion queries. However if the query had previously completed, the application could reasonably expect that the render condition would follow that result. To resolve this, we remove the nesting distinction which unnecessarily created an imbalance between the regular and inverted cases (since there's no "zero" condition mode). We also use the proper comparison if we know that the query has completed (which could happen as a result of an earlier get_query_result call). Fixes KHR-GL45.conditional_render_inverted.functional Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit e00799d3dc0595dc3998dbf199ceec8b1eece966) --- src/gallium/drivers/nouveau/nv50/nv50_query.c | 10 ++++------ src/gallium/drivers/nouveau/nv50/nv50_query_hw.c | 8 +------- src/gallium/drivers/nouveau/nv50/nv50_query_hw.h | 6 +++++- src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 10 ++++------ src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c | 13 ++++++------- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h | 6 +++++- 6 files changed, 25 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c index e30380cd84d..13088ebb5fa 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c @@ -98,12 +98,10 @@ nv50_render_condition(struct pipe_context *pipe, case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + if (hq->state == NV50_HW_QUERY_STATE_READY) + wait = true; if (likely(!condition)) { - if (unlikely(hq->nesting)) - cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : - NV50_3D_COND_MODE_ALWAYS; - else - cond = NV50_3D_COND_MODE_RES_NON_ZERO; + cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL : NV50_3D_COND_MODE_ALWAYS; } else { cond = wait ? NV50_3D_COND_MODE_EQUAL : NV50_3D_COND_MODE_ALWAYS; } @@ -129,7 +127,7 @@ nv50_render_condition(struct pipe_context *pipe, PUSH_SPACE(push, 9); - if (wait) { + if (wait && hq->state != NV50_HW_QUERY_STATE_READY) { BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c index ac3e409b2d5..4e74c462235 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c @@ -29,11 +29,6 @@ #include "nv50/nv50_query_hw_sm.h" #include "nv_object.xml.h" -#define NV50_HW_QUERY_STATE_READY 0 -#define NV50_HW_QUERY_STATE_ACTIVE 1 -#define NV50_HW_QUERY_STATE_ENDED 2 -#define NV50_HW_QUERY_STATE_FLUSHED 3 - /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts * (since we use only a single GPU channel per screen) will not work properly. * @@ -158,8 +153,7 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q) case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - hq->nesting = nv50->screen->num_occlusion_queries_active++; - if (hq->nesting) { + if (nv50->screen->num_occlusion_queries_active++) { nv50_hw_query_get(push, q, 0x10, 0x0100f002); } else { PUSH_SPACE(push, 4); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h index 82ec6bd2d96..a89a66cec4f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h @@ -6,6 +6,11 @@ #include "nv50_query.h" +#define NV50_HW_QUERY_STATE_READY 0 +#define NV50_HW_QUERY_STATE_ACTIVE 1 +#define NV50_HW_QUERY_STATE_ENDED 2 +#define NV50_HW_QUERY_STATE_FLUSHED 3 + #define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) struct nv50_hw_query; @@ -29,7 +34,6 @@ struct nv50_hw_query { uint8_t state; bool is64bit; uint8_t rotate; - int nesting; /* only used for occlusion queries */ struct nouveau_mm_allocation *mm; struct nouveau_fence *fence; }; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 1a3e4e794c0..40af9936859 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -121,12 +121,10 @@ nvc0_render_condition(struct pipe_context *pipe, case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + if (hq->state == NVC0_HW_QUERY_STATE_READY) + wait = true; if (likely(!condition)) { - if (unlikely(hq->nesting)) - cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : - NVC0_3D_COND_MODE_ALWAYS; - else - cond = NVC0_3D_COND_MODE_RES_NON_ZERO; + cond = wait ? NVC0_3D_COND_MODE_NOT_EQUAL : NVC0_3D_COND_MODE_ALWAYS; } else { cond = wait ? NVC0_3D_COND_MODE_EQUAL : NVC0_3D_COND_MODE_ALWAYS; } @@ -151,7 +149,7 @@ nvc0_render_condition(struct pipe_context *pipe, return; } - if (wait) + if (wait && hq->state != NVC0_HW_QUERY_STATE_READY) nvc0_hw_query_fifo_wait(nvc0, q); PUSH_SPACE(push, 10); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index a420ed4ac0d..4e34216caf0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -28,11 +28,6 @@ #include "nvc0/nvc0_query_hw_metric.h" #include "nvc0/nvc0_query_hw_sm.h" -#define NVC0_HW_QUERY_STATE_READY 0 -#define NVC0_HW_QUERY_STATE_ACTIVE 1 -#define NVC0_HW_QUERY_STATE_ENDED 2 -#define NVC0_HW_QUERY_STATE_FLUSHED 3 - #define NVC0_HW_QUERY_ALLOC_SPACE 256 bool @@ -158,14 +153,18 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) case PIPE_QUERY_OCCLUSION_COUNTER: case PIPE_QUERY_OCCLUSION_PREDICATE: case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - hq->nesting = nvc0->screen->num_occlusion_queries_active++; - if (hq->nesting) { + if (nvc0->screen->num_occlusion_queries_active++) { nvc0_hw_query_get(push, q, 0x10, 0x0100f002); } else { PUSH_SPACE(push, 3); BEGIN_NVC0(push, NVC0_3D(COUNTER_RESET), 1); PUSH_DATA (push, NVC0_3D_COUNTER_RESET_SAMPLECNT); IMMED_NVC0(push, NVC0_3D(SAMPLECNT_ENABLE), 1); + /* Given that the counter is reset, the contents at 0x10 are + * equivalent to doing the query -- we would get hq->sequence as the + * payload and 0 as the reported value. This is already set up above + * as in the hq->rotate case. + */ } break; case PIPE_QUERY_PRIMITIVES_GENERATED: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h index 8225755d85e..5c8ad5eb2d0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.h @@ -6,6 +6,11 @@ #include "nvc0_query.h" +#define NVC0_HW_QUERY_STATE_READY 0 +#define NVC0_HW_QUERY_STATE_ACTIVE 1 +#define NVC0_HW_QUERY_STATE_ENDED 2 +#define NVC0_HW_QUERY_STATE_FLUSHED 3 + #define NVC0_HW_QUERY_TFB_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0) struct nvc0_hw_query; @@ -29,7 +34,6 @@ struct nvc0_hw_query { uint8_t state; boolean is64bit; uint8_t rotate; - int nesting; /* only used for occlusion queries */ struct nouveau_mm_allocation *mm; struct nouveau_fence *fence; }; From d278b3c187d426fea5ded7e8d97022efc9e9d7e3 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Tue, 5 Feb 2019 03:05:33 -0500 Subject: [PATCH 046/378] nvc0: stick zero values for the compute invocation counts Not quite perfect, but at least we don't end up with random values in the query buffer. Fixes KHR-GL45.pipeline_statistics_query_tests_ARB.functional_default_qo_values Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 6adb9b38bfb1f6ee4c94596bf0744225aa8e967a) --- src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index 4e34216caf0..f6d5d0f5602 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -198,6 +198,7 @@ nvc0_hw_begin_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0xc0 + 0x70, 0x0980a002); /* ROP, PIXELS */ nvc0_hw_query_get(push, q, 0xc0 + 0x80, 0x0d808002); /* TCP, LAUNCHES */ nvc0_hw_query_get(push, q, 0xc0 + 0x90, 0x0e809002); /* TEP, LAUNCHES */ + ((uint64_t *)hq->data)[(12 + 10) * 2] = 0; break; default: break; @@ -270,6 +271,7 @@ nvc0_hw_end_query(struct nvc0_context *nvc0, struct nvc0_query *q) nvc0_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */ nvc0_hw_query_get(push, q, 0x80, 0x0d808002); /* TCP, LAUNCHES */ nvc0_hw_query_get(push, q, 0x90, 0x0e809002); /* TEP, LAUNCHES */ + ((uint64_t *)hq->data)[10 * 2] = 0; break; case PIPE_QUERY_TIMESTAMP_DISJOINT: /* This query is not issued on GPU because disjoint is forced to false */ From a08aba86da19f1ab642272fb517a28919e482536 Mon Sep 17 00:00:00 2001 From: Boyan Ding Date: Thu, 9 Mar 2017 13:55:17 +0800 Subject: [PATCH 047/378] gk110/ir: Add rcp f64 implementation Signed-off-by: Boyan Ding Acked-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 04593d9a73ea257a36cc3b9fb5cd41427beaaea5) --- .../drivers/nouveau/codegen/lib/gk110.asm | 152 +++++++++++++++++- .../drivers/nouveau/codegen/lib/gk110.asm.h | 87 +++++++++- 2 files changed, 235 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm index b9c05a04b9a..c33dd2158c9 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -83,11 +83,161 @@ gk110_div_s32: $p0 sub b32 $r1 $r1 $r2 $p0 add b32 $r0 $r0 0x1 $p3 cvt s32 $r0 neg s32 $r0 - sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c + sched 0x04 0x2e 0x28 0x04 0x28 0x28 0x28 $p2 cvt s32 $r1 neg s32 $r1 ret +// RCP F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 +// +// The core of RCP and RSQ implementation is Newton-Raphson step, which is +// used to find successively better approximation from an imprecise initial +// value (single precision rcp in RCP and rsqrt64h in RSQ). +// gk110_rcp_f64: + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + ext u32 $r2 $r1 0xb14 + add b32 $r3 $r2 0xffffffff + joinat #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + set b32 $p0 0x1 gt u32 $r3 0x7fd + // $r3: 0 for norms, 0x36 for denorms, -1 for others + mov b32 $r3 0x0 + sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 + join (not $p0) nop + // Process all special values: NaN, inf, denorm, 0 + mov b32 $r3 0xffffffff + // A number is NaN if its abs value is greater than or unordered with inf + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + (not $p0) bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + join or b32 $r1 $r1 0x80000 +rcp_inf_or_denorm_or_zero: + and b32 $r4 $r1 0x7ff00000 + // Other values with nonzero in exponent field should be inf + set b32 $p0 0x1 eq s32 $r4 0x0 + sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + xor b32 $r1 $r1 0x7ff00000 + join mov b32 $r0 0x0 +rcp_denorm_or_zero: + set $p0 0x1 gtu f64 abs $r0d 0x0 + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + join or b32 $r1 $r1 0x7ff00000 +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + mul rn f64 $r0d $r0d 0x4350000000000000 + sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 + join mov b32 $r3 0x36 +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + set b32 $p0 0x1 lt s32 $r3 0x0 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + ext u32 $r2 $r1 0xb14 + and b32 $r7 $r1 0x800fffff + add b32 $r7 $r7 0x3ff00000 + mov b32 $r6 $r0 + sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + cvt rz f32 $r5 f64 $r6d + rcp f32 $r4 $r5 + mov b32 $r0 0xbf800000 + fma rn f32 $r5 $r4 $r5 $r0 + fma rn f32 $r0 neg $r4 $r5 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + cvt f64 $r0d f32 $r0 + cvt f64 $r6d f64 neg $r6d + sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 + cvt f64 $r8d f32 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + subr b32 $r2 $r2 0x3ff + add b32 $r4 $r2 $r3 + ext u32 $r3 $r1 0xb14 + // New exponent in $r3 + add b32 $r3 $r3 $r4 + add b32 $r2 $r3 0xffffffff + sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + set b32 $p0 0x1 lt u32 $r2 0x7fe + (not $p0) bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl b32 $r4 $r4 clamp 0x14 + add b32 $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + set b32 $p0 0x1 ge s32 $r3 0x7ff + (not $p0) bra #rcp_result_denorm + sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f + // Infinity + and b32 $r1 $r1 0x80000000 + mov b32 $r0 0x0 + add b32 $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + set b32 $p0 0x1 ne u32 $r3 0x0 + and b32 $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 cvt f64 $r6d f32 0x3e800000 + sched 0x2f 0x28 0x2c 0x2e 0x2e 0x00 0x00 + // 0x3f000000: 1/2 + (not $p0) cvt f64 $r6d f32 0x3f000000 + add b32 $r1 $r1 0x00100000 + mul rn f64 $r0d $r0d $r6d +rcp_end: + ret + gk110_rsq_f64: ret diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h index 8d00e2a2245..d41f135a26a 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h @@ -65,11 +65,92 @@ uint64_t gk110_builtin_code[] = { 0xe088000001000406, 0x4000000000800001, 0xe6010000000ce802, - 0x08b08010a010b810, + 0x08a0a0a010a0b810, 0xe60100000088e806, 0x19000000001c003c, /* 0x0218: gk110_rcp_f64 */ -/* 0x0218: gk110_rsq_f64 */ + 0xc00000058a1c0409, + 0x407fffffff9c080d, + 0x1480000050000000, + 0xb3401c03fe9c0c1d, + 0xe4c03c007f9c000e, + 0x08a0a0bcacb410bc, + 0x8580000000603c02, + 0x747fffffff9fc00e, + 0xb4601fff801c021d, + 0x120000000420003c, + 0x21000400005c0404, +/* 0x0270: rcp_inf_or_denorm_or_zero */ + 0x203ff800001c0410, + 0xb3281c00001c101d, + 0x0880bcacb4bc10ac, + 0x120000000800003c, + 0x223ff800001c0404, + 0xe4c03c007fdc0002, +/* 0x02a0: rcp_denorm_or_zero */ + 0xb4601c00001c021d, + 0x120000000400003c, + 0x213ff800005c0404, +/* 0x02b8: rcp_denorm */ + 0xc400021a801c0001, + 0x08a010a0a0aca0bc, + 0x740000001b5fc00e, +/* 0x02d0: rcp_rejoin */ + 0xb3181c00001c0c1d, + 0x12000000c000003c, + 0xc00000058a1c0409, + 0x204007ffff9c041c, + 0x401ff800001c1c1d, + 0xe4c03c00001c001a, + 0x08b8aca8a0a010ac, + 0xe5400c00031c3816, + 0x84000000021c1412, + 0x745fc000001fc002, + 0xcc000000029c1016, + 0xcc081000029c1002, + 0xe5400000001c2c02, + 0xe5410000031c3c1a, + 0x08a4a4a4a4a4a4b8, + 0xc54001fc001c2c21, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0x08a0a0a0a0a080a4, + 0xdb802000001c1812, + 0xdb800000021c0002, + 0x48000001ff9c0809, + 0xe0800000019c0812, + 0xc00000058a1c040d, + 0xe0800000021c0c0e, + 0x407fffffff9c0c09, + 0x08aca0a0aca0aca0, + 0xb3101c03ff1c081d, + 0x120000000c20003c, + 0xc24000000a1c1011, + 0xe0800000009c1006, + 0x12000000381c003c, +/* 0x03f0: rcp_result_inf_or_denorm */ + 0xb3681c03ff9c0c1d, + 0x120000001420003c, + 0x08bc948caca09480, + 0x20400000001c0404, + 0xe4c03c007f9c0002, + 0x403ff800001c0405, + 0x120000001c1c003c, +/* 0x0428: rcp_result_denorm */ + 0xb3501c00001c0c1d, + 0x204007ffff9c0404, + 0xc54001f400002c19, + 0x080000b8b8b0a0bc, + 0xc54001f800202c19, + 0x40000800001c0405, + 0xe4000000031c0002, +/* 0x0460: rcp_end */ + 0x19000000001c003c, +/* 0x0468: gk110_rsq_f64 */ 0x19000000001c003c, }; @@ -77,5 +158,5 @@ uint64_t gk110_builtin_offsets[] = { 0x0000000000000000, 0x00000000000000f0, 0x0000000000000218, - 0x0000000000000218, + 0x0000000000000468, }; From c5b9774eb468fa772681807a4c939275937ca560 Mon Sep 17 00:00:00 2001 From: Boyan Ding Date: Thu, 9 Mar 2017 13:55:18 +0800 Subject: [PATCH 048/378] gk110/ir: Add rsq f64 implementation Signed-off-by: Boyan Ding Acked-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 7937408052a1896f0b08b0110bb8a1790eeee351) --- .../drivers/nouveau/codegen/lib/gk110.asm | 69 ++++++++++++++++++- .../drivers/nouveau/codegen/lib/gk110.asm.h | 42 ++++++++++- 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm index c33dd2158c9..4047a565a9f 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm @@ -230,7 +230,7 @@ rcp_result_denorm: and b32 $r1 $r1 0x800fffff // 0x3e800000: 1/4 $p0 cvt f64 $r6d f32 0x3e800000 - sched 0x2f 0x28 0x2c 0x2e 0x2e 0x00 0x00 + sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 // 0x3f000000: 1/2 (not $p0) cvt f64 $r6d f32 0x3f000000 add b32 $r1 $r1 0x00100000 @@ -238,7 +238,74 @@ rcp_result_denorm: rcp_end: ret +// RSQ F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 - $p1 +// gk110_rsq_f64: + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + $p0 or b32 $r1 $r1 0x00080000 + and b32 $r2 $r1 0x7fffffff + sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + ext u32 $r3 $r1 0xb14 + set b32 $p1 0x1 le u32 $r3 0x2 + or b32 $r2 $r0 $r2 + $p1 mul rn f64 $r0d $r0d 0x4350000000000000 + rsqrt64h f32 $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + set b32 $r6 ne u32 $r3 0x7ff + and b32 $r2 $r2 $r6 + sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 + set b32 $p0 0x1 ne u32 $r2 0x0 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + and b32 $r1 $r1 0x80000000 + mov b32 $r0 0x0 + or b32 $r1 $r1 $r5 + ret +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + mov b32 $r4 0x0 + sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 + // 0x3f000000: 1/2 + cvt f64 $r8d f32 0x3f000000 + mul rn f64 $r2d $r0d $r8d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 + // Multiply 2^27 to result for small inputs to recover + $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 + mov b32 $r1 $r5 + mov b32 $r0 $r4 ret .section #gk110_builtin_offsets diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h index d41f135a26a..3d1523f2fdd 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm.h @@ -144,13 +144,53 @@ uint64_t gk110_builtin_code[] = { 0xb3501c00001c0c1d, 0x204007ffff9c0404, 0xc54001f400002c19, - 0x080000b8b8b0a0bc, + 0x089c80a8b8b0a0bc, 0xc54001f800202c19, 0x40000800001c0405, 0xe4000000031c0002, /* 0x0460: rcp_end */ 0x19000000001c003c, /* 0x0468: gk110_rsq_f64 */ + 0xb4601fff801c021d, + 0x2100040000000404, + 0x203fffffff9c0408, + 0x08a0a094b0a0809c, + 0xc00000058a1c040d, + 0xb3301c00011c0c3d, + 0xe2001000011c000a, + 0xc400021a80040001, + 0x84000000039c0416, + 0xb2d01c03ff9c0c19, + 0xe2000000031c080a, + 0x08a0b8a09c80aca0, + 0xb3501c00001c081d, + 0x120000001000003c, + 0x20400000001c0404, + 0xe4c03c007f9c0002, + 0xe2001000029c0406, + 0x19000000001c003c, +/* 0x04f8: rsq_norm */ + 0xe4c03c007f9c0012, + 0x08a4a4a4a4a4a4bc, + 0xc54001f8001c2c21, + 0xe4000000041c000a, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0xdb801000031c1012, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0x08a4a4a4a4a4a4a4, + 0xdb801000031c1012, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0xdb801000031c1012, + 0xe4000000021c0802, + 0xdb882000001c101a, + 0xdb801000031c1012, + 0x08000000b8a080a4, + 0xc400020d00041011, + 0xe4c03c00029c0006, + 0xe4c03c00021c0002, 0x19000000001c003c, }; From 81810fa5db5e86e510a55f9c69b8d7759124b098 Mon Sep 17 00:00:00 2001 From: Boyan Ding Date: Thu, 9 Mar 2017 13:55:19 +0800 Subject: [PATCH 049/378] gk110/ir: Use the new rcp/rsq in library v2: (Karol Herbst * fix Value setup for the builtins Signed-off-by: Boyan Ding [imirkin: track the fp64 flag when switching ops to calls] Signed-off-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 656ad060518d067a3b311db8c2de2a396fb41898) --- .../drivers/nouveau/codegen/nv50_ir.cpp | 1 + src/gallium/drivers/nouveau/codegen/nv50_ir.h | 1 + .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 38 +++++++++++++++++++ .../nouveau/codegen/nv50_ir_lowering_nvc0.h | 1 + .../nouveau/codegen/nv50_ir_target.cpp | 1 + 5 files changed, 42 insertions(+) diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp index 49425b98b91..993d01c1e44 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp @@ -1119,6 +1119,7 @@ Program::Program(Type type, Target *arch) binSize = 0; maxGPR = -1; + fp64 = false; main = new Function(this, "MAIN", ~0); calls.insert(&main->call); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index 8085bb2f542..8d32a25ec23 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -1311,6 +1311,7 @@ class Program uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL int maxGPR; + bool fp64; MemoryPool mem_Instruction; MemoryPool mem_CmpInstruction; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index f95c4363beb..9d2c81db9dd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -83,6 +83,38 @@ NVC0LegalizeSSA::handleDIV(Instruction *i) delete_Instruction(prog, i); } +void +NVC0LegalizeSSA::handleRCPRSQLib(Instruction *i, Value *src[]) +{ + FlowInstruction *call; + Value *def[2]; + int builtin; + + def[0] = bld.mkMovToReg(0, src[0])->getDef(0); + def[1] = bld.mkMovToReg(1, src[1])->getDef(0); + + if (i->op == OP_RCP) + builtin = NVC0_BUILTIN_RCP_F64; + else + builtin = NVC0_BUILTIN_RSQ_F64; + + call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL); + def[0] = bld.getSSA(); + def[1] = bld.getSSA(); + bld.mkMovFromReg(def[0], 0); + bld.mkMovFromReg(def[1], 1); + bld.mkClobber(FILE_GPR, 0x3fc, 2); + bld.mkClobber(FILE_PREDICATE, i->op == OP_RSQ ? 0x3 : 0x1, 0); + bld.mkOp2(OP_MERGE, TYPE_U64, i->getDef(0), def[0], def[1]); + + call->fixed = 1; + call->absolute = call->builtin = 1; + call->target.builtin = builtin; + delete_Instruction(prog, i); + + prog->fp64 = true; +} + void NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) { @@ -96,6 +128,12 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) Value *src[2], *dst[2], *def = i->getDef(0); bld.mkSplit(src, 4, i->getSrc(0)); + int chip = prog->getTarget()->getChipset(); + if (chip >= NVISA_GK20A_CHIPSET && chip < NVISA_GM107_CHIPSET) { + handleRCPRSQLib(i, src); + return; + } + // 2. We don't care about the low 32 bits of the destination. Stick a 0 in. dst[0] = bld.loadImm(NULL, 0); dst[1] = bld.getSSA(); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index e0f50ab0904..99809726602 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -62,6 +62,7 @@ class NVC0LegalizeSSA : public Pass // we want to insert calls to the builtin library only after optimization void handleDIV(Instruction *); // integer division, modulus + void handleRCPRSQLib(Instruction *, Value *[]); void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt void handleFTZ(Instruction *); void handleSET(CmpInstruction *); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index 9193a01f189..5c6d0570ae2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -399,6 +399,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info) } } } + info->io.fp64 |= fp64; info->bin.relocData = emit->getRelocInfo(); info->bin.fixupData = emit->getFixupInfo(); From c96d4331051388fa489f486d53c06200b8aee7cf Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Mon, 14 Aug 2017 23:55:25 +0200 Subject: [PATCH 050/378] gk104/ir: Use the new rcp/rsq in library [imirkin: add a few more "long" prefixes to safen things up] Acked-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 12669d29705a26478aa691cb454149628be65f17) --- .../drivers/nouveau/codegen/lib/gk104.asm | 203 +++++++++++++++++- .../drivers/nouveau/codegen/lib/gk104.asm.h | 144 +++++++++++-- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +- 3 files changed, 334 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm index cd65b547279..576da1bab60 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm @@ -543,6 +543,8 @@ $p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0 $p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0 long mov b32 $r3 0x3f800000 long nop +sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00 +long nop long ret @@ -554,7 +556,144 @@ long ret // SIZE: 9 * 8 bytes // gk104_rcp_f64: - long nop + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + ext u32 $r2 $r1 0xb14 + add b32 $r3 $r2 0xffffffff + joinat #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + set $p0 0x1 gt u32 $r3 0x7fd + // $r3: 0 for norms, 0x36 for denorms, -1 for others + long mov b32 $r3 0x0 + sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28 + join (not $p0) nop + // Process all special values: NaN, inf, denorm, 0 + mov b32 $r3 0xffffffff + // A number is NaN if its abs value is greater than or unordered with inf + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + (not $p0) bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + join or b32 $r1 $r1 0x80000 +rcp_inf_or_denorm_or_zero: + and b32 $r4 $r1 0x7ff00000 + // Other values with nonzero in exponent field should be inf + set $p0 0x1 eq s32 $r4 0x0 + sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + xor b32 $r1 $r1 0x7ff00000 + join mov b32 $r0 0x0 +rcp_denorm_or_zero: + set $p0 0x1 gtu f64 abs $r0d 0x0 + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + join or b32 $r1 $r1 0x7ff00000 +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + mul rn f64 $r0d $r0d 0x4350000000000000 + sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28 + join mov b32 $r3 0x36 +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + set $p0 0x1 lt s32 $r3 0x0 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + ext u32 $r2 $r1 0xb14 + and b32 $r7 $r1 0x800fffff + add b32 $r7 $r7 0x3ff00000 + long mov b32 $r6 $r0 + sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + cvt rz f32 $r5 f64 $r6d + long rcp f32 $r4 $r5 + mov b32 $r0 0xbf800000 + fma rn f32 $r5 $r4 $r5 $r0 + fma rn f32 $r0 neg $r4 $r5 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + cvt f64 $r0d f32 $r0 + cvt f64 $r6d neg f64 $r6d + sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29 + cvt f64 $r8d f32 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28 + fma rn f64 $r4d $r6d $r0d $r8d + fma rn f64 $r0d $r0d $r4d $r0d + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + subr b32 $r2 $r2 0x3ff + long add b32 $r4 $r2 $r3 + ext u32 $r3 $r1 0xb14 + // New exponent in $r3 + long add b32 $r3 $r3 $r4 + add b32 $r2 $r3 0xffffffff + sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + set $p0 0x1 lt u32 $r2 0x7fe + (not $p0) bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl b32 $r4 $r4 clamp 0x14 + long add b32 $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + set $p0 0x1 ge s32 $r3 0x7ff + (not $p0) bra #rcp_result_denorm + sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f + // Infinity + and b32 $r1 $r1 0x80000000 + long mov b32 $r0 0x0 + add b32 $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + set $p0 0x1 ne u32 $r3 0x0 + and b32 $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 cvt f64 $r6d f32 0x3e800000 + sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27 + // 0x3f000000: 1/2 + (not $p0) cvt f64 $r6d f32 0x3f000000 + add b32 $r1 $r1 0x00100000 + mul rn f64 $r0d $r0d $r6d +rcp_end: long ret // RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) @@ -565,7 +704,67 @@ gk104_rcp_f64: // SIZE: 14 * 8 bytes // gk104_rsq_f64: - long nop + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000 + $p0 or b32 $r1 $r1 0x00080000 + and b32 $r2 $r1 0x7fffffff + sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28 + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + ext u32 $r3 $r1 0xb14 + set $p1 0x1 le u32 $r3 0x2 + long or b32 $r2 $r0 $r2 + $p1 mul rn f64 $r0d $r0d 0x4350000000000000 + rsqrt64h $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + set b32 $r6 ne u32 $r3 0x7ff + long and b32 $r2 $r2 $r6 + sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28 + set $p0 0x1 ne u32 $r2 0x0 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + and b32 $r1 $r1 0x80000000 + long mov b32 $r0 0x0 + long or b32 $r1 $r1 $r5 + long ret +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + long mov b32 $r4 0x0 + sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29 + // 0x3f000000: 1/2 + cvt f64 $r8d f32 0x3f000000 + mul rn f64 $r2d $r0d $r8d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29 + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + mul rn f64 $r0d $r2d $r4d + fma rn f64 $r6d neg $r4d $r0d $r8d + fma rn f64 $r4d $r4d $r6d $r4d + sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00 + // Multiply 2^27 to result for small inputs to recover + $p1 mul rn f64 $r4d $r4d 0x41a0000000000000 + long mov b32 $r1 $r5 + long mov b32 $r0 $r4 long ret // diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h index 37998768efe..ed948dee471 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gk104.asm.h @@ -481,12 +481,132 @@ uint64_t gk104_builtin_code[] = { 0xd40040000840c785, 0x18fe00000000dde2, 0x4000000000001de4, - 0x9000000000001de7, -/* 0x0f08: gk104_rcp_f64 */ + 0x2000000000000007, 0x4000000000001de4, 0x9000000000001de7, -/* 0x0f18: gk104_rsq_f64 */ - 0x4000000000001de4, +/* 0x0f18: gk104_rcp_f64 */ + 0x7000c02c50109c03, + 0x0bfffffffc20dc02, + 0x6000000280000007, + 0x1a0ec01ff431dc03, + 0x180000000000dde2, + 0x228282f2b2d042f7, + 0x40000000000021f4, + 0x1bfffffffc00dde2, + 0x1e0edffc0001dc81, + 0x40000000200021e7, + 0x3800200000105c52, +/* 0x0f70: rcp_inf_or_denorm_or_zero */ + 0x39ffc00000111c02, + 0x190e0000fc41dc23, + 0x2202f2b2d2f042b7, + 0x40000000400001e7, + 0x39ffc00000105c82, + 0x1800000000001df2, +/* 0x0fa0: rcp_denorm_or_zero */ + 0x1e0ec0000001dc81, + 0x40000000200001e7, + 0x39ffc00000105c52, +/* 0x0fb8: rcp_denorm */ + 0x5000d0d400001c01, + 0x2280428282b282f7, + 0x18000000d800ddf2, +/* 0x0fd0: rcp_rejoin */ + 0x188e0000fc31dc23, + 0x40000006000001e7, + 0x7000c02c50109c03, + 0x3a003ffffc11dc02, + 0x08ffc0000071dc02, + 0x2800000000019de4, + 0x22e2b2a2828042b7, + 0x1006000019a15c04, + 0xc800000010511c00, + 0x1afe000000001de2, + 0x3000000014415c00, + 0x3008000014401e00, + 0x1000000001301c04, + 0x1000000019b19d04, + 0x22929292929292e7, + 0x1000cfe001321c04, + 0x2010000000611c01, + 0x2000000010001c01, + 0x2010000000611c01, + 0x2000000010001c01, + 0x2010000000611c01, + 0x2000000010001c01, + 0x2282828282820297, + 0x2010000000611c01, + 0x2000000010001c01, + 0x0800000ffc209e02, + 0x480000000c211c03, + 0x7000c02c5010dc03, + 0x480000001030dc03, + 0x0bfffffffc309c02, + 0x22b28282b282b287, + 0x188ec01ff821dc03, + 0x40000000600021e7, + 0x6000c00050411c03, + 0x4800000004405c03, + 0x40000001c0001de7, +/* 0x10f0: rcp_result_inf_or_denorm */ + 0x1b0ec01ffc31dc23, + 0x40000000a00021e7, + 0x22f25232b2825207, + 0x3a00000000105c02, + 0x1800000000001de2, + 0x09ffc00000105c02, + 0x40000000e0001de7, +/* 0x1128: rcp_result_denorm */ + 0x1a8e0000fc31dc03, + 0x3a003ffffc105c02, + 0x1000cfa001318004, + 0x227202a2e2c282f7, + 0x1000cfc00131a004, + 0x0800400000105c02, + 0x5000000018001c01, +/* 0x1160: rcp_end */ + 0x9000000000001de7, +/* 0x1168: gk104_rsq_f64 */ + 0x1e0edffc0001dc81, + 0x3800200000104042, + 0x39fffffffc109c02, + 0x22828252c2820277, + 0x7000c02c5010dc03, + 0x198ec0000833dc03, + 0x6800000008009c43, + 0x5000d0d400000401, + 0xc80000001c115c00, + 0x128ec01ffc319c03, + 0x6800000018209c03, + 0x2282e2827202b287, + 0x1a8e0000fc21dc03, + 0x40000000800001e7, + 0x3a00000000105c02, + 0x1800000000001de2, + 0x6800000014105c43, + 0x9000000000001de7, +/* 0x11f8: rsq_norm */ + 0x1800000000011de2, + 0x22929292929292f7, + 0x1000cfc001321c04, + 0x5000000020009c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2008000018411c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2292929292929297, + 0x2008000018411c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2008000018411c01, + 0x5000000010201c01, + 0x2010000000419e01, + 0x2008000018411c01, + 0x20000002e2820297, + 0x5000d06800410401, + 0x2800000014005de4, + 0x2800000010001de4, 0x9000000000001de7, 0xc800000003f01cc5, 0x2c00000100005c04, @@ -495,7 +615,7 @@ uint64_t gk104_builtin_code[] = { 0x680100000c1fdc03, 0x4000000a60001c47, 0x180000004000dde2, -/* 0x0f60: spill_cfstack */ +/* 0x12e0: spill_cfstack */ 0x78000009c0000007, 0x0c0000000430dd02, 0x4003ffffa0001ca7, @@ -543,14 +663,14 @@ uint64_t gk104_builtin_code[] = { 0x4000000100001ea7, 0x480100000c001c03, 0x0800000000105c42, -/* 0x10d8: shared_loop */ +/* 0x1458: shared_loop */ 0xc100000000309c85, 0x9400000500009c85, 0x0c00000010001d02, 0x0800000000105d42, 0x0c0000001030dd02, 0x4003ffff40001ca7, -/* 0x1108: shared_done */ +/* 0x1488: shared_done */ 0x2800406420001de4, 0x2800406430005de4, 0xe000000000001c45, @@ -564,7 +684,7 @@ uint64_t gk104_builtin_code[] = { 0x480000000c209c03, 0x4801000008001c03, 0x0800000000105c42, -/* 0x1170: search_cstack */ +/* 0x14f0: search_cstack */ 0x280040646000dde4, 0x8400000020009f05, 0x190ec0002821dc03, @@ -573,17 +693,17 @@ uint64_t gk104_builtin_code[] = { 0x0800000000105c42, 0x0c0000004030dd02, 0x00029dff0ffc5cbf, -/* 0x11b0: entry_found */ +/* 0x1530: entry_found */ 0x8400000000009f85, 0x2800406400001de4, 0x2800406410005de4, 0x9400000010009c85, 0x4000000000001df4, -/* 0x11d8: end_exit */ +/* 0x1558: end_exit */ 0x9800000003ffdcc5, 0xd000000000008007, 0xa000000000004007, -/* 0x11f0: end_cont */ +/* 0x1570: end_cont */ 0xd000000000008007, 0x3400c3fffc201c04, 0xc000000003f01ec5, @@ -593,6 +713,6 @@ uint64_t gk104_builtin_code[] = { uint64_t gk104_builtin_offsets[] = { 0x0000000000000000, 0x00000000000000f0, - 0x0000000000000f08, 0x0000000000000f18, + 0x0000000000001168, }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 9d2c81db9dd..948db73f427 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -129,7 +129,7 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) bld.mkSplit(src, 4, i->getSrc(0)); int chip = prog->getTarget()->getChipset(); - if (chip >= NVISA_GK20A_CHIPSET && chip < NVISA_GM107_CHIPSET) { + if (chip >= NVISA_GK104_CHIPSET && chip < NVISA_GM107_CHIPSET) { handleRCPRSQLib(i, src); return; } From 77102d015147b4a2495d0dec52d4e395af710aad Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sat, 30 Jun 2018 23:09:27 +0200 Subject: [PATCH 051/378] gm107/ir: add fp64 rcp Acked-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit 815a8e59c6d462a7008653ea9e3010d40b6ba589) --- .../drivers/nouveau/codegen/lib/gm107.asm | 169 +++++++++++++++++- .../drivers/nouveau/codegen/lib/gm107.asm.h | 103 ++++++++++- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +- 3 files changed, 270 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm index 7ee5f8fc65b..595d9dc5d41 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm @@ -100,8 +100,175 @@ gm107_div_s32: ret nop 0 -// STUB +// RCP F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 +// +// The core of RCP and RSQ implementation is Newton-Raphson step, which is +// used to find successively better approximation from an imprecise initial +// value (single precision rcp in RCP and rsqrt64h in RSQ). +// gm107_rcp_f64: + // Step 1: classify input according to exponent and value, and calculate + // result for 0/inf/nan. $r2 holds the exponent value, which starts at + // bit 52 (bit 20 of the upper half) and is 11 bits in length + sched (st 0x0) (st 0x0) (st 0x0) + bfe u32 $r2 $r1 0xb14 + iadd32i $r3 $r2 -1 + ssy #rcp_rejoin + // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf, + // denorm, or 0). Do this by substracting 1 from the exponent, which will + // mean that it's > 0x7fd in those cases when doing unsigned comparison + sched (st 0x0) (st 0x0) (st 0x0) + isetp gt u32 and $p0 1 $r3 0x7fd 1 + // $r3: 0 for norms, 0x36 for denorms, -1 for others + mov $r3 0x0 0xf + not $p0 sync + // Process all special values: NaN, inf, denorm, 0 + sched (st 0x0) (st 0x0) (st 0x0) + mov32i $r3 0xffffffff 0xf + // A number is NaN if its abs value is greater than or unordered with inf + dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1 + not $p0 bra #rcp_inf_or_denorm_or_zero + // NaN -> NaN, the next line sets the "quiet" bit of the result. This + // behavior is both seen on the CPU and the blob + sched (st 0x0) (st 0x0) (st 0x0) + lop32i or $r1 $r1 0x80000 + sync +rcp_inf_or_denorm_or_zero: + lop32i and $r4 $r1 0x7ff00000 + sched (st 0x0) (st 0x0) (st 0x0) + // Other values with nonzero in exponent field should be inf + isetp eq and $p0 1 $r4 0x0 1 + $p0 bra #rcp_denorm_or_zero + // +/-Inf -> +/-0 + lop32i xor $r1 $r1 0x7ff00000 + sched (st 0x0) (st 0x0) (st 0x0) + mov $r0 0x0 0xf + sync +rcp_denorm_or_zero: + dsetp gtu and $p0 1 abs $r0 0x0 1 + sched (st 0x0) (st 0x0) (st 0x0) + $p0 bra #rcp_denorm + // +/-0 -> +/-Inf + lop32i or $r1 $r1 0x7ff00000 + sync +rcp_denorm: + // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms + sched (st 0x0) (st 0x0) (st 0x0) + dmul $r0 $r0 0x4350000000000000 + mov $r3 0x36 0xf + sync +rcp_rejoin: + // All numbers with -1 in $r3 have their result ready in $r0d, return them + // others need further calculation + sched (st 0x0) (st 0x0) (st 0x0) + isetp lt and $p0 1 $r3 0x0 1 + $p0 bra #rcp_end + // Step 2: Before the real calculation goes on, renormalize the values to + // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1) + // result in $r6d. The exponent will be recovered later. + bfe u32 $r2 $r1 0xb14 + sched (st 0x0) (st 0x0) (st 0x0) + lop32i and $r7 $r1 0x800fffff + iadd32i $r7 $r7 0x3ff00000 + mov $r6 $r0 0xf + // Step 3: Convert new value to float (no overflow will occur due to step + // 2), calculate rcp and do newton-raphson step once + sched (st 0x0) (st 0x0) (st 0x0) + f2f ftz f64 f32 $r5 $r6 + mufu rcp $r4 $r5 + mov32i $r0 0xbf800000 0xf + sched (st 0x0) (st 0x0) (st 0x0) + ffma $r5 $r4 $r5 $r0 + ffma $r0 $r5 neg $r4 $r4 + // Step 4: convert result $r0 back to double, do newton-raphson steps + f2f f32 f64 $r0 $r0 + sched (st 0x0) (st 0x0) (st 0x0) + f2f f64 f64 $r6 neg $r6 + f2f f32 f64 $r8 0x3f800000 + // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d + // The formula used here (and above) is: + // RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n} + // The following code uses 2 FMAs for each step, and it will basically + // looks like: + // tmp = -src * RCP_{n} + 1 + // RCP_{n + 1} = RCP_{n} * tmp + RCP_{n} + dfma $r4 $r6 $r0 $r8 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r0 $r0 $r4 $r0 + dfma $r4 $r6 $r0 $r8 + dfma $r0 $r0 $r4 $r0 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r4 $r6 $r0 $r8 + dfma $r0 $r0 $r4 $r0 + dfma $r4 $r6 $r0 $r8 + sched (st 0x0) (st 0x0) (st 0x0) + dfma $r0 $r0 $r4 $r0 + // Step 5: Exponent recovery and final processing + // The exponent is recovered by adding what we added to the exponent. + // Suppose we want to calculate rcp(x), but we have rcp(cx), then + // rcp(x) = c * rcp(cx) + // The delta in exponent comes from two sources: + // 1) The renormalization in step 2. The delta is: + // 0x3ff - $r2 + // 2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored + // in $r3 + // These 2 sources are calculated in the first two lines below, and then + // added to the exponent extracted from the result above. + // Note that after processing, the new exponent may >= 0x7ff (inf) + // or <= 0 (denorm). Those cases will be handled respectively below + iadd $r2 neg $r2 0x3ff + iadd $r4 $r2 $r3 + sched (st 0x0) (st 0x0) (st 0x0) + bfe u32 $r3 $r1 0xb14 + // New exponent in $r3 + iadd $r3 $r3 $r4 + iadd32i $r2 $r3 -1 + // (exponent-1) < 0x7fe (unsigned) means the result is in norm range + // (same logic as in step 1) + sched (st 0x0) (st 0x0) (st 0x0) + isetp lt u32 and $p0 1 $r2 0x7fe 1 + not $p0 bra #rcp_result_inf_or_denorm + // Norms: convert exponents back and return + shl $r4 $r4 0x14 + sched (st 0x0) (st 0x0) (st 0x0) + iadd $r1 $r4 $r1 + bra #rcp_end +rcp_result_inf_or_denorm: + // New exponent >= 0x7ff means that result is inf + isetp ge and $p0 1 $r3 0x7ff 1 + sched (st 0x0) (st 0x0) (st 0x0) + not $p0 bra #rcp_result_denorm + // Infinity + lop32i and $r1 $r1 0x80000000 + mov $r0 0x0 0xf + sched (st 0x0) (st 0x0) (st 0x0) + iadd32i $r1 $r1 0x7ff00000 + bra #rcp_end +rcp_result_denorm: + // Denorm result comes from huge input. The greatest possible fp64, i.e. + // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest + // normal value. Other rcp result should be greater than that. If we + // set the exponent field to 1, we can recover the result by multiplying + // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise + // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies + // the logic here. + isetp ne u32 and $p0 1 $r3 0x0 1 + sched (st 0x0) (st 0x0) (st 0x0) + lop32i and $r1 $r1 0x800fffff + // 0x3e800000: 1/4 + $p0 f2f f32 f64 $r6 0x3e800000 + // 0x3f000000: 1/2 + not $p0 f2f f32 f64 $r6 0x3f000000 + sched (st 0x0) (st 0x0) (st 0x0) + iadd32i $r1 $r1 0x00100000 + dmul $r0 $r0 $r6 +rcp_end: + ret + gm107_rsq_f64: sched (st 0x0) (st 0x0) (st 0x0) ret diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h index 65c93f7ae89..206d01bde83 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h @@ -82,7 +82,106 @@ uint64_t gm107_builtin_code[] = { 0xe32000000007000f, 0x50b0000000070f00, /* 0x0280: gm107_rcp_f64 */ -/* 0x0280: gm107_rsq_f64 */ + 0x001f8000fc0007e0, + 0x38000000b1470102, + 0x1c0ffffffff70203, + 0xe29000000e000000, + 0x001f8000fc0007e0, + 0x366803807fd70307, + 0x5c9807800ff70003, + 0xf0f800000008000f, + 0x001f8000fc0007e0, + 0x010ffffffff7f003, + 0x368c03fff0070087, + 0xe24000000188000f, + 0x001f8000fc0007e0, + 0x0420008000070101, + 0xf0f800000007000f, +/* 0x02f8: rcp_inf_or_denorm_or_zero */ + 0x0407ff0000070104, + 0x001f8000fc0007e0, + 0x5b6503800ff70407, + 0xe24000000200000f, + 0x0447ff0000070101, + 0x001f8000fc0007e0, + 0x5c9807800ff70000, + 0xf0f800000007000f, +/* 0x0338: rcp_denorm_or_zero */ + 0x5b8c03800ff70087, + 0x001f8000fc0007e0, + 0xe24000000100000f, + 0x0427ff0000070101, + 0xf0f800000007000f, +/* 0x0360: rcp_denorm */ + 0x001f8000fc0007e0, + 0x3880004350070000, + 0x3898078003670003, + 0xf0f800000007000f, +/* 0x0380: rcp_rejoin */ + 0x001f8000fc0007e0, + 0x5b6303800ff70307, + 0xe24000001c00000f, + 0x38000000b1470102, + 0x001f8000fc0007e0, + 0x040800fffff70107, + 0x1c03ff0000070707, + 0x5c98078000070006, + 0x001f8000fc0007e0, + 0x5ca8100000670e05, + 0x5080000000470504, + 0x010bf8000007f000, + 0x001f8000fc0007e0, + 0x5980000000570405, + 0x5981020000470500, + 0x5ca8000000070b00, + 0x001f8000fc0007e0, + 0x5ca8200000670f06, + 0x38a8003f80070b08, + 0x5b70040000070604, + 0x001f8000fc0007e0, + 0x5b70000000470000, + 0x5b70040000070604, + 0x5b70000000470000, + 0x001f8000fc0007e0, + 0x5b70040000070604, + 0x5b70000000470000, + 0x5b70040000070604, + 0x001f8000fc0007e0, + 0x5b70000000470000, + 0x381200003ff70202, + 0x5c10000000370204, + 0x001f8000fc0007e0, + 0x38000000b1470103, + 0x5c10000000470303, + 0x1c0ffffffff70302, + 0x001f8000fc0007e0, + 0x366203807fe70207, + 0xe24000000208000f, + 0x3848000001470404, + 0x001f8000fc0007e0, + 0x5c10000000170401, + 0xe24000000807000f, +/* 0x04d8: rcp_result_inf_or_denorm */ + 0x366d03807ff70307, + 0x001f8000fc0007e0, + 0xe24000000288000f, + 0x0408000000070101, + 0x5c9807800ff70000, + 0x001f8000fc0007e0, + 0x1c07ff0000070101, + 0xe24000000407000f, +/* 0x0518: rcp_result_denorm */ + 0x5b6a03800ff70307, + 0x001f8000fc0007e0, + 0x040800fffff70101, + 0x38a8003e80000b06, + 0x38a8003f00080b06, + 0x001f8000fc0007e0, + 0x1c00010000070101, + 0x5c80000000670000, +/* 0x0558: rcp_end */ + 0xe32000000007000f, +/* 0x0560: gm107_rsq_f64 */ 0x001f8000fc0007e0, 0xe32000000007000f, 0x50b0000000070f00, @@ -93,5 +192,5 @@ uint64_t gm107_builtin_offsets[] = { 0x0000000000000000, 0x0000000000000120, 0x0000000000000280, - 0x0000000000000280, + 0x0000000000000560, }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 948db73f427..2ec9af2d6a3 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -129,7 +129,7 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) bld.mkSplit(src, 4, i->getSrc(0)); int chip = prog->getTarget()->getChipset(); - if (chip >= NVISA_GK104_CHIPSET && chip < NVISA_GM107_CHIPSET) { + if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) { handleRCPRSQLib(i, src); return; } From 7b5e0f8316e7294617887e8df072f6b8009b849a Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Sun, 1 Jul 2018 09:44:48 +0200 Subject: [PATCH 052/378] gm107/ir: add fp64 rsq Acked-by: Ilia Mirkin Cc: 19.0 (cherry picked from commit cce495572136a606dd2a35e79f45080c3796e2cc) --- .../drivers/nouveau/codegen/lib/gm107.asm | 78 ++++++++++++++++++- .../drivers/nouveau/codegen/lib/gm107.asm.h | 51 +++++++++++- .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +- 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm index 595d9dc5d41..faee0218d18 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm @@ -269,8 +269,84 @@ rcp_result_denorm: rcp_end: ret +// RSQ F64 +// +// INPUT: $r0d +// OUTPUT: $r0d +// CLOBBER: $r2 - $r9, $p0 - $p1 +// gm107_rsq_f64: - sched (st 0x0) (st 0x0) (st 0x0) + // Before getting initial result rsqrt64h, two special cases should be + // handled first. + // 1. NaN: set the highest bit in mantissa so it'll be surely recognized + // as NaN in rsqrt64h + sched (st 0xd wr 0x0 wt 0x3f) (st 0xd wt 0x1) (st 0xd) + dsetp gtu and $p0 1 abs $r0 0x7ff0000000000000 1 + $p0 lop32i or $r1 $r1 0x00080000 + lop32i and $r2 $r1 0x7fffffff + // 2. denorms and small normal values: using their original value will + // lose precision either at rsqrt64h or the first step in newton-raphson + // steps below. Take 2 as a threshold in exponent field, and multiply + // with 2^54 if the exponent is smaller or equal. (will multiply 2^27 + // to recover in the end) + sched (st 0xd) (st 0xd) (st 0xd) + bfe u32 $r3 $r1 0xb14 + isetp le u32 and $p1 1 $r3 0x2 1 + lop or 1 $r2 $r0 $r2 + sched (st 0xd wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xd) + $p1 dmul $r0 $r0 0x4350000000000000 + mufu rsq64h $r5 $r1 + // rsqrt64h will give correct result for 0/inf/nan, the following logic + // checks whether the input is one of those (exponent is 0x7ff or all 0 + // except for the sign bit) + iset ne u32 and $r6 $r3 0x7ff 1 + sched (st 0xd) (st 0xd) (st 0xd) + lop and 1 $r2 $r2 $r6 + isetp ne u32 and $p0 1 $r2 0x0 1 + $p0 bra #rsq_norm + // For 0/inf/nan, make sure the sign bit agrees with input and return + sched (st 0xd) (st 0xd) (st 0xd wt 0x1) + lop32i and $r1 $r1 0x80000000 + mov $r0 0x0 0xf + lop or 1 $r1 $r1 $r5 + sched (st 0xd) (st 0xf) (st 0xf) + ret + nop 0 + nop 0 +rsq_norm: + // For others, do 4 Newton-Raphson steps with the formula: + // RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n}) + // In the code below, each step is written as: + // tmp1 = 0.5 * x * RSQ_{n} + // tmp2 = -RSQ_{n} * tmp1 + 0.5 + // RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n} + sched (st 0xd) (st 0xd wr 0x1) (st 0xd wr 0x1 rd 0x0 wt 0x3) + mov $r4 0x0 0xf + // 0x3f000000: 1/2 + f2f f32 f64 $r8 0x3f000000 + dmul $r2 $r0 $r8 + sched (st 0xd wr 0x0 wt 0x3) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) (st 0xd wr 0x0 wt 0x1) + dmul $r0 $r2 $r4 + dfma $r6 $r0 neg $r4 $r8 + dfma $r4 $r4 $r6 $r4 + // Multiply 2^27 to result for small inputs to recover + sched (st 0xd wr 0x0 wt 0x1) (st 0xd wt 0x1) (st 0xd) + $p1 dmul $r4 $r4 0x41a0000000000000 + mov $r1 $r5 0xf + mov $r0 $r4 0xf + sched (st 0xd) (st 0xf) (st 0xf) ret nop 0 nop 0 diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h index 206d01bde83..8eb27bbac99 100644 --- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h +++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h @@ -182,7 +182,56 @@ uint64_t gm107_builtin_code[] = { /* 0x0558: rcp_end */ 0xe32000000007000f, /* 0x0560: gm107_rsq_f64 */ - 0x001f8000fc0007e0, + 0x001fb401fda1ff0d, + 0x368c03fff0070087, + 0x0420008000000101, + 0x0407fffffff70102, + 0x001fb400fda007ed, + 0x38000000b1470103, + 0x366603800027030f, + 0x5c47020000270002, + 0x001fb401e1a0070d, + 0x3880004350010000, + 0x5080000000770105, + 0x365a03807ff70306, + 0x001fb400fda007ed, + 0x5c47000000670202, + 0x5b6a03800ff70207, + 0xe24000000400000f, + 0x003fb400fda007ed, + 0x0408000000070101, + 0x5c9807800ff70000, + 0x5c47020000570101, + 0x001fbc00fde007ed, + 0xe32000000007000f, + 0x50b0000000070f00, + 0x50b0000000070f00, +/* 0x0620: rsq_norm */ + 0x0060b400e5a007ed, + 0x5c9807800ff70004, + 0x38a8003f00070b08, + 0x5c80000000870002, + 0x003c3401e1a01f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x003c3401e1a00f0d, + 0x5c80000000470200, + 0x5b71040000470006, + 0x5b70020000670404, + 0x001fb401fda00f0d, + 0x38800041a0010404, + 0x5c98078000570001, + 0x5c98078000470000, + 0x001fbc00fde007ed, 0xe32000000007000f, 0x50b0000000070f00, 0x50b0000000070f00, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 2ec9af2d6a3..346a98228bd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -129,7 +129,7 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i) bld.mkSplit(src, 4, i->getSrc(0)); int chip = prog->getTarget()->getChipset(); - if (chip >= NVISA_GK104_CHIPSET && (i->op == OP_RCP || chip < NVISA_GM107_CHIPSET)) { + if (chip >= NVISA_GK104_CHIPSET) { handleRCPRSQLib(i, src); return; } From 24bb2771b648340011ec3913bf15b2785c4c2ee2 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 25 Aug 2017 19:22:03 +0200 Subject: [PATCH 053/378] gallium: add PIPE_CAP_MAX_VARYINGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some NVIDIA hardware can accept 128 fragment shader input components, but only have up to 124 varying-interpolated input components. We add a new cap to express this cleanly. For most drivers, this will have the same value as PIPE_SHADER_CAP_MAX_INPUTS for the fragment shader. Fixes KHR-GL45.limits.max_fragment_input_components Conflicts resolved by Dylan Signed-off-by: Karol Herbst [imirkin: rebased, improved docs/commit message] Signed-off-by: Ilia Mirkin Acked-by: Rob Clark Acked-by: Eric Anholt Reviewed-by: Marek Olšák Cc: 19.0 (cherry picked from commit 6010d7b8e8bee1bcea2b329cf6d3b44c5fc3ca66) --- src/gallium/docs/source/screen.rst | 4 ++++ src/gallium/drivers/etnaviv/etnaviv_screen.c | 3 +++ .../drivers/freedreno/freedreno_screen.c | 3 +++ src/gallium/drivers/i915/i915_screen.c | 2 ++ src/gallium/drivers/llvmpipe/lp_screen.c | 2 ++ .../drivers/nouveau/nv30/nv30_screen.c | 3 +++ .../drivers/nouveau/nv50/nv50_screen.c | 2 ++ .../drivers/nouveau/nvc0/nvc0_screen.c | 19 +++++++------------ src/gallium/drivers/r300/r300_screen.c | 3 +++ src/gallium/drivers/r600/r600_pipe.c | 3 +++ src/gallium/drivers/radeonsi/si_get.c | 3 +++ src/gallium/drivers/softpipe/sp_screen.c | 2 ++ src/gallium/drivers/svga/svga_screen.c | 2 ++ src/gallium/drivers/v3d/v3d_screen.c | 3 +++ src/gallium/drivers/vc4/vc4_screen.c | 3 +++ src/gallium/drivers/virgl/virgl_screen.c | 4 ++++ src/gallium/include/pipe/p_defines.h | 1 + src/mesa/state_tracker/st_extensions.c | 5 +---- 18 files changed, 51 insertions(+), 16 deletions(-) diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index eaf492ce8b0..b927d014179 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -487,6 +487,10 @@ The integer capabilities: * ``PIPE_CAP_DEST_SURFACE_SRGB_CONTROL``: Indicates whether the drivers supports switching the format between sRGB and linear for a surface that is used as destination in draw and blit calls. +* ``PIPE_CAP_MAX_VARYINGS``: The maximum number of fragment shader + varyings. This will generally correspond to + ``PIPE_SHADER_CAP_MAX_INPUTS`` for the fragment shader, but in some + cases may be a smaller number. .. _pipe_capf: diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c index fd320232528..35dcac1409b 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_screen.c +++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c @@ -360,6 +360,9 @@ etna_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return 0; + case PIPE_CAP_MAX_VARYINGS: + return screen->specs.max_varyings; + case PIPE_CAP_PCI_GROUP: case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index e596a4e8462..c3b08ab0e0f 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -317,6 +317,9 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_VIEWPORTS: return 1; + case PIPE_CAP_MAX_VARYINGS: + return 16; + case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: /* manage the variants for these ourself, to avoid breaking precompile: */ diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index a7b4a43c015..78707c66e62 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -402,6 +402,8 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) return 0; case PIPE_CAP_ENDIANNESS: return PIPE_ENDIAN_LITTLE; + case PIPE_CAP_MAX_VARYINGS: + return 10; case PIPE_CAP_VENDOR_ID: return 0x8086; diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index c95016a6cbe..b55b4a3c4fe 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -310,6 +310,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_CLEAR_TEXTURE: return 1; + case PIPE_CAP_MAX_VARYINGS: + return 32; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 2b69a8f6968..53551ebc037 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -79,6 +79,9 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 2048; case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 8 * 1024 * 1024; + case PIPE_CAP_MAX_VARYINGS: + return 8; + /* supported capabilities */ case PIPE_CAP_ANISOTROPIC_FILTER: case PIPE_CAP_POINT_SPRITE: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 85cb016e3c2..8e65eaf50b1 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -156,6 +156,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return NV50_MAX_WINDOW_RECTANGLES; case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 16 * 1024 * 1024; + case PIPE_CAP_MAX_VARYINGS: + return 15; /* supported caps */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 216fba49d9e..5a94865175d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -182,6 +182,13 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return class_3d >= GM200_3D_CLASS ? 8 : 0; case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: return 64 * 1024 * 1024; + case PIPE_CAP_MAX_VARYINGS: + /* NOTE: These only count our slots for GENERIC varyings. + * The address space may be larger, but the actual hard limit seems to be + * less than what the address space layout permits, so don't add TEXCOORD, + * COLOR, etc. here. + */ + return 0x1f0 / 16; /* supported caps */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: @@ -394,18 +401,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: return 16; case PIPE_SHADER_CAP_MAX_INPUTS: - if (shader == PIPE_SHADER_VERTEX) - return 32; - /* NOTE: These only count our slots for GENERIC varyings. - * The address space may be larger, but the actual hard limit seems to be - * less than what the address space layout permits, so don't add TEXCOORD, - * COLOR, etc. here. - */ - if (shader == PIPE_SHADER_FRAGMENT) - return 0x1f0 / 16; - /* Actually this counts CLIPVERTEX, which occupies the last generic slot, - * and excludes 0x60 per-patch inputs. - */ return 0x200 / 16; case PIPE_SHADER_CAP_MAX_OUTPUTS: return 32; diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index 19d3a1bae30..be0b475e5ef 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -304,6 +304,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE: return 2048; + case PIPE_CAP_MAX_VARYINGS: + return 10; + case PIPE_CAP_VENDOR_ID: return 0x1002; case PIPE_CAP_DEVICE_ID: diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index ade1a94ab32..41a878ab9d2 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -536,6 +536,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_MAX_TEXEL_OFFSET: return 7; + case PIPE_CAP_MAX_VARYINGS: + return 32; + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: return PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600; case PIPE_CAP_ENDIANNESS: diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index bb2d8c09eb1..f8ca02d4fcf 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -254,6 +254,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: return 30; + case PIPE_CAP_MAX_VARYINGS: + return 32; + case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK: return sscreen->info.chip_class <= VI ? PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_R600 : 0; diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 44e48cc7ee4..6931b52dc9f 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -265,6 +265,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) return 1; case PIPE_CAP_CLEAR_TEXTURE: return 1; + case PIPE_CAP_MAX_VARYINGS: + return TGSI_EXEC_MAX_INPUT_ATTRIBS; case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index 95dde8b0897..6cb5a14f5b0 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -350,6 +350,8 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return sws->have_sm4_1 ? 1 : 0; /* only single-channel textures */ + case PIPE_CAP_MAX_VARYINGS: + return sws->have_vgpu10 ? VGPU10_MAX_FS_INPUTS : 10; /* Unsupported features */ case PIPE_CAP_TEXTURE_MIRROR_CLAMP: diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c index bed2c63a64d..c539daf02b9 100644 --- a/src/gallium/drivers/v3d/v3d_screen.c +++ b/src/gallium/drivers/v3d/v3d_screen.c @@ -177,6 +177,9 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: return 4; + case PIPE_CAP_MAX_VARYINGS: + return V3D_MAX_FS_INPUTS / 4; + /* Texturing. */ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index e7f7c82c271..acb4a1feb0d 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -178,6 +178,9 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) /* Note: Not supported in hardware, just faking it. */ return 5; + case PIPE_CAP_MAX_VARYINGS: + return 8; + case PIPE_CAP_VENDOR_ID: return 0x14E4; case PIPE_CAP_ACCELERATED: diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index 42e0987e0c9..17fa5fc51cc 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -258,6 +258,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TEXTURE_FLOAT_LINEAR: case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 1; /* TODO: need to introduce a hw-cap for this */ + case PIPE_CAP_MAX_VARYINGS: + if (vscreen->caps.caps.v1.glsl_level < 150) + return vscreen->caps.caps.v2.max_vertex_attribs; + return 32; case PIPE_CAP_TEXTURE_GATHER_SM5: case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: case PIPE_CAP_FAKE_SW_MSAA: diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 867d0cb5d74..96e8fbed1be 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -856,6 +856,7 @@ enum pipe_cap PIPE_CAP_QUERY_PIPELINE_STATISTICS_SINGLE, PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND, PIPE_CAP_DEST_SURFACE_SRGB_CONTROL, + PIPE_CAP_MAX_VARYINGS, }; /** diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 1e456d019d0..1de81487e67 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -362,10 +362,7 @@ void st_init_limits(struct pipe_screen *screen, c->Program[MESA_SHADER_VERTEX].MaxAttribs = MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16); - /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number - * of inputs. It's always 2 colors + N generic inputs. */ - c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT, - PIPE_SHADER_CAP_MAX_INPUTS); + c->MaxVarying = screen->get_param(screen, PIPE_CAP_MAX_VARYINGS); c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING); c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES); From ab70eccc75ecf90f82d5e69c4db39226774ddfa5 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 16 Jan 2019 15:17:31 +0100 Subject: [PATCH 054/378] st/mesa: require RGBA2, RGB4, and RGBA4 to be renderable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the driver does not support rendering to these formats but does support texturing, we can end up in incompatibilities between textures and renderbuffers that are then copied to. Fixes KHR-GL45.copy_image.functional on nvc0 Reviewed-by: Ilia Mirkin Reviewed-by: Marek Olšák Cc: 19.0 (cherry picked from commit cbd1ad6165f0aea7fb7c6fd1b36ad5317dd65cb7) --- src/mesa/state_tracker/st_format.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c index aacb8788287..febde1a5e97 100644 --- a/src/mesa/state_tracker/st_format.c +++ b/src/mesa/state_tracker/st_format.c @@ -2356,6 +2356,8 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target, bindings |= PIPE_BIND_DEPTH_STENCIL; else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 || internalFormat == GL_RGB || internalFormat == GL_RGBA || + internalFormat == GL_RGBA2 || + internalFormat == GL_RGB4 || internalFormat == GL_RGBA4 || internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 || internalFormat == GL_BGRA || internalFormat == GL_RGB16F || From 2a97a3a8e708ec104c7ac18fff7c518be7487900 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 9 Feb 2019 22:36:49 -0500 Subject: [PATCH 055/378] nvc0: we have 16k-sized framebuffers, fix default scissors For some reason we don't use view volume clipping by default, and use scissors instead. These scissors were set to an 8k max fb size, while the driver advertises 16k-sized framebuffers. Signed-off-by: Ilia Mirkin Cc: (cherry picked from commit cc79a1483ffb7b91edc97c9870eadcab1e83b8f3) --- src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 5a94865175d..553fe324bc7 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -1283,8 +1283,8 @@ nvc0_screen_create(struct nouveau_device *dev) for (i = 0; i < NVC0_MAX_VIEWPORTS; i++) { BEGIN_NVC0(push, NVC0_3D(SCISSOR_ENABLE(i)), 3); PUSH_DATA (push, 1); - PUSH_DATA (push, 8192 << 16); - PUSH_DATA (push, 8192 << 16); + PUSH_DATA (push, 16384 << 16); + PUSH_DATA (push, 16384 << 16); } #define MK_MACRO(m, n) i = nvc0_graph_set_macro(screen, m, i, sizeof(n), n); From fb3485bc9248a12f47b07b593f0a81d58cbb3155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 1 Feb 2019 17:10:46 -0500 Subject: [PATCH 056/378] gallium/u_threaded: fix EXPLICIT_FLUSH for flush offsets > 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 18.3 19.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 4522f01d4ef9e8a1bb945ca28fba7649aae2171b) --- src/gallium/auxiliary/util/u_threaded_context.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index 8e3bceae18d..b596c322918 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -1524,7 +1524,8 @@ tc_buffer_do_flush_region(struct threaded_context *tc, if (ttrans->staging) { struct pipe_box src_box; - u_box_1d(ttrans->offset + box->x % tc->map_buffer_alignment, + u_box_1d(ttrans->offset + ttrans->b.box.x % tc->map_buffer_alignment + + (box->x - ttrans->b.box.x), box->width, &src_box); /* Copy the staging buffer into the original one. */ From 62b3bd8cd1dfe95a0d7a874a7df25aa09215999a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 1 Feb 2019 17:10:46 -0500 Subject: [PATCH 057/378] radeonsi: fix EXPLICIT_FLUSH for flush offsets > 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 18.3 19.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 61c678d4bc91d2ff9ca8c9b183e08daf07d8d24c) --- src/gallium/drivers/radeonsi/si_buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index 03c11cb7013..3845e56a4b3 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -521,10 +521,13 @@ static void si_buffer_do_flush_region(struct pipe_context *ctx, struct si_resource *buf = si_resource(transfer->resource); if (stransfer->staging) { + unsigned src_offset = stransfer->offset + + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + + (box->x - transfer->box.x); + /* Copy the staging buffer into the original one. */ si_copy_buffer((struct si_context*)ctx, transfer->resource, - &stransfer->staging->b.b, box->x, - stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT, + &stransfer->staging->b.b, box->x, src_offset, box->width); } From 75bec50c2a3cba0bb1dd2ac18599d94e81422609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 4 Feb 2019 15:12:17 -0500 Subject: [PATCH 058/378] winsys/amdgpu: don't drop manually added fence dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wow, it's hard to believe that fence and syncobjs dependencies were ignored. Cc: 18.3 19.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit ddfe209a0d61917e7b08100eeac82f4c20ca59e8) --- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index b4e62acbae4..397e41c05f0 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -1219,8 +1219,6 @@ static void amdgpu_add_fence_dependencies_bo_lists(struct amdgpu_cs *acs) { struct amdgpu_cs_context *cs = acs->csc; - cs->num_fence_dependencies = 0; - amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_real_buffers, cs->real_buffers); amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_slab_buffers, cs->slab_buffers); amdgpu_add_fence_dependencies_bo_list(acs, cs->fence, cs->num_sparse_buffers, cs->sparse_buffers); From ab585817e6a34893674b65aba77252d7bff43cc5 Mon Sep 17 00:00:00 2001 From: Leo Liu Date: Fri, 8 Feb 2019 08:56:53 -0500 Subject: [PATCH 059/378] st/va: fix the incorrect max profiles report Add "PIPE_VIDEO_PROFILE_MAX" to enum, so it will make sure here will be correct when adding more profiles in the future. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109107 Signed-off-by: Leo Liu Reviewed-by: Boyuan Zhang Cc: 19.0 (cherry picked from commit 21cdb828a3f4d1e2f140fc7c81a4bc305b2f6b04) --- src/gallium/include/pipe/p_video_enums.h | 3 ++- src/gallium/state_trackers/va/context.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h index b5b8b062285..adbe7858d0f 100644 --- a/src/gallium/include/pipe/p_video_enums.h +++ b/src/gallium/include/pipe/p_video_enums.h @@ -70,7 +70,8 @@ enum pipe_video_profile PIPE_VIDEO_PROFILE_HEVC_MAIN_444, PIPE_VIDEO_PROFILE_JPEG_BASELINE, PIPE_VIDEO_PROFILE_VP9_PROFILE0, - PIPE_VIDEO_PROFILE_VP9_PROFILE2 + PIPE_VIDEO_PROFILE_VP9_PROFILE2, + PIPE_VIDEO_PROFILE_MAX }; /* Video caps, can be different for each codec/profile */ diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c index 14e904ee490..47a5e7be230 100644 --- a/src/gallium/state_trackers/va/context.c +++ b/src/gallium/state_trackers/va/context.c @@ -175,7 +175,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx) ctx->version_minor = 1; *ctx->vtable = vtable; *ctx->vtable_vpp = vtable_vpp; - ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN; + ctx->max_profiles = PIPE_VIDEO_PROFILE_MAX - PIPE_VIDEO_PROFILE_UNKNOWN - 1; ctx->max_entrypoints = 2; ctx->max_attributes = 1; ctx->max_image_formats = VL_VA_MAX_IMAGE_FORMATS; From c55008e5a04e91ea03b2047c68121ee009704366 Mon Sep 17 00:00:00 2001 From: Leo Liu Date: Fri, 8 Feb 2019 09:48:23 -0500 Subject: [PATCH 060/378] st/va/vp9: set max reference as default of VP9 reference number If there is no information about number of render targets Signed-off-by: Leo Liu Reviewed-by: Boyuan Zhang Cc: 19.0 (cherry picked from commit a0a52a036708dbf5989778795fd67a79e3226289) --- src/gallium/state_trackers/va/picture_vp9.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gallium/state_trackers/va/picture_vp9.c b/src/gallium/state_trackers/va/picture_vp9.c index c1ca54cd008..b5aca9a513c 100644 --- a/src/gallium/state_trackers/va/picture_vp9.c +++ b/src/gallium/state_trackers/va/picture_vp9.c @@ -28,6 +28,8 @@ #include "vl/vl_vlc.h" #include "va_private.h" +#define NUM_VP9_REFS 8 + void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { VADecPictureParameterBufferVP9 *vp9 = buf->data; @@ -79,8 +81,11 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context, context->desc.vp9.picture_parameter.bit_depth = vp9->bit_depth; - for (i = 0 ; i < 8 ; i++) + for (i = 0 ; i < NUM_VP9_REFS ; i++) vlVaGetReferenceFrame(drv, vp9->reference_frames[i], &context->desc.vp9.ref[i]); + + if (!context->decoder && !context->templat.max_references) + context->templat.max_references = NUM_VP9_REFS; } void vlVaHandleSliceParameterBufferVP9(vlVaContext *context, vlVaBuffer *buf) From 9a5c8d2aab26eec839a554cc199798c61ca54788 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Sun, 10 Feb 2019 22:49:20 -0800 Subject: [PATCH 061/378] st/mesa: Limit GL_MAX_[NATIVE_]PROGRAM_PARAMETERS_ARB to 2048 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Piglit's vp-max-array test creates a vertex program containing a uniform array sized to the value of GL_MAX_NATIVE_PROGRAM_PARAMETERS_ARB. Mesa will then add additional state-var parameters for things like the MVP matrix. radeonsi currently exposes a value of 4096, derived from constant buffer upload size. This means the array will have 4096 elements, and the extra MVP state-vars would get a prog_src_register::Index of over 4096. Unfortunately, prog_src_register::Index is a signed 13-bit integer, so values beyond 4096 end up turning into negative numbers. Negative source indexes are only valid for relative addressing, so this ends up generating illegal IR. In prog_to_nir, this would cause an out of bounds array access. st_mesa_to_tgsi checks for a negative value, assumes it's bogus, and remaps it to parameter 0 in order to get something in-range. This isn't right - instead of reading the MVP matrix, it would read the first element of the vertex program's large array. But the test only checks that the program compiles, so we never noticed that it was broken. This patch limits the size of the program limits, with the understanding that we may need to generate additional state-vars internally. i965 has exposed 1024 for this limit for years, so I don't expect lowering it to 2048 will cause any practical problems for radeonsi or other drivers. Fixes vp-max-array with prog_to_nir.c. Cc: "19.0" Reviewed-by: Marek Olšák Reviewed-by: Eric Anholt (cherry picked from commit f45dd6d31b2ff46a082931386ccd0bf043cfad59) --- src/mesa/state_tracker/st_extensions.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 1de81487e67..92e512a0f1c 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -223,8 +223,13 @@ void st_init_limits(struct pipe_screen *screen, pc->MaxUniformComponents = MIN2(pc->MaxUniformComponents, MAX_UNIFORMS * 4); + /* For ARB programs, prog_src_register::Index is a signed 13-bit number. + * This gives us a limit of 4096 values - but we may need to generate + * internal values in addition to what the source program uses. So, we + * drop the limit one step lower, to 2048, to be safe. + */ pc->MaxParameters = - pc->MaxNativeParameters = pc->MaxUniformComponents / 4; + pc->MaxNativeParameters = MIN2(pc->MaxUniformComponents / 4, 2048); pc->MaxInputComponents = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS) * 4; pc->MaxOutputComponents = From fbcd1ad42cf0ea64b08045d4d9fc932ea1073a8b Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 11 Feb 2019 10:17:52 +0100 Subject: [PATCH 062/378] radv: fix compiler issues with GCC 9 "The C standard says that compound literals which occur inside of the body of a function have automatic storage duration associated with the enclosing block. Older GCC releases were putting such compound literals into the scope of the whole function, so their lifetime actually ended at the end of containing function. This has been fixed in GCC 9. Code that relied on this extended lifetime needs to be fixed, move the compound literals to whatever scope they need to accessible in." Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109543 Cc: Signed-off-by: Samuel Pitoiset Reviewed-by: Gustaw Smolarczyk Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 129a9f4937b8f2adb4d37999677d748d816d611c) --- src/amd/vulkan/radv_meta_blit.c | 90 ++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c index ef690edb471..f3a8f6464b8 100644 --- a/src/amd/vulkan/radv_meta_blit.c +++ b/src/amd/vulkan/radv_meta_blit.c @@ -849,54 +849,60 @@ build_pipeline(struct radv_device *device, .subpass = 0, }; - switch(aspect) { - case VK_IMAGE_ASPECT_COLOR_BIT: - vk_pipeline_info.pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) { - .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, - .attachmentCount = 1, - .pAttachments = (VkPipelineColorBlendAttachmentState []) { - { .colorWriteMask = - VK_COLOR_COMPONENT_A_BIT | - VK_COLOR_COMPONENT_R_BIT | - VK_COLOR_COMPONENT_G_BIT | - VK_COLOR_COMPONENT_B_BIT }, + VkPipelineColorBlendStateCreateInfo color_blend_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .attachmentCount = 1, + .pAttachments = (VkPipelineColorBlendAttachmentState []) { + { + .colorWriteMask = VK_COLOR_COMPONENT_A_BIT | + VK_COLOR_COMPONENT_R_BIT | + VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT }, } }; + + VkPipelineDepthStencilStateCreateInfo depth_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = true, + .depthWriteEnable = true, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }; + + VkPipelineDepthStencilStateCreateInfo stencil_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .depthTestEnable = false, + .depthWriteEnable = false, + .stencilTestEnable = true, + .front = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .back = { + .failOp = VK_STENCIL_OP_REPLACE, + .passOp = VK_STENCIL_OP_REPLACE, + .depthFailOp = VK_STENCIL_OP_REPLACE, + .compareOp = VK_COMPARE_OP_ALWAYS, + .compareMask = 0xff, + .writeMask = 0xff, + .reference = 0 + }, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + }; + + switch(aspect) { + case VK_IMAGE_ASPECT_COLOR_BIT: + vk_pipeline_info.pColorBlendState = &color_blend_info; break; case VK_IMAGE_ASPECT_DEPTH_BIT: - vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { - .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, - .depthTestEnable = true, - .depthWriteEnable = true, - .depthCompareOp = VK_COMPARE_OP_ALWAYS, - }; + vk_pipeline_info.pDepthStencilState = &depth_info; break; case VK_IMAGE_ASPECT_STENCIL_BIT: - vk_pipeline_info.pDepthStencilState = &(VkPipelineDepthStencilStateCreateInfo) { - .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, - .depthTestEnable = false, - .depthWriteEnable = false, - .stencilTestEnable = true, - .front = { - .failOp = VK_STENCIL_OP_REPLACE, - .passOp = VK_STENCIL_OP_REPLACE, - .depthFailOp = VK_STENCIL_OP_REPLACE, - .compareOp = VK_COMPARE_OP_ALWAYS, - .compareMask = 0xff, - .writeMask = 0xff, - .reference = 0 - }, - .back = { - .failOp = VK_STENCIL_OP_REPLACE, - .passOp = VK_STENCIL_OP_REPLACE, - .depthFailOp = VK_STENCIL_OP_REPLACE, - .compareOp = VK_COMPARE_OP_ALWAYS, - .compareMask = 0xff, - .writeMask = 0xff, - .reference = 0 - }, - .depthCompareOp = VK_COMPARE_OP_ALWAYS, - }; + vk_pipeline_info.pDepthStencilState = &stencil_info; break; default: unreachable("Unhandled aspect"); From 1f33f3cf3a4441d76708b18d124477f099c0d491 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 12 Feb 2019 09:01:50 +0100 Subject: [PATCH 063/378] radv: fix using LOAD_CONTEXT_REG with old GFX ME firmwares on GFX8 This fixes a critical issue. Cc: Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109575 Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 1b8983c25be19073c02fe9630e949be55f8280fa) --- src/amd/vulkan/radv_cmd_buffer.c | 5 ++--- src/amd/vulkan/radv_device.c | 5 +++++ src/amd/vulkan/radv_private.h | 3 +++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 7f7f052986e..89d75da369e 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -1356,7 +1356,7 @@ radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset; - if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) { + if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, 0)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); @@ -1518,14 +1518,13 @@ radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c; - if (cmd_buffer->device->physical_device->rad_info.chip_class >= VI) { + if (cmd_buffer->device->physical_device->has_load_ctx_reg_pkt) { radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG, 3, cmd_buffer->state.predicating)); radeon_emit(cs, va); radeon_emit(cs, va >> 32); radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2); radeon_emit(cs, 2); } else { - /* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */ radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 34d93b262f8..68c6b647718 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -369,6 +369,11 @@ radv_physical_device_init(struct radv_physical_device *device, device->dcc_msaa_allowed = (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA); + /* TODO: Figure out how to use LOAD_CONTEXT_REG on SI/CIK. */ + device->has_load_ctx_reg_pkt = device->rad_info.chip_class >= GFX9 || + (device->rad_info.chip_class >= VI && + device->rad_info.me_fw_feature >= 41); + radv_physical_device_init_mem_types(device); radv_fill_device_extension_table(device, &device->supported_extensions); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 85c18906f84..8714052e4a0 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -306,6 +306,9 @@ struct radv_physical_device { /* Whether DCC should be enabled for MSAA textures. */ bool dcc_msaa_allowed; + /* Whether LOAD_CONTEXT_REG packets are supported. */ + bool has_load_ctx_reg_pkt; + /* This is the drivers on-disk cache used as a fallback as opposed to * the pipeline cache defined by apps. */ From d8534f931c22463b1fab20248c114d1feea2a48b Mon Sep 17 00:00:00 2001 From: "Juan A. Suarez Romero" Date: Tue, 12 Feb 2019 19:19:13 +0100 Subject: [PATCH 064/378] anv/cmd_buffer: check for NULL framebuffer This can happen when we record a VkCmdDraw in a secondary buffer that was created inheriting from the primary buffer, but with the framebuffer set to NULL in the VkCommandBufferInheritanceInfo. Vulkan 1.1.81 spec says that "the application must ensure (using scissor if neccesary) that all rendering is contained in the render area [...] [which] must be contained within the framebuffer dimesions". While this should be done by the application, commit 465e5a86 added the clamp to the framebuffer size, in case of application does not do it. But this requires to know the framebuffer dimensions. If we do not have a framebuffer at that moment, the best compromise we can do is to just apply the scissor as it is, and let the application to ensure the rendering is contained in the render area. v2: do not clamp to framebuffer if there isn't a framebuffer v3 (Jason): - clamp earlier in the conditional - clamp to render area if command buffer is primary v4: clamp also x and y to render area (Jason) v5: rename used variables (Jason) Fixes: 465e5a86 ("anv: Clamp scissors to the framebuffer boundary") CC: Jason Ekstrand Reviewed-by: Jason Ekstrand (cherry picked from commit 1ad26f941792f07f226c054811be78b0c0ac9fce) --- src/intel/vulkan/gen7_cmd_buffer.c | 34 +++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c index 352892aee33..380283bdd56 100644 --- a/src/intel/vulkan/gen7_cmd_buffer.c +++ b/src/intel/vulkan/gen7_cmd_buffer.c @@ -70,12 +70,36 @@ gen7_cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer) }; const int max = 0xffff; + + uint32_t y_min = s->offset.y; + uint32_t x_min = s->offset.x; + uint32_t y_max = s->offset.y + s->extent.height - 1; + uint32_t x_max = s->offset.x + s->extent.width - 1; + + /* Do this math using int64_t so overflow gets clamped correctly. */ + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { + y_min = clamp_int64((uint64_t) y_min, + cmd_buffer->state.render_area.offset.y, max); + x_min = clamp_int64((uint64_t) x_min, + cmd_buffer->state.render_area.offset.x, max); + y_max = clamp_int64((uint64_t) y_max, 0, + cmd_buffer->state.render_area.offset.y + + cmd_buffer->state.render_area.extent.height - 1); + x_max = clamp_int64((uint64_t) x_max, 0, + cmd_buffer->state.render_area.offset.x + + cmd_buffer->state.render_area.extent.width - 1); + } else if (fb) { + y_min = clamp_int64((uint64_t) y_min, 0, max); + x_min = clamp_int64((uint64_t) x_min, 0, max); + y_max = clamp_int64((uint64_t) y_max, 0, fb->height - 1); + x_max = clamp_int64((uint64_t) x_max, 0, fb->width - 1); + } + struct GEN7_SCISSOR_RECT scissor = { - /* Do this math using int64_t so overflow gets clamped correctly. */ - .ScissorRectangleYMin = clamp_int64(s->offset.y, 0, max), - .ScissorRectangleXMin = clamp_int64(s->offset.x, 0, max), - .ScissorRectangleYMax = clamp_int64((uint64_t) s->offset.y + s->extent.height - 1, 0, fb->height - 1), - .ScissorRectangleXMax = clamp_int64((uint64_t) s->offset.x + s->extent.width - 1, 0, fb->width - 1) + .ScissorRectangleYMin = y_min, + .ScissorRectangleXMin = x_min, + .ScissorRectangleYMax = y_max, + .ScissorRectangleXMax = x_max }; if (s->extent.width <= 0 || s->extent.height <= 0) { From 838baab47263389704c0e5bd635447f9ac3439c3 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Wed, 13 Feb 2019 09:11:02 -0800 Subject: [PATCH 065/378] version: bump for 19.0-rc4 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 07977171c97..b61e5f2523a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -19.0.0-rc3 +19.0.0-rc4 From 6b484511104c5bddfd552cb83b1c7ee600004000 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Tue, 12 Feb 2019 21:52:51 +0100 Subject: [PATCH 066/378] radeonsi: Fix guardband computation for large render targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stop using 12.12 quantization for viewports that are not contained in the lower 4k corner of the render target as the hardware needs to keep both absolute and relative coordinates representable. Signed-off-by: Marek Olšák Cc: 18.3 19.0 (cherry picked from commit 3c540e0a748844258e77254fc4f864f3b875fe18) --- .../drivers/radeonsi/si_state_viewport.c | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c index dac90df1c4f..64bb956b200 100644 --- a/src/gallium/drivers/radeonsi/si_state_viewport.c +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c @@ -185,6 +185,16 @@ static void si_emit_guardband(struct si_context *ctx) const unsigned hw_screen_offset_alignment = ctx->chip_class >= VI ? 16 : MAX2(ctx->screen->se_tile_repeat, 16); + /* Indexed by quantization modes */ + static unsigned max_viewport_size[] = {65535, 16383, 4095}; + + /* Ensure that the whole viewport stays representable in + * absolute coordinates. + * See comment in si_set_viewport_states. + */ + assert(vp_as_scissor.maxx <= max_viewport_size[vp_as_scissor.quant_mode] && + vp_as_scissor.maxy <= max_viewport_size[vp_as_scissor.quant_mode]); + hw_screen_offset_x = CLAMP(hw_screen_offset_x, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); hw_screen_offset_y = CLAMP(hw_screen_offset_y, 0, MAX_PA_SU_HARDWARE_SCREEN_OFFSET); @@ -219,7 +229,6 @@ static void si_emit_guardband(struct si_context *ctx) * * The viewport range is [-max_viewport_size/2, max_viewport_size/2]. */ - static unsigned max_viewport_size[] = {65535, 16383, 4095}; assert(vp_as_scissor.quant_mode < ARRAY_SIZE(max_viewport_size)); max_range = max_viewport_size[vp_as_scissor.quant_mode] / 2; left = (-max_range - vp.translate[0]) / vp.scale[0]; @@ -333,6 +342,8 @@ static void si_set_viewport_states(struct pipe_context *pctx, unsigned h = scissor->maxy - scissor->miny; unsigned max_extent = MAX2(w, h); + int max_corner = MAX2(scissor->maxx, scissor->maxy); + unsigned center_x = (scissor->maxx + scissor->minx) / 2; unsigned center_y = (scissor->maxy + scissor->miny) / 2; unsigned max_center = MAX2(center_x, center_y); @@ -358,7 +369,22 @@ static void si_set_viewport_states(struct pipe_context *pctx, if (ctx->family == CHIP_RAVEN) max_extent = 16384; /* Use QUANT_MODE == 16_8. */ - if (max_extent <= 1024) /* 4K scanline area for guardband */ + /* Another constraint is that all coordinates in the viewport + * are representable in fixed point with respect to the + * surface origin. + * + * It means that PA_SU_HARDWARE_SCREEN_OFFSET can't be given + * an offset that would make the upper corner of the viewport + * greater than the maximum representable number post + * quantization, ie 2^quant_bits. + * + * This does not matter for 14.10 and 16.8 formats since the + * offset is already limited at 8k, but it means we can't use + * 12.12 if we are drawing to some pixels outside the lower + * 4k x 4k of the render target. + */ + + if (max_extent <= 1024 && max_corner < 4096) /* 4K scanline area for guardband */ scissor->quant_mode = SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH; else if (max_extent <= 4096) /* 16K scanline area for guardband */ scissor->quant_mode = SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH; From 7ac15d9e4248ddafba8189e5dfcacddc1af367e2 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Tue, 12 Feb 2019 20:59:35 +0100 Subject: [PATCH 067/378] nir/opt_if: don't mark progress if nothing changes if we have something like this: loop { ... if x { break; } else { continue; } } opt_if_loop_last_continue returns true marking progress allthough nothing changes. Fixes: 5921a19d4b0c6 "nir: add if opt opt_if_loop_last_continue()" Signed-off-by: Karol Herbst Reviewed-by: Jason Ekstrand (cherry picked from commit 7e08f22a72cfc379902feeca3673db6aa344f782) --- src/compiler/nir/nir_opt_if.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c index c2f945d4d59..ba94807bb20 100644 --- a/src/compiler/nir/nir_opt_if.c +++ b/src/compiler/nir/nir_opt_if.c @@ -313,6 +313,13 @@ opt_if_loop_last_continue(nir_loop *loop) if (!then_ends_in_continue && !else_ends_in_continue) return false; + /* if the block after the if/else is empty we bail, otherwise we might end + * up looping forever + */ + if (&nif->cf_node == nir_cf_node_prev(&last_block->cf_node) && + exec_list_is_empty(&last_block->instr_list)) + return false; + /* Move the last block of the loop inside the last if-statement */ nir_cf_list tmp; nir_cf_extract(&tmp, nir_after_cf_node(if_node), From b4419fdba50856b5963f2728d819433130b30888 Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Tue, 12 Feb 2019 14:03:21 -0800 Subject: [PATCH 068/378] get-pick-list: Add --pretty=medium to the arguments for Cc patches Because none of them have been picked up for 19.0 due to this bug being reintroduced. v2: - Fix fixes tags Fixes: e6b3a3b2014413366110f6deeced8095e7262b1d ("bin/get-pick-list.sh: handle "typod" usecase.") Fixes: fac10169bbad2da918ef07a62c01e0b321508cfe ("bin/get-pick-list.sh: prefix output with "[stable] "") Reviewed-by: Andres Gomez Reviewed-by: Emil Velikov (cherry picked from commit aff52dd2c61eb8d1b03cebbcca7e070ffa48afdf) --- bin/get-pick-list.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh index 15f0e7d4a34..8fa4f438771 100755 --- a/bin/get-pick-list.sh +++ b/bin/get-pick-list.sh @@ -13,12 +13,12 @@ is_stable_nomination() { - git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable" + git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-stable" } is_typod_nomination() { - git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev" + git show --pretty=medium --summary "$1" | grep -q -i -o "CC:.*mesa-dev" } fixes= From e304007d87ad99968dfded7048a81c947ce4ffdc Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Wed, 13 Feb 2019 18:51:23 +0100 Subject: [PATCH 069/378] radv/winsys: fix BO list creation when RADV_DEBUG=allbos is set Fixes: 50fd253bd6e ("radv/winsys: Add priority handling during submit.") Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 5e18000d1b070ecf627138b7bff47ff8fef81576) --- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 438ed594ede..49a86a72c31 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -665,6 +665,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, assert(num < ws->num_buffers); handles[num].bo_handle = bo->bo_handle; handles[num].bo_priority = bo->priority; + num++; } r = amdgpu_bo_list_create_raw(ws->dev, ws->num_buffers, From eba57c29b09ef8d6770d2ba71d22a3e1cca08146 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 12 Feb 2019 09:50:15 +0100 Subject: [PATCH 070/378] radv: always export gl_SampleMask when the fragment shader uses it For some reasons, this breaks trees rendering in Project Cars. Fixes: 85010585cde ("radv: only enable gl_SampleMask if MSAA is enabled too") Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109401 Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 334da034d8d91ca5a0a1bff8deaefd8ca762c42e) --- src/amd/vulkan/radv_pipeline.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 138e153f9a4..64090f6e3f2 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3192,11 +3192,11 @@ radv_compute_db_shader_control(const struct radv_device *device, bool disable_rbplus = device->physical_device->has_rbplus && !device->physical_device->rbplus_allowed; - /* Do not enable the gl_SampleMask fragment shader output if MSAA is - * disabled. + /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled + * but this appears to break Project Cars (DXVK). See + * https://bugs.freedesktop.org/show_bug.cgi?id=109401 */ - bool mask_export_enable = ms->num_samples > 1 && - ps->info.info.ps.writes_sample_mask; + bool mask_export_enable = ps->info.info.ps.writes_sample_mask; return S_02880C_Z_EXPORT_ENABLE(ps->info.info.ps.writes_z) | S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.info.ps.writes_stencil) | From c19ce6e5e26b44e13376fba1a5ea4fd60ad9dd5b Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Wed, 13 Feb 2019 09:26:16 -0800 Subject: [PATCH 071/378] meson: Add dependency on genxml to anvil Currently the Intel "anvil" driver races with the generation of genxml files, while i965 has an explicit dependency. This patch adds the same dependency to anvil. Fixes: d1992255bb29054fa51763376d125183a9f602f ("meson: Add build Intel "anv" vulkan driver") Acked-by: Jason Ekstrand Acked-by: Lionel Landwerlin Reviewed-by: Eric Engestrom (cherry picked from commit 279060cd32dd673c6a5bf302ceac852f51a6c17c) --- src/intel/vulkan/meson.build | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index 05fdeca8c25..ffd1985f251 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2017-2018 Intel Corporation +# Copyright © 2017-2019 Intel Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -178,7 +178,10 @@ endif libanv_common = static_library( 'anv_common', - [libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h], + [ + libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h, sha1_h, + gen_xml_pack, + ], include_directories : [ inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util, inc_vulkan_wsi, From 59812ac38dfeb6a1e92d0c333aeb4f1ea0f89bd9 Mon Sep 17 00:00:00 2001 From: Ian Romanick Date: Wed, 13 Feb 2019 15:01:16 -0800 Subject: [PATCH 072/378] spirv: Add missing break Reviewed-by: Caio Marcelo de Oliveira Filho Reviewed-by: Jason Ekstrand Fixes: c6465fec0c5 ("spirv: add SpvCapabilityInt64Atomics") CID: 1442555 (cherry picked from commit 9a918050e0886d8c6d6adc0c687ffd30d8f70b40) --- src/compiler/spirv/spirv_to_nir.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c index 9bfe5805919..670bd1bcb89 100644 --- a/src/compiler/spirv/spirv_to_nir.c +++ b/src/compiler/spirv/spirv_to_nir.c @@ -3595,6 +3595,7 @@ vtn_handle_preamble_instruction(struct vtn_builder *b, SpvOp opcode, case SpvCapabilityInt64Atomics: spv_check_supported(int64_atomics, cap); + break; case SpvCapabilityInt8: spv_check_supported(int8, cap); From 103928528883dde66e48bc68f1cd740f65de69a7 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Thu, 14 Feb 2019 08:55:37 -0800 Subject: [PATCH 073/378] anv: Put MOCS in the correct location My patch to switch from struct-based MOCS to numeric MOCS accidentally divided all MOCS entries by 2 in the Vulkan driver. MOCS on Gen9+ is just an array index into a table. But in the hardware packets, the index starts at bit 1. So we need to shift it. Fixes: 0b44644ca68 (genxml: Consistently use a numeric "MOCS" field) Reviewed-by: Jason Ekstrand (cherry picked from commit 39aee57523a02552e7eae7df5da488e535aeb1eb) --- src/intel/vulkan/anv_private.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 110b2ccf023..5f6973ebe81 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1449,10 +1449,10 @@ _anv_combine_address(struct anv_batch *batch, void *location, */ /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ -#define GEN9_MOCS 2 +#define GEN9_MOCS (2 << 1) /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */ -#define GEN9_EXTERNAL_MOCS 1 +#define GEN9_EXTERNAL_MOCS (1 << 1) /* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */ #define GEN10_MOCS GEN9_MOCS From f30fb27665d8341b7c71f601dafe091856b97874 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Wed, 13 Feb 2019 22:32:25 -0500 Subject: [PATCH 074/378] swr: set PIPE_CAP_MAX_VARYINGS correctly Unfortunately swr was missed in the original commit. The number of varyings should generally match up to what's reported as the shader caps for fragment inputs. Fixes: 6010d7b8e8be (gallium: add PIPE_CAP_MAX_VARYINGS) Signed-off-by: Ilia Mirkin Reviewed-by: Alok Hota Cc: 19.0 (cherry picked from commit 8c859367df95b74e7596f7fefffbdbf08bb8f8c7) --- src/gallium/drivers/swr/swr_screen.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index de9008ddf6a..f390f9219c2 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -369,6 +369,8 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) return 32; case PIPE_CAP_MAX_SHADER_BUFFER_SIZE: return 1 << 27; + case PIPE_CAP_MAX_VARYINGS: + return 32; case PIPE_CAP_VENDOR_ID: return 0xFFFFFFFF; From 81e053b7578e540588e897b4d7670d0dfd8b0ab8 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 11 Feb 2019 22:39:45 -0600 Subject: [PATCH 075/378] intel/fs: Bail in optimize_extract_to_float if we have modifiers This fixes a bug in runscape where we were optimizing x >> 16 to an extract and then negating and converting to float. The NIR to fs pass was dropping the negate on the floor breaking a geometry shader and causing it to render nothing. Fixes: 1f862e923cb "i965/fs: Optimize float conversions of byte/word..." Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109601 Tested-by: Lionel Landwerlin Reviewed-by: Matt Turner (cherry picked from commit 367b0ede4d9115aba772d6e46ec73642761f7ff6) --- src/intel/compiler/brw_fs_nir.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index f16627b8a64..d9cfaeb0631 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -512,6 +512,15 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) return false; + /* If either opcode has source modifiers, bail. + * + * TODO: We can potentially handle source modifiers if both of the opcodes + * we're combining are signed integers. + */ + if (instr->src[0].abs || instr->src[0].negate || + src0->src[0].abs || src0->src[0].negate) + return false; + unsigned element = nir_src_as_uint(src0->src[1].src); /* Element type to extract.*/ From 4cf1a40f9a33517d9fc1bb4439528af49699661b Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 11 Feb 2019 16:02:15 -0800 Subject: [PATCH 076/378] intel/compiler: Avoid propagating inequality cmods if types are different MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v2: Fix silly bug in logic. s/||/&&/ All but one of the affected shaders is in an Unreal4 demo. The other is in Tomb Raider. All of the cases that Ian investigated appear to be sequences like the following if (int(uint(some_float)) < 0) /* other relations too */ ... At least in Tomb Raider, it's not obvious that this sequence came from the original shader. In some of the Unreal demos, the shader contains code like if (int(uint(textureLod(...))) > 0) ... which explicitly generates the offending sequence. All Gen6+ platforms had similar results (Skylake shown): total instructions in shared programs: 15437170 -> 15437187 (<.01%) instructions in affected programs: 4492 -> 4509 (0.38%) helped: 0 HURT: 17 HURT stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1 HURT stats (rel) min: 0.05% max: 0.73% x̄: 0.66% x̃: 0.73% 95% mean confidence interval for instructions value: 1.00 1.00 95% mean confidence interval for instructions %-change: 0.57% 0.75% Instructions are HURT. total cycles in shared programs: 383007996 -> 383007992 (<.01%) cycles in affected programs: 20542 -> 20538 (-0.02%) helped: 6 HURT: 7 helped stats (abs) min: 2 max: 6 x̄: 5.33 x̃: 6 helped stats (rel) min: 0.11% max: 0.36% x̄: 0.32% x̃: 0.36% HURT stats (abs) min: 4 max: 4 x̄: 4.00 x̃: 4 HURT stats (rel) min: 0.27% max: 0.27% x̄: 0.27% x̃: 0.27% 95% mean confidence interval for cycles value: -3.30 2.69 95% mean confidence interval for cycles %-change: -0.19% 0.19% Inconclusive result (value mean confidence interval includes 0). No changes on Iron Lake or GM45. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109404 Reviewed-by: Ian Romanick Reviewed-by: Jason Ekstrand Tested-by: nagrigoriadis@gmail.com Tested-by: Danylo Piliaiev (cherry picked from commit 2dff9a66b629834bffad47e7a9025e0f1de5ffc3) --- src/intel/compiler/brw_fs_cmod_propagation.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp index 5fb522f810f..b58730fbbe5 100644 --- a/src/intel/compiler/brw_fs_cmod_propagation.cpp +++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp @@ -255,6 +255,13 @@ opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block) if (inst->opcode == BRW_OPCODE_AND) break; + /* Not safe to use inequality operators if the types are different + */ + if (scan_inst->dst.type != inst->src[0].type && + inst->conditional_mod != BRW_CONDITIONAL_Z && + inst->conditional_mod != BRW_CONDITIONAL_NZ) + break; + /* Comparisons operate differently for ints and floats */ if (scan_inst->dst.type != inst->dst.type && (scan_inst->dst.type == BRW_REGISTER_TYPE_F || From 385b7362385b4f648508768c20b057eed2022409 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 11 Feb 2019 13:41:32 -0800 Subject: [PATCH 077/378] intel/compiler/test: Add unit test for mismatched signedness comparison v2 (idr): Move adding the test to after adding the fix. Reordering the two commits prevents possible headaches for git-bisect with scripts that always do 'ninja check'. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109404 Reviewed-by: Ian Romanick (cherry picked from commit ac21dd4aee450b2a4bc63adb05356b07abba2ff6) --- .../compiler/test_fs_cmod_propagation.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp index 659fbb2d1bc..4215af1fb02 100644 --- a/src/intel/compiler/test_fs_cmod_propagation.cpp +++ b/src/intel/compiler/test_fs_cmod_propagation.cpp @@ -889,3 +889,35 @@ TEST_F(cmod_propagation_test, subtract_delete_compare_derp) EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 1)->opcode); EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); } + +TEST_F(cmod_propagation_test, signed_unsigned_comparison_mismatch) +{ + const fs_builder &bld = v->bld; + fs_reg dest0 = v->vgrf(glsl_type::int_type); + fs_reg src0 = v->vgrf(glsl_type::int_type); + src0.type = BRW_REGISTER_TYPE_W; + + bld.ASR(dest0, negate(src0), brw_imm_d(15)); + bld.CMP(bld.null_reg_ud(), retype(dest0, BRW_REGISTER_TYPE_UD), + brw_imm_ud(0u), BRW_CONDITIONAL_LE); + + /* = Before = + * 0: asr(8) dest:D -src0:W 15D + * 1: cmp.le.f0(8) null:UD dest:UD 0UD + * + * = After = + * (no changes) + */ + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_FALSE(cmod_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + EXPECT_EQ(BRW_OPCODE_ASR, instruction(block0, 0)->opcode); + EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); + EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 1)->conditional_mod); +} From 69ebf4569ae0d2c7d9b1dc04de145329a1684727 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 15 Feb 2019 14:52:20 -0800 Subject: [PATCH 078/378] nir: Don't reassociate add/mul chains containing only constants The idea here is to reassociate a * (b * c) into (a * c) * b, when b is a non-constant value, but a and c are constants, allowing them to be combined. But nothing was enforcing that 'b' must be non-constant, which meant that running opt_algebraic in a loop would never terminate if the IR contained non-folded constant expressions like 256 * 0.5 * 2. Normally, we call constant folding in such a loop too, but IMO it's better for nir_opt_algebraic to be robust and not rely on that. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109581 Fixes: 32e266a9a58 i965: Compile fp64 funcs only if we do not have 64-bit hardware support Reviewed-by: Ian Romanick (cherry picked from commit 535251487ba56c4fd98465c4682881c2b9734242) --- src/compiler/nir/nir_opt_algebraic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 75a3d2ad238..636f3672708 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -618,11 +618,11 @@ # Reassociate constants in add/mul chains so they can be folded together. # For now, we mostly only handle cases where the constants are separated by # a single non-constant. We could do better eventually. - (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)), - (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)), - (('~fadd', '#a', ('fadd', b, '#c')), ('fadd', ('fadd', a, c), b)), - (('~fadd', '#a', ('fneg', ('fadd', b, '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), - (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)), + (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), + (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), + (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), + (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), + (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), # By definition... (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), From 0b9f6ebfbbb792e6b592044a1577a3e3fae6e50c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Fri, 15 Feb 2019 18:02:52 +0100 Subject: [PATCH 079/378] radv: write the alpha channel of MRT0 when alpha coverage is enabled Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109597 Cc: 18.3 19.0 Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 0d8f09629377da9cf48ab4315574d69fdef5369d) --- src/amd/vulkan/radv_pipeline.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 64090f6e3f2..efd675c4dc4 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -511,6 +511,13 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline, if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { cf = V_028714_SPI_SHADER_ZERO; + + if (blend->need_src_alpha & (1 << i)) { + /* Write the alpha channel of MRT0 when alpha coverage is + * enabled because the depth attachment needs it. + */ + col_format |= V_028714_SPI_SHADER_32_ABGR; + } } else { struct radv_render_pass_attachment *attachment = pass->attachments + subpass->color_attachments[i].attachment; bool blend_enable = @@ -689,6 +696,7 @@ radv_pipeline_init_blend_state(struct radv_pipeline *pipeline, if (vkms && vkms->alphaToCoverageEnable) { blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1); + blend.need_src_alpha |= 0x1; } blend.cb_target_mask = 0; From 110500cc8ab5dfc2b0d5c464155f85b1bccba779 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 18 Feb 2019 17:42:10 +0100 Subject: [PATCH 080/378] radv: fix writing the alpha channel of MRT0 when alpha coverage is enabled This version is better and safer. Cc: 18.3 19.0 Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 47616810ed7cfce21d239391131ad9a5ef558b52) --- src/amd/vulkan/radv_pipeline.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index efd675c4dc4..03e9369e094 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -511,13 +511,6 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline, if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) { cf = V_028714_SPI_SHADER_ZERO; - - if (blend->need_src_alpha & (1 << i)) { - /* Write the alpha channel of MRT0 when alpha coverage is - * enabled because the depth attachment needs it. - */ - col_format |= V_028714_SPI_SHADER_32_ABGR; - } } else { struct radv_render_pass_attachment *attachment = pass->attachments + subpass->color_attachments[i].attachment; bool blend_enable = @@ -531,6 +524,14 @@ radv_pipeline_compute_spi_color_formats(struct radv_pipeline *pipeline, col_format |= cf << (4 * i); } + if (!col_format && blend->need_src_alpha & (1 << 0)) { + /* When a subpass doesn't have any color attachments, write the + * alpha channel of MRT0 when alpha coverage is enabled because + * the depth attachment needs it. + */ + col_format |= V_028714_SPI_SHADER_32_ABGR; + } + /* If the i-th target format is set, all previous target formats must * be non-zero to avoid hangs. */ From ba24ca67f6d5532037c6c4a4528cf22a05872109 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 12 Feb 2019 14:39:40 -0800 Subject: [PATCH 081/378] v3d: Use the early_fragment_tests flag for the shader's disable-EZ field. Apparently we need disable-EZ flagged, not just "does Z writes". Fixes dEQP-GLES31.functional.image_load_store.early_fragment_tests.no_early_fragment_tests_depth_fbo on 7278, even though it passed in simulation. Signed-off-by: Eric Anholt Fixes: 051a41d3d56e ("v3d: Add support for the early_fragment_tests flag.") (cherry picked from commit cd5e0b272919a654079620adecd2abe24ff51233) --- src/broadcom/compiler/nir_to_vir.c | 3 +++ src/broadcom/compiler/v3d_compiler.h | 3 ++- src/broadcom/compiler/vir.c | 25 +++++++++++-------------- src/gallium/drivers/v3d/v3dx_draw.c | 9 +++++++-- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index a3ff2b0e8b4..a6d3cad21a5 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -1158,7 +1158,9 @@ emit_frag_end(struct v3d_compile *c) inst->src[vir_get_implicit_uniform_src(inst)] = vir_uniform_ui(c, tlb_specifier | 0xffffff00); + c->writes_z = true; } else if (c->s->info.fs.uses_discard || + !c->s->info.fs.early_fragment_tests || c->fs_key->sample_alpha_to_coverage || !has_any_tlb_color_write) { /* Emit passthrough Z if it needed to be delayed until shader @@ -1188,6 +1190,7 @@ emit_frag_end(struct v3d_compile *c) inst->src[vir_get_implicit_uniform_src(inst)] = vir_uniform_ui(c, tlb_specifier | 0xffffff00); + c->writes_z = true; } /* XXX: Performance improvement: Merge Z write and color writes TLB diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 127b04136d1..3f811f2b808 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -519,6 +519,7 @@ struct v3d_compile { uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)]; bool uses_center_w; + bool writes_z; struct v3d_ubo_range *ubo_ranges; bool *ubo_range_used; @@ -716,7 +717,7 @@ struct v3d_fs_prog_data { uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1]; bool writes_z; - bool discard; + bool disable_ez; bool uses_center_w; }; diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index 10105fbd861..20f7004149c 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -777,21 +777,9 @@ v3d_fs_set_prog_data(struct v3d_compile *c, struct v3d_fs_prog_data *prog_data) { v3d_set_fs_prog_data_inputs(c, prog_data); - prog_data->writes_z = (c->s->info.outputs_written & - (1 << FRAG_RESULT_DEPTH)); - prog_data->discard = (c->s->info.fs.uses_discard || - c->fs_key->sample_alpha_to_coverage); + prog_data->writes_z = c->writes_z; + prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; prog_data->uses_center_w = c->uses_center_w; - - /* If the shader has some side effects and hasn't allowed early - * fragment tests, disable them. - */ - if (!c->s->info.fs.early_fragment_tests && - (c->s->info.num_images || - c->s->info.num_ssbos || - c->s->info.num_abos)) { - prog_data->discard = true; - } } static void @@ -888,6 +876,15 @@ v3d_nir_lower_fs_early(struct v3d_compile *c) { if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) v3d_fixup_fs_output_types(c); + + /* If the shader has no non-TLB side effects, we can promote it to + * enabling early_fragment_tests even if the user didn't. + */ + if (!(c->s->info.num_images || + c->s->info.num_ssbos || + c->s->info.num_abos)) { + c->s->info.fs.early_fragment_tests = true; + } } static void diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c index 2700208e388..fba0c6733ea 100644 --- a/src/gallium/drivers/v3d/v3dx_draw.c +++ b/src/gallium/drivers/v3d/v3dx_draw.c @@ -203,8 +203,13 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, * shader needs to write the Z value (even just discards). */ shader.fragment_shader_does_z_writes = - (v3d->prog.fs->prog_data.fs->writes_z || - v3d->prog.fs->prog_data.fs->discard); + v3d->prog.fs->prog_data.fs->writes_z; + /* Set if the EZ test must be disabled (due to shader side + * effects and the early_z flag not being present in the + * shader). + */ + shader.turn_off_early_z_test = + v3d->prog.fs->prog_data.fs->disable_ez; shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 = v3d->prog.fs->prog_data.fs->uses_center_w; From d73e48b63ff6011b39da7c3be139f1e4bff9dd76 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 14 Feb 2019 09:42:38 -0800 Subject: [PATCH 082/378] v3d: Fix the check for "is the last thrsw inside control flow" The execute.file check used to be good enough, until I stopped setting up the execute mask for uniform ifs. No known tests fixed, noticed while doing a refactor. Fixes: 080506057310 ("v3d: Handle dynamically uniform IF statements with uniform control flow.") (cherry picked from commit 441294962cd65d44febdbe9ef0b0d99b5d27cec8) --- src/broadcom/compiler/nir_to_vir.c | 24 ++++++++++++++++-------- src/broadcom/compiler/v3d_compiler.h | 1 + 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index a6d3cad21a5..bd19bb9b0b6 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -121,7 +121,7 @@ vir_emit_thrsw(struct v3d_compile *c) */ c->last_thrsw = vir_NOP(c); c->last_thrsw->qpu.sig.thrsw = true; - c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); + c->last_thrsw_at_top_level = !c->in_control_flow; } static uint32_t @@ -2106,10 +2106,10 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) else else_block = vir_new_block(c); - bool was_top_level = false; + bool was_uniform_control_flow = false; if (c->execute.file == QFILE_NULL) { c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); - was_top_level = true; + was_uniform_control_flow = true; } /* Set up the flags for the IF condition (taking the THEN branch). */ @@ -2125,7 +2125,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and * was previously active (execute Z) for updating the exec flags. */ - if (was_top_level) { + if (was_uniform_control_flow) { cond = v3d_qpu_cond_invert(cond); } else { struct qinst *inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), @@ -2179,7 +2179,7 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) vir_link_blocks(c->cur_block, after_block); vir_set_emit_block(c, after_block); - if (was_top_level) + if (was_uniform_control_flow) c->execute = c->undef; else ntq_activate_execute_for_block(c); @@ -2188,12 +2188,15 @@ ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) static void ntq_emit_if(struct v3d_compile *c, nir_if *nif) { + bool was_in_control_flow = c->in_control_flow; + c->in_control_flow = true; if (c->execute.file == QFILE_NULL && nir_src_is_dynamically_uniform(nif->condition)) { ntq_emit_uniform_if(c, nif); } else { ntq_emit_nonuniform_if(c, nif); } + c->in_control_flow = was_in_control_flow; } static void @@ -2270,10 +2273,13 @@ static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); static void ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) { - bool was_top_level = false; + bool was_in_control_flow = c->in_control_flow; + c->in_control_flow = true; + + bool was_uniform_control_flow = false; if (c->execute.file == QFILE_NULL) { c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); - was_top_level = true; + was_uniform_control_flow = true; } struct qblock *save_loop_cont_block = c->loop_cont_block; @@ -2310,7 +2316,7 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) vir_link_blocks(c->cur_block, c->loop_break_block); vir_set_emit_block(c, c->loop_break_block); - if (was_top_level) + if (was_uniform_control_flow) c->execute = c->undef; else ntq_activate_execute_for_block(c); @@ -2319,6 +2325,8 @@ ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) c->loop_cont_block = save_loop_cont_block; c->loops++; + + c->in_control_flow = was_in_control_flow; } static void diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index 3f811f2b808..671aba3c551 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -532,6 +532,7 @@ struct v3d_compile { * yes, otherwise a block number + 1 that the channel jumped to. */ struct qreg execute; + bool in_control_flow; struct qreg line_x, point_x, point_y; From 1b093b567f8aa0dc24530006cc0aeffa90ecd3ab Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 6 Dec 2018 12:10:41 +0000 Subject: [PATCH 083/378] radv: bitcast 16-bit outputs to integers 16-bit outputs are stored as 16-bit floats in the outputs array, so they have to be bitcast. Fixes: b722b29f10d ('radv: add support for 16bit input/output') Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset (cherry picked from commit 64065aa504c4872a15f7b0894b6037a6b2bcae65) --- src/amd/vulkan/radv_nir_to_llvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index e80938527e5..fa6bfe6b750 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2365,7 +2365,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildZExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; @@ -2376,7 +2376,7 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, if (is_16bit) { for (unsigned chan = 0; chan < 4; chan++) values[chan] = LLVMBuildSExt(ctx->ac.builder, - values[chan], + ac_to_integer(&ctx->ac, values[chan]), ctx->ac.i32, ""); } break; From c7fc61d15b5db5340dfe3eca633c8fc86d81b0d4 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 6 Dec 2018 12:11:00 +0000 Subject: [PATCH 084/378] radv: ensure export arguments are always float So that the signature is correct and consistent, the inputs to a export intrinsic should always be 32-bit floats. This and the previous commit fixes a large amount crashes from dEQP-VK.spirv_assembly.instruction.graphics.16bit_storage.input_output_int_* tests Fixes: b722b29f10d ('radv: add support for 16bit input/output') Signed-off-by: Rhys Perry Reviewed-by: Samuel Pitoiset (cherry picked from commit 0ca550e01ac55c67c2deef50f5cb750a0181352b) --- src/amd/vulkan/radv_nir_to_llvm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index fa6bfe6b750..cfaeaabf539 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -2429,12 +2429,8 @@ si_llvm_init_export_args(struct radv_shader_context *ctx, } else memcpy(&args->out[0], values, sizeof(values[0]) * 4); - for (unsigned i = 0; i < 4; ++i) { - if (!(args->enabled_channels & (1 << i))) - continue; - + for (unsigned i = 0; i < 4; ++i) args->out[i] = ac_to_float(&ctx->ac, args->out[i]); - } } static void From 0a2e4b02ca6f0bf4b198b10a2752fb33064f44ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tapani=20P=C3=A4lli?= Date: Thu, 14 Feb 2019 09:02:31 +0200 Subject: [PATCH 085/378] mesa: return NULL if we exceed MaxColorAttachments in get_fb_attachment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes invalid access to Attachment array which would occur if caller would exceed MaxColorAttachments. In practice this should not ever happen because DiscardFramebufferEXT specifies only GL_COLOR_ATTACHMENT0 to be valid and InvalidateFramebuffer will error out before but this should make coverity happy. v2: const, remove _EXT (Ian) CID: 1442559 Fixes: 0c42b5f3cb9 "mesa: wire up InvalidateFramebuffer" Signed-off-by: Tapani Pälli Reviewed-by: Ian Romanick (cherry picked from commit 9762a9f89380a8070654a80e73d927297c29da35) --- src/mesa/main/fbobject.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index 87c33be7854..341fd93efc6 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -4663,8 +4663,12 @@ get_fb_attachment(struct gl_context *ctx, struct gl_framebuffer *fb, case GL_COLOR_ATTACHMENT12: case GL_COLOR_ATTACHMENT13: case GL_COLOR_ATTACHMENT14: - case GL_COLOR_ATTACHMENT15: - return &fb->Attachment[BUFFER_COLOR0 + attachment - GL_COLOR_ATTACHMENT0]; + case GL_COLOR_ATTACHMENT15: { + const unsigned i = attachment - GL_COLOR_ATTACHMENT0; + if (i >= ctx->Const.MaxColorAttachments) + return NULL; + return &fb->Attachment[BUFFER_COLOR0 + i]; + } case GL_DEPTH: case GL_DEPTH_ATTACHMENT: case GL_DEPTH_STENCIL_ATTACHMENT: From 2e7833ad916c493969d00871cdf56db4407b80eb Mon Sep 17 00:00:00 2001 From: Dylan Baker Date: Tue, 19 Feb 2019 11:15:18 -0800 Subject: [PATCH 086/378] Version: update to 19.0-rc5 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index b61e5f2523a..f1b47395643 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -19.0.0-rc4 +19.0.0-rc5 From a5f16a42a53932514763a73004cb133a490546f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 19 Feb 2019 17:20:01 -0500 Subject: [PATCH 087/378] radeonsi: add driconf option radeonsi_enable_nir Cc: 18.3 19.0 Reviewed-by: Timothy Arceri (cherry picked from commit ccbfe44e5ff88a19451701561f752c6046677122) --- src/gallium/drivers/radeonsi/driinfo_radeonsi.h | 1 + src/gallium/drivers/radeonsi/si_pipe.c | 3 ++- src/util/xmlpool/t_options.h | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h index cbf3bb01fb3..edf8edba035 100644 --- a/src/gallium/drivers/radeonsi/driinfo_radeonsi.h +++ b/src/gallium/drivers/radeonsi/driinfo_radeonsi.h @@ -12,4 +12,5 @@ DRI_CONF_SECTION_END DRI_CONF_SECTION_DEBUG DRI_CONF_RADEONSI_CLEAR_DB_CACHE_BEFORE_CLEAR("false") + DRI_CONF_RADEONSI_ENABLE_NIR("false") DRI_CONF_SECTION_END diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 41d395d7d3f..2656bdc2068 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -865,7 +865,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws, sscreen->debug_flags |= DBG(FS_CORRECT_DERIVS_AFTER_KILL); if (driQueryOptionb(config->options, "radeonsi_enable_sisched")) sscreen->debug_flags |= DBG(SI_SCHED); - + if (driQueryOptionb(config->options, "radeonsi_enable_nir")) + sscreen->debug_flags |= DBG(NIR); if (sscreen->debug_flags & DBG(INFO)) ac_print_gpu_info(&sscreen->info); diff --git a/src/util/xmlpool/t_options.h b/src/util/xmlpool/t_options.h index 80ddf0e203e..f0ef84ab69b 100644 --- a/src/util/xmlpool/t_options.h +++ b/src/util/xmlpool/t_options.h @@ -347,3 +347,8 @@ DRI_CONF_OPT_END DRI_CONF_OPT_BEGIN_B(radeonsi_zerovram, def) \ DRI_CONF_DESC(en,"Zero all vram allocations") \ DRI_CONF_OPT_END + +#define DRI_CONF_RADEONSI_ENABLE_NIR(def) \ +DRI_CONF_OPT_BEGIN_B(radeonsi_enable_nir, def) \ + DRI_CONF_DESC(en,gettext("Enable NIR")) \ +DRI_CONF_OPT_END From c837cd6546c50c212378b48fefcd30fc6c22df05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 19 Feb 2019 17:21:20 -0500 Subject: [PATCH 088/378] radeonsi: always enable NIR for Civilization 6 to fix corruption Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104602 Cc: 18.3 19.0 Reviewed-by: Timothy Arceri (cherry picked from commit ae21bdf47cacafdf69b904cbf3e433cbe0cccb84) --- src/util/00-mesa-defaults.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index cb0e6e659e2..81f23c97941 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -426,5 +426,8 @@ TODO: document the other workarounds. + + From dd03e1d5defe315f2457e22a9bf5a46fe313d8a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 19 Feb 2019 17:29:52 -0500 Subject: [PATCH 089/378] driconf: add Civ6Sub executable for Civilization 6 I'm getting Civ6Sub instead of Civ6. Cc: 18.3 19.0 Reviewed-by: Timothy Arceri (cherry picked from commit bff8da6c591e55e4b5f04aea1fef29e6230e9222) --- src/util/00-mesa-defaults.conf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf index 81f23c97941..8abc50c9f26 100644 --- a/src/util/00-mesa-defaults.conf +++ b/src/util/00-mesa-defaults.conf @@ -229,6 +229,9 @@ TODO: document the other workarounds. + +