Skip to content
This repository was archived by the owner on Jan 7, 2023. It is now read-only.

Commit bf0992e

Browse files
Toni Lönnbergrenchenglei
authored andcommitted
FROMLIST: SQUASH: i965: SIMD32 selection heuristics
(cover letter https://patchwork.freedesktop.org/series/51006/) FROMLIST: i965: SIMD32 heuristics debug flag Added a new DEBUG_HEUR32 flag to INTEL_DEBUG flags for enabling SIMD32 selection heuristics. (am from https://patchwork.freedesktop.org/patch/256764/) FROMLIST: i965: SIMD32 heuristics control data Added a new structure for holding SIMD32 heuristics control data. The control data itself will be fetched from drirc. (am from https://patchwork.freedesktop.org/patch/256806/) FROMLIST: i965: SIMD32 heuristics control data from drirc To be able to test the heuristics with different parameters, they can be controlled via environment variables through drirc. (am from https://patchwork.freedesktop.org/patch/256788/) FROMLIST: mesa: Helper functions for counting set bits in a mask (am from https://patchwork.freedesktop.org/patch/256765/) FROMLIST: i965/fs: Save the instruction count of each dispatch width The SIMD32 selection heuristics will use this information for deciding whether SIMD32 shaders should be used. (am from https://patchwork.freedesktop.org/patch/256793/) FROMLIST: i965/fs: SIMD32 selection heuristic based on grouped texture fetches The function goes through the compiled shader and checks how many grouped texture fetches there are. This is a simple heuristic which gets rid of most of the regressions when enabling SIMD32 shaders but still retains some of the benefits. (am from https://patchwork.freedesktop.org/patch/256798/) FROMLIST: i965/fs: Enable all SIMD32 heuristics There are three simple heuristics for SIMD32 shader enabling: - How many MRTs does the shader write into? - How many grouped texture fetches does the shader have? - How many instructions does the SIMD32 shader have compared to the SIMD16 shader? For testing purposes, the heuristics can be controlled via these environment variables: simd32_heuristic_mrt_check - Enables MRT write check - Default: true simd32_heuristic_max_mrts - How many MRT writes the heuristic allows - Default: 1 simd32_heuristic_grouped_check - Enables grouped texture fetch check - Default: true simd32_heuristic_grouped_sends - How many grouped texture fetches the heuristic allows - Default: 6 simd32_heuristic_inst_check - Enables SIMD32 vs. SIMD16 instruction count check - Default: true simd32_heuristic_inst_ratio - SIMD32 vs. SIMD16 instruction count ratio the heuristic allows - Default: 2.3 SIMD32 shaders will not be compiled also when SIMD16 compilation fails or spills. (am from https://patchwork.freedesktop.org/patch/256766/)
1 parent 7d3cc77 commit bf0992e

File tree

15 files changed

+244
-9
lines changed

15 files changed

+244
-9
lines changed

src/intel/compiler/brw_compiler.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ struct ra_regs;
3838
struct nir_shader;
3939
struct brw_program;
4040

41+
struct brw_simd32_heuristics_control {
42+
bool grouped_sends_check;
43+
int max_grouped_sends;
44+
bool inst_count_check;
45+
float inst_count_ratio;
46+
bool mrt_check;
47+
int max_mrts;
48+
};
49+
4150
struct brw_compiler {
4251
const struct gen_device_info *devinfo;
4352

@@ -119,6 +128,8 @@ struct brw_compiler {
119128
* whether nir_opt_large_constants will be run.
120129
*/
121130
bool supports_shader_constants;
131+
132+
struct brw_simd32_heuristics_control simd32_heuristics_control;
122133
};
123134

124135
/**

src/intel/compiler/brw_fs.cpp

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8240,6 +8240,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
82408240
char **error_str)
82418241
{
82428242
const struct gen_device_info *devinfo = compiler->devinfo;
8243+
bool simd16_failed = false;
8244+
bool simd16_spilled = false;
82438245

82448246
unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16;
82458247

@@ -8323,20 +8325,30 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
83238325
shader_time_index16);
83248326
v16.import_uniforms(&v8);
83258327
if (!v16.run_fs(allow_spilling, use_rep_send)) {
8328+
simd16_failed = true;
83268329
compiler->shader_perf_log(log_data,
83278330
"SIMD16 shader failed to compile: %s",
83288331
v16.fail_msg);
83298332
} else {
8333+
simd16_spilled = v16.spilled_any_registers;
83308334
simd16_cfg = v16.cfg;
83318335
prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
83328336
prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
83338337
}
83348338
}
83358339

83368340
/* Currently, the compiler only supports SIMD32 on SNB+ */
8341+
const brw_simd32_heuristics_control *ctrl = &compiler->simd32_heuristics_control;
8342+
uint64_t mrts = shader->info.outputs_written << FRAG_RESULT_DATA0;
8343+
83378344
if (v8.max_dispatch_width >= 32 && !use_rep_send &&
83388345
compiler->devinfo->gen >= 6 &&
8339-
unlikely(INTEL_DEBUG & DEBUG_DO32)) {
8346+
(unlikely(INTEL_DEBUG & DEBUG_DO32) ||
8347+
(unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
8348+
!simd16_failed && !simd16_spilled &&
8349+
(!ctrl->mrt_check ||
8350+
(ctrl->mrt_check &&
8351+
u_count_bits64(&mrts) <= ctrl->max_mrts))))) {
83408352
/* Try a SIMD32 compile */
83418353
fs_visitor v32(compiler, log_data, mem_ctx, &key->base,
83428354
&prog_data->base, shader, 32,
@@ -8347,9 +8359,12 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
83478359
"SIMD32 shader failed to compile: %s",
83488360
v32.fail_msg);
83498361
} else {
8350-
simd32_cfg = v32.cfg;
8351-
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
8352-
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
8362+
if (likely(!(INTEL_DEBUG & DEBUG_HEUR32)) ||
8363+
v32.run_heuristic(ctrl)) {
8364+
simd32_cfg = v32.cfg;
8365+
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
8366+
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
8367+
}
83538368
}
83548369
}
83558370

@@ -8424,14 +8439,51 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
84248439
}
84258440

84268441
if (simd32_cfg) {
8427-
prog_data->dispatch_32 = true;
8428-
prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32, stats);
8429-
stats = stats ? stats + 1 : NULL;
8442+
uint32_t offset = g.generate_code(simd32_cfg, 32);
8443+
8444+
if (unlikely(INTEL_DEBUG & DEBUG_DO32) ||
8445+
(unlikely(INTEL_DEBUG & DEBUG_HEUR32) &&
8446+
(!simd16_cfg ||
8447+
(simd16_cfg &&
8448+
(!ctrl->inst_count_check ||
8449+
(ctrl->inst_count_check &&
8450+
(float)g.get_inst_count(32) / (float)g.get_inst_count(16) <= ctrl->inst_count_ratio)))))) {
8451+
prog_data->dispatch_32 = true;
8452+
prog_data->prog_offset_32 = offset;
8453+
uint32_t offset = g.generate_code(simd32_cfg, 32);
8454+
}
8455+
84308456
}
84318457

84328458
return g.get_assembly();
84338459
}
84348460

8461+
bool
8462+
fs_visitor::run_heuristic(const struct brw_simd32_heuristics_control *ctrl) {
8463+
int grouped_sends = 0;
8464+
int max_grouped_sends = 0;
8465+
bool pass = true;
8466+
8467+
foreach_block_and_inst(block, fs_inst, inst, cfg) {
8468+
if (inst->opcode >= SHADER_OPCODE_TEX && inst->opcode <= SHADER_OPCODE_SAMPLEINFO_LOGICAL) {
8469+
++grouped_sends;
8470+
} else if (grouped_sends > 0) {
8471+
if (grouped_sends > max_grouped_sends) {
8472+
max_grouped_sends = grouped_sends;
8473+
}
8474+
grouped_sends = 0;
8475+
}
8476+
}
8477+
8478+
if (ctrl->grouped_sends_check) {
8479+
if (max_grouped_sends > ctrl->max_grouped_sends) {
8480+
pass = false;
8481+
}
8482+
}
8483+
8484+
return pass;
8485+
}
8486+
84358487
fs_reg *
84368488
fs_visitor::emit_cs_work_group_id_setup()
84378489
{

src/intel/compiler/brw_fs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ class fs_visitor : public backend_shader
306306
virtual void dump_instructions(const char *name);
307307
void dump_instruction(backend_instruction *inst);
308308
void dump_instruction(backend_instruction *inst, FILE *file);
309+
310+
bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);
309311

310312
const brw_base_prog_key *const key;
311313
const struct brw_sampler_prog_key_data *key_tex;
@@ -430,6 +432,7 @@ class fs_generator
430432
void enable_debug(const char *shader_name);
431433
int generate_code(const cfg_t *cfg, int dispatch_width,
432434
struct brw_compile_stats *stats);
435+
int get_inst_count(int dispatch_width);
433436
const unsigned *get_assembly();
434437

435438
private:
@@ -525,6 +528,7 @@ class fs_generator
525528
struct brw_stage_prog_data * const prog_data;
526529

527530
unsigned dispatch_width; /**< 8, 16 or 32 */
531+
int inst_count[3]; /* for 8, 16 and 32 */
528532

529533
exec_list discard_halt_patches;
530534
struct shader_stats shader_stats;

src/intel/compiler/brw_fs.h.rej

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
diff a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h (rejected hunks)
2+
@@ -300,6 +300,8 @@ public:
3+
void dump_instruction(backend_instruction *inst);
4+
void dump_instruction(backend_instruction *inst, FILE *file);
5+
6+
+ bool run_heuristic(const struct brw_simd32_heuristics_control *ctrl);
7+
+
8+
const void *const key;
9+
const struct brw_sampler_prog_key_data *key_tex;
10+
11+
@@ -420,6 +422,7 @@ public:
12+
13+
void enable_debug(const char *shader_name);
14+
int generate_code(const cfg_t *cfg, int dispatch_width);
15+
+ int get_inst_count(int dispatch_width);
16+
const unsigned *get_assembly();
17+
18+
private:
19+
@@ -515,6 +518,7 @@ private:
20+
struct brw_stage_prog_data * const prog_data;
21+
22+
unsigned dispatch_width; /**< 8, 16 or 32 */
23+
+ int inst_count[3]; /* for 8, 16 and 32 */
24+
25+
exec_list discard_halt_patches;
26+
unsigned promoted_constants;

src/intel/compiler/brw_fs_generator.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2430,6 +2430,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
24302430
stats->fills = fill_count;
24312431
}
24322432

2433+
inst_count[ffs(dispatch_width) - 4] = before_size / 16;
2434+
24332435
return start_offset;
24342436
}
24352437

@@ -2438,3 +2440,13 @@ fs_generator::get_assembly()
24382440
{
24392441
return brw_get_program(p, &prog_data->program_size);
24402442
}
2443+
2444+
int
2445+
fs_generator::get_inst_count(int dispatch_width)
2446+
{
2447+
if (dispatch_width == 8 || dispatch_width == 16 || dispatch_width == 32) {
2448+
return inst_count[ffs(dispatch_width) - 4];
2449+
} else {
2450+
return 0;
2451+
}
2452+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
diff a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp (rejected hunks)
2+
@@ -2297,6 +2297,8 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
3+
fill_count, promoted_constants, before_size,
4+
after_size);
5+
6+
+ inst_count[ffs(dispatch_width) - 4] = before_size / 16;
7+
+
8+
return start_offset;
9+
}
10+

src/intel/dev/gen_debug.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ static const struct debug_control debug_control[] = {
8686
{ "color", DEBUG_COLOR },
8787
{ "reemit", DEBUG_REEMIT },
8888
{ "soft64", DEBUG_SOFT64 },
89+
{ "heur32", DEBUG_HEUR32 },
8990
{ "tcs8", DEBUG_TCS_EIGHT_PATCH },
9091
{ "bt", DEBUG_BT },
9192
{ "pc", DEBUG_PIPE_CONTROL },

src/intel/dev/gen_debug.c.rej

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
diff a/src/intel/dev/gen_debug.c b/src/intel/dev/gen_debug.c (rejected hunks)
2+
@@ -86,6 +86,7 @@ static const struct debug_control debug_control[] = {
3+
{ "color", DEBUG_COLOR },
4+
{ "reemit", DEBUG_REEMIT },
5+
{ "soft64", DEBUG_SOFT64 },
6+
+ { "heur32", DEBUG_HEUR32 },
7+
{ NULL, 0 }
8+
};
9+

src/intel/dev/gen_debug.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ extern uint64_t INTEL_DEBUG;
8888
#define DEBUG_BT (1ull << 44)
8989
#define DEBUG_PIPE_CONTROL (1ull << 45)
9090
#define DEBUG_NO_FAST_CLEAR (1ull << 46)
91+
#define DEBUG_HEUR32 (1ull << 47)
9192

9293
/* These flags are not compatible with the disk shader cache */
9394
#define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
@@ -96,7 +97,7 @@ extern uint64_t INTEL_DEBUG;
9697
#define DEBUG_DISK_CACHE_MASK \
9798
(DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \
9899
DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | \
99-
DEBUG_TCS_EIGHT_PATCH)
100+
DEBUG_TCS_EIGHT_PATCH | DEBUG_HEUR32)
100101

101102
#ifdef HAVE_ANDROID_PLATFORM
102103
#define LOG_TAG "INTEL-MESA"

src/intel/dev/gen_debug.h.rej

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
diff a/src/intel/dev/gen_debug.h b/src/intel/dev/gen_debug.h (rejected hunks)
2+
@@ -84,6 +84,7 @@ extern uint64_t INTEL_DEBUG;
3+
#define DEBUG_COLOR (1ull << 40)
4+
#define DEBUG_REEMIT (1ull << 41)
5+
#define DEBUG_SOFT64 (1ull << 42)
6+
+#define DEBUG_HEUR32 (1ull << 43)
7+
8+
/* These flags are not compatible with the disk shader cache */
9+
#define DEBUG_DISK_CACHE_DISABLE_MASK DEBUG_SHADER_TIME
10+
@@ -91,7 +92,7 @@ extern uint64_t INTEL_DEBUG;
11+
/* These flags may affect program generation */
12+
#define DEBUG_DISK_CACHE_MASK \
13+
(DEBUG_NO16 | DEBUG_NO_DUAL_OBJECT_GS | DEBUG_NO8 | DEBUG_SPILL_FS | \
14+
- DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64)
15+
+ DEBUG_SPILL_VEC4 | DEBUG_NO_COMPACTION | DEBUG_DO32 | DEBUG_SOFT64 | DEBUG_HEUR32)
16+
17+
#ifdef HAVE_ANDROID_PLATFORM
18+
#define LOG_TAG "INTEL-MESA"

0 commit comments

Comments
 (0)