1/* 2 * Copyright (C) 2021 Collabora, Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 */ 24 25#include <stdio.h> 26#include "pan_bo.h" 27#include "pan_shader.h" 28#include "pan_scoreboard.h" 29#include "pan_encoder.h" 30#include "pan_indirect_draw.h" 31#include "pan_pool.h" 32#include "pan_util.h" 33#include "compiler/nir/nir_builder.h" 34#include "util/u_memory.h" 35#include "util/macros.h" 36 37#define WORD(x) ((x) * 4) 38 39#define LOOP \ 40 for (nir_loop *l = nir_push_loop(b); l != NULL; \ 41 nir_pop_loop(b, l), l = NULL) 42#define BREAK nir_jump(b, nir_jump_break) 43#define CONTINUE nir_jump(b, nir_jump_continue) 44 45#define IF(cond) nir_push_if(b, cond); 46#define ELSE nir_push_else(b, NULL); 47#define ENDIF nir_pop_if(b, NULL); 48 49#define MIN_MAX_JOBS 128 50 51struct draw_data { 52 nir_ssa_def *draw_buf; 53 nir_ssa_def *draw_buf_stride; 54 nir_ssa_def *index_buf; 55 nir_ssa_def *restart_index; 56 nir_ssa_def *vertex_count; 57 nir_ssa_def *start_instance; 58 nir_ssa_def *instance_count; 59 nir_ssa_def *vertex_start; 60 nir_ssa_def *index_bias; 61 nir_ssa_def *draw_ctx; 62 nir_ssa_def *min_max_ctx; 63}; 64 65struct instance_size { 66 nir_ssa_def *raw; 67 nir_ssa_def *padded; 68 nir_ssa_def *packed; 69}; 70 71struct jobs_data { 72 nir_ssa_def *vertex_job; 73 nir_ssa_def *tiler_job; 74 nir_ssa_def *base_vertex_offset; 75 nir_ssa_def *first_vertex_sysval; 76 nir_ssa_def *base_vertex_sysval; 77 nir_ssa_def *base_instance_sysval; 78 nir_ssa_def *offset_start; 79 nir_ssa_def *invocation; 80}; 81 82struct varyings_data { 83 nir_ssa_def *varying_bufs; 84 nir_ssa_def *pos_ptr; 85 nir_ssa_def *psiz_ptr; 86 nir_variable *mem_ptr; 87}; 88 89struct attribs_data { 90 nir_ssa_def *attrib_count; 91 nir_ssa_def *attrib_bufs; 92 nir_ssa_def *attribs; 93}; 94 95struct indirect_draw_shader_builder { 96 nir_builder b; 97 const struct panfrost_device *dev; 98 unsigned flags; 99 bool index_min_max_search; 100 unsigned index_size; 101 struct draw_data draw; 102 struct instance_size instance_size; 103 struct jobs_data jobs; 104 struct varyings_data varyings; 105 struct attribs_data attribs; 106}; 107 108/* Describes an indirect draw (see glDrawArraysIndirect()) */ 109 110struct indirect_draw_info { 111 uint32_t count; 112 uint32_t instance_count; 113 uint32_t start; 114 uint32_t start_instance; 115}; 116 117struct indirect_indexed_draw_info { 118 uint32_t count; 119 uint32_t instance_count; 120 uint32_t start; 121 int32_t index_bias; 122 uint32_t start_instance; 123}; 124 125/* Store the min/max index in a separate context. This is not supported yet, but 126 * the DDK seems to put all min/max search jobs at the beginning of the job chain 127 * when multiple indirect draws are issued to avoid the serialization caused by 128 * the draw patching jobs which have the suppress_prefetch flag set. Merging the 129 * min/max and draw contexts would prevent such optimizations (draw contexts are 130 * shared by all indirect draw in a batch). 131 */ 132 133struct min_max_context { 134 uint32_t min; 135 uint32_t max; 136}; 137 138/* Per-batch context shared by all indirect draws queued to a given batch. */ 139 140struct indirect_draw_context { 141 /* Pointer to the top of the varying heap. */ 142 mali_ptr varying_mem; 143}; 144 145/* Indirect draw shader inputs. Those are stored in FAU. */ 146 147struct indirect_draw_inputs { 148 /* indirect_draw_context pointer */ 149 mali_ptr draw_ctx; 150 151 /* min_max_context pointer */ 152 mali_ptr min_max_ctx; 153 154 /* Pointer to an array of indirect_draw_info objects */ 155 mali_ptr draw_buf; 156 157 /* Pointer to an uint32_t containing the number of draws to issue */ 158 mali_ptr draw_count_ptr; 159 160 /* index buffer */ 161 mali_ptr index_buf; 162 163 /* {base,first}_{vertex,instance} sysvals */ 164 mali_ptr first_vertex_sysval; 165 mali_ptr base_vertex_sysval; 166 mali_ptr base_instance_sysval; 167 168 /* Pointers to various cmdstream structs that need to be patched */ 169 mali_ptr vertex_job; 170 mali_ptr tiler_job; 171 mali_ptr attrib_bufs; 172 mali_ptr attribs; 173 mali_ptr varying_bufs; 174 uint32_t draw_count; 175 uint32_t draw_buf_stride; 176 uint32_t restart_index; 177 uint32_t attrib_count; 178} PACKED; 179 180#define get_input_field(b, name) \ 181 nir_load_push_constant(b, \ 182 1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \ 183 nir_imm_int(b, 0), \ 184 .base = offsetof(struct indirect_draw_inputs, name)) 185 186static nir_ssa_def * 187get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset) 188{ 189 return nir_iadd(b, base, nir_u2u64(b, offset)); 190} 191 192static nir_ssa_def * 193get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset) 194{ 195 return get_address(b, base, nir_imm_int(b, offset)); 196} 197 198static nir_ssa_def * 199load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size) 200{ 201 return nir_load_global(b, addr, 4, ncomps, bit_size); 202} 203 204static void 205store_global(nir_builder *b, nir_ssa_def *addr, 206 nir_ssa_def *value, unsigned ncomps) 207{ 208 nir_store_global(b, addr, 4, value, (1 << ncomps) - 1); 209} 210 211static nir_ssa_def * 212get_draw_ctx_data(struct indirect_draw_shader_builder *builder, 213 unsigned offset, unsigned size) 214{ 215 nir_builder *b = &builder->b; 216 return load_global(b, 217 get_address_imm(b, builder->draw.draw_ctx, offset), 218 1, size); 219} 220 221static void 222set_draw_ctx_data(struct indirect_draw_shader_builder *builder, 223 unsigned offset, nir_ssa_def *value, unsigned size) 224{ 225 nir_builder *b = &builder->b; 226 store_global(b, 227 get_address_imm(b, builder->draw.draw_ctx, offset), 228 value, 1); 229} 230 231#define get_draw_ctx_field(builder, name) \ 232 get_draw_ctx_data(builder, \ 233 offsetof(struct indirect_draw_context, name), \ 234 sizeof(((struct indirect_draw_context *)0)->name) * 8) 235 236#define set_draw_ctx_field(builder, name, val) \ 237 set_draw_ctx_data(builder, \ 238 offsetof(struct indirect_draw_context, name), \ 239 val, \ 240 sizeof(((struct indirect_draw_context *)0)->name) * 8) 241 242static nir_ssa_def * 243get_min_max_ctx_data(struct indirect_draw_shader_builder *builder, 244 unsigned offset, unsigned size) 245{ 246 nir_builder *b = &builder->b; 247 return load_global(b, 248 get_address_imm(b, builder->draw.min_max_ctx, offset), 249 1, size); 250} 251 252#define get_min_max_ctx_field(builder, name) \ 253 get_min_max_ctx_data(builder, \ 254 offsetof(struct min_max_context, name), \ 255 sizeof(((struct min_max_context *)0)->name) * 8) 256 257static void 258update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val) 259{ 260 nir_builder *b = &builder->b; 261 nir_ssa_def *addr = 262 get_address_imm(b, 263 builder->draw.min_max_ctx, 264 offsetof(struct min_max_context, min)); 265 nir_global_atomic_umin(b, 32, addr, val); 266} 267 268static void 269update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val) 270{ 271 nir_builder *b = &builder->b; 272 nir_ssa_def *addr = 273 get_address_imm(b, 274 builder->draw.min_max_ctx, 275 offsetof(struct min_max_context, max)); 276 nir_global_atomic_umax(b, 32, addr, val); 277} 278 279#define get_draw_field(b, draw_ptr, field) \ 280 load_global(b, \ 281 get_address_imm(b, draw_ptr, \ 282 offsetof(struct indirect_draw_info, field)), \ 283 1, sizeof(((struct indirect_draw_info *)0)->field) * 8) 284 285#define get_indexed_draw_field(b, draw_ptr, field) \ 286 load_global(b, \ 287 get_address_imm(b, draw_ptr, \ 288 offsetof(struct indirect_indexed_draw_info, field)), \ 289 1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8) 290 291static void 292extract_inputs(struct indirect_draw_shader_builder *builder) 293{ 294 nir_builder *b = &builder->b; 295 296 builder->draw.draw_ctx = get_input_field(b, draw_ctx); 297 builder->draw.draw_buf = get_input_field(b, draw_buf); 298 builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride); 299 300 if (builder->index_size) { 301 builder->draw.index_buf = get_input_field(b, index_buf); 302 builder->draw.min_max_ctx = get_input_field(b, min_max_ctx); 303 if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) { 304 builder->draw.restart_index = 305 get_input_field(b, restart_index); 306 } 307 } 308 309 if (builder->index_min_max_search) 310 return; 311 312 builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval); 313 builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval); 314 builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval); 315 builder->jobs.vertex_job = get_input_field(b, vertex_job); 316 builder->jobs.tiler_job = get_input_field(b, tiler_job); 317 builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs); 318 builder->attribs.attribs = get_input_field(b, attribs); 319 builder->attribs.attrib_count = get_input_field(b, attrib_count); 320 builder->varyings.varying_bufs = get_input_field(b, varying_bufs); 321 builder->varyings.mem_ptr = 322 nir_local_variable_create(b->impl, 323 glsl_uint64_t_type(), 324 "var_mem_ptr"); 325 nir_store_var(b, builder->varyings.mem_ptr, 326 get_draw_ctx_field(builder, varying_mem), 3); 327} 328 329static void 330init_shader_builder(struct indirect_draw_shader_builder *builder, 331 const struct panfrost_device *dev, 332 unsigned flags, unsigned index_size, 333 bool index_min_max_search) 334{ 335 memset(builder, 0, sizeof(*builder)); 336 builder->dev = dev; 337 builder->flags = flags; 338 builder->index_size = index_size; 339 340 builder->index_min_max_search = index_min_max_search; 341 342 if (index_min_max_search) { 343 builder->b = 344 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, 345 GENX(pan_shader_get_compiler_options)(), 346 "indirect_draw_min_max_index(index_size=%d)", 347 builder->index_size); 348 } else { 349 builder->b = 350 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, 351 GENX(pan_shader_get_compiler_options)(), 352 "indirect_draw(index_size=%d%s%s%s%s)", 353 builder->index_size, 354 flags & PAN_INDIRECT_DRAW_HAS_PSIZ ? 355 ",psiz" : "", 356 flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ? 357 ",primitive_restart" : "", 358 flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ? 359 ",update_primitive_size" : "", 360 flags & PAN_INDIRECT_DRAW_IDVS ? 361 ",idvs" : ""); 362 } 363 364 extract_inputs(builder); 365} 366 367static void 368update_dcd(struct indirect_draw_shader_builder *builder, 369 nir_ssa_def *job_ptr, 370 unsigned draw_offset) 371{ 372 nir_builder *b = &builder->b; 373 nir_ssa_def *draw_w01 = 374 load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32); 375 nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0); 376 377 /* Update DRAW.{instance_size,offset_start} */ 378 nir_ssa_def *instance_size = 379 nir_bcsel(b, 380 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)), 381 nir_imm_int(b, 0), builder->instance_size.packed); 382 draw_w01 = nir_vec2(b, 383 nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff), 384 nir_ishl(b, instance_size, nir_imm_int(b, 16))), 385 builder->jobs.offset_start); 386 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 387 draw_w01, 2); 388} 389 390static void 391update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type) 392{ 393 nir_builder *b = &builder->b; 394 nir_ssa_def *job_ptr = 395 type == MALI_JOB_TYPE_VERTEX ? 396 builder->jobs.vertex_job : builder->jobs.tiler_job; 397 398 /* Update the invocation words. */ 399 store_global(b, get_address_imm(b, job_ptr, WORD(8)), 400 builder->jobs.invocation, 2); 401 402 unsigned draw_offset = 403 type == MALI_JOB_TYPE_VERTEX ? 404 pan_section_offset(COMPUTE_JOB, DRAW) : 405 pan_section_offset(TILER_JOB, DRAW); 406 unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE); 407 unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE); 408 unsigned index_size = builder->index_size; 409 410 if (type == MALI_JOB_TYPE_TILER) { 411 /* Update PRIMITIVE.{base_vertex_offset,count} */ 412 store_global(b, 413 get_address_imm(b, job_ptr, prim_offset + WORD(1)), 414 builder->jobs.base_vertex_offset, 1); 415 store_global(b, 416 get_address_imm(b, job_ptr, prim_offset + WORD(3)), 417 nir_iadd_imm(b, builder->draw.vertex_count, -1), 1); 418 419 if (index_size) { 420 nir_ssa_def *addr = 421 get_address_imm(b, job_ptr, prim_offset + WORD(4)); 422 nir_ssa_def *indices = load_global(b, addr, 1, 64); 423 nir_ssa_def *offset = 424 nir_imul_imm(b, builder->draw.vertex_start, index_size); 425 426 indices = get_address(b, indices, offset); 427 store_global(b, addr, indices, 2); 428 } 429 430 /* Update PRIMITIVE_SIZE.size_array */ 431 if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) && 432 (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) { 433 store_global(b, 434 get_address_imm(b, job_ptr, psiz_offset + WORD(0)), 435 builder->varyings.psiz_ptr, 2); 436 } 437 438 /* Update DRAW.position */ 439 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)), 440 builder->varyings.pos_ptr, 2); 441 } 442 443 update_dcd(builder, job_ptr, draw_offset); 444 445 if (builder->flags & PAN_INDIRECT_DRAW_IDVS) { 446 assert(type == MALI_JOB_TYPE_TILER); 447 448 update_dcd(builder, job_ptr, 449 pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW)); 450 } 451} 452 453static void 454split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d) 455{ 456 /* TODO: Lower this 64bit div to something GPU-friendly */ 457 nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0)); 458 nir_ssa_def *div64 = nir_u2u64(b, div); 459 nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1)); 460 nir_ssa_def *f0 = nir_iadd(b, 461 nir_ishl(b, nir_imm_int64(b, 1), 462 nir_iadd_imm(b, r, 32)), 463 half_div64); 464 nir_ssa_def *fi = nir_idiv(b, f0, div64); 465 nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64)); 466 nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff), 467 nir_imm_int(b, 1 << 5), nir_imm_int(b, 0)); 468 *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31)); 469 *r_e = nir_ior(b, r, e); 470} 471 472static void 473update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder, 474 nir_ssa_def *attrib_buf_ptr, 475 enum mali_attribute_type type, 476 nir_ssa_def *div1, 477 nir_ssa_def *div2) 478{ 479 nir_builder *b = &builder->b; 480 unsigned type_mask = BITFIELD_MASK(6); 481 nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32); 482 nir_ssa_def *w0 = nir_channel(b, w01, 0); 483 nir_ssa_def *w1 = nir_channel(b, w01, 1); 484 485 /* Word 0 and 1 of the attribute descriptor contain the type, 486 * pointer and the the divisor exponent. 487 */ 488 w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask); 489 w0 = nir_ior(b, w0, nir_imm_int(b, type)); 490 w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24))); 491 492 store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2); 493 494 if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) { 495 /* If the divisor is not a power of two, the divisor numerator 496 * is passed in word 1 of the continuation attribute (word 5 497 * if we consider the attribute and its continuation as a 498 * single attribute). 499 */ 500 assert(div2); 501 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)), 502 div2, 1); 503 } 504} 505 506static void 507zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder, 508 nir_ssa_def *attrib_buf_ptr) 509{ 510 /* Stride is an unadorned 32-bit uint at word 2 */ 511 nir_builder *b = &builder->b; 512 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 513 nir_imm_int(b, 0), 1); 514} 515 516static void 517adjust_attrib_offset(struct indirect_draw_shader_builder *builder, 518 nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr, 519 nir_ssa_def *instance_div) 520{ 521 nir_builder *b = &builder->b; 522 nir_ssa_def *zero = nir_imm_int(b, 0); 523 nir_ssa_def *two = nir_imm_int(b, 2); 524 nir_ssa_def *sub_cur_offset = 525 nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero), 526 nir_uge(b, builder->draw.instance_count, two)); 527 528 nir_ssa_def *add_base_inst_offset = 529 nir_iand(b, nir_ine(b, builder->draw.start_instance, zero), 530 nir_ine(b, instance_div, zero)); 531 532 IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) { 533 nir_ssa_def *offset = 534 load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32); 535 nir_ssa_def *stride = 536 load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32); 537 538 /* Per-instance data needs to be offset in response to a 539 * delayed start in an indexed draw. 540 */ 541 542 IF (add_base_inst_offset) { 543 offset = nir_iadd(b, offset, 544 nir_idiv(b, 545 nir_imul(b, stride, 546 builder->draw.start_instance), 547 instance_div)); 548 } ENDIF 549 550 IF (sub_cur_offset) { 551 offset = nir_isub(b, offset, 552 nir_imul(b, stride, 553 builder->jobs.offset_start)); 554 } ENDIF 555 556 store_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 557 offset, 1); 558 } ENDIF 559} 560 561/* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */ 562 563static nir_ssa_def * 564nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x) 565{ 566 return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2)); 567} 568 569/* Based on panfrost_emit_vertex_data() */ 570 571static void 572update_vertex_attribs(struct indirect_draw_shader_builder *builder) 573{ 574 nir_builder *b = &builder->b; 575 nir_variable *attrib_idx_var = 576 nir_local_variable_create(b->impl, glsl_uint_type(), 577 "attrib_idx"); 578 nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1); 579 580#if PAN_ARCH <= 5 581 nir_ssa_def *single_instance = 582 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)); 583#endif 584 585 LOOP { 586 nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var); 587 IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count)) 588 BREAK; 589 ENDIF 590 591 nir_ssa_def *attrib_buf_ptr = 592 get_address(b, builder->attribs.attrib_bufs, 593 nir_imul_imm(b, attrib_idx, 594 2 * pan_size(ATTRIBUTE_BUFFER))); 595 nir_ssa_def *attrib_ptr = 596 get_address(b, builder->attribs.attribs, 597 nir_imul_imm(b, attrib_idx, 598 pan_size(ATTRIBUTE))); 599 600 nir_ssa_def *r_e, *d; 601 602#if PAN_ARCH <= 5 603 IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) { 604 nir_ssa_def *r_p = 605 nir_bcsel(b, single_instance, 606 nir_imm_int(b, 0x9f), 607 builder->instance_size.packed); 608 609 store_global(b, 610 get_address_imm(b, attrib_buf_ptr, WORD(4)), 611 nir_ishl(b, r_p, nir_imm_int(b, 24)), 1); 612 613 nir_store_var(b, attrib_idx_var, 614 nir_iadd_imm(b, attrib_idx, 1), 1); 615 CONTINUE; 616 } ENDIF 617 618 IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) { 619 split_div(b, builder->instance_size.padded, 620 &r_e, &d); 621 nir_ssa_def *default_div = 622 nir_ior(b, single_instance, 623 nir_ult(b, 624 builder->instance_size.padded, 625 nir_imm_int(b, 2))); 626 r_e = nir_bcsel(b, default_div, 627 nir_imm_int(b, 0x3f), r_e); 628 d = nir_bcsel(b, default_div, 629 nir_imm_int(b, (1u << 31) - 1), d); 630 store_global(b, 631 get_address_imm(b, attrib_buf_ptr, WORD(1)), 632 nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d), 633 2); 634 nir_store_var(b, attrib_idx_var, 635 nir_iadd_imm(b, attrib_idx, 1), 1); 636 CONTINUE; 637 } ENDIF 638#endif 639 640 nir_ssa_def *instance_div = 641 load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32); 642 643 nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded); 644 645 nir_ssa_def *multi_instance = 646 nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2)); 647 648 IF (nir_ine(b, div, nir_imm_int(b, 0))) { 649 IF (multi_instance) { 650 IF (nir_is_power_of_two_or_zero(b, div)) { 651 nir_ssa_def *exp = 652 nir_imax(b, nir_ufind_msb(b, div), 653 nir_imm_int(b, 0)); 654 update_vertex_attrib_buf(builder, attrib_buf_ptr, 655 MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR, 656 exp, NULL); 657 } ELSE { 658 split_div(b, div, &r_e, &d); 659 update_vertex_attrib_buf(builder, attrib_buf_ptr, 660 MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR, 661 r_e, d); 662 } ENDIF 663 } ELSE { 664 /* Single instance with a non-0 divisor: all 665 * accesses should point to attribute 0 */ 666 zero_attrib_buf_stride(builder, attrib_buf_ptr); 667 } ENDIF 668 669 adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div); 670 } ELSE IF (multi_instance) { 671 update_vertex_attrib_buf(builder, attrib_buf_ptr, 672 MALI_ATTRIBUTE_TYPE_1D_MODULUS, 673 builder->instance_size.packed, NULL); 674 } ENDIF ENDIF 675 676 nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1); 677 } 678} 679 680static nir_ssa_def * 681update_varying_buf(struct indirect_draw_shader_builder *builder, 682 nir_ssa_def *varying_buf_ptr, 683 nir_ssa_def *vertex_count) 684{ 685 nir_builder *b = &builder->b; 686 687 nir_ssa_def *stride = 688 load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32); 689 nir_ssa_def *size = nir_imul(b, stride, vertex_count); 690 nir_ssa_def *aligned_size = 691 nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63); 692 nir_ssa_def *var_mem_ptr = 693 nir_load_var(b, builder->varyings.mem_ptr); 694 nir_ssa_def *w0 = 695 nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr), 696 nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D)); 697 nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr); 698 store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)), 699 nir_vec4(b, w0, w1, stride, size), 4); 700 701 nir_store_var(b, builder->varyings.mem_ptr, 702 get_address(b, var_mem_ptr, aligned_size), 3); 703 704 return var_mem_ptr; 705} 706 707/* Based on panfrost_emit_varying_descriptor() */ 708 709static void 710update_varyings(struct indirect_draw_shader_builder *builder) 711{ 712 nir_builder *b = &builder->b; 713 nir_ssa_def *vertex_count = 714 nir_imul(b, builder->instance_size.padded, 715 builder->draw.instance_count); 716 nir_ssa_def *buf_ptr = 717 get_address_imm(b, builder->varyings.varying_bufs, 718 PAN_VARY_GENERAL * 719 pan_size(ATTRIBUTE_BUFFER)); 720 update_varying_buf(builder, buf_ptr, vertex_count); 721 722 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs, 723 PAN_VARY_POSITION * 724 pan_size(ATTRIBUTE_BUFFER)); 725 builder->varyings.pos_ptr = 726 update_varying_buf(builder, buf_ptr, vertex_count); 727 728 if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) { 729 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs, 730 PAN_VARY_PSIZ * 731 pan_size(ATTRIBUTE_BUFFER)); 732 builder->varyings.psiz_ptr = 733 update_varying_buf(builder, buf_ptr, vertex_count); 734 } 735 736 set_draw_ctx_field(builder, varying_mem, 737 nir_load_var(b, builder->varyings.mem_ptr)); 738} 739 740/* Based on panfrost_pack_work_groups_compute() */ 741 742static void 743get_invocation(struct indirect_draw_shader_builder *builder) 744{ 745 nir_builder *b = &builder->b; 746 nir_ssa_def *one = nir_imm_int(b, 1); 747 nir_ssa_def *max_vertex = 748 nir_usub_sat(b, builder->instance_size.raw, one); 749 nir_ssa_def *max_instance = 750 nir_usub_sat(b, builder->draw.instance_count, one); 751 nir_ssa_def *split = 752 nir_bcsel(b, nir_ieq_imm(b, max_instance, 0), 753 nir_imm_int(b, 32), 754 nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1)); 755 756 builder->jobs.invocation = 757 nir_vec2(b, 758 nir_ior(b, max_vertex, 759 nir_ishl(b, max_instance, split)), 760 nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)), 761 nir_imm_int(b, 2 << 28))); 762} 763 764static nir_ssa_def * 765nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot) 766{ 767 assert(pot != 0 && util_is_power_of_two_or_zero(pot)); 768 769 return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1)); 770} 771 772/* Based on panfrost_padded_vertex_count() */ 773 774static nir_ssa_def * 775get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed) 776{ 777 nir_ssa_def *one = nir_imm_int(b, 1); 778 nir_ssa_def *zero = nir_imm_int(b, 0); 779 nir_ssa_def *eleven = nir_imm_int(b, 11); 780 nir_ssa_def *four = nir_imm_int(b, 4); 781 782 nir_ssa_def *exp = 783 nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four); 784 nir_ssa_def *base = nir_ushr(b, val, exp); 785 786 base = nir_iadd(b, base, 787 nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero)); 788 789 nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero); 790 exp = nir_iadd(b, exp, rshift); 791 base = nir_ushr(b, base, rshift); 792 base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero)); 793 rshift = nir_imax(b, nir_find_lsb(b, base), zero); 794 exp = nir_iadd(b, exp, rshift); 795 base = nir_ushr(b, base, rshift); 796 797 *packed = nir_ior(b, exp, 798 nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5))); 799 return nir_ishl(b, base, exp); 800} 801 802static void 803update_jobs(struct indirect_draw_shader_builder *builder) 804{ 805 get_invocation(builder); 806 807 if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS)) 808 update_job(builder, MALI_JOB_TYPE_VERTEX); 809 810 update_job(builder, MALI_JOB_TYPE_TILER); 811} 812 813 814static void 815set_null_job(struct indirect_draw_shader_builder *builder, 816 nir_ssa_def *job_ptr) 817{ 818 nir_builder *b = &builder->b; 819 nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4)); 820 nir_ssa_def *val = load_global(b, w4, 1, 32); 821 822 /* Set job type to NULL (AKA NOOP) */ 823 val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01), 824 nir_imm_int(b, MALI_JOB_TYPE_NULL << 1)); 825 store_global(b, w4, val, 1); 826} 827 828static void 829get_instance_size(struct indirect_draw_shader_builder *builder) 830{ 831 nir_builder *b = &builder->b; 832 833 if (!builder->index_size) { 834 builder->jobs.base_vertex_offset = nir_imm_int(b, 0); 835 builder->jobs.offset_start = builder->draw.vertex_start; 836 builder->instance_size.raw = builder->draw.vertex_count; 837 return; 838 } 839 840 unsigned index_size = builder->index_size; 841 nir_ssa_def *min = get_min_max_ctx_field(builder, min); 842 nir_ssa_def *max = get_min_max_ctx_field(builder, max); 843 844 /* We handle unaligned indices here to avoid the extra complexity in 845 * the min/max search job. 846 */ 847 if (builder->index_size < 4) { 848 nir_variable *min_var = 849 nir_local_variable_create(b->impl, glsl_uint_type(), "min"); 850 nir_store_var(b, min_var, min, 1); 851 nir_variable *max_var = 852 nir_local_variable_create(b->impl, glsl_uint_type(), "max"); 853 nir_store_var(b, max_var, max, 1); 854 855 nir_ssa_def *base = 856 get_address(b, builder->draw.index_buf, 857 nir_imul_imm(b, builder->draw.vertex_start, index_size)); 858 nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3); 859 nir_ssa_def *end = 860 nir_iadd(b, offset, 861 nir_imul_imm(b, builder->draw.vertex_count, index_size)); 862 nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3); 863 unsigned shift = index_size * 8; 864 unsigned mask = (1 << shift) - 1; 865 866 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL)); 867 868 /* Unaligned start offset, we need to ignore any data that's 869 * outside the requested range. We also handle ranges that are 870 * covering less than 2 words here. 871 */ 872 IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) { 873 min = nir_load_var(b, min_var); 874 max = nir_load_var(b, max_var); 875 876 nir_ssa_def *val = load_global(b, base, 1, 32); 877 for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) { 878 nir_ssa_def *oob = 879 nir_ior(b, 880 nir_ult(b, nir_imm_int(b, i), offset), 881 nir_uge(b, nir_imm_int(b, i), end)); 882 nir_ssa_def *data = nir_iand_imm(b, val, mask); 883 884 min = nir_umin(b, min, 885 nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data)); 886 max = nir_umax(b, max, 887 nir_bcsel(b, oob, nir_imm_int(b, 0), data)); 888 val = nir_ushr_imm(b, val, shift); 889 } 890 891 nir_store_var(b, min_var, min, 1); 892 nir_store_var(b, max_var, max, 1); 893 } ENDIF 894 895 nir_ssa_def *remaining = nir_isub(b, end, aligned_end); 896 897 /* The last word contains less than 4bytes of data, we need to 898 * discard anything falling outside the requested range. 899 */ 900 IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) { 901 min = nir_load_var(b, min_var); 902 max = nir_load_var(b, max_var); 903 904 nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32); 905 for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) { 906 nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining); 907 nir_ssa_def *data = nir_iand_imm(b, val, mask); 908 909 min = nir_umin(b, min, 910 nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data)); 911 max = nir_umax(b, max, 912 nir_bcsel(b, oob, nir_imm_int(b, 0), data)); 913 val = nir_ushr_imm(b, val, shift); 914 } 915 916 nir_store_var(b, min_var, min, 1); 917 nir_store_var(b, max_var, max, 1); 918 } ENDIF 919 920 min = nir_load_var(b, min_var); 921 max = nir_load_var(b, max_var); 922 } 923 924 builder->jobs.base_vertex_offset = nir_ineg(b, min); 925 builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias); 926 builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1); 927} 928 929/* Patch a draw sequence */ 930 931static void 932patch(struct indirect_draw_shader_builder *builder) 933{ 934 unsigned index_size = builder->index_size; 935 nir_builder *b = &builder->b; 936 937 nir_ssa_def *draw_ptr = builder->draw.draw_buf; 938 939 if (index_size) { 940 builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count); 941 builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance); 942 builder->draw.instance_count = 943 get_indexed_draw_field(b, draw_ptr, instance_count); 944 builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start); 945 builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias); 946 } else { 947 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count); 948 builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance); 949 builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count); 950 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start); 951 } 952 953 assert(builder->draw.vertex_count->num_components); 954 955 nir_ssa_def *num_vertices = 956 nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count); 957 958 IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) { 959 /* If there's nothing to draw, turn the vertex/tiler jobs into 960 * null jobs. 961 */ 962 if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS)) 963 set_null_job(builder, builder->jobs.vertex_job); 964 965 set_null_job(builder, builder->jobs.tiler_job); 966 } ELSE { 967 get_instance_size(builder); 968 969 nir_ssa_def *count = builder->instance_size.raw; 970 971 /* IDVS requires padding to a multiple of 4 */ 972 if (builder->flags & PAN_INDIRECT_DRAW_IDVS) 973 count = nir_align_pot(b, count, 4); 974 975 builder->instance_size.padded = 976 get_padded_count(b, count, 977 &builder->instance_size.packed); 978 979 update_varyings(builder); 980 update_jobs(builder); 981 update_vertex_attribs(builder); 982 983 IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) { 984 store_global(b, builder->jobs.first_vertex_sysval, 985 builder->jobs.offset_start, 1); 986 } ENDIF 987 988 IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) { 989 store_global(b, builder->jobs.base_vertex_sysval, 990 index_size ? 991 builder->draw.index_bias : 992 nir_imm_int(b, 0), 993 1); 994 } ENDIF 995 996 IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) { 997 store_global(b, builder->jobs.base_instance_sysval, 998 builder->draw.start_instance, 1); 999 } ENDIF 1000 } ENDIF 1001} 1002 1003/* Search the min/max index in the range covered by the indirect draw call */ 1004 1005static void 1006get_index_min_max(struct indirect_draw_shader_builder *builder) 1007{ 1008 nir_ssa_def *restart_index = builder->draw.restart_index; 1009 unsigned index_size = builder->index_size; 1010 nir_builder *b = &builder->b; 1011 1012 nir_ssa_def *draw_ptr = builder->draw.draw_buf; 1013 1014 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count); 1015 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start); 1016 1017 nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); 1018 nir_variable *min_var = 1019 nir_local_variable_create(b->impl, glsl_uint_type(), "min"); 1020 nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1); 1021 nir_variable *max_var = 1022 nir_local_variable_create(b->impl, glsl_uint_type(), "max"); 1023 nir_store_var(b, max_var, nir_imm_int(b, 0), 1); 1024 1025 nir_ssa_def *base = 1026 get_address(b, builder->draw.index_buf, 1027 nir_imul_imm(b, builder->draw.vertex_start, index_size)); 1028 1029 1030 nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3); 1031 nir_ssa_def *end = 1032 nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size)); 1033 1034 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL)); 1035 1036 /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */ 1037 start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3); 1038 end = nir_iand_imm(b, end, ~3); 1039 1040 /* Add the job offset. */ 1041 start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t))); 1042 1043 nir_variable *offset_var = 1044 nir_local_variable_create(b->impl, glsl_uint_type(), "offset"); 1045 nir_store_var(b, offset_var, start, 1); 1046 1047 LOOP { 1048 nir_ssa_def *offset = nir_load_var(b, offset_var); 1049 IF (nir_uge(b, offset, end)) 1050 BREAK; 1051 ENDIF 1052 1053 nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32); 1054 nir_ssa_def *old_min = nir_load_var(b, min_var); 1055 nir_ssa_def *old_max = nir_load_var(b, max_var); 1056 nir_ssa_def *new_min; 1057 nir_ssa_def *new_max; 1058 1059 /* TODO: use 8/16 bit arithmetic when index_size < 4. */ 1060 for (unsigned i = 0; i < 4; i += index_size) { 1061 nir_ssa_def *data = nir_ushr_imm(b, val, i * 8); 1062 data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1); 1063 new_min = nir_umin(b, old_min, data); 1064 new_max = nir_umax(b, old_max, data); 1065 if (restart_index) { 1066 new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min); 1067 new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max); 1068 } 1069 old_min = new_min; 1070 old_max = new_max; 1071 } 1072 1073 nir_store_var(b, min_var, new_min, 1); 1074 nir_store_var(b, max_var, new_max, 1); 1075 nir_store_var(b, offset_var, 1076 nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1); 1077 } 1078 1079 IF (nir_ult(b, start, end)) 1080 update_min(builder, nir_load_var(b, min_var)); 1081 update_max(builder, nir_load_var(b, max_var)); 1082 ENDIF 1083} 1084 1085static unsigned 1086get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search) 1087{ 1088 if (!index_min_max_search) { 1089 flags &= PAN_INDIRECT_DRAW_FLAGS_MASK; 1090 flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK; 1091 if (index_size) 1092 flags |= (util_logbase2(index_size) + 1); 1093 return flags; 1094 } 1095 1096 return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ? 1097 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART : 1098 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) + 1099 util_logbase2(index_size); 1100} 1101 1102static void 1103create_indirect_draw_shader(struct panfrost_device *dev, 1104 unsigned flags, unsigned index_size, 1105 bool index_min_max_search) 1106{ 1107 assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS); 1108 struct indirect_draw_shader_builder builder; 1109 init_shader_builder(&builder, dev, flags, index_size, index_min_max_search); 1110 1111 nir_builder *b = &builder.b; 1112 1113 if (index_min_max_search) 1114 get_index_min_max(&builder); 1115 else 1116 patch(&builder); 1117 1118 struct panfrost_compile_inputs inputs = { 1119 .gpu_id = dev->gpu_id, 1120 .fixed_sysval_ubo = -1, 1121 .no_ubo_to_push = true, 1122 }; 1123 struct pan_shader_info shader_info; 1124 struct util_dynarray binary; 1125 1126 util_dynarray_init(&binary, NULL); 1127 GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info); 1128 1129 assert(!shader_info.tls_size); 1130 assert(!shader_info.wls_size); 1131 assert(!shader_info.sysvals.sysval_count); 1132 1133 shader_info.push.count = 1134 DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4); 1135 1136 unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search); 1137 struct pan_indirect_draw_shader *draw_shader = 1138 &dev->indirect_draw_shaders.shaders[shader_id]; 1139 void *state = dev->indirect_draw_shaders.states->ptr.cpu + 1140 (shader_id * pan_size(RENDERER_STATE)); 1141 1142 pthread_mutex_lock(&dev->indirect_draw_shaders.lock); 1143 if (!draw_shader->rsd) { 1144 mali_ptr address = 1145 pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool, 1146 binary.data, binary.size, 1147 PAN_ARCH >= 6 ? 128 : 64); 1148 1149 util_dynarray_fini(&binary); 1150 1151 pan_pack(state, RENDERER_STATE, cfg) { 1152 pan_shader_prepare_rsd(&shader_info, address, &cfg); 1153 } 1154 1155 draw_shader->push = shader_info.push; 1156 draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu + 1157 (shader_id * pan_size(RENDERER_STATE)); 1158 } 1159 pthread_mutex_unlock(&dev->indirect_draw_shaders.lock); 1160 1161 ralloc_free(b->shader); 1162} 1163 1164static mali_ptr 1165get_renderer_state(struct panfrost_device *dev, unsigned flags, 1166 unsigned index_size, bool index_min_max_search) 1167{ 1168 unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search); 1169 struct pan_indirect_draw_shader *info = 1170 &dev->indirect_draw_shaders.shaders[shader_id]; 1171 1172 if (!info->rsd) { 1173 create_indirect_draw_shader(dev, flags, index_size, 1174 index_min_max_search); 1175 assert(info->rsd); 1176 } 1177 1178 return info->rsd; 1179} 1180 1181static mali_ptr 1182get_tls(const struct panfrost_device *dev) 1183{ 1184 return dev->indirect_draw_shaders.states->ptr.gpu + 1185 (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE)); 1186} 1187 1188static void 1189panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev) 1190{ 1191 pthread_mutex_lock(&dev->indirect_draw_shaders.lock); 1192 if (dev->indirect_draw_shaders.states) 1193 goto out; 1194 1195 unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS * 1196 pan_size(RENDERER_STATE)) + 1197 pan_size(LOCAL_STORAGE); 1198 1199 dev->indirect_draw_shaders.states = 1200 panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states"); 1201 1202 /* Prepare the thread storage descriptor now since it's invariant. */ 1203 void *tsd = dev->indirect_draw_shaders.states->ptr.cpu + 1204 (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE)); 1205 pan_pack(tsd, LOCAL_STORAGE, ls) { 1206 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM; 1207 }; 1208 1209 /* FIXME: Currently allocating 512M of growable memory, meaning that we 1210 * only allocate what we really use, the problem is: 1211 * - allocation happens 2M at a time, which might be more than we 1212 * actually need 1213 * - the memory is attached to the device to speed up subsequent 1214 * indirect draws, but that also means it's never shrinked 1215 */ 1216 dev->indirect_draw_shaders.varying_heap = 1217 panfrost_bo_create(dev, 512 * 1024 * 1024, 1218 PAN_BO_INVISIBLE | PAN_BO_GROWABLE, 1219 "Indirect draw varying heap"); 1220 1221out: 1222 pthread_mutex_unlock(&dev->indirect_draw_shaders.lock); 1223} 1224 1225static unsigned 1226panfrost_emit_index_min_max_search(struct pan_pool *pool, 1227 struct pan_scoreboard *scoreboard, 1228 const struct pan_indirect_draw_info *draw_info, 1229 const struct indirect_draw_inputs *inputs, 1230 struct indirect_draw_context *draw_ctx) 1231{ 1232 struct panfrost_device *dev = pool->dev; 1233 unsigned index_size = draw_info->index_size; 1234 1235 if (!index_size) 1236 return 0; 1237 1238 mali_ptr rsd = 1239 get_renderer_state(dev, draw_info->flags, 1240 draw_info->index_size, true); 1241 struct panfrost_ptr job = 1242 pan_pool_alloc_desc(pool, COMPUTE_JOB); 1243 void *invocation = 1244 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); 1245 panfrost_pack_work_groups_compute(invocation, 1246 1, 1, 1, MIN_MAX_JOBS, 1, 1, 1247 false, false); 1248 1249 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { 1250 cfg.job_task_split = 7; 1251 } 1252 1253 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { 1254 cfg.state = rsd; 1255 cfg.thread_storage = get_tls(pool->dev); 1256 cfg.push_uniforms = 1257 pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16); 1258 } 1259 1260 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, 1261 false, false, 0, 0, &job, false); 1262} 1263 1264unsigned 1265GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool, 1266 struct pan_scoreboard *scoreboard, 1267 const struct pan_indirect_draw_info *draw_info, 1268 struct panfrost_ptr *ctx) 1269{ 1270 struct panfrost_device *dev = pool->dev; 1271 1272 /* Currently only tested on Bifrost, but the logic should be the same 1273 * on Midgard. 1274 */ 1275 assert(pan_is_bifrost(dev)); 1276 1277 panfrost_indirect_draw_alloc_deps(dev); 1278 1279 struct panfrost_ptr job = 1280 pan_pool_alloc_desc(pool, COMPUTE_JOB); 1281 mali_ptr rsd = 1282 get_renderer_state(dev, draw_info->flags, 1283 draw_info->index_size, false); 1284 1285 struct indirect_draw_context draw_ctx = { 1286 .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu, 1287 }; 1288 1289 struct panfrost_ptr draw_ctx_ptr = *ctx; 1290 if (!draw_ctx_ptr.cpu) { 1291 draw_ctx_ptr = pan_pool_alloc_aligned(pool, 1292 sizeof(draw_ctx), 1293 sizeof(mali_ptr)); 1294 } 1295 1296 struct indirect_draw_inputs inputs = { 1297 .draw_ctx = draw_ctx_ptr.gpu, 1298 .draw_buf = draw_info->draw_buf, 1299 .index_buf = draw_info->index_buf, 1300 .first_vertex_sysval = draw_info->first_vertex_sysval, 1301 .base_vertex_sysval = draw_info->base_vertex_sysval, 1302 .base_instance_sysval = draw_info->base_instance_sysval, 1303 .vertex_job = draw_info->vertex_job, 1304 .tiler_job = draw_info->tiler_job, 1305 .attrib_bufs = draw_info->attrib_bufs, 1306 .attribs = draw_info->attribs, 1307 .varying_bufs = draw_info->varying_bufs, 1308 .attrib_count = draw_info->attrib_count, 1309 }; 1310 1311 if (draw_info->index_size) { 1312 inputs.restart_index = draw_info->restart_index; 1313 1314 struct panfrost_ptr min_max_ctx_ptr = 1315 pan_pool_alloc_aligned(pool, 1316 sizeof(struct min_max_context), 1317 4); 1318 struct min_max_context *ctx = min_max_ctx_ptr.cpu; 1319 1320 ctx->min = UINT32_MAX; 1321 ctx->max = 0; 1322 inputs.min_max_ctx = min_max_ctx_ptr.gpu; 1323 } 1324 1325 void *invocation = 1326 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION); 1327 panfrost_pack_work_groups_compute(invocation, 1328 1, 1, 1, 1, 1, 1, 1329 false, false); 1330 1331 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) { 1332 cfg.job_task_split = 2; 1333 } 1334 1335 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) { 1336 cfg.state = rsd; 1337 cfg.thread_storage = get_tls(pool->dev); 1338 cfg.push_uniforms = 1339 pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16); 1340 } 1341 1342 unsigned global_dep = draw_info->last_indirect_draw; 1343 unsigned local_dep = 1344 panfrost_emit_index_min_max_search(pool, scoreboard, draw_info, 1345 &inputs, &draw_ctx); 1346 1347 if (!ctx->cpu) { 1348 *ctx = draw_ctx_ptr; 1349 memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx)); 1350 } 1351 1352 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE, 1353 false, true, local_dep, global_dep, 1354 &job, false); 1355} 1356 1357void 1358GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev, 1359 struct pan_pool *bin_pool) 1360{ 1361 /* We allocate the states and varying_heap BO lazily to avoid 1362 * reserving memory when indirect draws are not used. 1363 */ 1364 pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL); 1365 dev->indirect_draw_shaders.bin_pool = bin_pool; 1366} 1367 1368void 1369GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev) 1370{ 1371 panfrost_bo_unreference(dev->indirect_draw_shaders.states); 1372 panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap); 1373 pthread_mutex_destroy(&dev->indirect_draw_shaders.lock); 1374} 1375