1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * This code is based on original work by Ilia Mirkin. 24 */ 25 26/** 27 * \file gfx6_gs_visitor.cpp 28 * 29 * Gfx6 geometry shader implementation 30 */ 31 32#include "gfx6_gs_visitor.h" 33#include "brw_eu.h" 34#include "brw_prim.h" 35 36namespace brw { 37 38void 39gfx6_gs_visitor::emit_prolog() 40{ 41 vec4_gs_visitor::emit_prolog(); 42 43 /* Gfx6 geometry shaders require to allocate an initial VUE handle via 44 * FF_SYNC message, however the documentation remarks that only one thread 45 * can write to the URB simultaneously and the FF_SYNC message provides the 46 * synchronization mechanism for this, so using this message effectively 47 * stalls the thread until it is its turn to write to the URB. Because of 48 * this, the best way to implement geometry shader algorithms in gfx6 is to 49 * execute the algorithm before the FF_SYNC message to maximize parallelism. 50 * 51 * To achieve this we buffer the geometry shader outputs for each emitted 52 * vertex in vertex_output during operation. Then, when we have processed 53 * the last vertex (that is, at thread end time), we send the FF_SYNC 54 * message to allocate the initial VUE handle and write all buffered vertex 55 * data to the URB in one go. 56 * 57 * For each emitted vertex, vertex_output will hold vue_map.num_slots 58 * data items plus one additional item to hold required flags 59 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) 60 * which come right after the data items for that vertex. Vertex data and 61 * flags for the next vertex come right after the data items and flags for 62 * the previous vertex. 63 */ 64 this->current_annotation = "gfx6 prolog"; 65 this->vertex_output = src_reg(this, 66 glsl_type::uint_type, 67 (prog_data->vue_map.num_slots + 1) * 68 nir->info.gs.vertices_out); 69 this->vertex_output_offset = src_reg(this, glsl_type::uint_type); 70 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); 71 72 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), 73 * so initialize it once to R0. 74 */ 75 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), 76 retype(brw_vec8_grf(0, 0), 77 BRW_REGISTER_TYPE_UD))); 78 inst->force_writemask_all = true; 79 80 /* This will be used as a temporary to store writeback data of FF_SYNC 81 * and URB_WRITE messages. 82 */ 83 this->temp = src_reg(this, glsl_type::uint_type); 84 85 /* This will be used to know when we are processing the first vertex of 86 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know 87 * that we are processing the first vertex in the primitive and to zero 88 * otherwise. This way we can use its value directly in the URB write 89 * headers. 90 */ 91 this->first_vertex = src_reg(this, glsl_type::uint_type); 92 emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START))); 93 94 /* The FF_SYNC message requires to know the number of primitives generated, 95 * so keep a counter for this. 96 */ 97 this->prim_count = src_reg(this, glsl_type::uint_type); 98 emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u))); 99 100 if (gs_prog_data->num_transform_feedback_bindings) { 101 /* Create a virtual register to hold destination indices in SOL */ 102 this->destination_indices = src_reg(this, glsl_type::uvec4_type); 103 /* Create a virtual register to hold number of written primitives */ 104 this->sol_prim_written = src_reg(this, glsl_type::uint_type); 105 /* Create a virtual register to hold Streamed Vertex Buffer Indices */ 106 this->svbi = src_reg(this, glsl_type::uvec4_type); 107 /* Create a virtual register to hold max values of SVBI */ 108 this->max_svbi = src_reg(this, glsl_type::uvec4_type); 109 emit(MOV(dst_reg(this->max_svbi), 110 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); 111 } 112 113 /* PrimitveID is delivered in r0.1 of the thread payload. If the program 114 * needs it we have to move it to a separate register where we can map 115 * the attribute. 116 * 117 * Notice that we cannot use a virtual register for this, because we need to 118 * map all input attributes to hardware registers in setup_payload(), 119 * which happens before virtual registers are mapped to hardware registers. 120 * We could work around that issue if we were able to compute the first 121 * non-payload register here and move the PrimitiveID information to that 122 * register, but we can't because at this point we don't know the final 123 * number uniforms that will be included in the payload. 124 * 125 * So, what we do is to place PrimitiveID information in r1, which is always 126 * delivered as part of the payload, but its only populated with data 127 * relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE 128 * in the 3DSTATE_GS state packet. That information can be obtained by other 129 * means though, so we can safely use r1 for this purpose. 130 */ 131 if (gs_prog_data->include_primitive_id) { 132 this->primitive_id = 133 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 134 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); 135 } 136} 137 138void 139gfx6_gs_visitor::gs_emit_vertex(int stream_id) 140{ 141 this->current_annotation = "gfx6 emit vertex"; 142 143 /* Buffer all output slots for this vertex in vertex_output */ 144 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { 145 int varying = prog_data->vue_map.slot_to_varying[slot]; 146 if (varying != VARYING_SLOT_PSIZ) { 147 dst_reg dst(this->vertex_output); 148 dst.reladdr = ralloc(mem_ctx, src_reg); 149 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 150 emit_urb_slot(dst, varying); 151 } else { 152 /* The PSIZ slot can pack multiple varyings in different channels 153 * and emit_urb_slot() will produce a MOV instruction for each of 154 * them. Since we are writing to an array, that will translate to 155 * possibly multiple MOV instructions with an array destination and 156 * each will generate a scratch write with the same offset into 157 * scratch space (thus, each one overwriting the previous). This is 158 * not what we want. What we will do instead is emit PSIZ to a 159 * a regular temporary register, then move that register into the 160 * array. This way we only have one instruction with an array 161 * destination and we only produce a single scratch write. 162 */ 163 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); 164 emit_urb_slot(tmp, varying); 165 dst_reg dst(this->vertex_output); 166 dst.reladdr = ralloc(mem_ctx, src_reg); 167 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 168 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); 169 inst->force_writemask_all = true; 170 } 171 172 emit(ADD(dst_reg(this->vertex_output_offset), 173 this->vertex_output_offset, brw_imm_ud(1u))); 174 } 175 176 /* Now buffer flags for this vertex */ 177 dst_reg dst(this->vertex_output); 178 dst.reladdr = ralloc(mem_ctx, src_reg); 179 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 180 if (nir->info.gs.output_primitive == GL_POINTS) { 181 /* If we are outputting points, then every vertex has PrimStart and 182 * PrimEnd set. 183 */ 184 emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | 185 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END))); 186 emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); 187 } else { 188 /* Otherwise, we can only set the PrimStart flag, which we have stored 189 * in the first_vertex register. We will have to wait until we execute 190 * EndPrimitive() or we end the thread to set the PrimEnd flag on a 191 * vertex. 192 */ 193 emit(OR(dst, this->first_vertex, 194 brw_imm_ud(gs_prog_data->output_topology << 195 URB_WRITE_PRIM_TYPE_SHIFT))); 196 emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u))); 197 } 198 emit(ADD(dst_reg(this->vertex_output_offset), 199 this->vertex_output_offset, brw_imm_ud(1u))); 200} 201 202void 203gfx6_gs_visitor::gs_end_primitive() 204{ 205 this->current_annotation = "gfx6 end primitive"; 206 /* Calling EndPrimitive() is optional for point output. In this case we set 207 * the PrimEnd flag when we process EmitVertex(). 208 */ 209 if (nir->info.gs.output_primitive == GL_POINTS) 210 return; 211 212 /* Otherwise we know that the last vertex we have processed was the last 213 * vertex in the primitive and we need to set its PrimEnd flag, so do this 214 * unless we haven't emitted that vertex at all (vertex_count != 0). 215 * 216 * Notice that we have already incremented vertex_count when we processed 217 * the last emit_vertex, so we need to take that into account in the 218 * comparison below (hence the num_output_vertices + 1 in the comparison 219 * below). 220 */ 221 unsigned num_output_vertices = nir->info.gs.vertices_out; 222 emit(CMP(dst_null_ud(), this->vertex_count, 223 brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L)); 224 vec4_instruction *inst = emit(CMP(dst_null_ud(), 225 this->vertex_count, brw_imm_ud(0u), 226 BRW_CONDITIONAL_NEQ)); 227 inst->predicate = BRW_PREDICATE_NORMAL; 228 emit(IF(BRW_PREDICATE_NORMAL)); 229 { 230 /* vertex_output_offset is already pointing at the first entry of the 231 * next vertex. So subtract 1 to modify the flags for the previous 232 * vertex. 233 */ 234 src_reg offset(this, glsl_type::uint_type); 235 emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1))); 236 237 src_reg dst(this->vertex_output); 238 dst.reladdr = ralloc(mem_ctx, src_reg); 239 memcpy(dst.reladdr, &offset, sizeof(src_reg)); 240 241 emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END))); 242 emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); 243 244 /* Set the first vertex flag to indicate that the next vertex will start 245 * a primitive. 246 */ 247 emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START))); 248 } 249 emit(BRW_OPCODE_ENDIF); 250} 251 252void 253gfx6_gs_visitor::emit_urb_write_header(int mrf) 254{ 255 this->current_annotation = "gfx6 urb header"; 256 /* Compute offset of the flags for the current vertex in vertex_output and 257 * write them in dw2 of the message header. 258 * 259 * Notice that by the time that emit_thread_end() calls here 260 * vertex_output_offset should point to the first data item of the current 261 * vertex in vertex_output, thus we only need to add the number of output 262 * slots per vertex to that offset to obtain the flags data offset. 263 */ 264 src_reg flags_offset(this, glsl_type::uint_type); 265 emit(ADD(dst_reg(flags_offset), 266 this->vertex_output_offset, 267 brw_imm_d(prog_data->vue_map.num_slots))); 268 269 src_reg flags_data(this->vertex_output); 270 flags_data.reladdr = ralloc(mem_ctx, src_reg); 271 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg)); 272 273 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data); 274} 275 276static unsigned 277align_interleaved_urb_mlen(unsigned mlen) 278{ 279 /* URB data written (does not include the message header reg) must 280 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 281 * section 5.4.3.2.2: URB_INTERLEAVED. 282 */ 283 if ((mlen % 2) != 1) 284 mlen++; 285 return mlen; 286} 287 288void 289gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf, 290 int last_mrf, int urb_offset) 291{ 292 vec4_instruction *inst = NULL; 293 294 if (!complete) { 295 /* If the vertex is not complete we don't have to do anything special */ 296 inst = emit(VEC4_GS_OPCODE_URB_WRITE); 297 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 298 } else { 299 /* Otherwise we always request to allocate a new VUE handle. If this is 300 * the last write before the EOT message and the new handle never gets 301 * used it will be dereferenced when we send the EOT message. This is 302 * necessary to avoid different setups for the EOT message (one for the 303 * case when there is no output and another for the case when there is) 304 * which would require to end the program with an IF/ELSE/ENDIF block, 305 * something we do not want. 306 */ 307 inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE); 308 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE; 309 inst->dst = dst_reg(MRF, base_mrf); 310 inst->src[0] = this->temp; 311 } 312 313 inst->base_mrf = base_mrf; 314 inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf); 315 inst->offset = urb_offset; 316} 317 318void 319gfx6_gs_visitor::emit_thread_end() 320{ 321 /* Make sure the current primitive is ended: we know it is not ended when 322 * first_vertex is not zero. This is only relevant for outputs other than 323 * points because in the point case we set PrimEnd on all vertices. 324 */ 325 if (nir->info.gs.output_primitive != GL_POINTS) { 326 emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z)); 327 emit(IF(BRW_PREDICATE_NORMAL)); 328 gs_end_primitive(); 329 emit(BRW_OPCODE_ENDIF); 330 } 331 332 /* Here we have to: 333 * 1) Emit an FF_SYNC message to obtain an initial VUE handle. 334 * 2) Loop over all buffered vertex data and write it to corresponding 335 * URB entries. 336 * 3) Allocate new VUE handles for all vertices other than the first. 337 * 4) Send a final EOT message. 338 */ 339 340 /* MRF 0 is reserved for the debugger, so start with message header 341 * in MRF 1. 342 */ 343 int base_mrf = 1; 344 345 /* In the process of generating our URB write message contents, we 346 * may need to unspill a register or load from an array. Those 347 * reads would use MRFs 21..23 348 */ 349 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver); 350 351 /* Issue the FF_SYNC message and obtain the initial VUE handle. */ 352 this->current_annotation = "gfx6 thread end: ff_sync"; 353 354 vec4_instruction *inst = NULL; 355 if (gs_prog_data->num_transform_feedback_bindings) { 356 src_reg sol_temp(this, glsl_type::uvec4_type); 357 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, 358 dst_reg(this->svbi), 359 this->vertex_count, 360 this->prim_count, 361 sol_temp); 362 inst = emit(GS_OPCODE_FF_SYNC, 363 dst_reg(this->temp), this->prim_count, this->svbi); 364 } else { 365 inst = emit(GS_OPCODE_FF_SYNC, 366 dst_reg(this->temp), this->prim_count, brw_imm_ud(0u)); 367 } 368 inst->base_mrf = base_mrf; 369 370 emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G)); 371 emit(IF(BRW_PREDICATE_NORMAL)); 372 { 373 /* Loop over all buffered vertices and emit URB write messages */ 374 this->current_annotation = "gfx6 thread end: urb writes init"; 375 src_reg vertex(this, glsl_type::uint_type); 376 emit(MOV(dst_reg(vertex), brw_imm_ud(0u))); 377 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); 378 379 this->current_annotation = "gfx6 thread end: urb writes"; 380 emit(BRW_OPCODE_DO); 381 { 382 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); 383 inst = emit(BRW_OPCODE_BREAK); 384 inst->predicate = BRW_PREDICATE_NORMAL; 385 386 /* First we prepare the message header */ 387 emit_urb_write_header(base_mrf); 388 389 /* Then add vertex data to the message in interleaved fashion */ 390 int slot = 0; 391 bool complete = false; 392 do { 393 int mrf = base_mrf + 1; 394 395 /* URB offset is in URB row increments, and each of our MRFs is half 396 * of one of those, since we're doing interleaved writes. 397 */ 398 int urb_offset = slot / 2; 399 400 for (; slot < prog_data->vue_map.num_slots; ++slot) { 401 int varying = prog_data->vue_map.slot_to_varying[slot]; 402 current_annotation = output_reg_annotation[varying]; 403 404 /* Compute offset of this slot for the current vertex 405 * in vertex_output 406 */ 407 src_reg data(this->vertex_output); 408 data.reladdr = ralloc(mem_ctx, src_reg); 409 memcpy(data.reladdr, &this->vertex_output_offset, 410 sizeof(src_reg)); 411 412 /* Copy this slot to the appropriate message register */ 413 dst_reg reg = dst_reg(MRF, mrf); 414 reg.type = output_reg[varying][0].type; 415 data.type = reg.type; 416 inst = emit(MOV(reg, data)); 417 inst->force_writemask_all = true; 418 419 mrf++; 420 emit(ADD(dst_reg(this->vertex_output_offset), 421 this->vertex_output_offset, brw_imm_ud(1u))); 422 423 /* If this was max_usable_mrf, we can't fit anything more into 424 * this URB WRITE. Same if we reached the max. message length. 425 */ 426 if (mrf > max_usable_mrf || 427 align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 428 slot++; 429 break; 430 } 431 } 432 433 complete = slot >= prog_data->vue_map.num_slots; 434 emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset); 435 } while (!complete); 436 437 /* Skip over the flags data item so that vertex_output_offset points 438 * to the first data item of the next vertex, so that we can start 439 * writing the next vertex. 440 */ 441 emit(ADD(dst_reg(this->vertex_output_offset), 442 this->vertex_output_offset, brw_imm_ud(1u))); 443 444 emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u))); 445 } 446 emit(BRW_OPCODE_WHILE); 447 448 if (gs_prog_data->num_transform_feedback_bindings) 449 xfb_write(); 450 } 451 emit(BRW_OPCODE_ENDIF); 452 453 /* Finally, emit EOT message. 454 * 455 * In gfx6 we need to end the thread differently depending on whether we have 456 * emitted at least one vertex or not. In case we did, the EOT message must 457 * always include the COMPLETE flag or else the GPU hangs. If we have not 458 * produced any output we can't use the COMPLETE flag. 459 * 460 * However, this would lead us to end the program with an ENDIF opcode, 461 * which we want to avoid, so what we do is that we always request a new 462 * VUE handle every time, even if GS produces no output. 463 * With this we make sure that whether we have emitted at least one vertex 464 * or none at all, we have to finish the thread without writing to the URB, 465 * which works for both cases by setting the COMPLETE and UNUSED flags in 466 * the EOT message. 467 */ 468 this->current_annotation = "gfx6 thread end: EOT"; 469 470 if (gs_prog_data->num_transform_feedback_bindings) { 471 /* When emitting EOT, set SONumPrimsWritten Increment Value. */ 472 src_reg data(this, glsl_type::uint_type); 473 emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu))); 474 emit(SHL(dst_reg(data), data, brw_imm_ud(16u))); 475 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); 476 } 477 478 inst = emit(GS_OPCODE_THREAD_END); 479 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; 480 inst->base_mrf = base_mrf; 481 inst->mlen = 1; 482} 483 484void 485gfx6_gs_visitor::setup_payload() 486{ 487 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; 488 489 /* Attributes are going to be interleaved, so one register contains two 490 * attribute slots. 491 */ 492 int attributes_per_reg = 2; 493 494 /* If a geometry shader tries to read from an input that wasn't written by 495 * the vertex shader, that produces undefined results, but it shouldn't 496 * crash anything. So initialize attribute_map to zeros--that ensures that 497 * these undefined results are read from r0. 498 */ 499 memset(attribute_map, 0, sizeof(attribute_map)); 500 501 int reg = 0; 502 503 /* The payload always contains important data in r0. */ 504 reg++; 505 506 /* r1 is always part of the payload and it holds information relevant 507 * for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in 508 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID 509 * information (and move the original value to a virtual register if 510 * necessary). 511 */ 512 if (gs_prog_data->include_primitive_id) 513 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg; 514 reg++; 515 516 reg = setup_uniforms(reg); 517 518 reg = setup_varying_inputs(reg, attributes_per_reg); 519 520 this->first_non_payload_grf = reg; 521} 522 523void 524gfx6_gs_visitor::xfb_write() 525{ 526 unsigned num_verts; 527 528 switch (gs_prog_data->output_topology) { 529 case _3DPRIM_POINTLIST: 530 num_verts = 1; 531 break; 532 case _3DPRIM_LINELIST: 533 case _3DPRIM_LINESTRIP: 534 case _3DPRIM_LINELOOP: 535 num_verts = 2; 536 break; 537 case _3DPRIM_TRILIST: 538 case _3DPRIM_TRIFAN: 539 case _3DPRIM_TRISTRIP: 540 case _3DPRIM_RECTLIST: 541 num_verts = 3; 542 break; 543 case _3DPRIM_QUADLIST: 544 case _3DPRIM_QUADSTRIP: 545 case _3DPRIM_POLYGON: 546 num_verts = 3; 547 break; 548 default: 549 unreachable("Unexpected primitive type in Gfx6 SOL program."); 550 } 551 552 this->current_annotation = "gfx6 thread end: svb writes init"; 553 554 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); 555 emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u))); 556 557 /* Check that at least one primitive can be written 558 * 559 * Note: since we use the binding table to keep track of buffer offsets 560 * and stride, the GS doesn't need to keep track of a separate pointer 561 * into each buffer; it uses a single pointer which increments by 1 for 562 * each vertex. So we use SVBI0 for this pointer, regardless of whether 563 * transform feedback is in interleaved or separate attribs mode. 564 */ 565 src_reg sol_temp(this, glsl_type::uvec4_type); 566 emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts))); 567 568 /* Compare SVBI calculated number with the maximum value, which is 569 * in R1.4 (previously saved in this->max_svbi) for gfx6. 570 */ 571 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); 572 emit(IF(BRW_PREDICATE_NORMAL)); 573 { 574 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices), 575 brw_imm_vf4(brw_float_to_vf(0.0), 576 brw_float_to_vf(1.0), 577 brw_float_to_vf(2.0), 578 brw_float_to_vf(0.0)))); 579 inst->force_writemask_all = true; 580 581 emit(ADD(dst_reg(this->destination_indices), 582 this->destination_indices, 583 this->svbi)); 584 } 585 emit(BRW_OPCODE_ENDIF); 586 587 /* Write transform feedback data for all processed vertices. */ 588 for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) { 589 emit(MOV(dst_reg(sol_temp), brw_imm_d(i))); 590 emit(CMP(dst_null_d(), sol_temp, this->vertex_count, 591 BRW_CONDITIONAL_L)); 592 emit(IF(BRW_PREDICATE_NORMAL)); 593 { 594 xfb_program(i, num_verts); 595 } 596 emit(BRW_OPCODE_ENDIF); 597 } 598} 599 600void 601gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) 602{ 603 unsigned binding; 604 unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings; 605 src_reg sol_temp(this, glsl_type::uvec4_type); 606 607 /* Check for buffer overflow: we need room to write the complete primitive 608 * (all vertices). Otherwise, avoid writing any vertices for it 609 */ 610 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u))); 611 emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts))); 612 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); 613 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); 614 emit(IF(BRW_PREDICATE_NORMAL)); 615 { 616 /* Avoid overwriting MRF 1 as it is used as URB write message header */ 617 dst_reg mrf_reg(MRF, 2); 618 619 this->current_annotation = "gfx6: emit SOL vertex data"; 620 /* For each vertex, generate code to output each varying using the 621 * appropriate binding table entry. 622 */ 623 for (binding = 0; binding < num_bindings; ++binding) { 624 unsigned char varying = 625 gs_prog_data->transform_feedback_bindings[binding]; 626 627 /* Set up the correct destination index for this vertex */ 628 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, 629 mrf_reg, 630 this->destination_indices); 631 inst->sol_vertex = vertex % num_verts; 632 633 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: 634 * 635 * "Prior to End of Thread with a URB_WRITE, the kernel must 636 * ensure that all writes are complete by sending the final 637 * write as a committed write." 638 */ 639 bool final_write = binding == (unsigned) num_bindings - 1 && 640 inst->sol_vertex == num_verts - 1; 641 642 /* Compute offset of this varying for the current vertex 643 * in vertex_output 644 */ 645 this->current_annotation = output_reg_annotation[varying]; 646 src_reg data(this->vertex_output); 647 data.reladdr = ralloc(mem_ctx, src_reg); 648 int offset = get_vertex_output_offset_for_varying(vertex, varying); 649 emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset))); 650 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); 651 data.type = output_reg[varying][0].type; 652 data.swizzle = gs_prog_data->transform_feedback_swizzles[binding]; 653 654 /* Write data */ 655 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); 656 inst->sol_binding = binding; 657 inst->sol_final_write = final_write; 658 659 if (final_write) { 660 /* This is the last vertex of the primitive, then increment 661 * SO num primitive counter and destination indices. 662 */ 663 emit(ADD(dst_reg(this->destination_indices), 664 this->destination_indices, 665 brw_imm_ud(num_verts))); 666 emit(ADD(dst_reg(this->sol_prim_written), 667 this->sol_prim_written, brw_imm_ud(1u))); 668 } 669 670 } 671 this->current_annotation = NULL; 672 } 673 emit(BRW_OPCODE_ENDIF); 674} 675 676int 677gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying) 678{ 679 /* Find the output slot assigned to this varying. 680 * 681 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot 682 * as VARYING_SLOT_PSIZ. 683 */ 684 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) 685 varying = VARYING_SLOT_PSIZ; 686 int slot = prog_data->vue_map.varying_to_slot[varying]; 687 688 if (slot < 0) { 689 /* This varying does not exist in the VUE so we are not writing to it 690 * and its value is undefined. We still want to return a valid offset 691 * into vertex_output though, to prevent any out-of-bound accesses into 692 * the vertex_output array. Since the value for this varying is undefined 693 * we don't really care for the value we assign to it, so any offset 694 * within the limits of vertex_output will do. 695 */ 696 slot = 0; 697 } 698 699 return vertex * (prog_data->vue_map.num_slots + 1) + slot; 700} 701 702} /* namespace brw */ 703