1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "nir_xfb_info.h" 25 26#include "util/u_dynarray.h" 27#include <util/u_math.h> 28 29static void 30add_var_xfb_varying(nir_xfb_info *xfb, 31 nir_xfb_varyings_info *varyings, 32 unsigned buffer, 33 unsigned offset, 34 const struct glsl_type *type) 35{ 36 if (varyings == NULL) 37 return; 38 39 nir_xfb_varying_info *varying = &varyings->varyings[varyings->varying_count++]; 40 41 varying->type = type; 42 varying->buffer = buffer; 43 varying->offset = offset; 44 xfb->buffers[buffer].varying_count++; 45} 46 47 48static nir_xfb_info * 49nir_xfb_info_create(void *mem_ctx, uint16_t output_count) 50{ 51 return rzalloc_size(mem_ctx, nir_xfb_info_size(output_count)); 52} 53 54static size_t 55nir_xfb_varyings_info_size(uint16_t varying_count) 56{ 57 return sizeof(nir_xfb_info) + sizeof(nir_xfb_varying_info) * varying_count; 58} 59 60static nir_xfb_varyings_info * 61nir_xfb_varyings_info_create(void *mem_ctx, uint16_t varying_count) 62{ 63 return rzalloc_size(mem_ctx, nir_xfb_varyings_info_size(varying_count)); 64} 65 66static void 67add_var_xfb_outputs(nir_xfb_info *xfb, 68 nir_xfb_varyings_info *varyings, 69 nir_variable *var, 70 unsigned buffer, 71 unsigned *location, 72 unsigned *offset, 73 const struct glsl_type *type, 74 bool varying_added) 75{ 76 /* If this type contains a 64-bit value, align to 8 bytes */ 77 if (glsl_type_contains_64bit(type)) 78 *offset = ALIGN_POT(*offset, 8); 79 80 if (glsl_type_is_array_or_matrix(type) && !var->data.compact) { 81 unsigned length = glsl_get_length(type); 82 83 const struct glsl_type *child_type = glsl_get_array_element(type); 84 if (!glsl_type_is_array(child_type) && 85 !glsl_type_is_struct(child_type)) { 86 87 add_var_xfb_varying(xfb, varyings, buffer, *offset, type); 88 varying_added = true; 89 } 90 91 for (unsigned i = 0; i < length; i++) 92 add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset, 93 child_type, varying_added); 94 } else if (glsl_type_is_struct_or_ifc(type)) { 95 unsigned length = glsl_get_length(type); 96 for (unsigned i = 0; i < length; i++) { 97 const struct glsl_type *child_type = glsl_get_struct_field(type, i); 98 add_var_xfb_outputs(xfb, varyings, var, buffer, location, offset, 99 child_type, varying_added); 100 } 101 } else { 102 assert(buffer < NIR_MAX_XFB_BUFFERS); 103 if (xfb->buffers_written & (1 << buffer)) { 104 assert(xfb->buffers[buffer].stride == var->data.xfb.stride); 105 assert(xfb->buffer_to_stream[buffer] == var->data.stream); 106 } else { 107 xfb->buffers_written |= (1 << buffer); 108 xfb->buffers[buffer].stride = var->data.xfb.stride; 109 xfb->buffer_to_stream[buffer] = var->data.stream; 110 } 111 112 assert(var->data.stream < NIR_MAX_XFB_STREAMS); 113 xfb->streams_written |= (1 << var->data.stream); 114 115 unsigned comp_slots; 116 if (var->data.compact) { 117 /* This only happens for clip/cull which are float arrays */ 118 assert(glsl_without_array(type) == glsl_float_type()); 119 assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || 120 var->data.location == VARYING_SLOT_CLIP_DIST1); 121 comp_slots = glsl_get_length(type); 122 } else { 123 comp_slots = glsl_get_component_slots(type); 124 125 UNUSED unsigned attrib_slots = DIV_ROUND_UP(comp_slots, 4); 126 assert(attrib_slots == glsl_count_attribute_slots(type, false)); 127 128 /* Ensure that we don't have, for instance, a dvec2 with a 129 * location_frac of 2 which would make it crass a location boundary 130 * even though it fits in a single slot. However, you can have a 131 * dvec3 which crosses the slot boundary with a location_frac of 2. 132 */ 133 assert(DIV_ROUND_UP(var->data.location_frac + comp_slots, 4) == 134 attrib_slots); 135 } 136 137 assert(var->data.location_frac + comp_slots <= 8); 138 uint8_t comp_mask = ((1 << comp_slots) - 1) << var->data.location_frac; 139 unsigned comp_offset = var->data.location_frac; 140 141 if (!varying_added) { 142 add_var_xfb_varying(xfb, varyings, buffer, *offset, type); 143 } 144 145 while (comp_mask) { 146 nir_xfb_output_info *output = &xfb->outputs[xfb->output_count++]; 147 148 output->buffer = buffer; 149 output->offset = *offset; 150 output->location = *location; 151 output->component_mask = comp_mask & 0xf; 152 output->component_offset = comp_offset; 153 154 *offset += util_bitcount(output->component_mask) * 4; 155 (*location)++; 156 comp_mask >>= 4; 157 comp_offset = 0; 158 } 159 } 160} 161 162static int 163compare_xfb_varying_offsets(const void *_a, const void *_b) 164{ 165 const nir_xfb_varying_info *a = _a, *b = _b; 166 167 if (a->buffer != b->buffer) 168 return a->buffer - b->buffer; 169 170 return a->offset - b->offset; 171} 172 173static int 174compare_xfb_output_offsets(const void *_a, const void *_b) 175{ 176 const nir_xfb_output_info *a = _a, *b = _b; 177 178 return a->offset - b->offset; 179} 180 181void 182nir_shader_gather_xfb_info(nir_shader *shader) 183{ 184 nir_gather_xfb_info_with_varyings(shader, NULL, NULL); 185} 186 187void 188nir_gather_xfb_info_with_varyings(nir_shader *shader, 189 void *mem_ctx, 190 nir_xfb_varyings_info **varyings_info_out) 191{ 192 assert(shader->info.stage == MESA_SHADER_VERTEX || 193 shader->info.stage == MESA_SHADER_TESS_EVAL || 194 shader->info.stage == MESA_SHADER_GEOMETRY); 195 196 /* Compute the number of outputs we have. This is simply the number of 197 * cumulative locations consumed by all the variables. If a location is 198 * represented by multiple variables, then they each count separately in 199 * number of outputs. This is only an estimate as some variables may have 200 * an xfb_buffer but not an output so it may end up larger than we need but 201 * it should be good enough for allocation. 202 */ 203 unsigned num_outputs = 0; 204 unsigned num_varyings = 0; 205 nir_xfb_varyings_info *varyings_info = NULL; 206 nir_foreach_shader_out_variable(var, shader) { 207 if (var->data.explicit_xfb_buffer) { 208 num_outputs += glsl_count_attribute_slots(var->type, false); 209 num_varyings += glsl_varying_count(var->type); 210 } 211 } 212 if (num_outputs == 0 || num_varyings == 0) 213 return; 214 215 nir_xfb_info *xfb = nir_xfb_info_create(shader, num_outputs); 216 if (varyings_info_out != NULL) { 217 *varyings_info_out = nir_xfb_varyings_info_create(mem_ctx, num_varyings); 218 varyings_info = *varyings_info_out; 219 } 220 221 /* Walk the list of outputs and add them to the array */ 222 nir_foreach_shader_out_variable(var, shader) { 223 if (!var->data.explicit_xfb_buffer) 224 continue; 225 226 unsigned location = var->data.location; 227 228 /* In order to know if we have a array of blocks can't be done just by 229 * checking if we have an interface type and is an array, because due 230 * splitting we could end on a case were we received a split struct 231 * that contains an array. 232 */ 233 bool is_array_block = var->interface_type != NULL && 234 glsl_type_is_array(var->type) && 235 glsl_without_array(var->type) == var->interface_type; 236 237 if (var->data.explicit_offset && !is_array_block) { 238 unsigned offset = var->data.offset; 239 add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer, 240 &location, &offset, var->type, false); 241 } else if (is_array_block) { 242 assert(glsl_type_is_struct_or_ifc(var->interface_type)); 243 244 unsigned aoa_size = glsl_get_aoa_size(var->type); 245 const struct glsl_type *itype = var->interface_type; 246 unsigned nfields = glsl_get_length(itype); 247 for (unsigned b = 0; b < aoa_size; b++) { 248 for (unsigned f = 0; f < nfields; f++) { 249 int foffset = glsl_get_struct_field_offset(itype, f); 250 const struct glsl_type *ftype = glsl_get_struct_field(itype, f); 251 if (foffset < 0) { 252 location += glsl_count_attribute_slots(ftype, false); 253 continue; 254 } 255 256 unsigned offset = foffset; 257 add_var_xfb_outputs(xfb, varyings_info, var, var->data.xfb.buffer + b, 258 &location, &offset, ftype, false); 259 } 260 } 261 } 262 } 263 264 /* Everything is easier in the state setup code if outputs and varyings are 265 * sorted in order of output offset (and buffer for varyings). 266 */ 267 qsort(xfb->outputs, xfb->output_count, sizeof(xfb->outputs[0]), 268 compare_xfb_output_offsets); 269 270 if (varyings_info != NULL) { 271 qsort(varyings_info->varyings, varyings_info->varying_count, 272 sizeof(varyings_info->varyings[0]), 273 compare_xfb_varying_offsets); 274 } 275 276#ifndef NDEBUG 277 /* Finally, do a sanity check */ 278 unsigned max_offset[NIR_MAX_XFB_BUFFERS] = {0}; 279 for (unsigned i = 0; i < xfb->output_count; i++) { 280 assert(xfb->outputs[i].offset >= max_offset[xfb->outputs[i].buffer]); 281 assert(xfb->outputs[i].component_mask != 0); 282 unsigned slots = util_bitcount(xfb->outputs[i].component_mask); 283 max_offset[xfb->outputs[i].buffer] = xfb->outputs[i].offset + slots * 4; 284 } 285#endif 286 287 ralloc_free(shader->xfb_info); 288 shader->xfb_info = xfb; 289} 290 291static int 292get_xfb_out_sort_index(const nir_xfb_output_info *a) 293{ 294 /* Return the maximum number to put dummy components at the end. */ 295 if (!a->component_mask) 296 return MAX_XFB_BUFFERS << 26; 297 298 return ((uint32_t)a->buffer << 26) | /* 2 bits for the buffer */ 299 /* 10 bits for the component location (256 * 4) */ 300 (((uint32_t)a->location * 4 + a->component_offset) << 16) | 301 /* 16 bits for the offset */ 302 a->offset; 303} 304 305static int 306compare_xfb_out(const void *pa, const void *pb) 307{ 308 const nir_xfb_output_info *a = (const nir_xfb_output_info *)pa; 309 const nir_xfb_output_info *b = (const nir_xfb_output_info *)pb; 310 311 return get_xfb_out_sort_index(a) - get_xfb_out_sort_index(b); 312} 313 314/** 315 * Gather transform feedback info from lowered IO intrinsics. 316 * 317 * Optionally return slot_to_register, an optional table to translate 318 * gl_varying_slot to "base" indices. 319 */ 320nir_xfb_info * 321nir_gather_xfb_info_from_intrinsics(nir_shader *nir, 322 int slot_to_register[NUM_TOTAL_VARYING_SLOTS]) 323{ 324 nir_function_impl *impl = nir_shader_get_entrypoint(nir); 325 uint8_t buffer_to_stream[MAX_XFB_BUFFERS] = {0}; 326 uint8_t buffer_mask = 0; 327 uint8_t stream_mask = 0; 328 329 if (slot_to_register) { 330 memset(slot_to_register, -1, 331 sizeof(slot_to_register[0] * NUM_TOTAL_VARYING_SLOTS)); 332 } 333 334 /* Gather xfb outputs. */ 335 struct util_dynarray array = {0}; 336 337 nir_foreach_block(block, impl) { 338 nir_foreach_instr(instr, block) { 339 if (instr->type != nir_instr_type_intrinsic || 340 !nir_instr_xfb_write_mask(nir_instr_as_intrinsic(instr))) 341 continue; 342 343 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 344 345 unsigned wr_mask = nir_intrinsic_write_mask(intr); 346 347 while (wr_mask) { 348 unsigned i = u_bit_scan(&wr_mask); 349 unsigned index = nir_intrinsic_component(intr) + i; 350 nir_io_xfb xfb = index < 2 ? nir_intrinsic_io_xfb(intr) : 351 nir_intrinsic_io_xfb2(intr); 352 353 if (xfb.out[index % 2].num_components) { 354 nir_io_semantics sem = nir_intrinsic_io_semantics(intr); 355 nir_xfb_output_info out; 356 357 out.component_offset = index; 358 out.component_mask = 359 BITFIELD_RANGE(index, xfb.out[index % 2].num_components); 360 out.location = sem.location; 361 out.buffer = xfb.out[index % 2].buffer; 362 out.offset = (uint32_t)xfb.out[index % 2].offset * 4; 363 util_dynarray_append(&array, nir_xfb_output_info, out); 364 365 uint8_t stream = (sem.gs_streams >> (i * 2)) & 0x3; 366 buffer_to_stream[out.buffer] = stream; 367 buffer_mask |= BITFIELD_BIT(out.buffer); 368 stream_mask |= BITFIELD_BIT(stream); 369 370 if (slot_to_register) 371 slot_to_register[sem.location] = nir_intrinsic_base(intr); 372 373 /* No elements before component_offset are allowed to be set. */ 374 assert(!(out.component_mask & BITFIELD_MASK(out.component_offset))); 375 } 376 } 377 } 378 } 379 380 nir_xfb_output_info *outputs = (nir_xfb_output_info *)array.data; 381 int count = util_dynarray_num_elements(&array, nir_xfb_output_info); 382 383 if (!count) 384 return NULL; 385 386 if (count > 1) { 387 /* Sort outputs by buffer, location, and component. */ 388 qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out); 389 390 /* Merge outputs referencing the same slot. */ 391 for (int i = 0; i < count - 1; i++) { 392 nir_xfb_output_info *cur = &outputs[i]; 393 394 if (!cur->component_mask) 395 continue; 396 397 /* Outputs referencing the same buffer and location are contiguous. */ 398 for (int j = i + 1; 399 j < count && 400 cur->buffer == outputs[j].buffer && 401 cur->location == outputs[j].location; j++) { 402 if (outputs[j].component_mask && 403 outputs[j].offset - outputs[j].component_offset * 4 == 404 cur->offset - cur->component_offset * 4) { 405 unsigned merged_offset = MIN2(cur->component_offset, 406 outputs[j].component_offset); 407 /* component_mask is relative to 0, not component_offset */ 408 unsigned merged_mask = cur->component_mask | outputs[j].component_mask; 409 410 /* The component mask should have no holes after merging. */ 411 if (util_is_power_of_two_nonzero((merged_mask >> merged_offset) + 1)) { 412 /* Merge outputs. */ 413 cur->component_offset = merged_offset; 414 cur->component_mask = merged_mask; 415 cur->offset = (uint32_t)cur->offset - 416 (uint32_t)cur->component_offset * 4 + 417 (uint32_t)merged_offset * 4; 418 /* Disable the other output. */ 419 outputs[j].component_mask = 0; 420 } 421 } 422 } 423 } 424 425 /* Sort outputs again to put disabled outputs at the end. */ 426 qsort(outputs, count, sizeof(nir_xfb_output_info), compare_xfb_out); 427 428 /* Remove disabled outputs. */ 429 for (int i = count - 1; i >= 0 && !outputs[i].component_mask; i--) 430 count = i; 431 } 432 433 for (unsigned i = 0; i < count; i++) 434 assert(outputs[i].component_mask); 435 436 /* Create nir_xfb_info. */ 437 nir_xfb_info *info = calloc(1, nir_xfb_info_size(count)); 438 if (!info) { 439 util_dynarray_fini(&array); 440 return NULL; 441 } 442 443 /* Fill nir_xfb_info. */ 444 info->buffers_written = buffer_mask; 445 info->streams_written = stream_mask; 446 memcpy(info->buffer_to_stream, buffer_to_stream, sizeof(buffer_to_stream)); 447 info->output_count = count; 448 memcpy(info->outputs, outputs, count * sizeof(outputs[0])); 449 450 /* Set strides. */ 451 for (unsigned i = 0; i < MAX_XFB_BUFFERS; i++) { 452 if (buffer_mask & BITFIELD_BIT(i)) 453 info->buffers[i].stride = nir->info.xfb_stride[i]; 454 } 455 456 /* Set varying_count. */ 457 for (unsigned i = 0; i < count; i++) 458 info->buffers[outputs[i].buffer].varying_count++; 459 460 util_dynarray_fini(&array); 461 return info; 462} 463 464void 465nir_print_xfb_info(nir_xfb_info *info, FILE *fp) 466{ 467 fprintf(fp, "buffers_written: 0x%x\n", info->buffers_written); 468 fprintf(fp, "streams_written: 0x%x\n", info->streams_written); 469 470 for (unsigned i = 0; i < NIR_MAX_XFB_BUFFERS; i++) { 471 if (BITFIELD_BIT(i) & info->buffers_written) { 472 fprintf(fp, "buffer%u: stride=%u varying_count=%u stream=%u\n", i, 473 info->buffers[i].stride, 474 info->buffers[i].varying_count, 475 info->buffer_to_stream[i]); 476 } 477 } 478 479 fprintf(fp, "output_count: %u\n", info->output_count); 480 481 for (unsigned i = 0; i < info->output_count; i++) { 482 fprintf(fp, "output%u: buffer=%u, offset=%u, location=%u, " 483 "component_offset=%u, component_mask=0x%x\n", 484 i, info->outputs[i].buffer, 485 info->outputs[i].offset, 486 info->outputs[i].location, 487 info->outputs[i].component_offset, 488 info->outputs[i].component_mask); 489 } 490} 491