1/* 2 * Copyright 2010 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23#include "pipe/p_defines.h" 24 25#include "compiler/nir/nir.h" 26 27#include "nv50/nv50_context.h" 28#include "nv50/nv50_program.h" 29 30#include "nv50_ir_driver.h" 31 32static inline unsigned 33bitcount4(const uint32_t val) 34{ 35 static const uint8_t cnt[16] 36 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 37 return cnt[val & 0xf]; 38} 39 40static int 41nv50_vertprog_assign_slots(struct nv50_ir_prog_info_out *info) 42{ 43 struct nv50_program *prog = (struct nv50_program *)info->driverPriv; 44 unsigned i, n, c; 45 46 n = 0; 47 for (i = 0; i < info->numInputs; ++i) { 48 prog->in[i].id = i; 49 prog->in[i].sn = info->in[i].sn; 50 prog->in[i].si = info->in[i].si; 51 prog->in[i].hw = n; 52 prog->in[i].mask = info->in[i].mask; 53 54 prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32); 55 56 for (c = 0; c < 4; ++c) 57 if (info->in[i].mask & (1 << c)) 58 info->in[i].slot[c] = n++; 59 60 if (info->in[i].sn == TGSI_SEMANTIC_PRIMID) 61 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID; 62 } 63 prog->in_nr = info->numInputs; 64 65 for (i = 0; i < info->numSysVals; ++i) { 66 switch (info->sv[i].sn) { 67 case TGSI_SEMANTIC_INSTANCEID: 68 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID; 69 continue; 70 case TGSI_SEMANTIC_VERTEXID: 71 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID; 72 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START; 73 continue; 74 case TGSI_SEMANTIC_PRIMID: 75 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID; 76 break; 77 default: 78 break; 79 } 80 } 81 82 /* 83 * Corner case: VP has no inputs, but we will still need to submit data to 84 * draw it. HW will shout at us and won't draw anything if we don't enable 85 * any input, so let's just pretend it's the first one. 86 */ 87 if (prog->vp.attrs[0] == 0 && 88 prog->vp.attrs[1] == 0 && 89 prog->vp.attrs[2] == 0) 90 prog->vp.attrs[0] |= 0xf; 91 92 /* VertexID before InstanceID */ 93 if (info->io.vertexId < info->numSysVals) 94 info->sv[info->io.vertexId].slot[0] = n++; 95 if (info->io.instanceId < info->numSysVals) 96 info->sv[info->io.instanceId].slot[0] = n++; 97 98 n = 0; 99 for (i = 0; i < info->numOutputs; ++i) { 100 switch (info->out[i].sn) { 101 case TGSI_SEMANTIC_PSIZE: 102 prog->vp.psiz = i; 103 break; 104 case TGSI_SEMANTIC_CLIPDIST: 105 prog->vp.clpd[info->out[i].si] = n; 106 break; 107 case TGSI_SEMANTIC_EDGEFLAG: 108 prog->vp.edgeflag = i; 109 break; 110 case TGSI_SEMANTIC_BCOLOR: 111 prog->vp.bfc[info->out[i].si] = i; 112 break; 113 case TGSI_SEMANTIC_LAYER: 114 prog->gp.has_layer = true; 115 prog->gp.layerid = n; 116 break; 117 case TGSI_SEMANTIC_VIEWPORT_INDEX: 118 prog->gp.has_viewport = true; 119 prog->gp.viewportid = n; 120 break; 121 default: 122 break; 123 } 124 prog->out[i].id = i; 125 prog->out[i].sn = info->out[i].sn; 126 prog->out[i].si = info->out[i].si; 127 prog->out[i].hw = n; 128 prog->out[i].mask = info->out[i].mask; 129 130 for (c = 0; c < 4; ++c) 131 if (info->out[i].mask & (1 << c)) 132 info->out[i].slot[c] = n++; 133 } 134 prog->out_nr = info->numOutputs; 135 prog->max_out = n; 136 if (!prog->max_out) 137 prog->max_out = 1; 138 139 if (prog->vp.psiz < info->numOutputs) 140 prog->vp.psiz = prog->out[prog->vp.psiz].hw; 141 142 return 0; 143} 144 145static int 146nv50_fragprog_assign_slots(struct nv50_ir_prog_info_out *info) 147{ 148 struct nv50_program *prog = (struct nv50_program *)info->driverPriv; 149 unsigned i, n, m, c; 150 unsigned nvary; 151 unsigned nflat; 152 unsigned nintp = 0; 153 154 /* count recorded non-flat inputs */ 155 for (m = 0, i = 0; i < info->numInputs; ++i) { 156 switch (info->in[i].sn) { 157 case TGSI_SEMANTIC_POSITION: 158 continue; 159 default: 160 m += info->in[i].flat ? 0 : 1; 161 break; 162 } 163 } 164 /* careful: id may be != i in info->in[prog->in[i].id] */ 165 166 /* Fill prog->in[] so that non-flat inputs are first and 167 * kick out special inputs that don't use the RESULT_MAP. 168 */ 169 for (n = 0, i = 0; i < info->numInputs; ++i) { 170 if (info->in[i].sn == TGSI_SEMANTIC_POSITION) { 171 prog->fp.interp |= info->in[i].mask << 24; 172 for (c = 0; c < 4; ++c) 173 if (info->in[i].mask & (1 << c)) 174 info->in[i].slot[c] = nintp++; 175 } else { 176 unsigned j = info->in[i].flat ? m++ : n++; 177 178 if (info->in[i].sn == TGSI_SEMANTIC_COLOR) 179 prog->vp.bfc[info->in[i].si] = j; 180 else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID) 181 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID; 182 183 prog->in[j].id = i; 184 prog->in[j].mask = info->in[i].mask; 185 prog->in[j].sn = info->in[i].sn; 186 prog->in[j].si = info->in[i].si; 187 prog->in[j].linear = info->in[i].linear; 188 189 prog->in_nr++; 190 } 191 } 192 if (!(prog->fp.interp & (8 << 24))) { 193 ++nintp; 194 prog->fp.interp |= 8 << 24; 195 } 196 197 for (i = 0; i < prog->in_nr; ++i) { 198 int j = prog->in[i].id; 199 200 prog->in[i].hw = nintp; 201 for (c = 0; c < 4; ++c) 202 if (prog->in[i].mask & (1 << c)) 203 info->in[j].slot[c] = nintp++; 204 } 205 /* (n == m) if m never increased, i.e. no flat inputs */ 206 nflat = (n < m) ? (nintp - prog->in[n].hw) : 0; 207 nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */ 208 nvary = nintp - nflat; 209 210 prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT; 211 prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT; 212 213 /* put front/back colors right after HPOS */ 214 prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT; 215 for (i = 0; i < 2; ++i) 216 if (prog->vp.bfc[i] < 0xff) 217 prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16; 218 219 /* FP outputs */ 220 221 if (info->prop.fp.numColourResults > 1) 222 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS; 223 224 for (i = 0; i < info->numOutputs; ++i) { 225 prog->out[i].id = i; 226 prog->out[i].sn = info->out[i].sn; 227 prog->out[i].si = info->out[i].si; 228 prog->out[i].mask = info->out[i].mask; 229 230 if (i == info->io.fragDepth || i == info->io.sampleMask) 231 continue; 232 prog->out[i].hw = info->out[i].si * 4; 233 234 for (c = 0; c < 4; ++c) 235 info->out[i].slot[c] = prog->out[i].hw + c; 236 237 prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4); 238 } 239 240 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) { 241 info->out[info->io.sampleMask].slot[0] = prog->max_out++; 242 prog->fp.has_samplemask = 1; 243 } 244 245 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS) 246 info->out[info->io.fragDepth].slot[2] = prog->max_out++; 247 248 if (!prog->max_out) 249 prog->max_out = 4; 250 251 return 0; 252} 253 254static int 255nv50_program_assign_varying_slots(struct nv50_ir_prog_info_out *info) 256{ 257 switch (info->type) { 258 case PIPE_SHADER_VERTEX: 259 return nv50_vertprog_assign_slots(info); 260 case PIPE_SHADER_GEOMETRY: 261 return nv50_vertprog_assign_slots(info); 262 case PIPE_SHADER_FRAGMENT: 263 return nv50_fragprog_assign_slots(info); 264 case PIPE_SHADER_COMPUTE: 265 return 0; 266 default: 267 return -1; 268 } 269} 270 271static struct nv50_stream_output_state * 272nv50_program_create_strmout_state(const struct nv50_ir_prog_info_out *info, 273 const struct pipe_stream_output_info *pso) 274{ 275 struct nv50_stream_output_state *so; 276 unsigned b, i, c; 277 unsigned base[4]; 278 279 so = MALLOC_STRUCT(nv50_stream_output_state); 280 if (!so) 281 return NULL; 282 memset(so->map, 0xff, sizeof(so->map)); 283 284 for (b = 0; b < 4; ++b) 285 so->num_attribs[b] = 0; 286 for (i = 0; i < pso->num_outputs; ++i) { 287 unsigned end = pso->output[i].dst_offset + pso->output[i].num_components; 288 b = pso->output[i].output_buffer; 289 assert(b < 4); 290 so->num_attribs[b] = MAX2(so->num_attribs[b], end); 291 } 292 293 so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED; 294 295 so->stride[0] = pso->stride[0] * 4; 296 base[0] = 0; 297 for (b = 1; b < 4; ++b) { 298 assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]); 299 so->stride[b] = so->num_attribs[b] * 4; 300 if (so->num_attribs[b]) 301 so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT; 302 base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4); 303 } 304 if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) { 305 assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX); 306 so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT; 307 } 308 309 so->map_size = base[3] + so->num_attribs[3]; 310 311 for (i = 0; i < pso->num_outputs; ++i) { 312 const unsigned s = pso->output[i].start_component; 313 const unsigned p = pso->output[i].dst_offset; 314 const unsigned r = pso->output[i].register_index; 315 b = pso->output[i].output_buffer; 316 317 if (r >= info->numOutputs) 318 continue; 319 320 for (c = 0; c < pso->output[i].num_components; ++c) 321 so->map[base[b] + p + c] = info->out[r].slot[s + c]; 322 } 323 324 return so; 325} 326 327bool 328nv50_program_translate(struct nv50_program *prog, uint16_t chipset, 329 struct util_debug_callback *debug) 330{ 331 struct nv50_ir_prog_info *info; 332 struct nv50_ir_prog_info_out info_out = {}; 333 int i, ret; 334 const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80; 335 336 info = CALLOC_STRUCT(nv50_ir_prog_info); 337 if (!info) 338 return false; 339 340 info->type = prog->type; 341 info->target = chipset; 342 343 info->bin.sourceRep = prog->pipe.type; 344 switch (prog->pipe.type) { 345 case PIPE_SHADER_IR_TGSI: 346 info->bin.source = (void *)prog->pipe.tokens; 347 break; 348 case PIPE_SHADER_IR_NIR: 349 info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir); 350 break; 351 default: 352 assert(!"unsupported IR!"); 353 free(info); 354 return false; 355 } 356 357 info->bin.smemSize = prog->cp.smem_size; 358 info->io.auxCBSlot = 15; 359 info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET; 360 info->io.genUserClip = prog->vp.clpd_nr; 361 if (prog->fp.alphatest) 362 info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET; 363 364 info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET; 365 info->io.bufInfoBase = NV50_CB_AUX_BUF_INFO(0); 366 info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET; 367 info->io.msInfoCBSlot = 15; 368 info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET; 369 370 info->io.membarOffset = NV50_CB_AUX_MEMBAR_OFFSET; 371 info->io.gmemMembar = 15; 372 373 info->assignSlots = nv50_program_assign_varying_slots; 374 375 prog->vp.bfc[0] = 0xff; 376 prog->vp.bfc[1] = 0xff; 377 prog->vp.edgeflag = 0xff; 378 prog->vp.clpd[0] = map_undef; 379 prog->vp.clpd[1] = map_undef; 380 prog->vp.psiz = map_undef; 381 prog->gp.has_layer = 0; 382 prog->gp.has_viewport = 0; 383 384 if (prog->type == PIPE_SHADER_COMPUTE) 385 info->prop.cp.inputOffset = 0x14; 386 387 info_out.driverPriv = prog; 388 389#ifndef NDEBUG 390 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); 391 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); 392 info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0); 393#else 394 info->optLevel = 3; 395#endif 396 397 ret = nv50_ir_generate_code(info, &info_out); 398 if (ret) { 399 NOUVEAU_ERR("shader translation failed: %i\n", ret); 400 goto out; 401 } 402 403 prog->code = info_out.bin.code; 404 prog->code_size = info_out.bin.codeSize; 405 prog->fixups = info_out.bin.relocData; 406 prog->interps = info_out.bin.fixupData; 407 prog->max_gpr = MAX2(4, (info_out.bin.maxGPR >> 1) + 1); 408 prog->tls_space = info_out.bin.tlsSpace; 409 prog->cp.smem_size = info_out.bin.smemSize; 410 prog->mul_zero_wins = info->io.mul_zero_wins; 411 prog->vp.need_vertex_id = info_out.io.vertexId < PIPE_MAX_SHADER_INPUTS; 412 413 prog->vp.clip_enable = (1 << info_out.io.clipDistances) - 1; 414 prog->vp.cull_enable = 415 ((1 << info_out.io.cullDistances) - 1) << info_out.io.clipDistances; 416 prog->vp.clip_mode = 0; 417 for (i = 0; i < info_out.io.cullDistances; ++i) 418 prog->vp.clip_mode |= 1 << ((info_out.io.clipDistances + i) * 4); 419 420 if (prog->type == PIPE_SHADER_FRAGMENT) { 421 if (info_out.prop.fp.writesDepth) { 422 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z; 423 prog->fp.flags[1] = 0x11; 424 } 425 if (info_out.prop.fp.usesDiscard) 426 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL; 427 } else 428 if (prog->type == PIPE_SHADER_GEOMETRY) { 429 switch (info_out.prop.gp.outputPrim) { 430 case PIPE_PRIM_LINE_STRIP: 431 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP; 432 break; 433 case PIPE_PRIM_TRIANGLE_STRIP: 434 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP; 435 break; 436 case PIPE_PRIM_POINTS: 437 default: 438 assert(info_out.prop.gp.outputPrim == PIPE_PRIM_POINTS); 439 prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS; 440 break; 441 } 442 prog->gp.vert_count = CLAMP(info_out.prop.gp.maxVertices, 1, 1024); 443 } else 444 if (prog->type == PIPE_SHADER_COMPUTE) { 445 for (i = 0; i < NV50_MAX_GLOBALS; i++) { 446 prog->cp.gmem[i] = (struct nv50_gmem_state){ 447 .valid = info_out.prop.cp.gmem[i].valid, 448 .image = info_out.prop.cp.gmem[i].image, 449 .slot = info_out.prop.cp.gmem[i].slot 450 }; 451 } 452 } 453 454 if (prog->pipe.stream_output.num_outputs) 455 prog->so = nv50_program_create_strmout_state(&info_out, 456 &prog->pipe.stream_output); 457 458 util_debug_message(debug, SHADER_INFO, 459 "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, loops: %d, bytes: %d", 460 prog->type, info_out.bin.tlsSpace, info_out.bin.smemSize, 461 prog->max_gpr, info_out.bin.instructions, info_out.loops, 462 info_out.bin.codeSize); 463 464out: 465 if (info->bin.sourceRep == PIPE_SHADER_IR_NIR) 466 ralloc_free((void *)info->bin.source); 467 FREE(info); 468 return !ret; 469} 470 471bool 472nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) 473{ 474 struct nouveau_heap *heap; 475 int ret; 476 uint32_t size = align(prog->code_size, 0x40); 477 uint8_t prog_type; 478 479 switch (prog->type) { 480 case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break; 481 case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break; 482 case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break; 483 case PIPE_SHADER_COMPUTE: heap = nv50->screen->fp_code_heap; break; 484 default: 485 assert(!"invalid program type"); 486 return false; 487 } 488 489 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); 490 if (ret) { 491 /* Out of space: evict everything to compactify the code segment, hoping 492 * the working set is much smaller and drifts slowly. Improve me ! 493 */ 494 while (heap->next) { 495 struct nv50_program *evict = heap->next->priv; 496 if (evict) 497 nouveau_heap_free(&evict->mem); 498 } 499 debug_printf("WARNING: out of code space, evicting all shaders.\n"); 500 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); 501 if (ret) { 502 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); 503 return false; 504 } 505 } 506 507 if (prog->type == PIPE_SHADER_COMPUTE) { 508 /* CP code must be uploaded in FP code segment. */ 509 prog_type = 1; 510 } else { 511 prog->code_base = prog->mem->start; 512 prog_type = prog->type; 513 } 514 515 ret = nv50_tls_realloc(nv50->screen, prog->tls_space); 516 if (ret < 0) { 517 nouveau_heap_free(&prog->mem); 518 return false; 519 } 520 if (ret > 0) 521 nv50->state.new_tls_space = true; 522 523 if (prog->fixups) 524 nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0); 525 if (prog->interps) 526 nv50_ir_apply_fixups(prog->interps, prog->code, 527 prog->fp.force_persample_interp, 528 false /* flatshade */, 529 prog->fp.alphatest - 1, 530 false /* msaa */); 531 532 nv50_sifc_linear_u8(&nv50->base, nv50->screen->code, 533 (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base, 534 NOUVEAU_BO_VRAM, prog->code_size, prog->code); 535 536 BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1); 537 PUSH_DATA (nv50->base.pushbuf, 0); 538 539 return true; 540} 541 542void 543nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 544{ 545 const struct pipe_shader_state pipe = p->pipe; 546 const ubyte type = p->type; 547 548 if (p->mem) 549 nouveau_heap_free(&p->mem); 550 551 FREE(p->code); 552 553 FREE(p->fixups); 554 FREE(p->interps); 555 FREE(p->so); 556 557 memset(p, 0, sizeof(*p)); 558 559 p->pipe = pipe; 560 p->type = type; 561} 562