1/* 2 * Copyright 2012 Nouveau Project 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * Authors: Christoph Bumiller 23 */ 24 25#include "nvc0/nvc0_context.h" 26#include "nvc0/nve4_compute.h" 27 28#include "nv50_ir_driver.h" 29 30#include "drf.h" 31#include "qmd.h" 32#include "cla0c0qmd.h" 33#include "clc0c0qmd.h" 34#include "clc3c0qmd.h" 35 36#define NVA0C0_QMDV00_06_VAL_SET(p,a...) NVVAL_MW_SET((p), NVA0C0, QMDV00_06, ##a) 37#define NVA0C0_QMDV00_06_DEF_SET(p,a...) NVDEF_MW_SET((p), NVA0C0, QMDV00_06, ##a) 38#define NVC0C0_QMDV02_01_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC0C0, QMDV02_01, ##a) 39#define NVC0C0_QMDV02_01_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC0C0, QMDV02_01, ##a) 40#define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a) 41#define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a) 42 43int 44nve4_screen_compute_setup(struct nvc0_screen *screen, 45 struct nouveau_pushbuf *push) 46{ 47 struct nouveau_device *dev = screen->base.device; 48 struct nouveau_object *chan = screen->base.channel; 49 int i; 50 int ret; 51 uint32_t obj_class; 52 uint64_t address; 53 54 switch (dev->chipset & ~0xf) { 55 case 0x160: 56 obj_class = TU102_COMPUTE_CLASS; 57 break; 58 case 0x140: 59 obj_class = GV100_COMPUTE_CLASS; 60 break; 61 case 0x100: 62 case 0xf0: 63 obj_class = NVF0_COMPUTE_CLASS; /* GK110 */ 64 break; 65 case 0xe0: 66 obj_class = NVE4_COMPUTE_CLASS; /* GK104 */ 67 break; 68 case 0x110: 69 obj_class = GM107_COMPUTE_CLASS; 70 break; 71 case 0x120: 72 obj_class = GM200_COMPUTE_CLASS; 73 break; 74 case 0x130: 75 obj_class = (dev->chipset == 0x130 || dev->chipset == 0x13b) ? 76 GP100_COMPUTE_CLASS : GP104_COMPUTE_CLASS; 77 break; 78 default: 79 NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset); 80 return -1; 81 } 82 83 ret = nouveau_object_new(chan, 0xbeef00c0, obj_class, NULL, 0, 84 &screen->compute); 85 if (ret) { 86 NOUVEAU_ERR("Failed to allocate compute object: %d\n", ret); 87 return ret; 88 } 89 90 BEGIN_NVC0(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); 91 PUSH_DATA (push, screen->compute->oclass); 92 93 BEGIN_NVC0(push, NVE4_CP(TEMP_ADDRESS_HIGH), 2); 94 PUSH_DATAh(push, screen->tls->offset); 95 PUSH_DATA (push, screen->tls->offset); 96 /* No idea why there are 2. Divide size by 2 to be safe. 97 * Actually this might be per-MP TEMP size and looks like I'm only using 98 * 2 MPs instead of all 8. 99 */ 100 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(0)), 3); 101 PUSH_DATAh(push, screen->tls->size / screen->mp_count); 102 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); 103 PUSH_DATA (push, 0xff); 104 if (obj_class < GV100_COMPUTE_CLASS) { 105 BEGIN_NVC0(push, NVE4_CP(MP_TEMP_SIZE_HIGH(1)), 3); 106 PUSH_DATAh(push, screen->tls->size / screen->mp_count); 107 PUSH_DATA (push, (screen->tls->size / screen->mp_count) & ~0x7fff); 108 PUSH_DATA (push, 0xff); 109 } 110 111 /* Unified address space ? Who needs that ? Certainly not OpenCL. 112 * 113 * FATAL: Buffers with addresses inside [0x1000000, 0x3000000] will NOT be 114 * accessible. We cannot prevent that at the moment, so expect failure. 115 */ 116 if (obj_class < GV100_COMPUTE_CLASS) { 117 BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); 118 PUSH_DATA (push, 0xff << 24); 119 BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); 120 PUSH_DATA (push, 0xfe << 24); 121 122 BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); 123 PUSH_DATAh(push, screen->text->offset); 124 PUSH_DATA (push, screen->text->offset); 125 } else { 126 BEGIN_NVC0(push, SUBC_CP(0x2a0), 2); 127 PUSH_DATAh(push, 0xfeULL << 24); 128 PUSH_DATA (push, 0xfeULL << 24); 129 BEGIN_NVC0(push, SUBC_CP(0x7b0), 2); 130 PUSH_DATAh(push, 0xffULL << 24); 131 PUSH_DATA (push, 0xffULL << 24); 132 } 133 134 BEGIN_NVC0(push, SUBC_CP(0x0310), 1); 135 PUSH_DATA (push, (obj_class >= NVF0_COMPUTE_CLASS) ? 0x400 : 0x300); 136 137 /* NOTE: these do not affect the state used by the 3D object */ 138 BEGIN_NVC0(push, NVE4_CP(TIC_ADDRESS_HIGH), 3); 139 PUSH_DATAh(push, screen->txc->offset); 140 PUSH_DATA (push, screen->txc->offset); 141 PUSH_DATA (push, NVC0_TIC_MAX_ENTRIES - 1); 142 BEGIN_NVC0(push, NVE4_CP(TSC_ADDRESS_HIGH), 3); 143 PUSH_DATAh(push, screen->txc->offset + 65536); 144 PUSH_DATA (push, screen->txc->offset + 65536); 145 PUSH_DATA (push, NVC0_TSC_MAX_ENTRIES - 1); 146 147 if (obj_class >= NVF0_COMPUTE_CLASS) { 148 /* The blob calls GK110_COMPUTE.FIRMWARE[0x6], along with the args (0x1) 149 * passed with GK110_COMPUTE.GRAPH.SCRATCH[0x2]. This is currently 150 * disabled because our firmware doesn't support these commands and the 151 * GPU hangs if they are used. */ 152 BEGIN_NIC0(push, SUBC_CP(0x0248), 64); 153 for (i = 63; i >= 0; i--) 154 PUSH_DATA(push, 0x38000 | i); 155 IMMED_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 0); 156 } 157 158 BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1); 159 PUSH_DATA (push, 7); /* does not interfere with 3D */ 160 161 /* Disabling this UNK command avoid a read fault when using texelFetch() 162 * from a compute shader for weird reasons. 163 if (obj_class == NVF0_COMPUTE_CLASS) 164 IMMED_NVC0(push, SUBC_CP(0x02c4), 1); 165 */ 166 167 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); 168 169 /* MS sample coordinate offsets: these do not work with _ALT modes ! */ 170 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 171 PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO); 172 PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO); 173 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 174 PUSH_DATA (push, 64); 175 PUSH_DATA (push, 1); 176 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 17); 177 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 178 PUSH_DATA (push, 0); /* 0 */ 179 PUSH_DATA (push, 0); 180 PUSH_DATA (push, 1); /* 1 */ 181 PUSH_DATA (push, 0); 182 PUSH_DATA (push, 0); /* 2 */ 183 PUSH_DATA (push, 1); 184 PUSH_DATA (push, 1); /* 3 */ 185 PUSH_DATA (push, 1); 186 PUSH_DATA (push, 2); /* 4 */ 187 PUSH_DATA (push, 0); 188 PUSH_DATA (push, 3); /* 5 */ 189 PUSH_DATA (push, 0); 190 PUSH_DATA (push, 2); /* 6 */ 191 PUSH_DATA (push, 1); 192 PUSH_DATA (push, 3); /* 7 */ 193 PUSH_DATA (push, 1); 194 195#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER 196 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 197 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); 198 PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); 199 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 200 PUSH_DATA (push, 28); 201 PUSH_DATA (push, 1); 202 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 8); 203 PUSH_DATA (push, 1); 204 PUSH_DATA (push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); 205 PUSH_DATAh(push, screen->parm->offset + NVE4_CP_PARAM_TRAP_INFO); 206 PUSH_DATA (push, screen->tls->offset); 207 PUSH_DATAh(push, screen->tls->offset); 208 PUSH_DATA (push, screen->tls->size / 2); /* MP TEMP block size */ 209 PUSH_DATA (push, screen->tls->size / 2 / 64); /* warp TEMP block size */ 210 PUSH_DATA (push, 0); /* warp cfstack size */ 211#endif 212 213 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 214 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 215 216 return 0; 217} 218 219static void 220gm107_compute_validate_surfaces(struct nvc0_context *nvc0, 221 struct pipe_image_view *view, int slot) 222{ 223 struct nv04_resource *res = nv04_resource(view->resource); 224 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 225 struct nvc0_screen *screen = nvc0->screen; 226 struct nouveau_bo *txc = nvc0->screen->txc; 227 struct nv50_tic_entry *tic; 228 uint64_t address; 229 const int s = 5; 230 231 tic = nv50_tic_entry(nvc0->images_tic[s][slot]); 232 233 res = nv04_resource(tic->pipe.texture); 234 nvc0_update_tic(nvc0, tic, res); 235 236 if (tic->id < 0) { 237 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); 238 239 /* upload the texture view */ 240 PUSH_SPACE(push, 16); 241 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 242 PUSH_DATAh(push, txc->offset + (tic->id * 32)); 243 PUSH_DATA (push, txc->offset + (tic->id * 32)); 244 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 245 PUSH_DATA (push, 32); 246 PUSH_DATA (push, 1); 247 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); 248 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 249 PUSH_DATAp(push, &tic->tic[0], 8); 250 251 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), 1); 252 PUSH_DATA (push, (tic->id << 4) | 1); 253 } else 254 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { 255 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), 1); 256 PUSH_DATA (push, (tic->id << 4) | 1); 257 } 258 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); 259 260 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; 261 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; 262 263 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RD); 264 265 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 266 267 /* upload the texture handle */ 268 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 269 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); 270 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(slot + 32)); 271 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 272 PUSH_DATA (push, 4); 273 PUSH_DATA (push, 0x1); 274 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 2); 275 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 276 PUSH_DATA (push, tic->id); 277 278 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 279 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 280} 281 282static void 283nve4_compute_validate_surfaces(struct nvc0_context *nvc0) 284{ 285 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 286 uint64_t address; 287 const int s = 5; 288 int i, j; 289 290 if (!nvc0->images_dirty[s]) 291 return; 292 293 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 294 295 for (i = 0; i < NVC0_MAX_IMAGES; ++i) { 296 struct pipe_image_view *view = &nvc0->images[s][i]; 297 298 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 299 PUSH_DATAh(push, address + NVC0_CB_AUX_SU_INFO(i)); 300 PUSH_DATA (push, address + NVC0_CB_AUX_SU_INFO(i)); 301 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 302 PUSH_DATA (push, 16 * 4); 303 PUSH_DATA (push, 0x1); 304 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 16); 305 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 306 307 if (view->resource) { 308 struct nv04_resource *res = nv04_resource(view->resource); 309 310 if (res->base.target == PIPE_BUFFER) { 311 if (view->access & PIPE_IMAGE_ACCESS_WRITE) 312 nvc0_mark_image_range_valid(view); 313 } 314 315 nve4_set_surface_info(push, view, nvc0); 316 BCTX_REFN(nvc0->bufctx_cp, CP_SUF, res, RDWR); 317 318 if (nvc0->screen->base.class_3d >= GM107_3D_CLASS) 319 gm107_compute_validate_surfaces(nvc0, view, i); 320 } else { 321 for (j = 0; j < 16; j++) 322 PUSH_DATA(push, 0); 323 } 324 } 325} 326 327/* Thankfully, textures with samplers follow the normal rules. */ 328static void 329nve4_compute_validate_samplers(struct nvc0_context *nvc0) 330{ 331 bool need_flush = nve4_validate_tsc(nvc0, 5); 332 if (need_flush) { 333 BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1); 334 PUSH_DATA (nvc0->base.pushbuf, 0); 335 } 336 337 /* Invalidate all 3D samplers because they are aliased. */ 338 for (int s = 0; s < 5; s++) 339 nvc0->samplers_dirty[s] = ~0; 340 nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS; 341} 342 343/* (Code duplicated at bottom for various non-convincing reasons. 344 * E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC 345 * entries to avoid a subchannel switch. 346 * Same for texture cache flushes. 347 * Also, the bufctx differs, and more IFs in the 3D version looks ugly.) 348 */ 349static void nve4_compute_validate_textures(struct nvc0_context *); 350 351static void 352nve4_compute_set_tex_handles(struct nvc0_context *nvc0) 353{ 354 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 355 struct nvc0_screen *screen = nvc0->screen; 356 uint64_t address; 357 const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE); 358 unsigned i, n; 359 uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s]; 360 361 if (!dirty) 362 return; 363 i = ffs(dirty) - 1; 364 n = util_logbase2(dirty) + 1 - i; 365 assert(n); 366 367 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 368 369 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 370 PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i)); 371 PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i)); 372 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 373 PUSH_DATA (push, n * 4); 374 PUSH_DATA (push, 0x1); 375 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + n); 376 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 377 PUSH_DATAp(push, &nvc0->tex_handles[s][i], n); 378 379 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 380 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 381 382 nvc0->textures_dirty[s] = 0; 383 nvc0->samplers_dirty[s] = 0; 384} 385 386static void 387nve4_compute_validate_constbufs(struct nvc0_context *nvc0) 388{ 389 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 390 const int s = 5; 391 392 while (nvc0->constbuf_dirty[s]) { 393 int i = ffs(nvc0->constbuf_dirty[s]) - 1; 394 nvc0->constbuf_dirty[s] &= ~(1 << i); 395 396 if (nvc0->constbuf[s][i].user) { 397 struct nouveau_bo *bo = nvc0->screen->uniform_bo; 398 const unsigned base = NVC0_CB_USR_INFO(s); 399 const unsigned size = nvc0->constbuf[s][0].size; 400 assert(i == 0); /* we really only want OpenGL uniforms here */ 401 assert(nvc0->constbuf[s][0].u.data); 402 403 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 404 PUSH_DATAh(push, bo->offset + base); 405 PUSH_DATA (push, bo->offset + base); 406 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 407 PUSH_DATA (push, size); 408 PUSH_DATA (push, 0x1); 409 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4)); 410 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 411 PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4); 412 } 413 else { 414 struct nv04_resource *res = 415 nv04_resource(nvc0->constbuf[s][i].u.buf); 416 if (res) { 417 uint64_t address 418 = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 419 420 /* constbufs above 0 will are fetched via ubo info in the shader */ 421 if (i > 0) { 422 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 423 PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); 424 PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); 425 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 426 PUSH_DATA (push, 4 * 4); 427 PUSH_DATA (push, 0x1); 428 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4); 429 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 430 431 PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); 432 PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); 433 PUSH_DATA (push, nvc0->constbuf[s][i].size); 434 PUSH_DATA (push, 0); 435 } 436 437 BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); 438 res->cb_bindings[s] |= 1 << i; 439 } 440 } 441 } 442 443 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 444 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 445} 446 447static void 448nve4_compute_validate_buffers(struct nvc0_context *nvc0) 449{ 450 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 451 uint64_t address; 452 const int s = 5; 453 int i; 454 455 address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); 456 457 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 458 PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0)); 459 PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0)); 460 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 461 PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4); 462 PUSH_DATA (push, 0x1); 463 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS); 464 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 465 466 for (i = 0; i < NVC0_MAX_BUFFERS; i++) { 467 if (nvc0->buffers[s][i].buffer) { 468 struct nv04_resource *res = 469 nv04_resource(nvc0->buffers[s][i].buffer); 470 PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset); 471 PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); 472 PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); 473 PUSH_DATA (push, 0); 474 BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); 475 util_range_add(&res->base, &res->valid_buffer_range, 476 nvc0->buffers[s][i].buffer_offset, 477 nvc0->buffers[s][i].buffer_offset + 478 nvc0->buffers[s][i].buffer_size); 479 } else { 480 PUSH_DATA (push, 0); 481 PUSH_DATA (push, 0); 482 PUSH_DATA (push, 0); 483 PUSH_DATA (push, 0); 484 } 485 } 486} 487 488static struct nvc0_state_validate 489validate_list_cp[] = { 490 { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, 491 { nve4_compute_validate_textures, NVC0_NEW_CP_TEXTURES }, 492 { nve4_compute_validate_samplers, NVC0_NEW_CP_SAMPLERS }, 493 { nve4_compute_set_tex_handles, NVC0_NEW_CP_TEXTURES | 494 NVC0_NEW_CP_SAMPLERS }, 495 { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, 496 { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, 497 { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, 498 { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, 499}; 500 501static bool 502nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) 503{ 504 bool ret; 505 506 ret = nvc0_state_validate(nvc0, mask, validate_list_cp, 507 ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp, 508 nvc0->bufctx_cp); 509 510 if (unlikely(nvc0->state.flushed)) 511 nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true); 512 return ret; 513} 514 515static void 516nve4_compute_upload_input(struct nvc0_context *nvc0, 517 const struct pipe_grid_info *info) 518{ 519 struct nvc0_screen *screen = nvc0->screen; 520 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 521 struct nvc0_program *cp = nvc0->compprog; 522 uint64_t address; 523 524 address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); 525 526 if (cp->parm_size) { 527 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 528 PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); 529 PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_USR_INFO(5)); 530 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 531 PUSH_DATA (push, cp->parm_size); 532 PUSH_DATA (push, 0x1); 533 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + DIV_ROUND_UP(cp->parm_size, 4)); 534 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 535 PUSH_DATAb(push, info->input, cp->parm_size); 536 } 537 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 538 PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO(0)); 539 PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO(0)); 540 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 541 PUSH_DATA (push, 8 * 4); 542 PUSH_DATA (push, 0x1); 543 544 if (unlikely(info->indirect)) { 545 struct nv04_resource *res = nv04_resource(info->indirect); 546 uint32_t offset = res->offset + info->indirect_offset; 547 548 nouveau_pushbuf_space(push, 32, 0, 1); 549 PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); 550 551 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); 552 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 553 PUSH_DATAp(push, info->block, 3); 554 nouveau_pushbuf_data(push, res->bo, offset, 555 NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); 556 } else { 557 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 8); 558 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 559 PUSH_DATAp(push, info->block, 3); 560 PUSH_DATAp(push, info->grid, 3); 561 } 562 PUSH_DATA (push, 0); 563 PUSH_DATA (push, info->work_dim); 564 565 BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); 566 PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); 567} 568 569static inline void 570gp100_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, 571 struct nouveau_bo *bo, uint32_t base, uint32_t size) 572{ 573 uint64_t address = bo->offset + base; 574 575 assert(index < 8); 576 assert(!(base & 0xff)); 577 578 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); 579 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); 580 NVC0C0_QMDV02_01_VAL_SET(qmd, CONSTANT_BUFFER_SIZE_SHIFTED4, index, 581 DIV_ROUND_UP(size, 16)); 582 NVC0C0_QMDV02_01_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE); 583} 584 585static inline void 586nve4_cp_launch_desc_set_cb(uint32_t *qmd, unsigned index, struct nouveau_bo *bo, 587 uint32_t base, uint32_t size) 588{ 589 uint64_t address = bo->offset + base; 590 591 assert(index < 8); 592 assert(!(base & 0xff)); 593 594 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_LOWER, index, address); 595 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_ADDR_UPPER, index, address >> 32); 596 NVA0C0_QMDV00_06_VAL_SET(qmd, CONSTANT_BUFFER_SIZE, index, size); 597 NVA0C0_QMDV00_06_DEF_SET(qmd, CONSTANT_BUFFER_VALID, index, TRUE); 598} 599 600static void 601nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc) 602{ 603 // only user constant buffers 0-6 can be put in the descriptor, the rest are 604 // loaded through global memory 605 for (int i = 0; i <= 6; i++) { 606 if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf) 607 continue; 608 609 struct nv04_resource *res = 610 nv04_resource(nvc0->constbuf[5][i].u.buf); 611 612 uint32_t base = res->offset + nvc0->constbuf[5][i].offset; 613 uint32_t size = nvc0->constbuf[5][i].size; 614 if (gp100) 615 gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size); 616 else 617 nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size); 618 } 619 620 // there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because 621 // nve4_compute_upload_input() does it later 622} 623 624static void 625nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd, 626 const struct pipe_grid_info *info) 627{ 628 const struct nvc0_screen *screen = nvc0->screen; 629 const struct nvc0_program *cp = nvc0->compprog; 630 631 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_HEADER_CACHE, TRUE); 632 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_SAMPLER_CACHE, TRUE); 633 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_TEXTURE_DATA_CACHE, TRUE); 634 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_DATA_CACHE, TRUE); 635 NVA0C0_QMDV00_06_DEF_SET(qmd, INVALIDATE_SHADER_CONSTANT_CACHE, TRUE); 636 NVA0C0_QMDV00_06_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR); 637 NVA0C0_QMDV00_06_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR); 638 NVA0C0_QMDV00_06_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); 639 NVA0C0_QMDV00_06_VAL_SET(qmd, SASS_VERSION, 0x30); 640 641 NVA0C0_QMDV00_06_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base); 642 643 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]); 644 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]); 645 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]); 646 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]); 647 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]); 648 NVA0C0_QMDV00_06_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]); 649 650 NVA0C0_QMDV00_06_VAL_SET(qmd, SHARED_MEMORY_SIZE, 651 align(cp->cp.smem_size, 0x100)); 652 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, 653 (cp->hdr[1] & 0xfffff0) + 654 align(cp->cp.lmem_size, 0x10)); 655 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); 656 NVA0C0_QMDV00_06_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800); 657 658 if (cp->cp.smem_size > (32 << 10)) 659 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, 660 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_48KB); 661 else 662 if (cp->cp.smem_size > (16 << 10)) 663 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, 664 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_32KB); 665 else 666 NVA0C0_QMDV00_06_DEF_SET(qmd, L1_CONFIGURATION, 667 DIRECTLY_ADDRESSABLE_MEMORY_SIZE_16KB); 668 669 NVA0C0_QMDV00_06_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs); 670 NVA0C0_QMDV00_06_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers); 671 672 // Only bind user uniforms and the driver constant buffer through the 673 // launch descriptor because UBOs are sticked to the driver cb to avoid the 674 // limitation of 8 CBs. 675 if (nvc0->constbuf[5][0].user || cp->parm_size) { 676 nve4_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo, 677 NVC0_CB_USR_INFO(5), 1 << 16); 678 679 // Later logic will attempt to bind a real buffer at position 0. That 680 // should not happen if we've bound a user buffer. 681 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf); 682 } 683 nve4_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo, 684 NVC0_CB_AUX_INFO(5), 1 << 11); 685 686 nve4_compute_setup_buf_cb(nvc0, false, qmd); 687} 688 689static void 690gp100_compute_setup_launch_desc(struct nvc0_context *nvc0, uint32_t *qmd, 691 const struct pipe_grid_info *info) 692{ 693 const struct nvc0_screen *screen = nvc0->screen; 694 const struct nvc0_program *cp = nvc0->compprog; 695 696 NVC0C0_QMDV02_01_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); 697 NVC0C0_QMDV02_01_DEF_SET(qmd, RELEASE_MEMBAR_TYPE, FE_SYSMEMBAR); 698 NVC0C0_QMDV02_01_DEF_SET(qmd, CWD_MEMBAR_TYPE, L1_SYSMEMBAR); 699 NVC0C0_QMDV02_01_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); 700 701 NVC0C0_QMDV02_01_VAL_SET(qmd, PROGRAM_OFFSET, cp->code_base); 702 703 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]); 704 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]); 705 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]); 706 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]); 707 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]); 708 NVC0C0_QMDV02_01_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]); 709 710 NVC0C0_QMDV02_01_VAL_SET(qmd, SHARED_MEMORY_SIZE, 711 align(cp->cp.smem_size, 0x100)); 712 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, 713 (cp->hdr[1] & 0xfffff0) + 714 align(cp->cp.lmem_size, 0x10)); 715 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); 716 NVC0C0_QMDV02_01_VAL_SET(qmd, SHADER_LOCAL_MEMORY_CRS_SIZE, 0x800); 717 718 NVC0C0_QMDV02_01_VAL_SET(qmd, REGISTER_COUNT, cp->num_gprs); 719 NVC0C0_QMDV02_01_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers); 720 721 // Only bind user uniforms and the driver constant buffer through the 722 // launch descriptor because UBOs are sticked to the driver cb to avoid the 723 // limitation of 8 CBs. 724 if (nvc0->constbuf[5][0].user || cp->parm_size) { 725 gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo, 726 NVC0_CB_USR_INFO(5), 1 << 16); 727 728 // Later logic will attempt to bind a real buffer at position 0. That 729 // should not happen if we've bound a user buffer. 730 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf); 731 } 732 gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo, 733 NVC0_CB_AUX_INFO(5), 1 << 11); 734 735 nve4_compute_setup_buf_cb(nvc0, true, qmd); 736} 737 738static int 739gv100_sm_config_smem_size(u32 size) 740{ 741 if (size > 64 * 1024) size = 96 * 1024; 742 else if (size > 32 * 1024) size = 64 * 1024; 743 else if (size > 16 * 1024) size = 32 * 1024; 744 else if (size > 8 * 1024) size = 16 * 1024; 745 else size = 8 * 1024; 746 return (size / 4096) + 1; 747} 748 749static void 750gv100_compute_setup_launch_desc(struct nvc0_context *nvc0, u32 *qmd, 751 const struct pipe_grid_info *info) 752{ 753 struct nvc0_program *cp = nvc0->compprog; 754 struct nvc0_screen *screen = nvc0->screen; 755 uint64_t entry = screen->text->offset + cp->code_base; 756 757 NVC3C0_QMDV02_02_VAL_SET(qmd, SM_GLOBAL_CACHING_ENABLE, 1); 758 NVC3C0_QMDV02_02_DEF_SET(qmd, API_VISIBLE_CALL_LIMIT, NO_CHECK); 759 NVC3C0_QMDV02_02_DEF_SET(qmd, SAMPLER_INDEX, INDEPENDENTLY); 760 NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE, 761 align(cp->cp.smem_size, 0x100)); 762 NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE, 763 (cp->hdr[1] & 0xfffff0) + 764 align(cp->cp.lmem_size, 0x10)); 765 NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0); 766 NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE, 767 gv100_sm_config_smem_size(8 * 1024)); 768 NVC3C0_QMDV02_02_VAL_SET(qmd, MAX_SM_CONFIG_SHARED_MEM_SIZE, 769 gv100_sm_config_smem_size(96 * 1024)); 770 NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_VERSION, 2); 771 NVC3C0_QMDV02_02_VAL_SET(qmd, QMD_MAJOR_VERSION, 2); 772 NVC3C0_QMDV02_02_VAL_SET(qmd, TARGET_SM_CONFIG_SHARED_MEM_SIZE, 773 gv100_sm_config_smem_size(cp->cp.smem_size)); 774 775 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_WIDTH, info->grid[0]); 776 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_HEIGHT, info->grid[1]); 777 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_RASTER_DEPTH, info->grid[2]); 778 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION0, info->block[0]); 779 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION1, info->block[1]); 780 NVC3C0_QMDV02_02_VAL_SET(qmd, CTA_THREAD_DIMENSION2, info->block[2]); 781 NVC3C0_QMDV02_02_VAL_SET(qmd, REGISTER_COUNT_V, cp->num_gprs); 782 NVC3C0_QMDV02_02_VAL_SET(qmd, BARRIER_COUNT, cp->num_barriers); 783 784 // Only bind user uniforms and the driver constant buffer through the 785 // launch descriptor because UBOs are sticked to the driver cb to avoid the 786 // limitation of 8 CBs. 787 if (nvc0->constbuf[5][0].user || cp->parm_size) { 788 gp100_cp_launch_desc_set_cb(qmd, 0, screen->uniform_bo, 789 NVC0_CB_USR_INFO(5), 1 << 16); 790 791 // Later logic will attempt to bind a real buffer at position 0. That 792 // should not happen if we've bound a user buffer. 793 assert(nvc0->constbuf[5][0].user || !nvc0->constbuf[5][0].u.buf); 794 } 795 gp100_cp_launch_desc_set_cb(qmd, 7, screen->uniform_bo, 796 NVC0_CB_AUX_INFO(5), 1 << 11); 797 798 nve4_compute_setup_buf_cb(nvc0, true, qmd); 799 800 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_LOWER, entry & 0xffffffff); 801 NVC3C0_QMDV02_02_VAL_SET(qmd, PROGRAM_ADDRESS_UPPER, entry >> 32); 802} 803 804static inline void * 805nve4_compute_alloc_launch_desc(struct nouveau_context *nv, 806 struct nouveau_bo **pbo, uint64_t *pgpuaddr) 807{ 808 uint8_t *ptr = nouveau_scratch_get(nv, 512, pgpuaddr, pbo); 809 if (!ptr) 810 return NULL; 811 if (*pgpuaddr & 255) { 812 unsigned adj = 256 - (*pgpuaddr & 255); 813 ptr += adj; 814 *pgpuaddr += adj; 815 } 816 memset(ptr, 0x00, 256); 817 return ptr; 818} 819 820static void 821nve4_upload_indirect_desc(struct nouveau_pushbuf *push, 822 struct nv04_resource *res, uint64_t gpuaddr, 823 uint32_t length, uint32_t bo_offset) 824{ 825 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 826 PUSH_DATAh(push, gpuaddr); 827 PUSH_DATA (push, gpuaddr); 828 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 829 PUSH_DATA (push, length); 830 PUSH_DATA (push, 1); 831 832 nouveau_pushbuf_space(push, 32, 0, 1); 833 PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); 834 835 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (length / 4)); 836 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); 837 nouveau_pushbuf_data(push, res->bo, bo_offset, 838 NVC0_IB_ENTRY_1_NO_PREFETCH | length); 839} 840 841void 842nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) 843{ 844 struct nvc0_context *nvc0 = nvc0_context(pipe); 845 struct nvc0_screen *screen = nvc0->screen; 846 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 847 void *desc; 848 uint64_t desc_gpuaddr; 849 struct nouveau_bo *desc_bo; 850 int ret; 851 852 desc = nve4_compute_alloc_launch_desc(&nvc0->base, &desc_bo, &desc_gpuaddr); 853 if (!desc) { 854 ret = -1; 855 goto out; 856 } 857 BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD, 858 desc_bo); 859 860 list_for_each_entry(struct nvc0_resident, resident, &nvc0->tex_head, list) { 861 nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf, 862 resident->flags); 863 } 864 865 list_for_each_entry(struct nvc0_resident, resident, &nvc0->img_head, list) { 866 nvc0_add_resident(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS, resident->buf, 867 resident->flags); 868 } 869 870 ret = !nve4_state_validate_cp(nvc0, ~0); 871 if (ret) 872 goto out; 873 874 if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS) 875 gv100_compute_setup_launch_desc(nvc0, desc, info); 876 else 877 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) 878 gp100_compute_setup_launch_desc(nvc0, desc, info); 879 else 880 nve4_compute_setup_launch_desc(nvc0, desc, info); 881 882 nve4_compute_upload_input(nvc0, info); 883 884#ifndef NDEBUG 885 if (debug_get_num_option("NV50_PROG_DEBUG", 0)) { 886 debug_printf("Queue Meta Data:\n"); 887 if (nvc0->screen->compute->oclass >= GV100_COMPUTE_CLASS) 888 NVC3C0QmdDump_V02_02(desc); 889 else 890 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) 891 NVC0C0QmdDump_V02_01(desc); 892 else 893 NVA0C0QmdDump_V00_06(desc); 894 } 895#endif 896 897 if (unlikely(info->indirect)) { 898 struct nv04_resource *res = nv04_resource(info->indirect); 899 uint32_t offset = res->offset + info->indirect_offset; 900 901 /* upload the descriptor */ 902 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 903 PUSH_DATAh(push, desc_gpuaddr); 904 PUSH_DATA (push, desc_gpuaddr); 905 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 906 PUSH_DATA (push, 256); 907 PUSH_DATA (push, 1); 908 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4)); 909 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); 910 PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); 911 912 if (nvc0->screen->compute->oclass >= GP100_COMPUTE_CLASS) { 913 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 12, offset); 914 } else { 915 /* overwrite griddim_x and griddim_y as two 32-bits integers even 916 * if griddim_y must be a 16-bits integer */ 917 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 48, 8, offset); 918 919 /* overwrite the 16 high bits of griddim_y with griddim_z because 920 * we need (z << 16) | x */ 921 nve4_upload_indirect_desc(push, res, desc_gpuaddr + 54, 4, offset + 8); 922 } 923 } 924 925 /* upload descriptor and flush */ 926 nouveau_pushbuf_space(push, 32, 1, 0); 927 PUSH_REFN(push, screen->text, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD); 928 BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1); 929 PUSH_DATA (push, desc_gpuaddr >> 8); 930 BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1); 931 PUSH_DATA (push, 0x3); 932 BEGIN_NVC0(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); 933 PUSH_DATA (push, 0); 934 935 nvc0_update_compute_invocations_counter(nvc0, info); 936 937out: 938 if (ret) 939 NOUVEAU_ERR("Failed to launch grid !\n"); 940 nouveau_scratch_done(&nvc0->base); 941 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_DESC); 942 nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_BINDLESS); 943} 944 945 946#define NVE4_TIC_ENTRY_INVALID 0x000fffff 947 948static void 949nve4_compute_validate_textures(struct nvc0_context *nvc0) 950{ 951 struct nouveau_bo *txc = nvc0->screen->txc; 952 struct nouveau_pushbuf *push = nvc0->base.pushbuf; 953 const unsigned s = 5; 954 unsigned i; 955 uint32_t commands[2][32]; 956 unsigned n[2] = { 0, 0 }; 957 958 for (i = 0; i < nvc0->num_textures[s]; ++i) { 959 struct nv50_tic_entry *tic = nv50_tic_entry(nvc0->textures[s][i]); 960 struct nv04_resource *res; 961 const bool dirty = !!(nvc0->textures_dirty[s] & (1 << i)); 962 963 if (!tic) { 964 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; 965 continue; 966 } 967 res = nv04_resource(tic->pipe.texture); 968 nvc0_update_tic(nvc0, tic, res); 969 970 if (tic->id < 0) { 971 tic->id = nvc0_screen_tic_alloc(nvc0->screen, tic); 972 973 PUSH_SPACE(push, 16); 974 BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); 975 PUSH_DATAh(push, txc->offset + (tic->id * 32)); 976 PUSH_DATA (push, txc->offset + (tic->id * 32)); 977 BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); 978 PUSH_DATA (push, 32); 979 PUSH_DATA (push, 1); 980 BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 9); 981 PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); 982 PUSH_DATAp(push, &tic->tic[0], 8); 983 984 commands[0][n[0]++] = (tic->id << 4) | 1; 985 } else 986 if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) { 987 commands[1][n[1]++] = (tic->id << 4) | 1; 988 } 989 nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); 990 991 res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; 992 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; 993 994 nvc0->tex_handles[s][i] &= ~NVE4_TIC_ENTRY_INVALID; 995 nvc0->tex_handles[s][i] |= tic->id; 996 if (dirty) 997 BCTX_REFN(nvc0->bufctx_cp, CP_TEX(i), res, RD); 998 } 999 for (; i < nvc0->state.num_textures[s]; ++i) { 1000 nvc0->tex_handles[s][i] |= NVE4_TIC_ENTRY_INVALID; 1001 nvc0->textures_dirty[s] |= 1 << i; 1002 } 1003 1004 if (n[0]) { 1005 BEGIN_NIC0(push, NVE4_CP(TIC_FLUSH), n[0]); 1006 PUSH_DATAp(push, commands[0], n[0]); 1007 } 1008 if (n[1]) { 1009 BEGIN_NIC0(push, NVE4_CP(TEX_CACHE_CTL), n[1]); 1010 PUSH_DATAp(push, commands[1], n[1]); 1011 } 1012 1013 nvc0->state.num_textures[s] = nvc0->num_textures[s]; 1014 1015 /* Invalidate all 3D textures because they are aliased. */ 1016 for (int s = 0; s < 5; s++) { 1017 for (int i = 0; i < nvc0->num_textures[s]; i++) 1018 nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i)); 1019 nvc0->textures_dirty[s] = ~0; 1020 } 1021 nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES; 1022} 1023 1024#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER 1025static void 1026nve4_compute_trap_info(struct nvc0_context *nvc0) 1027{ 1028 struct nvc0_screen *screen = nvc0->screen; 1029 struct nouveau_bo *bo = screen->parm; 1030 int ret, i; 1031 volatile struct nve4_mp_trap_info *info; 1032 uint8_t *map; 1033 1034 ret = nouveau_bo_map(bo, NOUVEAU_BO_RDWR, nvc0->base.client); 1035 if (ret) 1036 return; 1037 map = (uint8_t *)bo->map; 1038 info = (volatile struct nve4_mp_trap_info *)(map + NVE4_CP_PARAM_TRAP_INFO); 1039 1040 if (info->lock) { 1041 debug_printf("trapstat = %08x\n", info->trapstat); 1042 debug_printf("warperr = %08x\n", info->warperr); 1043 debug_printf("PC = %x\n", info->pc); 1044 debug_printf("tid = %u %u %u\n", 1045 info->tid[0], info->tid[1], info->tid[2]); 1046 debug_printf("ctaid = %u %u %u\n", 1047 info->ctaid[0], info->ctaid[1], info->ctaid[2]); 1048 for (i = 0; i <= 63; ++i) 1049 debug_printf("$r%i = %08x\n", i, info->r[i]); 1050 for (i = 0; i <= 6; ++i) 1051 debug_printf("$p%i = %i\n", i, (info->flags >> i) & 1); 1052 debug_printf("$c = %x\n", info->flags >> 12); 1053 } 1054 info->lock = 0; 1055} 1056#endif 1057