1/* 2 * Copyright © 2014-2017 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** 25 * @file v3dx_simulator.c 26 * 27 * Implements the actual HW interaction betweeh the GL driver's V3D simulator and the simulator. 28 * 29 * The register headers between V3D versions will have conflicting defines, so 30 * all register interactions appear in this file and are compiled per V3D version 31 * we support. 32 */ 33 34#ifdef USE_V3D_SIMULATOR 35 36#include <assert.h> 37#include <stdbool.h> 38#include <stdio.h> 39 40#include "v3d_simulator.h" 41#include "v3d_simulator_wrapper.h" 42 43#include "util/macros.h" 44#include "util/bitscan.h" 45#include "drm-uapi/v3d_drm.h" 46 47#define HW_REGISTER_RO(x) (x) 48#define HW_REGISTER_RW(x) (x) 49#if V3D_VERSION >= 41 50#include "libs/core/v3d/registers/4.1.35.0/v3d.h" 51#else 52#include "libs/core/v3d/registers/3.3.0.0/v3d.h" 53#endif 54 55#define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val) 56#define V3D_READ(reg) v3d_hw_read_reg(v3d, reg) 57 58static void 59v3d_invalidate_l3(struct v3d_hw *v3d) 60{ 61#if V3D_VERSION < 40 62 uint32_t gca_ctrl = V3D_READ(V3D_GCA_CACHE_CTRL); 63 64 V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl | V3D_GCA_CACHE_CTRL_FLUSH_SET); 65 V3D_WRITE(V3D_GCA_CACHE_CTRL, gca_ctrl & ~V3D_GCA_CACHE_CTRL_FLUSH_SET); 66#endif 67} 68 69/* Invalidates the L2C cache. This is a read-only cache for uniforms and instructions. */ 70static void 71v3d_invalidate_l2c(struct v3d_hw *v3d) 72{ 73 if (V3D_VERSION >= 33) 74 return; 75 76 V3D_WRITE(V3D_CTL_0_L2CACTL, 77 V3D_CTL_0_L2CACTL_L2CCLR_SET | 78 V3D_CTL_0_L2CACTL_L2CENA_SET); 79} 80 81enum v3d_l2t_cache_flush_mode { 82 V3D_CACHE_FLUSH_MODE_FLUSH, 83 V3D_CACHE_FLUSH_MODE_CLEAR, 84 V3D_CACHE_FLUSH_MODE_CLEAN, 85}; 86 87/* Invalidates texture L2 cachelines */ 88static void 89v3d_invalidate_l2t(struct v3d_hw *v3d) 90{ 91 V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); 92 V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); 93 V3D_WRITE(V3D_CTL_0_L2TCACTL, 94 V3D_CTL_0_L2TCACTL_L2TFLS_SET | 95 (V3D_CACHE_FLUSH_MODE_FLUSH << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); 96} 97 98/* 99 * Wait for l2tcactl, used for flushes. 100 * 101 * FIXME: for a multicore scenario we should pass here the core. All wrapper 102 * assumes just one core, so would be better to handle that on that case. 103 */ 104static UNUSED void v3d_core_wait_l2tcactl(struct v3d_hw *v3d, 105 uint32_t ctrl) 106{ 107 assert(!(ctrl & ~(V3D_CTL_0_L2TCACTL_TMUWCF_SET | V3D_CTL_0_L2TCACTL_L2TFLS_SET))); 108 109 while (V3D_READ(V3D_CTL_0_L2TCACTL) & ctrl) { 110 v3d_hw_tick(v3d); 111 } 112} 113 114/* Flushes dirty texture cachelines from the L1 write combiner */ 115static void 116v3d_flush_l1td(struct v3d_hw *v3d) 117{ 118 V3D_WRITE(V3D_CTL_0_L2TCACTL, 119 V3D_CTL_0_L2TCACTL_TMUWCF_SET); 120 121 /* Note: here the kernel (and previous versions of the simulator 122 * wrapper) is using V3D_CTL_0_L2TCACTL_L2TFLS_SET, as with l2t. We 123 * understand that it makes more sense to do like this. We need to 124 * confirm which one is doing it correctly. So far things work fine on 125 * the simulator this way. 126 */ 127 v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_TMUWCF_SET); 128} 129 130/* Flushes dirty texture L2 cachelines */ 131static void 132v3d_flush_l2t(struct v3d_hw *v3d) 133{ 134 V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0); 135 V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0); 136 V3D_WRITE(V3D_CTL_0_L2TCACTL, 137 V3D_CTL_0_L2TCACTL_L2TFLS_SET | 138 (V3D_CACHE_FLUSH_MODE_CLEAN << V3D_CTL_0_L2TCACTL_L2TFLM_LSB)); 139 140 v3d_core_wait_l2tcactl(v3d, V3D_CTL_0_L2TCACTL_L2TFLS_SET); 141} 142 143/* Invalidates the slice caches. These are read-only caches. */ 144static void 145v3d_invalidate_slices(struct v3d_hw *v3d) 146{ 147 V3D_WRITE(V3D_CTL_0_SLCACTL, ~0); 148} 149 150static void 151v3d_invalidate_caches(struct v3d_hw *v3d) 152{ 153 v3d_invalidate_l3(v3d); 154 v3d_invalidate_l2c(v3d); 155 v3d_invalidate_l2t(v3d); 156 v3d_invalidate_slices(v3d); 157} 158 159static uint32_t g_gmp_ofs; 160static void 161v3d_reload_gmp(struct v3d_hw *v3d) 162{ 163 /* Completely reset the GMP. */ 164 V3D_WRITE(V3D_GMP_CFG, 165 V3D_GMP_CFG_PROTENABLE_SET); 166 V3D_WRITE(V3D_GMP_TABLE_ADDR, g_gmp_ofs); 167 V3D_WRITE(V3D_GMP_CLEAR_LOAD, ~0); 168 while (V3D_READ(V3D_GMP_STATUS) & 169 V3D_GMP_STATUS_CFG_BUSY_SET) { 170 ; 171 } 172} 173 174static UNUSED void 175v3d_flush_caches(struct v3d_hw *v3d) 176{ 177 v3d_flush_l1td(v3d); 178 v3d_flush_l2t(v3d); 179} 180 181int 182v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d, 183 struct drm_v3d_submit_tfu *args) 184{ 185 int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET; 186 187 V3D_WRITE(V3D_TFU_IIA, args->iia); 188 V3D_WRITE(V3D_TFU_IIS, args->iis); 189 V3D_WRITE(V3D_TFU_ICA, args->ica); 190 V3D_WRITE(V3D_TFU_IUA, args->iua); 191 V3D_WRITE(V3D_TFU_IOA, args->ioa); 192 V3D_WRITE(V3D_TFU_IOS, args->ios); 193 V3D_WRITE(V3D_TFU_COEF0, args->coef[0]); 194 V3D_WRITE(V3D_TFU_COEF1, args->coef[1]); 195 V3D_WRITE(V3D_TFU_COEF2, args->coef[2]); 196 V3D_WRITE(V3D_TFU_COEF3, args->coef[3]); 197 198 V3D_WRITE(V3D_TFU_ICFG, args->icfg); 199 200 while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) { 201 v3d_hw_tick(v3d); 202 } 203 204 return 0; 205} 206 207#if V3D_VERSION >= 41 208int 209v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d, 210 struct drm_v3d_submit_csd *args, 211 uint32_t gmp_ofs) 212{ 213 int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) & 214 V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET); 215 g_gmp_ofs = gmp_ofs; 216 v3d_reload_gmp(v3d); 217 218 v3d_invalidate_caches(v3d); 219 220 V3D_WRITE(V3D_CSD_0_QUEUED_CFG1, args->cfg[1]); 221 V3D_WRITE(V3D_CSD_0_QUEUED_CFG2, args->cfg[2]); 222 V3D_WRITE(V3D_CSD_0_QUEUED_CFG3, args->cfg[3]); 223 V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]); 224 V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]); 225 V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]); 226 /* CFG0 kicks off the job */ 227 V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]); 228 229 /* Now we wait for the dispatch to finish. The safest way is to check 230 * if NUM_COMPLETED_JOBS has increased. Note that in spite of that 231 * name that register field is about the number of completed 232 * dispatches. 233 */ 234 while ((V3D_READ(V3D_CSD_0_STATUS) & 235 V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET) == last_completed_jobs) { 236 v3d_hw_tick(v3d); 237 } 238 239 v3d_flush_caches(v3d); 240 241 return 0; 242} 243#endif 244 245int 246v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d, 247 struct drm_v3d_get_param *args) 248{ 249 static const uint32_t reg_map[] = { 250 [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG, 251 [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1, 252 [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2, 253 [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3, 254 [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0, 255 [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1, 256 [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2, 257 }; 258 259 switch (args->param) { 260 case DRM_V3D_PARAM_SUPPORTS_TFU: 261 args->value = 1; 262 return 0; 263 case DRM_V3D_PARAM_SUPPORTS_CSD: 264 args->value = V3D_VERSION >= 41; 265 return 0; 266 case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH: 267 args->value = 1; 268 return 0; 269 case DRM_V3D_PARAM_SUPPORTS_PERFMON: 270 args->value = V3D_VERSION >= 41; 271 return 0; 272 case DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT: 273 args->value = 1; 274 return 0; 275 } 276 277 if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) { 278 args->value = V3D_READ(reg_map[args->param]); 279 return 0; 280 } 281 282 fprintf(stderr, "Unknown DRM_IOCTL_V3D_GET_PARAM(%lld)\n", 283 (long long)args->value); 284 abort(); 285} 286 287static struct v3d_hw *v3d_isr_hw; 288 289 290static void 291v3d_isr_core(struct v3d_hw *v3d, 292 unsigned core) 293{ 294 /* FIXME: so far we are assuming just one core, and using only the _0_ 295 * registers. If we add multiple-core on the simulator, we would need 296 * to pass core as a parameter, and chose the proper registers. 297 */ 298 assert(core == 0); 299 uint32_t core_status = V3D_READ(V3D_CTL_0_INT_STS); 300 V3D_WRITE(V3D_CTL_0_INT_CLR, core_status); 301 302 if (core_status & V3D_CTL_0_INT_STS_INT_OUTOMEM_SET) { 303 uint32_t size = 256 * 1024; 304 uint32_t offset = v3d_simulator_get_spill(size); 305 306 v3d_reload_gmp(v3d); 307 308 V3D_WRITE(V3D_PTB_0_BPOA, offset); 309 V3D_WRITE(V3D_PTB_0_BPOS, size); 310 return; 311 } 312 313 if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) { 314 fprintf(stderr, "GMP violation at 0x%08x\n", 315 V3D_READ(V3D_GMP_VIO_ADDR)); 316 abort(); 317 } else { 318 fprintf(stderr, 319 "Unexpected ISR with core status 0x%08x\n", 320 core_status); 321 } 322 abort(); 323} 324 325static void 326handle_mmu_interruptions(struct v3d_hw *v3d, 327 uint32_t hub_status) 328{ 329 bool wrv = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET; 330 bool pti = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET; 331 bool cap = hub_status & V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET; 332 333 if (!(pti || cap || wrv)) 334 return; 335 336 const char *client = "?"; 337 uint32_t axi_id = V3D_READ(V3D_MMU_VIO_ID); 338 uint32_t va_width = 30; 339 340#if V3D_VERSION >= 41 341 static const char *const v3d41_axi_ids[] = { 342 "L2T", 343 "PTB", 344 "PSE", 345 "TLB", 346 "CLE", 347 "TFU", 348 "MMU", 349 "GMP", 350 }; 351 352 axi_id = axi_id >> 5; 353 if (axi_id < ARRAY_SIZE(v3d41_axi_ids)) 354 client = v3d41_axi_ids[axi_id]; 355 356 uint32_t mmu_debug = V3D_READ(V3D_MMU_DEBUG_INFO); 357 358 va_width += ((mmu_debug & V3D_MMU_DEBUG_INFO_VA_WIDTH_SET) 359 >> V3D_MMU_DEBUG_INFO_VA_WIDTH_LSB); 360#endif 361 /* Only the top bits (final number depends on the gen) of the virtual 362 * address are reported in the MMU VIO_ADDR register. 363 */ 364 uint64_t vio_addr = ((uint64_t)V3D_READ(V3D_MMU_VIO_ADDR) << 365 (va_width - 32)); 366 367 /* Difference with the kernal: here were are going to abort after 368 * logging, so we don't bother with some stuff that the kernel does, 369 * like restoring the MMU ctrl bits 370 */ 371 372 fprintf(stderr, "MMU error from client %s (%d) at 0x%llx%s%s%s\n", 373 client, axi_id, (long long) vio_addr, 374 wrv ? ", write violation" : "", 375 pti ? ", pte invalid" : "", 376 cap ? ", cap exceeded" : ""); 377 378 abort(); 379} 380 381static void 382v3d_isr_hub(struct v3d_hw *v3d) 383{ 384 uint32_t hub_status = V3D_READ(V3D_HUB_CTL_INT_STS); 385 386 /* Acknowledge the interrupts we're handling here */ 387 V3D_WRITE(V3D_HUB_CTL_INT_CLR, hub_status); 388 389 if (hub_status & V3D_HUB_CTL_INT_STS_INT_TFUC_SET) { 390 /* FIXME: we were not able to raise this exception. We let the 391 * unreachable here, so we could get one if it is raised on 392 * the future. In any case, note that for this case we would 393 * only be doing debugging log. 394 */ 395 unreachable("TFU Conversion Complete interrupt not handled"); 396 } 397 398 handle_mmu_interruptions(v3d, hub_status); 399} 400 401static void 402v3d_isr(uint32_t hub_status) 403{ 404 struct v3d_hw *v3d = v3d_isr_hw; 405 uint32_t mask = hub_status; 406 407 /* Check the hub_status bits */ 408 while (mask) { 409 unsigned core = u_bit_scan(&mask); 410 411 if (core == v3d_hw_get_hub_core()) 412 v3d_isr_hub(v3d); 413 else 414 v3d_isr_core(v3d, core); 415 } 416 417 return; 418} 419 420void 421v3dX(simulator_init_regs)(struct v3d_hw *v3d) 422{ 423#if V3D_VERSION == 33 424 /* Set OVRTMUOUT to match kernel behavior. 425 * 426 * This means that the texture sampler uniform configuration's tmu 427 * output type field is used, instead of using the hardware default 428 * behavior based on the texture type. If you want the default 429 * behavior, you can still put "2" in the indirect texture state's 430 * output_type field. 431 */ 432 V3D_WRITE(V3D_CTL_0_MISCCFG, V3D_CTL_1_MISCCFG_OVRTMUOUT_SET); 433#endif 434 435 /* FIXME: the kernel captures some additional core interrupts here, 436 * for tracing. Perhaps we should evaluate to do the same here and add 437 * some debug options. 438 */ 439 uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET | 440 V3D_CTL_0_INT_STS_INT_OUTOMEM_SET); 441 V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts); 442 V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts); 443 444 uint32_t hub_interrupts = 445 (V3D_HUB_CTL_INT_STS_INT_MMU_WRV_SET | /* write violation */ 446 V3D_HUB_CTL_INT_STS_INT_MMU_PTI_SET | /* page table invalid */ 447 V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET | /* CAP exceeded */ 448 V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */ 449 450 V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts); 451 V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts); 452 453 v3d_isr_hw = v3d; 454 v3d_hw_set_isr(v3d, v3d_isr); 455} 456 457void 458v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d, 459 struct drm_v3d_submit_cl *submit, 460 uint32_t gmp_ofs) 461{ 462 int last_bfc = (V3D_READ(V3D_CLE_0_BFC) & 463 V3D_CLE_0_BFC_BMFCT_SET); 464 465 int last_rfc = (V3D_READ(V3D_CLE_0_RFC) & 466 V3D_CLE_0_RFC_RMFCT_SET); 467 468 g_gmp_ofs = gmp_ofs; 469 v3d_reload_gmp(v3d); 470 471 v3d_invalidate_caches(v3d); 472 473 if (submit->qma) { 474 V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma); 475 V3D_WRITE(V3D_CLE_0_CT0QMS, submit->qms); 476 } 477#if V3D_VERSION >= 41 478 if (submit->qts) { 479 V3D_WRITE(V3D_CLE_0_CT0QTS, 480 V3D_CLE_0_CT0QTS_CTQTSEN_SET | 481 submit->qts); 482 } 483#endif 484 V3D_WRITE(V3D_CLE_0_CT0QBA, submit->bcl_start); 485 V3D_WRITE(V3D_CLE_0_CT0QEA, submit->bcl_end); 486 487 /* Wait for bin to complete before firing render. The kernel's 488 * scheduler implements this using the GPU scheduler blocking on the 489 * bin fence completing. (We don't use HW semaphores). 490 */ 491 while ((V3D_READ(V3D_CLE_0_BFC) & 492 V3D_CLE_0_BFC_BMFCT_SET) == last_bfc) { 493 v3d_hw_tick(v3d); 494 } 495 496 v3d_invalidate_caches(v3d); 497 498 V3D_WRITE(V3D_CLE_0_CT1QBA, submit->rcl_start); 499 V3D_WRITE(V3D_CLE_0_CT1QEA, submit->rcl_end); 500 501 while ((V3D_READ(V3D_CLE_0_RFC) & 502 V3D_CLE_0_RFC_RMFCT_SET) == last_rfc) { 503 v3d_hw_tick(v3d); 504 } 505} 506 507#if V3D_VERSION >= 41 508#define V3D_PCTR_0_PCTR_N(x) (V3D_PCTR_0_PCTR0 + 4 * (x)) 509#define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x)) 510#define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8) 511#define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \ 512 V3D_PCTR_0_SRC_N_SHIFT(x) + 6)) 513#endif 514 515void 516v3dX(simulator_perfmon_start)(struct v3d_hw *v3d, 517 uint32_t ncounters, 518 uint8_t *events) 519{ 520#if V3D_VERSION >= 41 521 int i, j; 522 uint32_t source; 523 uint32_t mask = BITFIELD_RANGE(0, ncounters); 524 525 for (i = 0; i < ncounters; i+=4) { 526 source = i / 4; 527 uint32_t channels = 0; 528 for (j = 0; j < 4 && (i + j) < ncounters; j++) 529 channels |= events[i + j] << V3D_PCTR_0_SRC_N_SHIFT(j); 530 V3D_WRITE(V3D_PCTR_0_SRC_N(source), channels); 531 } 532 V3D_WRITE(V3D_PCTR_0_CLR, mask); 533 V3D_WRITE(V3D_PCTR_0_OVERFLOW, mask); 534 V3D_WRITE(V3D_PCTR_0_EN, mask); 535#endif 536} 537 538void v3dX(simulator_perfmon_stop)(struct v3d_hw *v3d, 539 uint32_t ncounters, 540 uint64_t *values) 541{ 542#if V3D_VERSION >= 41 543 int i; 544 545 for (i = 0; i < ncounters; i++) 546 values[i] += V3D_READ(V3D_PCTR_0_PCTR_N(i)); 547 548 V3D_WRITE(V3D_PCTR_0_EN, 0); 549#endif 550} 551 552#endif /* USE_V3D_SIMULATOR */ 553