1/* 2 * Copyright 2018 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "si_pipe.h" 26#include "tgsi/tgsi_text.h" 27#include "tgsi/tgsi_ureg.h" 28 29void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers) 30{ 31 unsigned vs_blit_property; 32 void **vs; 33 34 switch (type) { 35 case UTIL_BLITTER_ATTRIB_NONE: 36 vs = num_layers > 1 ? &sctx->vs_blit_pos_layered : &sctx->vs_blit_pos; 37 vs_blit_property = SI_VS_BLIT_SGPRS_POS; 38 break; 39 case UTIL_BLITTER_ATTRIB_COLOR: 40 vs = num_layers > 1 ? &sctx->vs_blit_color_layered : &sctx->vs_blit_color; 41 vs_blit_property = SI_VS_BLIT_SGPRS_POS_COLOR; 42 break; 43 case UTIL_BLITTER_ATTRIB_TEXCOORD_XY: 44 case UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW: 45 assert(num_layers == 1); 46 vs = &sctx->vs_blit_texcoord; 47 vs_blit_property = SI_VS_BLIT_SGPRS_POS_TEXCOORD; 48 break; 49 default: 50 assert(0); 51 return NULL; 52 } 53 if (*vs) 54 return *vs; 55 56 struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX); 57 if (!ureg) 58 return NULL; 59 60 /* Tell the shader to load VS inputs from SGPRs: */ 61 ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property); 62 ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true); 63 64 /* This is just a pass-through shader with 1-3 MOV instructions. */ 65 ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0)); 66 67 if (type != UTIL_BLITTER_ATTRIB_NONE) { 68 ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 0), ureg_DECL_vs_input(ureg, 1)); 69 } 70 71 if (num_layers > 1) { 72 struct ureg_src instance_id = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_INSTANCEID, 0); 73 struct ureg_dst layer = ureg_DECL_output(ureg, TGSI_SEMANTIC_LAYER, 0); 74 75 ureg_MOV(ureg, ureg_writemask(layer, TGSI_WRITEMASK_X), 76 ureg_scalar(instance_id, TGSI_SWIZZLE_X)); 77 } 78 ureg_END(ureg); 79 80 *vs = ureg_create_shader_and_destroy(ureg, &sctx->b); 81 return *vs; 82} 83 84/* Create a compute shader implementing clear_buffer or copy_buffer. */ 85void *si_create_dma_compute_shader(struct pipe_context *ctx, unsigned num_dwords_per_thread, 86 bool dst_stream_cache_policy, bool is_copy) 87{ 88 struct si_screen *sscreen = (struct si_screen *)ctx->screen; 89 assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); 90 91 unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; 92 if (dst_stream_cache_policy) 93 store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY; 94 95 /* Don't cache loads, because there is no reuse. */ 96 unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY; 97 98 unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); 99 unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); 100 101 for (unsigned i = 0; i < num_mem_ops; i++) { 102 if (i * 4 < num_dwords_per_thread) 103 inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4); 104 } 105 106 struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); 107 if (!ureg) 108 return NULL; 109 110 unsigned default_wave_size = si_determine_wave_size(sscreen, NULL); 111 112 ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, default_wave_size); 113 ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); 114 ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); 115 116 struct ureg_src value; 117 if (!is_copy) { 118 ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]); 119 value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0); 120 } 121 122 struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); 123 struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); 124 struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); 125 struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); 126 struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false)); 127 struct ureg_src srcbuf; 128 struct ureg_src *values = NULL; 129 130 if (is_copy) { 131 srcbuf = ureg_DECL_buffer(ureg, 1, false); 132 values = malloc(num_mem_ops * sizeof(struct ureg_src)); 133 } 134 135 /* If there are multiple stores, the first store writes into 0*wavesize+tid, 136 * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc. 137 */ 138 ureg_UMAD(ureg, store_addr, blk, ureg_imm1u(ureg, default_wave_size * num_mem_ops), 139 tid); 140 /* Convert from a "store size unit" into bytes. */ 141 ureg_UMUL(ureg, store_addr, ureg_src(store_addr), ureg_imm1u(ureg, 4 * inst_dwords[0])); 142 ureg_MOV(ureg, load_addr, ureg_src(store_addr)); 143 144 /* Distance between a load and a store for latency hiding. */ 145 unsigned load_store_distance = is_copy ? 8 : 0; 146 147 for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { 148 int d = i - load_store_distance; 149 150 if (is_copy && i < num_mem_ops) { 151 if (i) { 152 ureg_UADD(ureg, load_addr, ureg_src(load_addr), 153 ureg_imm1u(ureg, 4 * inst_dwords[i] * default_wave_size)); 154 } 155 156 values[i] = ureg_src(ureg_DECL_temporary(ureg)); 157 struct ureg_dst dst = 158 ureg_writemask(ureg_dst(values[i]), u_bit_consecutive(0, inst_dwords[i])); 159 struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)}; 160 ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, load_qualifier, 161 TGSI_TEXTURE_BUFFER, 0); 162 } 163 164 if (d >= 0) { 165 if (d) { 166 ureg_UADD(ureg, store_addr, ureg_src(store_addr), 167 ureg_imm1u(ureg, 4 * inst_dwords[d] * default_wave_size)); 168 } 169 170 struct ureg_dst dst = ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d])); 171 struct ureg_src srcs[] = {ureg_src(store_addr), is_copy ? values[d] : value}; 172 ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, store_qualifier, 173 TGSI_TEXTURE_BUFFER, 0); 174 } 175 } 176 ureg_END(ureg); 177 178 struct pipe_compute_state state = {}; 179 state.ir_type = PIPE_SHADER_IR_TGSI; 180 state.prog = ureg_get_tokens(ureg, NULL); 181 182 void *cs = ctx->create_compute_state(ctx, &state); 183 ureg_destroy(ureg); 184 ureg_free_tokens(state.prog); 185 186 free(values); 187 return cs; 188} 189 190/* Create the compute shader that is used to collect the results. 191 * 192 * One compute grid with a single thread is launched for every query result 193 * buffer. The thread (optionally) reads a previous summary buffer, then 194 * accumulates data from the query result buffer, and writes the result either 195 * to a summary buffer to be consumed by the next grid invocation or to the 196 * user-supplied buffer. 197 * 198 * Data layout: 199 * 200 * CONST 201 * 0.x = end_offset 202 * 0.y = result_stride 203 * 0.z = result_count 204 * 0.w = bit field: 205 * 1: read previously accumulated values 206 * 2: write accumulated values for chaining 207 * 4: write result available 208 * 8: convert result to boolean (0/1) 209 * 16: only read one dword and use that as result 210 * 32: apply timestamp conversion 211 * 64: store full 64 bits result 212 * 128: store signed 32 bits result 213 * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs 214 * 1.x = fence_offset 215 * 1.y = pair_stride 216 * 1.z = pair_count 217 * 218 * BUFFER[0] = query result buffer 219 * BUFFER[1] = previous summary buffer 220 * BUFFER[2] = next summary buffer or user-supplied buffer 221 */ 222void *si_create_query_result_cs(struct si_context *sctx) 223{ 224 /* TEMP[0].xy = accumulated result so far 225 * TEMP[0].z = result not available 226 * 227 * TEMP[1].x = current result index 228 * TEMP[1].y = current pair index 229 */ 230 static const char text_tmpl[] = 231 "COMP\n" 232 "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" 233 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" 234 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" 235 "DCL BUFFER[0]\n" 236 "DCL BUFFER[1]\n" 237 "DCL BUFFER[2]\n" 238 "DCL CONST[0][0..1]\n" 239 "DCL TEMP[0..5]\n" 240 "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" 241 "IMM[1] UINT32 {1, 2, 4, 8}\n" 242 "IMM[2] UINT32 {16, 32, 64, 128}\n" 243 "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ 244 "IMM[4] UINT32 {256, 0, 0, 0}\n" 245 246 "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" 247 "UIF TEMP[5]\n" 248 /* Check result availability. */ 249 "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n" 250 "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" 251 "MOV TEMP[1], TEMP[0].zzzz\n" 252 "NOT TEMP[0].z, TEMP[0].zzzz\n" 253 254 /* Load result if available. */ 255 "UIF TEMP[1]\n" 256 "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n" 257 "ENDIF\n" 258 "ELSE\n" 259 /* Load previously accumulated result if requested. */ 260 "MOV TEMP[0], IMM[0].xxxx\n" 261 "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" 262 "UIF TEMP[4]\n" 263 "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" 264 "ENDIF\n" 265 266 "MOV TEMP[1].x, IMM[0].xxxx\n" 267 "BGNLOOP\n" 268 /* Break if accumulated result so far is not available. */ 269 "UIF TEMP[0].zzzz\n" 270 "BRK\n" 271 "ENDIF\n" 272 273 /* Break if result_index >= result_count. */ 274 "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" 275 "UIF TEMP[5]\n" 276 "BRK\n" 277 "ENDIF\n" 278 279 /* Load fence and check result availability */ 280 "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" 281 "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" 282 "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" 283 "NOT TEMP[0].z, TEMP[0].zzzz\n" 284 "UIF TEMP[0].zzzz\n" 285 "BRK\n" 286 "ENDIF\n" 287 288 "MOV TEMP[1].y, IMM[0].xxxx\n" 289 "BGNLOOP\n" 290 /* Load start and end. */ 291 "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" 292 "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" 293 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" 294 295 "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" 296 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" 297 298 "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" 299 300 "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" 301 "UIF TEMP[5].zzzz\n" 302 /* Load second start/end half-pair and 303 * take the difference 304 */ 305 "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" 306 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" 307 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" 308 309 "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" 310 "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" 311 "ENDIF\n" 312 313 "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" 314 315 /* Increment pair index */ 316 "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" 317 "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" 318 "UIF TEMP[5]\n" 319 "BRK\n" 320 "ENDIF\n" 321 "ENDLOOP\n" 322 323 /* Increment result index */ 324 "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" 325 "ENDLOOP\n" 326 "ENDIF\n" 327 328 "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" 329 "UIF TEMP[4]\n" 330 /* Store accumulated data for chaining. */ 331 "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n" 332 "ELSE\n" 333 "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" 334 "UIF TEMP[4]\n" 335 /* Store result availability. */ 336 "NOT TEMP[0].z, TEMP[0]\n" 337 "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" 338 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n" 339 340 "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" 341 "UIF TEMP[4]\n" 342 "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n" 343 "ENDIF\n" 344 "ELSE\n" 345 /* Store result if it is available. */ 346 "NOT TEMP[4], TEMP[0].zzzz\n" 347 "UIF TEMP[4]\n" 348 /* Apply timestamp conversion */ 349 "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" 350 "UIF TEMP[4]\n" 351 "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" 352 "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" 353 "ENDIF\n" 354 355 /* Convert to boolean */ 356 "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" 357 "UIF TEMP[4]\n" 358 "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" 359 "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" 360 "MOV TEMP[0].y, IMM[0].xxxx\n" 361 "ENDIF\n" 362 363 "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" 364 "UIF TEMP[4]\n" 365 "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n" 366 "ELSE\n" 367 /* Clamping */ 368 "UIF TEMP[0].yyyy\n" 369 "MOV TEMP[0].x, IMM[0].wwww\n" 370 "ENDIF\n" 371 372 "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" 373 "UIF TEMP[4]\n" 374 "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" 375 "ENDIF\n" 376 377 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" 378 "ENDIF\n" 379 "ENDIF\n" 380 "ENDIF\n" 381 "ENDIF\n" 382 383 "END\n"; 384 385 char text[sizeof(text_tmpl) + 32]; 386 struct tgsi_token tokens[1024]; 387 struct pipe_compute_state state = {}; 388 389 /* Hard code the frequency into the shader so that the backend can 390 * use the full range of optimizations for divide-by-constant. 391 */ 392 snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq); 393 394 if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { 395 assert(false); 396 return NULL; 397 } 398 399 state.ir_type = PIPE_SHADER_IR_TGSI; 400 state.prog = tokens; 401 402 return sctx->b.create_compute_state(&sctx->b, &state); 403} 404 405void *si_clear_render_target_shader(struct pipe_context *ctx) 406{ 407 static const char text[] = 408 "COMP\n" 409 "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" 410 "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" 411 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" 412 "DCL SV[0], THREAD_ID\n" 413 "DCL SV[1], BLOCK_ID\n" 414 "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" 415 "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw 416 "DCL TEMP[0..3], LOCAL\n" 417 "IMM[0] UINT32 {8, 1, 0, 0}\n" 418 "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" 419 "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" 420 "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" 421 "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" 422 "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" 423 "END\n"; 424 425 struct tgsi_token tokens[1024]; 426 struct pipe_compute_state state = {0}; 427 428 if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { 429 assert(false); 430 return NULL; 431 } 432 433 state.ir_type = PIPE_SHADER_IR_TGSI; 434 state.prog = tokens; 435 436 return ctx->create_compute_state(ctx, &state); 437} 438 439/* TODO: Didn't really test 1D_ARRAY */ 440void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx) 441{ 442 static const char text[] = 443 "COMP\n" 444 "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" 445 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" 446 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" 447 "DCL SV[0], THREAD_ID\n" 448 "DCL SV[1], BLOCK_ID\n" 449 "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" 450 "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw 451 "DCL TEMP[0..3], LOCAL\n" 452 "IMM[0] UINT32 {64, 1, 0, 0}\n" 453 "MOV TEMP[0].xy, CONST[0][0].xzzw\n" 454 "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" 455 "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" 456 "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" 457 "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" 458 "END\n"; 459 460 struct tgsi_token tokens[1024]; 461 struct pipe_compute_state state = {0}; 462 463 if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { 464 assert(false); 465 return NULL; 466 } 467 468 state.ir_type = PIPE_SHADER_IR_TGSI; 469 state.prog = tokens; 470 471 return ctx->create_compute_state(ctx, &state); 472} 473 474void *si_clear_12bytes_buffer_shader(struct pipe_context *ctx) 475{ 476 static const char text[] = "COMP\n" 477 "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" 478 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" 479 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" 480 "PROPERTY CS_USER_DATA_COMPONENTS_AMD 3\n" 481 "DCL SV[0], THREAD_ID\n" 482 "DCL SV[1], BLOCK_ID\n" 483 "DCL SV[2], CS_USER_DATA_AMD\n" 484 "DCL BUFFER[0]\n" 485 "DCL TEMP[0..0]\n" 486 "IMM[0] UINT32 {64, 1, 12, 0}\n" 487 "UMAD TEMP[0].x, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" 488 "UMUL TEMP[0].x, TEMP[0].xyzz, IMM[0].zzzz\n" // 12 bytes 489 "STORE BUFFER[0].xyz, TEMP[0].xxxx, SV[2].xyzz%s\n" 490 "END\n"; 491 char final_text[2048]; 492 struct tgsi_token tokens[1024]; 493 struct pipe_compute_state state = {0}; 494 495 snprintf(final_text, sizeof(final_text), text, 496 SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ", STREAM_CACHE_POLICY" : ""); 497 498 if (!tgsi_text_translate(final_text, tokens, ARRAY_SIZE(tokens))) { 499 assert(false); 500 return NULL; 501 } 502 503 state.ir_type = PIPE_SHADER_IR_TGSI; 504 state.prog = tokens; 505 506 return ctx->create_compute_state(ctx, &state); 507} 508 509/* Load samples from the image, and copy them to the same image. This looks like 510 * a no-op, but it's not. Loads use FMASK, while stores don't, so samples are 511 * reordered to match expanded FMASK. 512 * 513 * After the shader finishes, FMASK should be cleared to identity. 514 */ 515void *si_create_fmask_expand_cs(struct pipe_context *ctx, unsigned num_samples, bool is_array) 516{ 517 enum tgsi_texture_type target = is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA; 518 struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); 519 if (!ureg) 520 return NULL; 521 522 ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 8); 523 ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 8); 524 ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); 525 526 /* Compute the image coordinates. */ 527 struct ureg_src image = ureg_DECL_image(ureg, 0, target, 0, true, false); 528 struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); 529 struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); 530 struct ureg_dst coord = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZW); 531 ureg_UMAD(ureg, ureg_writemask(coord, TGSI_WRITEMASK_XY), ureg_swizzle(blk, 0, 1, 1, 1), 532 ureg_imm2u(ureg, 8, 8), ureg_swizzle(tid, 0, 1, 1, 1)); 533 if (is_array) { 534 ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_Z), ureg_scalar(blk, TGSI_SWIZZLE_Z)); 535 } 536 537 /* Load samples, resolving FMASK. */ 538 struct ureg_dst sample[8]; 539 assert(num_samples <= ARRAY_SIZE(sample)); 540 541 for (unsigned i = 0; i < num_samples; i++) { 542 sample[i] = ureg_DECL_temporary(ureg); 543 544 ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i)); 545 546 struct ureg_src srcs[] = {image, ureg_src(coord)}; 547 ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &sample[i], 1, srcs, 2, TGSI_MEMORY_RESTRICT, target, 548 0); 549 } 550 551 /* Store samples, ignoring FMASK. */ 552 for (unsigned i = 0; i < num_samples; i++) { 553 ureg_MOV(ureg, ureg_writemask(coord, TGSI_WRITEMASK_W), ureg_imm1u(ureg, i)); 554 555 struct ureg_dst dst_image = ureg_dst(image); 556 struct ureg_src srcs[] = {ureg_src(coord), ureg_src(sample[i])}; 557 ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst_image, 1, srcs, 2, TGSI_MEMORY_RESTRICT, 558 target, 0); 559 } 560 ureg_END(ureg); 561 562 struct pipe_compute_state state = {}; 563 state.ir_type = PIPE_SHADER_IR_TGSI; 564 state.prog = ureg_get_tokens(ureg, NULL); 565 566 void *cs = ctx->create_compute_state(ctx, &state); 567 ureg_destroy(ureg); 568 return cs; 569} 570 571/* Create the compute shader that is used to collect the results of gfx10+ 572 * shader queries. 573 * 574 * One compute grid with a single thread is launched for every query result 575 * buffer. The thread (optionally) reads a previous summary buffer, then 576 * accumulates data from the query result buffer, and writes the result either 577 * to a summary buffer to be consumed by the next grid invocation or to the 578 * user-supplied buffer. 579 * 580 * Data layout: 581 * 582 * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem) 583 * BUFFER[1] = previous summary buffer 584 * BUFFER[2] = next summary buffer or user-supplied buffer 585 * 586 * CONST 587 * 0.x = config; the low 3 bits indicate the mode: 588 * 0: sum up counts 589 * 1: determine result availability and write it as a boolean 590 * 2: SO_OVERFLOW 591 * 3: SO_ANY_OVERFLOW 592 * the remaining bits form a bitfield: 593 * 8: write result as a 64-bit value 594 * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode 595 * 0.z = chain bit field: 596 * 1: have previous summary buffer 597 * 2: write next summary buffer 598 * 0.w = result_count 599 */ 600void *gfx10_create_sh_query_result_cs(struct si_context *sctx) 601{ 602 /* TEMP[0].x = accumulated result so far 603 * TEMP[0].y = result missing 604 * TEMP[0].z = whether we're in overflow mode 605 */ 606 static const char text_tmpl[] = "COMP\n" 607 "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" 608 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" 609 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" 610 "DCL BUFFER[0]\n" 611 "DCL BUFFER[1]\n" 612 "DCL BUFFER[2]\n" 613 "DCL CONST[0][0..0]\n" 614 "DCL TEMP[0..5]\n" 615 "IMM[0] UINT32 {0, 7, 256, 4294967295}\n" 616 "IMM[1] UINT32 {1, 2, 4, 8}\n" 617 "IMM[2] UINT32 {16, 32, 64, 128}\n" 618 619 /* 620 acc_result = 0; 621 acc_missing = 0; 622 if (chain & 1) { 623 acc_result = buffer[1][0]; 624 acc_missing = buffer[1][1]; 625 } 626 */ 627 "MOV TEMP[0].xy, IMM[0].xxxx\n" 628 "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n" 629 "UIF TEMP[5]\n" 630 "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n" 631 "ENDIF\n" 632 633 /* 634 is_overflow (TEMP[0].z) = (config & 7) >= 2; 635 result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : 636 result_count; base_offset (TEMP[1].y) = 0; for (;;) { if 637 (!result_remaining) break; result_remaining--; 638 */ 639 "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" 640 "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n" 641 642 "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n" 643 "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n" 644 "MOV TEMP[1].y, IMM[0].xxxx\n" 645 646 "BGNLOOP\n" 647 "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n" 648 "UIF TEMP[5]\n" 649 "BRK\n" 650 "ENDIF\n" 651 "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n" 652 653 /* 654 fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream)); 655 if (!fence) { 656 acc_missing = ~0u; 657 break; 658 } 659 */ 660 "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n" 661 "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" 662 "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" 663 "UIF TEMP[5]\n" 664 "MOV TEMP[0].y, TEMP[5].xxxx\n" 665 "BRK\n" 666 "ENDIF\n" 667 668 /* 669 stream_offset (TEMP[2].x) = base_offset + offset; 670 671 if (!(config & 7)) { 672 acc_result += buffer[0]@stream_offset; 673 } 674 */ 675 "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n" 676 677 "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" 678 "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" 679 "UIF TEMP[5]\n" 680 "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n" 681 "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n" 682 "ENDIF\n" 683 684 /* 685 if ((config & 7) >= 2) { 686 count (TEMP[2].y) = (config & 1) ? 4 : 1; 687 */ 688 "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" 689 "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n" 690 "UIF TEMP[5]\n" 691 "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n" 692 "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n" 693 694 /* 695 do { 696 generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t)); 697 emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t)); 698 if (generated != emitted) { 699 acc_result = 1; 700 result_remaining = 0; 701 break; 702 } 703 704 stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]); 705 } while (--count); 706 */ 707 "BGNLOOP\n" 708 "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n" 709 "LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n" 710 "USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n" 711 "UIF TEMP[5]\n" 712 "MOV TEMP[0].x, IMM[1].xxxx\n" 713 "MOV TEMP[1].y, IMM[0].xxxx\n" 714 "BRK\n" 715 "ENDIF\n" 716 717 "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n" 718 "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n" 719 "UIF TEMP[5]\n" 720 "BRK\n" 721 "ENDIF\n" 722 "UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n" 723 "ENDLOOP\n" 724 "ENDIF\n" 725 726 /* 727 base_offset += sizeof(gfx10_sh_query_buffer_mem); 728 } // end outer loop 729 */ 730 "UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n" 731 "ENDLOOP\n" 732 733 /* 734 if (chain & 2) { 735 buffer[2][0] = acc_result; 736 buffer[2][1] = acc_missing; 737 } else { 738 */ 739 "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n" 740 "UIF TEMP[5]\n" 741 "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n" 742 "ELSE\n" 743 744 /* 745 if ((config & 7) == 1) { 746 acc_result = acc_missing ? 0 : 1; 747 acc_missing = 0; 748 } 749 */ 750 "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n" 751 "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n" 752 "UIF TEMP[5]\n" 753 "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n" 754 "MOV TEMP[0].y, IMM[0].xxxx\n" 755 "ENDIF\n" 756 757 /* 758 if (!acc_missing) { 759 buffer[2][0] = acc_result; 760 if (config & 8) 761 buffer[2][1] = 0; 762 } 763 */ 764 "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n" 765 "UIF TEMP[5]\n" 766 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" 767 768 "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n" 769 "UIF TEMP[5]\n" 770 "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n" 771 "ENDIF\n" 772 "ENDIF\n" 773 "ENDIF\n" 774 775 "END\n"; 776 777 struct tgsi_token tokens[1024]; 778 struct pipe_compute_state state = {}; 779 780 if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) { 781 assert(false); 782 return NULL; 783 } 784 785 state.ir_type = PIPE_SHADER_IR_TGSI; 786 state.prog = tokens; 787 788 return sctx->b.create_compute_state(&sctx->b, &state); 789} 790