1/* 2 * Copyright © 2012 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "blorp_nir_builder.h" 25#include "compiler/nir/nir_format_convert.h" 26 27#include "blorp_priv.h" 28#include "dev/intel_debug.h" 29 30#include "util/format_rgb9e5.h" 31/* header-only include needed for _mesa_unorm_to_float and friends. */ 32#include "mesa/main/format_utils.h" 33#include "util/u_math.h" 34 35#define FILE_DEBUG_FLAG DEBUG_BLORP 36 37static const bool split_blorp_blit_debug = false; 38 39struct brw_blorp_blit_vars { 40 /* Input values from brw_blorp_wm_inputs */ 41 nir_variable *v_bounds_rect; 42 nir_variable *v_rect_grid; 43 nir_variable *v_coord_transform; 44 nir_variable *v_src_z; 45 nir_variable *v_src_offset; 46 nir_variable *v_dst_offset; 47 nir_variable *v_src_inv_size; 48}; 49 50static void 51brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v, 52 const struct brw_blorp_blit_prog_key *key) 53{ 54#define LOAD_INPUT(name, type)\ 55 v->v_##name = BLORP_CREATE_NIR_INPUT(b->shader, name, type); 56 57 LOAD_INPUT(bounds_rect, glsl_vec4_type()) 58 LOAD_INPUT(rect_grid, glsl_vec4_type()) 59 LOAD_INPUT(coord_transform, glsl_vec4_type()) 60 LOAD_INPUT(src_z, glsl_float_type()) 61 LOAD_INPUT(src_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) 62 LOAD_INPUT(dst_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) 63 LOAD_INPUT(src_inv_size, glsl_vector_type(GLSL_TYPE_FLOAT, 2)) 64 65#undef LOAD_INPUT 66} 67 68static nir_ssa_def * 69blorp_blit_get_frag_coords(nir_builder *b, 70 const struct brw_blorp_blit_prog_key *key, 71 struct brw_blorp_blit_vars *v) 72{ 73 nir_ssa_def *coord = nir_f2i32(b, nir_load_frag_coord(b)); 74 75 /* Account for destination surface intratile offset 76 * 77 * Transformation parameters giving translation from destination to source 78 * coordinates don't take into account possible intra-tile destination 79 * offset. Therefore it has to be first subtracted from the incoming 80 * coordinates. Vertices are set up based on coordinates containing the 81 * intra-tile offset. 82 */ 83 if (key->need_dst_offset) 84 coord = nir_isub(b, coord, nir_load_var(b, v->v_dst_offset)); 85 86 if (key->persample_msaa_dispatch) { 87 b->shader->info.fs.uses_sample_shading = true; 88 return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1), 89 nir_load_sample_id(b)); 90 } else { 91 return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1)); 92 } 93} 94 95static nir_ssa_def * 96blorp_blit_get_cs_dst_coords(nir_builder *b, 97 const struct brw_blorp_blit_prog_key *key, 98 struct brw_blorp_blit_vars *v) 99{ 100 nir_ssa_def *coord = nir_load_global_invocation_id(b, 32); 101 102 /* Account for destination surface intratile offset 103 * 104 * Transformation parameters giving translation from destination to source 105 * coordinates don't take into account possible intra-tile destination 106 * offset. Therefore it has to be first subtracted from the incoming 107 * coordinates. Vertices are set up based on coordinates containing the 108 * intra-tile offset. 109 */ 110 if (key->need_dst_offset) 111 coord = nir_isub(b, coord, nir_load_var(b, v->v_dst_offset)); 112 113 assert(!key->persample_msaa_dispatch); 114 return nir_channels(b, coord, 0x3); 115} 116 117/** 118 * Emit code to translate from destination (X, Y) coordinates to source (X, Y) 119 * coordinates. 120 */ 121static nir_ssa_def * 122blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos, 123 struct brw_blorp_blit_vars *v) 124{ 125 nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform); 126 127 nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1), 128 nir_channel(b, coord_transform, 3)); 129 nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0), 130 nir_channel(b, coord_transform, 2)); 131 132 return nir_fadd(b, nir_fmul(b, src_pos, mul), offset); 133} 134 135static nir_tex_instr * 136blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v, 137 nir_texop op, nir_ssa_def *pos, unsigned num_srcs, 138 nir_alu_type dst_type) 139{ 140 nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs); 141 142 tex->op = op; 143 144 tex->dest_type = dst_type | 32; 145 tex->is_array = false; 146 tex->is_shadow = false; 147 148 tex->texture_index = BLORP_TEXTURE_BT_INDEX; 149 tex->sampler_index = BLORP_SAMPLER_INDEX; 150 151 /* To properly handle 3-D and 2-D array textures, we pull the Z component 152 * from an input. TODO: This is a bit magic; we should probably make this 153 * more explicit in the future. 154 */ 155 assert(pos->num_components >= 2); 156 if (op == nir_texop_txf || op == nir_texop_txf_ms || 157 op == nir_texop_txf_ms_mcs_intel) { 158 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1), 159 nir_f2i32(b, nir_load_var(b, v->v_src_z))); 160 } else { 161 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1), 162 nir_load_var(b, v->v_src_z)); 163 } 164 165 tex->src[0].src_type = nir_tex_src_coord; 166 tex->src[0].src = nir_src_for_ssa(pos); 167 tex->coord_components = 3; 168 169 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL); 170 171 return tex; 172} 173 174static nir_ssa_def * 175blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v, 176 const struct brw_blorp_blit_prog_key *key, nir_ssa_def *pos) 177{ 178 if (key->need_src_offset) 179 pos = nir_fadd(b, pos, nir_i2f32(b, nir_load_var(b, v->v_src_offset))); 180 181 /* If the sampler requires normalized coordinates, we need to compensate. */ 182 if (key->src_coords_normalized) 183 pos = nir_fmul(b, pos, nir_load_var(b, v->v_src_inv_size)); 184 185 nir_tex_instr *tex = 186 blorp_create_nir_tex_instr(b, v, nir_texop_txl, pos, 2, 187 key->texture_data_type); 188 189 assert(pos->num_components == 2); 190 tex->sampler_dim = GLSL_SAMPLER_DIM_2D; 191 tex->src[1].src_type = nir_tex_src_lod; 192 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 193 194 nir_builder_instr_insert(b, &tex->instr); 195 196 return &tex->dest.ssa; 197} 198 199static nir_ssa_def * 200blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v, 201 nir_ssa_def *pos, nir_alu_type dst_type) 202{ 203 nir_tex_instr *tex = 204 blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type); 205 206 tex->sampler_dim = GLSL_SAMPLER_DIM_3D; 207 tex->src[1].src_type = nir_tex_src_lod; 208 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 209 210 nir_builder_instr_insert(b, &tex->instr); 211 212 return &tex->dest.ssa; 213} 214 215static nir_ssa_def * 216blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v, 217 nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type) 218{ 219 nir_tex_instr *tex = 220 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos, 221 mcs != NULL ? 3 : 2, dst_type); 222 223 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 224 225 tex->src[1].src_type = nir_tex_src_ms_index; 226 if (pos->num_components == 2) { 227 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0)); 228 } else { 229 assert(pos->num_components == 3); 230 tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2)); 231 } 232 233 if (mcs) { 234 tex->src[2].src_type = nir_tex_src_ms_mcs_intel; 235 tex->src[2].src = nir_src_for_ssa(mcs); 236 } 237 238 nir_builder_instr_insert(b, &tex->instr); 239 240 return &tex->dest.ssa; 241} 242 243static nir_ssa_def * 244blorp_blit_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, 245 nir_ssa_def *pos) 246{ 247 nir_tex_instr *tex = 248 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs_intel, 249 pos, 1, nir_type_int); 250 251 tex->sampler_dim = GLSL_SAMPLER_DIM_MS; 252 253 nir_builder_instr_insert(b, &tex->instr); 254 255 return &tex->dest.ssa; 256} 257 258/** 259 * Emit code to compensate for the difference between Y and W tiling. 260 * 261 * This code modifies the X and Y coordinates according to the formula: 262 * 263 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S)) 264 * 265 * (See brw_blorp_build_nir_shader). 266 */ 267static inline nir_ssa_def * 268blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos) 269{ 270 assert(pos->num_components == 2); 271 nir_ssa_def *x_Y = nir_channel(b, pos, 0); 272 nir_ssa_def *y_Y = nir_channel(b, pos, 1); 273 274 /* Given X and Y coordinates that describe an address using Y tiling, 275 * translate to the X and Y coordinates that describe the same address 276 * using W tiling. 277 * 278 * If we break down the low order bits of X and Y, using a 279 * single letter to represent each low-order bit: 280 * 281 * X = A << 7 | 0bBCDEFGH 282 * Y = J << 5 | 0bKLMNP (1) 283 * 284 * Then we can apply the Y tiling formula to see the memory offset being 285 * addressed: 286 * 287 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2) 288 * 289 * If we apply the W detiling formula to this memory location, that the 290 * corresponding X' and Y' coordinates are: 291 * 292 * X' = A << 6 | 0bBCDPFH (3) 293 * Y' = J << 6 | 0bKLMNEG 294 * 295 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'), 296 * we need to make the following computation: 297 * 298 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4) 299 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1 300 */ 301 nir_ssa_def *x_W = nir_imm_int(b, 0); 302 x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1); 303 x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2); 304 x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0); 305 306 nir_ssa_def *y_W = nir_imm_int(b, 0); 307 y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1); 308 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2); 309 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1); 310 311 return nir_vec2(b, x_W, y_W); 312} 313 314/** 315 * Emit code to compensate for the difference between Y and W tiling. 316 * 317 * This code modifies the X and Y coordinates according to the formula: 318 * 319 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S)) 320 * 321 * (See brw_blorp_build_nir_shader). 322 */ 323static inline nir_ssa_def * 324blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos) 325{ 326 assert(pos->num_components == 2); 327 nir_ssa_def *x_W = nir_channel(b, pos, 0); 328 nir_ssa_def *y_W = nir_channel(b, pos, 1); 329 330 /* Applying the same logic as above, but in reverse, we obtain the 331 * formulas: 332 * 333 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1 334 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2 335 */ 336 nir_ssa_def *x_Y = nir_imm_int(b, 0); 337 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1); 338 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2); 339 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1); 340 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0); 341 342 nir_ssa_def *y_Y = nir_imm_int(b, 0); 343 y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1); 344 y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2); 345 346 return nir_vec2(b, x_Y, y_Y); 347} 348 349/** 350 * Emit code to compensate for the difference between MSAA and non-MSAA 351 * surfaces. 352 * 353 * This code modifies the X and Y coordinates according to the formula: 354 * 355 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S) 356 * 357 * (See brw_blorp_blit_program). 358 */ 359static inline nir_ssa_def * 360blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos, 361 unsigned num_samples, enum isl_msaa_layout layout) 362{ 363 assert(pos->num_components == 2 || pos->num_components == 3); 364 365 switch (layout) { 366 case ISL_MSAA_LAYOUT_NONE: 367 assert(pos->num_components == 2); 368 return pos; 369 case ISL_MSAA_LAYOUT_ARRAY: 370 /* No translation needed */ 371 return pos; 372 case ISL_MSAA_LAYOUT_INTERLEAVED: { 373 nir_ssa_def *x_in = nir_channel(b, pos, 0); 374 nir_ssa_def *y_in = nir_channel(b, pos, 1); 375 nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) : 376 nir_channel(b, pos, 2); 377 378 nir_ssa_def *x_out = nir_imm_int(b, 0); 379 nir_ssa_def *y_out = nir_imm_int(b, 0); 380 switch (num_samples) { 381 case 2: 382 case 4: 383 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0) 384 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 385 * Y' = Y 386 * 387 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 388 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 389 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 390 */ 391 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1); 392 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 393 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 394 if (num_samples == 2) { 395 y_out = y_in; 396 } else { 397 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); 398 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 399 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 400 } 401 break; 402 403 case 8: 404 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) 405 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 406 * | (X & 0b1) 407 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 408 */ 409 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); 410 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); 411 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 412 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 413 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1); 414 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 415 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 416 break; 417 418 case 16: 419 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0) 420 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 421 * | (X & 0b1) 422 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10) 423 * | (Y & 0b1) 424 */ 425 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2); 426 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0); 427 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1); 428 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 429 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2); 430 y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1); 431 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0); 432 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 433 break; 434 435 default: 436 unreachable("Invalid number of samples for IMS layout"); 437 } 438 439 return nir_vec2(b, x_out, y_out); 440 } 441 442 default: 443 unreachable("Invalid MSAA layout"); 444 } 445} 446 447/** 448 * Emit code to compensate for the difference between MSAA and non-MSAA 449 * surfaces. 450 * 451 * This code modifies the X and Y coordinates according to the formula: 452 * 453 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S) 454 * 455 * (See brw_blorp_blit_program). 456 */ 457static inline nir_ssa_def * 458blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos, 459 unsigned num_samples, enum isl_msaa_layout layout) 460{ 461 assert(pos->num_components == 2 || pos->num_components == 3); 462 463 switch (layout) { 464 case ISL_MSAA_LAYOUT_NONE: 465 /* No translation necessary, and S should already be zero. */ 466 assert(pos->num_components == 2); 467 return pos; 468 case ISL_MSAA_LAYOUT_ARRAY: 469 /* No translation necessary. */ 470 return pos; 471 case ISL_MSAA_LAYOUT_INTERLEAVED: { 472 assert(pos->num_components == 2); 473 474 nir_ssa_def *x_in = nir_channel(b, pos, 0); 475 nir_ssa_def *y_in = nir_channel(b, pos, 1); 476 477 nir_ssa_def *x_out = nir_imm_int(b, 0); 478 nir_ssa_def *y_out = nir_imm_int(b, 0); 479 nir_ssa_def *s_out = nir_imm_int(b, 0); 480 switch (num_samples) { 481 case 2: 482 case 4: 483 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S) 484 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 485 * S = (X & 0b10) >> 1 486 * 487 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 488 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 489 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 490 * S = (Y & 0b10) | (X & 0b10) >> 1 491 */ 492 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1); 493 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 494 if (num_samples == 2) { 495 y_out = y_in; 496 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 497 } else { 498 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); 499 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 500 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 501 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 502 } 503 break; 504 505 case 8: 506 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) 507 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 508 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 509 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 510 */ 511 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); 512 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 513 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1); 514 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 515 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); 516 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 517 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 518 break; 519 520 case 16: 521 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S) 522 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 523 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1) 524 * S = (Y & 0b100) << 1 | (X & 0b100) | 525 * (Y & 0b10) | (X & 0b10) >> 1 526 */ 527 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2); 528 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0); 529 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2); 530 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0); 531 s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1); 532 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0); 533 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0); 534 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1); 535 break; 536 537 default: 538 unreachable("Invalid number of samples for IMS layout"); 539 } 540 541 return nir_vec3(b, x_out, y_out, s_out); 542 } 543 544 default: 545 unreachable("Invalid MSAA layout"); 546 } 547} 548 549/** 550 * Count the number of trailing 1 bits in the given value. For example: 551 * 552 * count_trailing_one_bits(0) == 0 553 * count_trailing_one_bits(7) == 3 554 * count_trailing_one_bits(11) == 2 555 */ 556static inline int count_trailing_one_bits(unsigned value) 557{ 558#ifdef HAVE___BUILTIN_CTZ 559 return __builtin_ctz(~value); 560#else 561 return util_bitcount(value & ~(value + 1)); 562#endif 563} 564 565static nir_ssa_def * 566blorp_nir_combine_samples(nir_builder *b, struct brw_blorp_blit_vars *v, 567 nir_ssa_def *pos, unsigned tex_samples, 568 enum isl_aux_usage tex_aux_usage, 569 nir_alu_type dst_type, 570 enum blorp_filter filter) 571{ 572 nir_variable *color = 573 nir_local_variable_create(b->impl, glsl_vec4_type(), "color"); 574 575 nir_ssa_def *mcs = NULL; 576 if (isl_aux_usage_has_mcs(tex_aux_usage)) 577 mcs = blorp_blit_txf_ms_mcs(b, v, pos); 578 579 nir_op combine_op; 580 switch (filter) { 581 case BLORP_FILTER_AVERAGE: 582 assert(dst_type == nir_type_float); 583 combine_op = nir_op_fadd; 584 break; 585 586 case BLORP_FILTER_MIN_SAMPLE: 587 switch (dst_type) { 588 case nir_type_int: combine_op = nir_op_imin; break; 589 case nir_type_uint: combine_op = nir_op_umin; break; 590 case nir_type_float: combine_op = nir_op_fmin; break; 591 default: unreachable("Invalid dst_type"); 592 } 593 break; 594 595 case BLORP_FILTER_MAX_SAMPLE: 596 switch (dst_type) { 597 case nir_type_int: combine_op = nir_op_imax; break; 598 case nir_type_uint: combine_op = nir_op_umax; break; 599 case nir_type_float: combine_op = nir_op_fmax; break; 600 default: unreachable("Invalid dst_type"); 601 } 602 break; 603 604 default: 605 unreachable("Invalid filter"); 606 } 607 608 /* If true, we inserted an if statement that we need to pop at at the end. 609 */ 610 bool inserted_if = false; 611 612 /* We add together samples using a binary tree structure, e.g. for 4x MSAA: 613 * 614 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4 615 * 616 * This ensures that when all samples have the same value, no numerical 617 * precision is lost, since each addition operation always adds two equal 618 * values, and summing two equal floating point values does not lose 619 * precision. 620 * 621 * We perform this computation by treating the texture_data array as a 622 * stack and performing the following operations: 623 * 624 * - push sample 0 onto stack 625 * - push sample 1 onto stack 626 * - add top two stack entries 627 * - push sample 2 onto stack 628 * - push sample 3 onto stack 629 * - add top two stack entries 630 * - add top two stack entries 631 * - divide top stack entry by 4 632 * 633 * Note that after pushing sample i onto the stack, the number of add 634 * operations we do is equal to the number of trailing 1 bits in i. This 635 * works provided the total number of samples is a power of two, which it 636 * always is for i965. 637 * 638 * For integer formats, we replace the add operations with average 639 * operations and skip the final division. 640 */ 641 nir_ssa_def *texture_data[5]; 642 texture_data[0] = NULL; /* Avoid maybe-uninitialized warning with GCC 10 */ 643 unsigned stack_depth = 0; 644 for (unsigned i = 0; i < tex_samples; ++i) { 645 assert(stack_depth == util_bitcount(i)); /* Loop invariant */ 646 647 /* Push sample i onto the stack */ 648 assert(stack_depth < ARRAY_SIZE(texture_data)); 649 650 nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0), 651 nir_channel(b, pos, 1), 652 nir_imm_int(b, i)); 653 texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type); 654 655 if (i == 0 && isl_aux_usage_has_mcs(tex_aux_usage)) { 656 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface) 657 * suggests an optimization: 658 * 659 * "A simple optimization with probable large return in 660 * performance is to compare the MCS value to zero (indicating 661 * all samples are on sample slice 0), and sample only from 662 * sample slice 0 using ld2dss if MCS is zero." 663 * 664 * Note that in the case where the MCS value is zero, sampling from 665 * sample slice 0 using ld2dss and sampling from sample 0 using 666 * ld2dms are equivalent (since all samples are on sample slice 0). 667 * Since we have already sampled from sample 0, all we need to do is 668 * skip the remaining fetches and averaging if MCS is zero. 669 * 670 * It's also trivial to detect when the MCS has the magic clear color 671 * value. In this case, the txf we did on sample 0 will return the 672 * clear color and we can skip the remaining fetches just like we do 673 * when MCS == 0. 674 */ 675 nir_ssa_def *mcs_zero = nir_ieq_imm(b, nir_channel(b, mcs, 0), 0); 676 if (tex_samples == 16) { 677 mcs_zero = nir_iand(b, mcs_zero, 678 nir_ieq_imm(b, nir_channel(b, mcs, 1), 0)); 679 } 680 nir_ssa_def *mcs_clear = 681 blorp_nir_mcs_is_clear_color(b, mcs, tex_samples); 682 683 nir_push_if(b, nir_ior(b, mcs_zero, mcs_clear)); 684 nir_store_var(b, color, texture_data[0], 0xf); 685 686 nir_push_else(b, NULL); 687 inserted_if = true; 688 } 689 690 for (int j = 0; j < count_trailing_one_bits(i); j++) { 691 assert(stack_depth >= 2); 692 --stack_depth; 693 694 texture_data[stack_depth - 1] = 695 nir_build_alu(b, combine_op, 696 texture_data[stack_depth - 1], 697 texture_data[stack_depth], 698 NULL, NULL); 699 } 700 } 701 702 /* We should have just 1 sample on the stack now. */ 703 assert(stack_depth == 1); 704 705 if (filter == BLORP_FILTER_AVERAGE) { 706 assert(dst_type == nir_type_float); 707 texture_data[0] = nir_fmul(b, texture_data[0], 708 nir_imm_float(b, 1.0 / tex_samples)); 709 } 710 711 nir_store_var(b, color, texture_data[0], 0xf); 712 713 if (inserted_if) 714 nir_pop_if(b, NULL); 715 716 return nir_load_var(b, color); 717} 718 719static nir_ssa_def * 720blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos, 721 unsigned tex_samples, 722 const struct brw_blorp_blit_prog_key *key, 723 struct brw_blorp_blit_vars *v) 724{ 725 nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3); 726 nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid); 727 nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale); 728 729 /* Translate coordinates to lay out the samples in a rectangular grid 730 * roughly corresponding to sample locations. 731 */ 732 pos_xy = nir_fmul(b, pos_xy, scale); 733 /* Adjust coordinates so that integers represent pixel centers rather 734 * than pixel edges. 735 */ 736 pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5)); 737 /* Clamp the X, Y texture coordinates to properly handle the sampling of 738 * texels on texture edges. 739 */ 740 pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)), 741 nir_vec2(b, nir_channel(b, rect_grid, 0), 742 nir_channel(b, rect_grid, 1))); 743 744 /* Store the fractional parts to be used as bilinear interpolation 745 * coefficients. 746 */ 747 nir_ssa_def *frac_xy = nir_ffract(b, pos_xy); 748 /* Round the float coordinates down to nearest integer */ 749 pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale); 750 751 nir_ssa_def *tex_data[4]; 752 for (unsigned i = 0; i < 4; ++i) { 753 float sample_off_x = (float)(i & 0x1) / key->x_scale; 754 float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale; 755 nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y); 756 757 nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off); 758 nir_ssa_def *sample_coords_int = nir_f2i32(b, sample_coords); 759 760 /* The MCS value we fetch has to match up with the pixel that we're 761 * sampling from. Since we sample from different pixels in each 762 * iteration of this "for" loop, the call to mcs_fetch() should be 763 * here inside the loop after computing the pixel coordinates. 764 */ 765 nir_ssa_def *mcs = NULL; 766 if (isl_aux_usage_has_mcs(key->tex_aux_usage)) 767 mcs = blorp_blit_txf_ms_mcs(b, v, sample_coords_int); 768 769 /* Compute sample index and map the sample index to a sample number. 770 * Sample index layout shows the numbering of slots in a rectangular 771 * grid of samples with in a pixel. Sample number layout shows the 772 * rectangular grid of samples roughly corresponding to the real sample 773 * locations with in a pixel. 774 * 775 * In the case of 2x MSAA, the layout of sample indices is reversed from 776 * the layout of sample numbers: 777 * 778 * sample index layout : --------- sample number layout : --------- 779 * | 0 | 1 | | 1 | 0 | 780 * --------- --------- 781 * 782 * In case of 4x MSAA, layout of sample indices matches the layout of 783 * sample numbers: 784 * --------- 785 * | 0 | 1 | 786 * --------- 787 * | 2 | 3 | 788 * --------- 789 * 790 * In case of 8x MSAA the two layouts don't match. 791 * sample index layout : --------- sample number layout : --------- 792 * | 0 | 1 | | 3 | 7 | 793 * --------- --------- 794 * | 2 | 3 | | 5 | 0 | 795 * --------- --------- 796 * | 4 | 5 | | 1 | 2 | 797 * --------- --------- 798 * | 6 | 7 | | 4 | 6 | 799 * --------- --------- 800 * 801 * Fortunately, this can be done fairly easily as: 802 * S' = (0x17306425 >> (S * 4)) & 0xf 803 * 804 * In the case of 16x MSAA the two layouts don't match. 805 * Sample index layout: Sample number layout: 806 * --------------------- --------------------- 807 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 | 808 * --------------------- --------------------- 809 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 | 810 * --------------------- --------------------- 811 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 | 812 * --------------------- --------------------- 813 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 | 814 * --------------------- --------------------- 815 * 816 * This is equivalent to 817 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf 818 */ 819 nir_ssa_def *frac = nir_ffract(b, sample_coords); 820 nir_ssa_def *sample = 821 nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale, 822 key->x_scale * key->y_scale)); 823 sample = nir_f2i32(b, sample); 824 825 if (tex_samples == 2) { 826 sample = nir_isub(b, nir_imm_int(b, 1), sample); 827 } else if (tex_samples == 8) { 828 sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573), 829 nir_ishl(b, sample, nir_imm_int(b, 2))), 830 nir_imm_int(b, 0xf)); 831 } else if (tex_samples == 16) { 832 nir_ssa_def *sample_low = 833 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af), 834 nir_ishl(b, sample, nir_imm_int(b, 2))), 835 nir_imm_int(b, 0xf)); 836 nir_ssa_def *sample_high = 837 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c), 838 nir_ishl(b, nir_iadd(b, sample, 839 nir_imm_int(b, -8)), 840 nir_imm_int(b, 2))), 841 nir_imm_int(b, 0xf)); 842 843 sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)), 844 sample_low, sample_high); 845 } 846 nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0), 847 nir_channel(b, sample_coords_int, 1), 848 sample); 849 tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type); 850 } 851 852 nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0); 853 nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1); 854 return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x), 855 nir_flrp(b, tex_data[2], tex_data[3], frac_x), 856 frac_y); 857} 858 859/** Perform a color bit-cast operation 860 * 861 * For copy operations involving CCS, we may need to use different formats for 862 * the source and destination surfaces. The two formats must both be UINT 863 * formats and must have the same size but may have different bit layouts. 864 * For instance, we may be copying from R8G8B8A8_UINT to R32_UINT or R32_UINT 865 * to R16G16_UINT. This function generates code to shuffle bits around to get 866 * us from one to the other. 867 */ 868static nir_ssa_def * 869bit_cast_color(struct nir_builder *b, nir_ssa_def *color, 870 const struct brw_blorp_blit_prog_key *key) 871{ 872 if (key->src_format == key->dst_format) 873 return color; 874 875 const struct isl_format_layout *src_fmtl = 876 isl_format_get_layout(key->src_format); 877 const struct isl_format_layout *dst_fmtl = 878 isl_format_get_layout(key->dst_format); 879 880 /* They must be formats with the same bit size */ 881 assert(src_fmtl->bpb == dst_fmtl->bpb); 882 883 if (src_fmtl->bpb <= 32) { 884 assert(src_fmtl->channels.r.type == ISL_UINT || 885 src_fmtl->channels.r.type == ISL_UNORM); 886 assert(dst_fmtl->channels.r.type == ISL_UINT || 887 dst_fmtl->channels.r.type == ISL_UNORM); 888 889 nir_ssa_def *packed = nir_imm_int(b, 0); 890 for (unsigned c = 0; c < 4; c++) { 891 if (src_fmtl->channels_array[c].bits == 0) 892 continue; 893 894 const unsigned chan_start_bit = src_fmtl->channels_array[c].start_bit; 895 const unsigned chan_bits = src_fmtl->channels_array[c].bits; 896 897 nir_ssa_def *chan = nir_channel(b, color, c); 898 if (src_fmtl->channels_array[c].type == ISL_UNORM) 899 chan = nir_format_float_to_unorm(b, chan, &chan_bits); 900 901 packed = nir_ior(b, packed, nir_shift_imm(b, chan, chan_start_bit)); 902 } 903 904 nir_ssa_def *chans[4] = { }; 905 for (unsigned c = 0; c < 4; c++) { 906 if (dst_fmtl->channels_array[c].bits == 0) { 907 chans[c] = nir_imm_int(b, 0); 908 continue; 909 } 910 911 const unsigned chan_start_bit = dst_fmtl->channels_array[c].start_bit; 912 const unsigned chan_bits = dst_fmtl->channels_array[c].bits; 913 chans[c] = nir_iand(b, nir_shift_imm(b, packed, -(int)chan_start_bit), 914 nir_imm_int(b, BITFIELD_MASK(chan_bits))); 915 916 if (dst_fmtl->channels_array[c].type == ISL_UNORM) 917 chans[c] = nir_format_unorm_to_float(b, chans[c], &chan_bits); 918 } 919 color = nir_vec(b, chans, 4); 920 } else { 921 /* This path only supports UINT formats */ 922 assert(src_fmtl->channels.r.type == ISL_UINT); 923 assert(dst_fmtl->channels.r.type == ISL_UINT); 924 925 const unsigned src_bpc = src_fmtl->channels.r.bits; 926 const unsigned dst_bpc = dst_fmtl->channels.r.bits; 927 928 assert(src_fmtl->channels.g.bits == 0 || 929 src_fmtl->channels.g.bits == src_fmtl->channels.r.bits); 930 assert(src_fmtl->channels.b.bits == 0 || 931 src_fmtl->channels.b.bits == src_fmtl->channels.r.bits); 932 assert(src_fmtl->channels.a.bits == 0 || 933 src_fmtl->channels.a.bits == src_fmtl->channels.r.bits); 934 assert(dst_fmtl->channels.g.bits == 0 || 935 dst_fmtl->channels.g.bits == dst_fmtl->channels.r.bits); 936 assert(dst_fmtl->channels.b.bits == 0 || 937 dst_fmtl->channels.b.bits == dst_fmtl->channels.r.bits); 938 assert(dst_fmtl->channels.a.bits == 0 || 939 dst_fmtl->channels.a.bits == dst_fmtl->channels.r.bits); 940 941 /* Restrict to only the channels we actually have */ 942 const unsigned src_channels = 943 isl_format_get_num_channels(key->src_format); 944 color = nir_trim_vector(b, color, src_channels); 945 946 color = nir_format_bitcast_uvec_unmasked(b, color, src_bpc, dst_bpc); 947 } 948 949 /* Blorp likes to assume that colors are vec4s */ 950 nir_ssa_def *u = nir_ssa_undef(b, 1, 32); 951 nir_ssa_def *chans[4] = { u, u, u, u }; 952 for (unsigned i = 0; i < color->num_components; i++) 953 chans[i] = nir_channel(b, color, i); 954 return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]); 955} 956 957static nir_ssa_def * 958select_color_channel(struct nir_builder *b, nir_ssa_def *color, 959 nir_alu_type data_type, 960 enum isl_channel_select chan) 961{ 962 if (chan == ISL_CHANNEL_SELECT_ZERO) { 963 return nir_imm_int(b, 0); 964 } else if (chan == ISL_CHANNEL_SELECT_ONE) { 965 switch (data_type) { 966 case nir_type_int: 967 case nir_type_uint: 968 return nir_imm_int(b, 1); 969 case nir_type_float: 970 return nir_imm_float(b, 1); 971 default: 972 unreachable("Invalid data type"); 973 } 974 } else { 975 assert((unsigned)(chan - ISL_CHANNEL_SELECT_RED) < 4); 976 return nir_channel(b, color, chan - ISL_CHANNEL_SELECT_RED); 977 } 978} 979 980static nir_ssa_def * 981swizzle_color(struct nir_builder *b, nir_ssa_def *color, 982 struct isl_swizzle swizzle, nir_alu_type data_type) 983{ 984 return nir_vec4(b, 985 select_color_channel(b, color, data_type, swizzle.r), 986 select_color_channel(b, color, data_type, swizzle.g), 987 select_color_channel(b, color, data_type, swizzle.b), 988 select_color_channel(b, color, data_type, swizzle.a)); 989} 990 991static nir_ssa_def * 992convert_color(struct nir_builder *b, nir_ssa_def *color, 993 const struct brw_blorp_blit_prog_key *key) 994{ 995 /* All of our color conversions end up generating a single-channel color 996 * value that we need to write out. 997 */ 998 nir_ssa_def *value; 999 1000 if (key->dst_format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) { 1001 /* The destination image is bound as R32_UINT but the data needs to be 1002 * in R24_UNORM_X8_TYPELESS. The bottom 24 are the actual data and the 1003 * top 8 need to be zero. We can accomplish this by simply multiplying 1004 * by a factor to scale things down. 1005 */ 1006 unsigned factor = (1 << 24) - 1; 1007 value = nir_fsat(b, nir_channel(b, color, 0)); 1008 value = nir_f2i32(b, nir_fmul(b, value, nir_imm_float(b, factor))); 1009 } else if (key->dst_format == ISL_FORMAT_L8_UNORM_SRGB) { 1010 value = nir_format_linear_to_srgb(b, nir_channel(b, color, 0)); 1011 } else if (key->dst_format == ISL_FORMAT_R8G8B8_UNORM_SRGB) { 1012 value = nir_format_linear_to_srgb(b, color); 1013 } else if (key->dst_format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) { 1014 value = nir_format_pack_r9g9b9e5(b, color); 1015 } else { 1016 unreachable("Unsupported format conversion"); 1017 } 1018 1019 nir_ssa_def *out_comps[4]; 1020 for (unsigned i = 0; i < 4; i++) { 1021 if (i < value->num_components) 1022 out_comps[i] = nir_channel(b, value, i); 1023 else 1024 out_comps[i] = nir_ssa_undef(b, 1, 32); 1025 } 1026 return nir_vec(b, out_comps, 4); 1027} 1028 1029/** 1030 * Generator for WM programs used in BLORP blits. 1031 * 1032 * The bulk of the work done by the WM program is to wrap and unwrap the 1033 * coordinate transformations used by the hardware to store surfaces in 1034 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the 1035 * sample index for a multisampled surface) to a memory offset by the 1036 * following formulas: 1037 * 1038 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S)) 1039 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset)) 1040 * 1041 * For a single-sampled surface, or for a multisampled surface using 1042 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity 1043 * function: 1044 * 1045 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 1046 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0) 1047 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S) 1048 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S) 1049 * 1050 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 1051 * embeds the sample number into bit 1 of the X and Y coordinates: 1052 * 1053 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0) 1054 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1) 1055 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1) 1056 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S) 1057 * where X' = (X & ~0b11) >> 1 | (X & 0b1) 1058 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 1059 * S = (Y & 0b10) | (X & 0b10) >> 1 1060 * 1061 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa() 1062 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of 1063 * the Y coordinate: 1064 * 1065 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0) 1066 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1) 1067 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1) 1068 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S) 1069 * where X' = (X & ~0b111) >> 2 | (X & 0b1) 1070 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1) 1071 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1 1072 * 1073 * For X tiling, tile() combines together the low-order bits of the X and Y 1074 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512 1075 * bytes wide and 8 rows high: 1076 * 1077 * tile(x_tiled, X, Y, S) = A 1078 * where A = tile_num << 12 | offset 1079 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9) 1080 * offset = (Y' & 0b111) << 9 1081 * | (X & 0b111111111) 1082 * X' = X * cpp 1083 * Y' = Y + S * qpitch 1084 * detile(x_tiled, A) = (X, Y, S) 1085 * where X = X' / cpp 1086 * Y = Y' % qpitch 1087 * S = Y' / qpitch 1088 * Y' = (tile_num / tile_pitch) << 3 1089 * | (A & 0b111000000000) >> 9 1090 * X' = (tile_num % tile_pitch) << 9 1091 * | (A & 0b111111111) 1092 * 1093 * (In all tiling formulas, cpp is the number of bytes occupied by a single 1094 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required 1095 * to fill the width of the surface, and qpitch is the spacing (in rows) 1096 * between array slices). 1097 * 1098 * For Y tiling, tile() combines together the low-order bits of the X and Y 1099 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128 1100 * bytes wide and 32 rows high: 1101 * 1102 * tile(y_tiled, X, Y, S) = A 1103 * where A = tile_num << 12 | offset 1104 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7) 1105 * offset = (X' & 0b1110000) << 5 1106 * | (Y' & 0b11111) << 4 1107 * | (X' & 0b1111) 1108 * X' = X * cpp 1109 * Y' = Y + S * qpitch 1110 * detile(y_tiled, A) = (X, Y, S) 1111 * where X = X' / cpp 1112 * Y = Y' % qpitch 1113 * S = Y' / qpitch 1114 * Y' = (tile_num / tile_pitch) << 5 1115 * | (A & 0b111110000) >> 4 1116 * X' = (tile_num % tile_pitch) << 7 1117 * | (A & 0b111000000000) >> 5 1118 * | (A & 0b1111) 1119 * 1120 * For W tiling, tile() combines together the low-order bits of the X and Y 1121 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64 1122 * bytes wide and 64 rows high (note that W tiling is only used for stencil 1123 * buffers, which always have cpp = 1 and S=0): 1124 * 1125 * tile(w_tiled, X, Y, S) = A 1126 * where A = tile_num << 12 | offset 1127 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6) 1128 * offset = (X' & 0b111000) << 6 1129 * | (Y' & 0b111100) << 3 1130 * | (X' & 0b100) << 2 1131 * | (Y' & 0b10) << 2 1132 * | (X' & 0b10) << 1 1133 * | (Y' & 0b1) << 1 1134 * | (X' & 0b1) 1135 * X' = X * cpp = X 1136 * Y' = Y + S * qpitch 1137 * detile(w_tiled, A) = (X, Y, S) 1138 * where X = X' / cpp = X' 1139 * Y = Y' % qpitch = Y' 1140 * S = Y / qpitch = 0 1141 * Y' = (tile_num / tile_pitch) << 6 1142 * | (A & 0b111100000) >> 3 1143 * | (A & 0b1000) >> 2 1144 * | (A & 0b10) >> 1 1145 * X' = (tile_num % tile_pitch) << 6 1146 * | (A & 0b111000000000) >> 6 1147 * | (A & 0b10000) >> 2 1148 * | (A & 0b100) >> 1 1149 * | (A & 0b1) 1150 * 1151 * Finally, for a non-tiled surface, tile() simply combines together the X and 1152 * Y coordinates in the natural way: 1153 * 1154 * tile(untiled, X, Y, S) = A 1155 * where A = Y * pitch + X' 1156 * X' = X * cpp 1157 * Y' = Y + S * qpitch 1158 * detile(untiled, A) = (X, Y, S) 1159 * where X = X' / cpp 1160 * Y = Y' % qpitch 1161 * S = Y' / qpitch 1162 * X' = A % pitch 1163 * Y' = A / pitch 1164 * 1165 * (In these formulas, pitch is the number of bytes occupied by a single row 1166 * of samples). 1167 */ 1168static nir_shader * 1169brw_blorp_build_nir_shader(struct blorp_context *blorp, 1170 struct blorp_batch *batch, void *mem_ctx, 1171 const struct brw_blorp_blit_prog_key *key) 1172{ 1173 const struct intel_device_info *devinfo = blorp->isl_dev->info; 1174 nir_ssa_def *src_pos, *dst_pos, *color; 1175 1176 /* Sanity checks */ 1177 if (key->dst_tiled_w && key->rt_samples > 1) { 1178 /* If the destination image is W tiled and multisampled, then the thread 1179 * must be dispatched once per sample, not once per pixel. This is 1180 * necessary because after conversion between W and Y tiling, there's no 1181 * guarantee that all samples corresponding to a single pixel will still 1182 * be together. 1183 */ 1184 assert(key->persample_msaa_dispatch); 1185 } 1186 1187 if (key->persample_msaa_dispatch) { 1188 /* It only makes sense to do persample dispatch if the render target is 1189 * configured as multisampled. 1190 */ 1191 assert(key->rt_samples > 0); 1192 } 1193 1194 /* Make sure layout is consistent with sample count */ 1195 assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) == 1196 (key->tex_samples <= 1)); 1197 assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) == 1198 (key->rt_samples <= 1)); 1199 assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) == 1200 (key->src_samples <= 1)); 1201 assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) == 1202 (key->dst_samples <= 1)); 1203 1204 nir_builder b; 1205 const bool compute = 1206 key->base.shader_pipeline == BLORP_SHADER_PIPELINE_COMPUTE; 1207 gl_shader_stage stage = 1208 compute ? MESA_SHADER_COMPUTE : MESA_SHADER_FRAGMENT; 1209 blorp_nir_init_shader(&b, mem_ctx, stage, NULL); 1210 1211 struct brw_blorp_blit_vars v; 1212 brw_blorp_blit_vars_init(&b, &v, key); 1213 1214 dst_pos = compute ? 1215 blorp_blit_get_cs_dst_coords(&b, key, &v) : 1216 blorp_blit_get_frag_coords(&b, key, &v); 1217 1218 /* Render target and texture hardware don't support W tiling until Gfx8. */ 1219 const bool rt_tiled_w = false; 1220 const bool tex_tiled_w = devinfo->ver >= 8 && key->src_tiled_w; 1221 1222 /* The address that data will be written to is determined by the 1223 * coordinates supplied to the WM thread and the tiling and sample count of 1224 * the render target, according to the formula: 1225 * 1226 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset)) 1227 * 1228 * If the actual tiling and sample count of the destination surface are not 1229 * the same as the configuration of the render target, then these 1230 * coordinates are wrong and we have to adjust them to compensate for the 1231 * difference. 1232 */ 1233 if (rt_tiled_w != key->dst_tiled_w || 1234 key->rt_samples != key->dst_samples || 1235 key->rt_layout != key->dst_layout) { 1236 dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples, 1237 key->rt_layout); 1238 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 1239 if (rt_tiled_w != key->dst_tiled_w) 1240 dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos); 1241 /* Now (X, Y, S) = detile(rt_tiling, offset) */ 1242 dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples, 1243 key->dst_layout); 1244 } 1245 1246 nir_ssa_def *comp = NULL; 1247 if (key->dst_rgb) { 1248 /* The destination image is bound as a red texture three times as wide 1249 * as the actual image. Our shader is effectively running one color 1250 * component at a time. We need to save off the component and adjust 1251 * the destination position. 1252 */ 1253 assert(dst_pos->num_components == 2); 1254 nir_ssa_def *dst_x = nir_channel(&b, dst_pos, 0); 1255 comp = nir_umod(&b, dst_x, nir_imm_int(&b, 3)); 1256 dst_pos = nir_vec2(&b, nir_idiv(&b, dst_x, nir_imm_int(&b, 3)), 1257 nir_channel(&b, dst_pos, 1)); 1258 } 1259 1260 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)). 1261 * 1262 * That is: X, Y and S now contain the true coordinates and sample index of 1263 * the data that the WM thread should output. 1264 * 1265 * If we need to kill pixels that are outside the destination rectangle, 1266 * now is the time to do it. 1267 */ 1268 nir_if *bounds_if = NULL; 1269 if (key->use_kill) { 1270 nir_ssa_def *bounds_rect = nir_load_var(&b, v.v_bounds_rect); 1271 nir_ssa_def *in_bounds = blorp_check_in_bounds(&b, bounds_rect, 1272 dst_pos); 1273 if (!compute) 1274 nir_discard_if(&b, nir_inot(&b, in_bounds)); 1275 else 1276 bounds_if = nir_push_if(&b, in_bounds); 1277 } 1278 1279 src_pos = blorp_blit_apply_transform(&b, nir_i2f32(&b, dst_pos), &v); 1280 if (dst_pos->num_components == 3) { 1281 /* The sample coordinate is an integer that we want left alone but 1282 * blorp_blit_apply_transform() blindly applies the transform to all 1283 * three coordinates. Grab the original sample index. 1284 */ 1285 src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0), 1286 nir_channel(&b, src_pos, 1), 1287 nir_channel(&b, dst_pos, 2)); 1288 } 1289 1290 /* If the source image is not multisampled, then we want to fetch sample 1291 * number 0, because that's the only sample there is. 1292 */ 1293 if (key->src_samples == 1) 1294 src_pos = nir_channels(&b, src_pos, 0x3); 1295 1296 /* X, Y, and S are now the coordinates of the pixel in the source image 1297 * that we want to texture from. Exception: if we are blending, then S is 1298 * irrelevant, because we are going to fetch all samples. 1299 */ 1300 switch (key->filter) { 1301 case BLORP_FILTER_NONE: 1302 case BLORP_FILTER_NEAREST: 1303 case BLORP_FILTER_SAMPLE_0: 1304 /* We're going to use texelFetch, so we need integers */ 1305 if (src_pos->num_components == 2) { 1306 src_pos = nir_f2i32(&b, src_pos); 1307 } else { 1308 assert(src_pos->num_components == 3); 1309 src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i32(&b, src_pos), 0), 1310 nir_channel(&b, nir_f2i32(&b, src_pos), 1), 1311 nir_channel(&b, src_pos, 2)); 1312 } 1313 1314 /* We aren't blending, which means we just want to fetch a single 1315 * sample from the source surface. The address that we want to fetch 1316 * from is related to the X, Y and S values according to the formula: 1317 * 1318 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)). 1319 * 1320 * If the actual tiling and sample count of the source surface are 1321 * not the same as the configuration of the texture, then we need to 1322 * adjust the coordinates to compensate for the difference. 1323 */ 1324 if (tex_tiled_w != key->src_tiled_w || 1325 key->tex_samples != key->src_samples || 1326 key->tex_layout != key->src_layout) { 1327 src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples, 1328 key->src_layout); 1329 /* Now (X, Y, S) = detile(src_tiling, offset) */ 1330 if (tex_tiled_w != key->src_tiled_w) 1331 src_pos = blorp_nir_retile_w_to_y(&b, src_pos); 1332 /* Now (X, Y, S) = detile(tex_tiling, offset) */ 1333 src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples, 1334 key->tex_layout); 1335 } 1336 1337 if (key->need_src_offset) 1338 src_pos = nir_iadd(&b, src_pos, nir_load_var(&b, v.v_src_offset)); 1339 1340 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)). 1341 * 1342 * In other words: X, Y, and S now contain values which, when passed to 1343 * the texturing unit, will cause data to be read from the correct 1344 * memory location. So we can fetch the texel now. 1345 */ 1346 if (key->src_samples == 1) { 1347 color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type); 1348 } else { 1349 nir_ssa_def *mcs = NULL; 1350 if (isl_aux_usage_has_mcs(key->tex_aux_usage)) 1351 mcs = blorp_blit_txf_ms_mcs(&b, &v, src_pos); 1352 1353 color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type); 1354 } 1355 break; 1356 1357 case BLORP_FILTER_BILINEAR: 1358 assert(!key->src_tiled_w); 1359 assert(key->tex_samples == key->src_samples); 1360 assert(key->tex_layout == key->src_layout); 1361 1362 if (key->src_samples == 1) { 1363 color = blorp_nir_tex(&b, &v, key, src_pos); 1364 } else { 1365 assert(!key->use_kill); 1366 color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, 1367 key, &v); 1368 } 1369 break; 1370 1371 case BLORP_FILTER_AVERAGE: 1372 case BLORP_FILTER_MIN_SAMPLE: 1373 case BLORP_FILTER_MAX_SAMPLE: 1374 assert(!key->src_tiled_w); 1375 assert(key->tex_samples == key->src_samples); 1376 assert(key->tex_layout == key->src_layout); 1377 1378 /* Resolves (effecively) use texelFetch, so we need integers and we 1379 * don't care about the sample index if we got one. 1380 */ 1381 src_pos = nir_f2i32(&b, nir_channels(&b, src_pos, 0x3)); 1382 1383 if (devinfo->ver == 6) { 1384 /* Because gfx6 only supports 4x interleved MSAA, we can do all the 1385 * blending we need with a single linear-interpolated texture lookup 1386 * at the center of the sample. The texture coordinates to be odd 1387 * integers so that they correspond to the center of a 2x2 block 1388 * representing the four samples that maxe up a pixel. So we need 1389 * to multiply our X and Y coordinates each by 2 and then add 1. 1390 */ 1391 assert(key->src_coords_normalized); 1392 assert(key->filter == BLORP_FILTER_AVERAGE); 1393 src_pos = nir_fadd(&b, 1394 nir_i2f32(&b, src_pos), 1395 nir_imm_float(&b, 0.5f)); 1396 color = blorp_nir_tex(&b, &v, key, src_pos); 1397 } else { 1398 /* Gfx7+ hardware doesn't automatically blend. */ 1399 color = blorp_nir_combine_samples(&b, &v, src_pos, key->src_samples, 1400 key->tex_aux_usage, 1401 key->texture_data_type, 1402 key->filter); 1403 } 1404 break; 1405 1406 default: 1407 unreachable("Invalid blorp filter"); 1408 } 1409 1410 if (!isl_swizzle_is_identity(key->src_swizzle)) { 1411 color = swizzle_color(&b, color, key->src_swizzle, 1412 key->texture_data_type); 1413 } 1414 1415 if (!isl_swizzle_is_identity(key->dst_swizzle)) { 1416 color = swizzle_color(&b, color, isl_swizzle_invert(key->dst_swizzle), 1417 nir_type_int); 1418 } 1419 1420 if (key->format_bit_cast) { 1421 assert(isl_swizzle_is_identity(key->src_swizzle)); 1422 assert(isl_swizzle_is_identity(key->dst_swizzle)); 1423 color = bit_cast_color(&b, color, key); 1424 } else if (key->dst_format) { 1425 color = convert_color(&b, color, key); 1426 } else if (key->uint32_to_sint) { 1427 /* Normally the hardware will take care of converting values from/to 1428 * the source and destination formats. But a few cases need help. 1429 * 1430 * The Skylake PRM, volume 07, page 658 has a programming note: 1431 * 1432 * "When using SINT or UINT rendertarget surface formats, Blending 1433 * must be DISABLED. The Pre-Blend Color Clamp Enable and Color 1434 * Clamp Range fields are ignored, and an implied clamp to the 1435 * rendertarget surface format is performed." 1436 * 1437 * For UINT to SINT blits, our sample operation gives us a uint32_t, 1438 * but our render target write expects a signed int32_t number. If we 1439 * simply passed the value along, the hardware would interpret a value 1440 * with bit 31 set as a negative value, clamping it to the largest 1441 * negative number the destination format could represent. But the 1442 * actual source value is a positive number, so we want to clamp it 1443 * to INT_MAX. To fix this, we explicitly take min(color, INT_MAX). 1444 */ 1445 color = nir_umin(&b, color, nir_imm_int(&b, INT32_MAX)); 1446 } else if (key->sint32_to_uint) { 1447 /* Similar to above, but clamping negative numbers to zero. */ 1448 color = nir_imax(&b, color, nir_imm_int(&b, 0)); 1449 } 1450 1451 if (key->dst_rgb) { 1452 /* The destination image is bound as a red texture three times as wide 1453 * as the actual image. Our shader is effectively running one color 1454 * component at a time. We need to pick off the appropriate component 1455 * from the source color and write that to destination red. 1456 */ 1457 assert(dst_pos->num_components == 2); 1458 1459 nir_ssa_def *color_component = 1460 nir_bcsel(&b, nir_ieq_imm(&b, comp, 0), 1461 nir_channel(&b, color, 0), 1462 nir_bcsel(&b, nir_ieq_imm(&b, comp, 1), 1463 nir_channel(&b, color, 1), 1464 nir_channel(&b, color, 2))); 1465 1466 nir_ssa_def *u = nir_ssa_undef(&b, 1, 32); 1467 color = nir_vec4(&b, color_component, u, u, u); 1468 } 1469 1470 if (compute) { 1471 nir_ssa_def *store_pos = nir_load_global_invocation_id(&b, 32); 1472 nir_image_store(&b, nir_imm_int(&b, 0), 1473 nir_pad_vector_imm_int(&b, store_pos, 0, 4), 1474 nir_imm_int(&b, 0), 1475 nir_pad_vector_imm_int(&b, color, 0, 4), 1476 nir_imm_int(&b, 0), 1477 .image_dim = GLSL_SAMPLER_DIM_2D, 1478 .image_array = true, 1479 .access = ACCESS_NON_READABLE); 1480 } else if (key->dst_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT) { 1481 nir_variable *color_out = 1482 nir_variable_create(b.shader, nir_var_shader_out, 1483 glsl_vec4_type(), "gl_FragColor"); 1484 color_out->data.location = FRAG_RESULT_COLOR; 1485 nir_store_var(&b, color_out, color, 0xf); 1486 } else if (key->dst_usage == ISL_SURF_USAGE_DEPTH_BIT) { 1487 nir_variable *depth_out = 1488 nir_variable_create(b.shader, nir_var_shader_out, 1489 glsl_float_type(), "gl_FragDepth"); 1490 depth_out->data.location = FRAG_RESULT_DEPTH; 1491 nir_store_var(&b, depth_out, nir_channel(&b, color, 0), 0x1); 1492 } else if (key->dst_usage == ISL_SURF_USAGE_STENCIL_BIT) { 1493 nir_variable *stencil_out = 1494 nir_variable_create(b.shader, nir_var_shader_out, 1495 glsl_int_type(), "gl_FragStencilRef"); 1496 stencil_out->data.location = FRAG_RESULT_STENCIL; 1497 nir_store_var(&b, stencil_out, nir_channel(&b, color, 0), 0x1); 1498 } else { 1499 unreachable("Invalid destination usage"); 1500 } 1501 1502 if (bounds_if) 1503 nir_pop_if(&b, bounds_if); 1504 1505 return b.shader; 1506} 1507 1508static bool 1509brw_blorp_get_blit_kernel_fs(struct blorp_batch *batch, 1510 struct blorp_params *params, 1511 const struct brw_blorp_blit_prog_key *key) 1512{ 1513 struct blorp_context *blorp = batch->blorp; 1514 1515 if (blorp->lookup_shader(batch, key, sizeof(*key), 1516 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data)) 1517 return true; 1518 1519 void *mem_ctx = ralloc_context(NULL); 1520 1521 const unsigned *program; 1522 struct brw_wm_prog_data prog_data; 1523 1524 nir_shader *nir = brw_blorp_build_nir_shader(blorp, batch, mem_ctx, key); 1525 nir->info.name = 1526 ralloc_strdup(nir, blorp_shader_type_to_name(key->base.shader_type)); 1527 1528 struct brw_wm_prog_key wm_key; 1529 brw_blorp_init_wm_prog_key(&wm_key); 1530 wm_key.base.tex.compressed_multisample_layout_mask = 1531 isl_aux_usage_has_mcs(key->tex_aux_usage); 1532 wm_key.base.tex.msaa_16 = key->tex_samples == 16; 1533 wm_key.multisample_fbo = key->rt_samples > 1; 1534 1535 program = blorp_compile_fs(blorp, mem_ctx, nir, &wm_key, false, 1536 &prog_data); 1537 1538 bool result = 1539 blorp->upload_shader(batch, MESA_SHADER_FRAGMENT, 1540 key, sizeof(*key), 1541 program, prog_data.base.program_size, 1542 &prog_data.base, sizeof(prog_data), 1543 ¶ms->wm_prog_kernel, ¶ms->wm_prog_data); 1544 1545 ralloc_free(mem_ctx); 1546 return result; 1547} 1548 1549static bool 1550brw_blorp_get_blit_kernel_cs(struct blorp_batch *batch, 1551 struct blorp_params *params, 1552 const struct brw_blorp_blit_prog_key *prog_key) 1553{ 1554 struct blorp_context *blorp = batch->blorp; 1555 1556 if (blorp->lookup_shader(batch, prog_key, sizeof(*prog_key), 1557 ¶ms->cs_prog_kernel, ¶ms->cs_prog_data)) 1558 return true; 1559 1560 void *mem_ctx = ralloc_context(NULL); 1561 1562 const unsigned *program; 1563 struct brw_cs_prog_data prog_data; 1564 1565 nir_shader *nir = brw_blorp_build_nir_shader(blorp, batch, mem_ctx, 1566 prog_key); 1567 nir->info.name = ralloc_strdup(nir, "BLORP-gpgpu-blit"); 1568 blorp_set_cs_dims(nir, prog_key->local_y); 1569 1570 struct brw_cs_prog_key cs_key; 1571 brw_blorp_init_cs_prog_key(&cs_key); 1572 cs_key.base.tex.compressed_multisample_layout_mask = 1573 prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS; 1574 cs_key.base.tex.msaa_16 = prog_key->tex_samples == 16; 1575 assert(prog_key->rt_samples == 1); 1576 1577 program = blorp_compile_cs(blorp, mem_ctx, nir, &cs_key, &prog_data); 1578 1579 bool result = 1580 blorp->upload_shader(batch, MESA_SHADER_COMPUTE, 1581 prog_key, sizeof(*prog_key), 1582 program, prog_data.base.program_size, 1583 &prog_data.base, sizeof(prog_data), 1584 ¶ms->cs_prog_kernel, ¶ms->cs_prog_data); 1585 1586 ralloc_free(mem_ctx); 1587 return result; 1588} 1589 1590static void 1591brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform, 1592 GLfloat src0, GLfloat src1, 1593 GLfloat dst0, GLfloat dst1, 1594 bool mirror) 1595{ 1596 double scale = (double)(src1 - src0) / (double)(dst1 - dst0); 1597 if (!mirror) { 1598 /* When not mirroring a coordinate (say, X), we need: 1599 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale 1600 * Therefore: 1601 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale 1602 * 1603 * blorp program uses "round toward zero" to convert the 1604 * transformed floating point coordinates to integer coordinates, 1605 * whereas the behaviour we actually want is "round to nearest", 1606 * so 0.5 provides the necessary correction. 1607 */ 1608 xform->multiplier = scale; 1609 xform->offset = src0 + (-(double)dst0 + 0.5) * scale; 1610 } else { 1611 /* When mirroring X we need: 1612 * src_x - src_x0 = dst_x1 - dst_x - 0.5 1613 * Therefore: 1614 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale 1615 */ 1616 xform->multiplier = -scale; 1617 xform->offset = src0 + ((double)dst1 - 0.5) * scale; 1618 } 1619} 1620 1621static inline void 1622surf_get_intratile_offset_px(struct brw_blorp_surface_info *info, 1623 uint32_t *tile_x_px, uint32_t *tile_y_px) 1624{ 1625 if (info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1626 struct isl_extent2d px_size_sa = 1627 isl_get_interleaved_msaa_px_size_sa(info->surf.samples); 1628 assert(info->tile_x_sa % px_size_sa.width == 0); 1629 assert(info->tile_y_sa % px_size_sa.height == 0); 1630 *tile_x_px = info->tile_x_sa / px_size_sa.width; 1631 *tile_y_px = info->tile_y_sa / px_size_sa.height; 1632 } else { 1633 *tile_x_px = info->tile_x_sa; 1634 *tile_y_px = info->tile_y_sa; 1635 } 1636} 1637 1638void 1639blorp_surf_convert_to_single_slice(const struct isl_device *isl_dev, 1640 struct brw_blorp_surface_info *info) 1641{ 1642 bool ok UNUSED; 1643 1644 /* It would be insane to try and do this on a compressed surface */ 1645 assert(info->aux_usage == ISL_AUX_USAGE_NONE); 1646 1647 /* Just bail if we have nothing to do. */ 1648 if (info->surf.dim == ISL_SURF_DIM_2D && 1649 info->view.base_level == 0 && info->view.base_array_layer == 0 && 1650 info->surf.levels == 1 && info->surf.logical_level0_px.array_len == 1) 1651 return; 1652 1653 /* If this gets triggered then we've gotten here twice which. This 1654 * shouldn't happen thanks to the above early return. 1655 */ 1656 assert(info->tile_x_sa == 0 && info->tile_y_sa == 0); 1657 1658 uint32_t layer = 0, z = 0; 1659 if (info->surf.dim == ISL_SURF_DIM_3D) 1660 z = info->view.base_array_layer + info->z_offset; 1661 else 1662 layer = info->view.base_array_layer; 1663 1664 uint64_t offset_B; 1665 isl_surf_get_image_surf(isl_dev, &info->surf, 1666 info->view.base_level, layer, z, 1667 &info->surf, 1668 &offset_B, &info->tile_x_sa, &info->tile_y_sa); 1669 info->addr.offset += offset_B; 1670 1671 uint32_t tile_x_px, tile_y_px; 1672 surf_get_intratile_offset_px(info, &tile_x_px, &tile_y_px); 1673 1674 /* Instead of using the X/Y Offset fields in RENDER_SURFACE_STATE, we place 1675 * the image at the tile boundary and offset our sampling or rendering. 1676 * For this reason, we need to grow the image by the offset to ensure that 1677 * the hardware doesn't think we've gone past the edge. 1678 */ 1679 info->surf.logical_level0_px.w += tile_x_px; 1680 info->surf.logical_level0_px.h += tile_y_px; 1681 info->surf.phys_level0_sa.w += info->tile_x_sa; 1682 info->surf.phys_level0_sa.h += info->tile_y_sa; 1683 1684 /* The view is also different now. */ 1685 info->view.base_level = 0; 1686 info->view.levels = 1; 1687 info->view.base_array_layer = 0; 1688 info->view.array_len = 1; 1689 info->z_offset = 0; 1690} 1691 1692void 1693blorp_surf_fake_interleaved_msaa(const struct isl_device *isl_dev, 1694 struct brw_blorp_surface_info *info) 1695{ 1696 assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED); 1697 1698 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ 1699 blorp_surf_convert_to_single_slice(isl_dev, info); 1700 1701 info->surf.logical_level0_px = info->surf.phys_level0_sa; 1702 info->surf.samples = 1; 1703 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE; 1704} 1705 1706void 1707blorp_surf_retile_w_to_y(const struct isl_device *isl_dev, 1708 struct brw_blorp_surface_info *info) 1709{ 1710 assert(info->surf.tiling == ISL_TILING_W); 1711 1712 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */ 1713 blorp_surf_convert_to_single_slice(isl_dev, info); 1714 1715 /* On gfx7+, we don't have interleaved multisampling for color render 1716 * targets so we have to fake it. 1717 * 1718 * TODO: Are we sure we don't also need to fake it on gfx6? 1719 */ 1720 if (isl_dev->info->ver > 6 && 1721 info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1722 blorp_surf_fake_interleaved_msaa(isl_dev, info); 1723 } 1724 1725 if (isl_dev->info->ver == 6 || isl_dev->info->ver == 7) { 1726 /* Gfx6-7 stencil buffers have a very large alignment coming in from the 1727 * miptree. It's out-of-bounds for what the surface state can handle. 1728 * Since we have a single layer and level, it doesn't really matter as 1729 * long as we don't pass a bogus value into isl_surf_fill_state(). 1730 */ 1731 info->surf.image_alignment_el = isl_extent3d(4, 2, 1); 1732 } 1733 1734 /* Now that we've converted everything to a simple 2-D surface with only 1735 * one miplevel, we can go about retiling it. 1736 */ 1737 const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4; 1738 info->surf.tiling = ISL_TILING_Y0; 1739 info->surf.logical_level0_px.width = 1740 ALIGN(info->surf.logical_level0_px.width, x_align) * 2; 1741 info->surf.logical_level0_px.height = 1742 ALIGN(info->surf.logical_level0_px.height, y_align) / 2; 1743 info->tile_x_sa *= 2; 1744 info->tile_y_sa /= 2; 1745} 1746 1747static bool 1748can_shrink_surface(const struct brw_blorp_surface_info *surf) 1749{ 1750 /* The current code doesn't support offsets into the aux buffers. This 1751 * should be possible, but we need to make sure the offset is page 1752 * aligned for both the surface and the aux buffer surface. Generally 1753 * this mean using the page aligned offset for the aux buffer. 1754 * 1755 * Currently the cases where we must split the blit are limited to cases 1756 * where we don't have a aux buffer. 1757 */ 1758 if (surf->aux_addr.buffer != NULL) 1759 return false; 1760 1761 /* We can't support splitting the blit for gen <= 7, because the qpitch 1762 * size is calculated by the hardware based on the surface height for 1763 * gen <= 7. In gen >= 8, the qpitch is controlled by the driver. 1764 */ 1765 if (surf->surf.msaa_layout == ISL_MSAA_LAYOUT_ARRAY) 1766 return false; 1767 1768 return true; 1769} 1770 1771static unsigned 1772get_max_surface_size(const struct intel_device_info *devinfo, 1773 const struct brw_blorp_surface_info *surf) 1774{ 1775 const unsigned max = devinfo->ver >= 7 ? 16384 : 8192; 1776 if (split_blorp_blit_debug && can_shrink_surface(surf)) 1777 return max >> 4; /* A smaller restriction when debug is enabled */ 1778 else 1779 return max; 1780} 1781 1782struct blt_axis { 1783 double src0, src1, dst0, dst1; 1784 bool mirror; 1785}; 1786 1787struct blt_coords { 1788 struct blt_axis x, y; 1789}; 1790 1791static enum isl_format 1792get_red_format_for_rgb_format(enum isl_format format) 1793{ 1794 const struct isl_format_layout *fmtl = isl_format_get_layout(format); 1795 1796 switch (fmtl->channels.r.bits) { 1797 case 8: 1798 switch (fmtl->channels.r.type) { 1799 case ISL_UNORM: 1800 return ISL_FORMAT_R8_UNORM; 1801 case ISL_SNORM: 1802 return ISL_FORMAT_R8_SNORM; 1803 case ISL_UINT: 1804 return ISL_FORMAT_R8_UINT; 1805 case ISL_SINT: 1806 return ISL_FORMAT_R8_SINT; 1807 default: 1808 unreachable("Invalid 8-bit RGB channel type"); 1809 } 1810 case 16: 1811 switch (fmtl->channels.r.type) { 1812 case ISL_UNORM: 1813 return ISL_FORMAT_R16_UNORM; 1814 case ISL_SNORM: 1815 return ISL_FORMAT_R16_SNORM; 1816 case ISL_SFLOAT: 1817 return ISL_FORMAT_R16_FLOAT; 1818 case ISL_UINT: 1819 return ISL_FORMAT_R16_UINT; 1820 case ISL_SINT: 1821 return ISL_FORMAT_R16_SINT; 1822 default: 1823 unreachable("Invalid 8-bit RGB channel type"); 1824 } 1825 case 32: 1826 switch (fmtl->channels.r.type) { 1827 case ISL_SFLOAT: 1828 return ISL_FORMAT_R32_FLOAT; 1829 case ISL_UINT: 1830 return ISL_FORMAT_R32_UINT; 1831 case ISL_SINT: 1832 return ISL_FORMAT_R32_SINT; 1833 default: 1834 unreachable("Invalid 8-bit RGB channel type"); 1835 } 1836 default: 1837 unreachable("Invalid number of red channel bits"); 1838 } 1839} 1840 1841void 1842surf_fake_rgb_with_red(const struct isl_device *isl_dev, 1843 struct brw_blorp_surface_info *info) 1844{ 1845 blorp_surf_convert_to_single_slice(isl_dev, info); 1846 1847 info->surf.logical_level0_px.width *= 3; 1848 info->surf.phys_level0_sa.width *= 3; 1849 info->tile_x_sa *= 3; 1850 1851 enum isl_format red_format = 1852 get_red_format_for_rgb_format(info->view.format); 1853 1854 assert(isl_format_get_layout(red_format)->channels.r.type == 1855 isl_format_get_layout(info->view.format)->channels.r.type); 1856 assert(isl_format_get_layout(red_format)->channels.r.bits == 1857 isl_format_get_layout(info->view.format)->channels.r.bits); 1858 1859 info->surf.format = info->view.format = red_format; 1860 1861 if (isl_dev->info->verx10 >= 125) { 1862 /* The horizontal alignment is in units of texels for NPOT formats, and 1863 * bytes for other formats. Since the only allowed alignment units are 1864 * powers of two, there's no way to convert the alignment. 1865 * 1866 * Thankfully, the value doesn't matter since we're only a single slice. 1867 * Pick one allowed by isl_gfx125_choose_image_alignment_el. 1868 */ 1869 info->surf.image_alignment_el.w = 1870 128 / (isl_format_get_layout(red_format)->bpb / 8); 1871 } 1872} 1873 1874enum blit_shrink_status { 1875 BLIT_NO_SHRINK = 0, 1876 BLIT_SRC_WIDTH_SHRINK = (1 << 0), 1877 BLIT_DST_WIDTH_SHRINK = (1 << 1), 1878 BLIT_SRC_HEIGHT_SHRINK = (1 << 2), 1879 BLIT_DST_HEIGHT_SHRINK = (1 << 3), 1880}; 1881 1882/* Try to blit. If the surface parameters exceed the size allowed by hardware, 1883 * then enum blit_shrink_status will be returned. If BLIT_NO_SHRINK is 1884 * returned, then the blit was successful. 1885 */ 1886static enum blit_shrink_status 1887try_blorp_blit(struct blorp_batch *batch, 1888 struct blorp_params *params, 1889 struct brw_blorp_blit_prog_key *key, 1890 struct blt_coords *coords) 1891{ 1892 const struct intel_device_info *devinfo = batch->blorp->isl_dev->info; 1893 1894 if (params->dst.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) { 1895 if (devinfo->ver >= 7) { 1896 /* We can render as depth on Gfx5 but there's no real advantage since 1897 * it doesn't support MSAA or HiZ. On Gfx4, we can't always render 1898 * to depth due to issues with depth buffers and mip-mapping. On 1899 * Gfx6, we can do everything but we have weird offsetting for HiZ 1900 * and stencil. It's easier to just render using the color pipe 1901 * on those platforms. 1902 */ 1903 key->dst_usage = ISL_SURF_USAGE_DEPTH_BIT; 1904 } else { 1905 key->dst_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; 1906 } 1907 } else if (params->dst.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) { 1908 assert(params->dst.surf.format == ISL_FORMAT_R8_UINT); 1909 if (devinfo->ver >= 9 && !(batch->flags & BLORP_BATCH_USE_COMPUTE)) { 1910 key->dst_usage = ISL_SURF_USAGE_STENCIL_BIT; 1911 } else { 1912 key->dst_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; 1913 } 1914 } else { 1915 key->dst_usage = ISL_SURF_USAGE_RENDER_TARGET_BIT; 1916 } 1917 1918 if (isl_format_has_sint_channel(params->src.view.format)) { 1919 key->texture_data_type = nir_type_int; 1920 } else if (isl_format_has_uint_channel(params->src.view.format)) { 1921 key->texture_data_type = nir_type_uint; 1922 } else { 1923 key->texture_data_type = nir_type_float; 1924 } 1925 1926 /* src_samples and dst_samples are the true sample counts */ 1927 key->src_samples = params->src.surf.samples; 1928 key->dst_samples = params->dst.surf.samples; 1929 1930 key->tex_aux_usage = params->src.aux_usage; 1931 1932 /* src_layout and dst_layout indicate the true MSAA layout used by src and 1933 * dst. 1934 */ 1935 key->src_layout = params->src.surf.msaa_layout; 1936 key->dst_layout = params->dst.surf.msaa_layout; 1937 1938 /* Round floating point values to nearest integer to avoid "off by one texel" 1939 * kind of errors when blitting. 1940 */ 1941 params->x0 = params->wm_inputs.bounds_rect.x0 = round(coords->x.dst0); 1942 params->y0 = params->wm_inputs.bounds_rect.y0 = round(coords->y.dst0); 1943 params->x1 = params->wm_inputs.bounds_rect.x1 = round(coords->x.dst1); 1944 params->y1 = params->wm_inputs.bounds_rect.y1 = round(coords->y.dst1); 1945 1946 brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[0], 1947 coords->x.src0, coords->x.src1, 1948 coords->x.dst0, coords->x.dst1, 1949 coords->x.mirror); 1950 brw_blorp_setup_coord_transform(¶ms->wm_inputs.coord_transform[1], 1951 coords->y.src0, coords->y.src1, 1952 coords->y.dst0, coords->y.dst1, 1953 coords->y.mirror); 1954 1955 1956 if (devinfo->ver == 4) { 1957 /* The MinLOD and MinimumArrayElement don't work properly for cube maps. 1958 * Convert them to a single slice on gfx4. 1959 */ 1960 if (params->dst.surf.usage & ISL_SURF_USAGE_CUBE_BIT) { 1961 blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms->dst); 1962 key->need_dst_offset = true; 1963 } 1964 1965 if (params->src.surf.usage & ISL_SURF_USAGE_CUBE_BIT) { 1966 blorp_surf_convert_to_single_slice(batch->blorp->isl_dev, ¶ms->src); 1967 key->need_src_offset = true; 1968 } 1969 } 1970 1971 if (devinfo->ver > 6 && 1972 !isl_surf_usage_is_depth_or_stencil(key->dst_usage) && 1973 params->dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) { 1974 assert(params->dst.surf.samples > 1); 1975 1976 /* We must expand the rectangle we send through the rendering pipeline, 1977 * to account for the fact that we are mapping the destination region as 1978 * single-sampled when it is in fact multisampled. We must also align 1979 * it to a multiple of the multisampling pattern, because the 1980 * differences between multisampled and single-sampled surface formats 1981 * will mean that pixels are scrambled within the multisampling pattern. 1982 * TODO: what if this makes the coordinates too large? 1983 * 1984 * Note: this only works if the destination surface uses the IMS layout. 1985 * If it's UMS, then we have no choice but to set up the rendering 1986 * pipeline as multisampled. 1987 */ 1988 struct isl_extent2d px_size_sa = 1989 isl_get_interleaved_msaa_px_size_sa(params->dst.surf.samples); 1990 params->x0 = ROUND_DOWN_TO(params->x0, 2) * px_size_sa.width; 1991 params->y0 = ROUND_DOWN_TO(params->y0, 2) * px_size_sa.height; 1992 params->x1 = ALIGN(params->x1, 2) * px_size_sa.width; 1993 params->y1 = ALIGN(params->y1, 2) * px_size_sa.height; 1994 1995 blorp_surf_fake_interleaved_msaa(batch->blorp->isl_dev, ¶ms->dst); 1996 1997 key->use_kill = true; 1998 key->need_dst_offset = true; 1999 } 2000 2001 if (params->dst.surf.tiling == ISL_TILING_W && 2002 key->dst_usage != ISL_SURF_USAGE_STENCIL_BIT) { 2003 /* We must modify the rectangle we send through the rendering pipeline 2004 * (and the size and x/y offset of the destination surface), to account 2005 * for the fact that we are mapping it as Y-tiled when it is in fact 2006 * W-tiled. 2007 * 2008 * Both Y tiling and W tiling can be understood as organizations of 2009 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels 2010 * is different, but the layout of the 32-byte sub-tiles within the 4k 2011 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in 2012 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide 2013 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high. 2014 * 2015 * Therefore, to account for the layout differences within the 32-byte 2016 * sub-tiles, we must expand the rectangle so the X coordinates of its 2017 * edges are multiples of 8 (the W sub-tile width), and its Y 2018 * coordinates of its edges are multiples of 4 (the W sub-tile height). 2019 * Then we need to scale the X and Y coordinates of the rectangle to 2020 * account for the differences in aspect ratio between the Y and W 2021 * sub-tiles. We need to modify the layer width and height similarly. 2022 * 2023 * A correction needs to be applied when MSAA is in use: since 2024 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4, 2025 * we need to align the Y coordinates to multiples of 8, so that when 2026 * they are divided by two they are still multiples of 4. 2027 * 2028 * Note: Since the x/y offset of the surface will be applied using the 2029 * SURFACE_STATE command packet, it will be invisible to the swizzling 2030 * code in the shader; therefore it needs to be in a multiple of the 2031 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8 2032 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil 2033 * buffer), and the miplevel alignment used for stencil buffers is 8 2034 * pixels horizontally and either 4 or 8 pixels vertically (see 2035 * intel_horizontal_texture_alignment_unit() and 2036 * intel_vertical_texture_alignment_unit()). 2037 * 2038 * Note: Also, since the SURFACE_STATE command packet can only apply 2039 * offsets that are multiples of 4 pixels horizontally and 2 pixels 2040 * vertically, it is important that the offsets will be multiples of 2041 * these sizes after they are converted into Y-tiled coordinates. 2042 * Fortunately they will be, since we know from above that the offsets 2043 * are a multiple of the 32-byte sub-tile size, and in Y-tiled 2044 * coordinates the sub-tile is 16 pixels wide and 2 pixels high. 2045 * 2046 * TODO: what if this makes the coordinates (or the texture size) too 2047 * large? 2048 */ 2049 const unsigned x_align = 8; 2050 const unsigned y_align = params->dst.surf.samples != 0 ? 8 : 4; 2051 params->x0 = ROUND_DOWN_TO(params->x0, x_align) * 2; 2052 params->y0 = ROUND_DOWN_TO(params->y0, y_align) / 2; 2053 params->x1 = ALIGN(params->x1, x_align) * 2; 2054 params->y1 = ALIGN(params->y1, y_align) / 2; 2055 2056 /* Retile the surface to Y-tiled */ 2057 blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->dst); 2058 2059 key->dst_tiled_w = true; 2060 key->use_kill = true; 2061 key->need_dst_offset = true; 2062 2063 if (params->dst.surf.samples > 1) { 2064 /* If the destination surface is a W-tiled multisampled stencil 2065 * buffer that we're mapping as Y tiled, then we need to arrange for 2066 * the WM program to run once per sample rather than once per pixel, 2067 * because the memory layout of related samples doesn't match between 2068 * W and Y tiling. 2069 */ 2070 key->persample_msaa_dispatch = true; 2071 } 2072 } 2073 2074 if (devinfo->ver < 8 && params->src.surf.tiling == ISL_TILING_W) { 2075 /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled. 2076 * Broadwell adds support for sampling from stencil. 2077 * 2078 * See the comments above concerning x/y offset alignment for the 2079 * destination surface. 2080 * 2081 * TODO: what if this makes the texture size too large? 2082 */ 2083 blorp_surf_retile_w_to_y(batch->blorp->isl_dev, ¶ms->src); 2084 2085 key->src_tiled_w = true; 2086 key->need_src_offset = true; 2087 } 2088 2089 /* tex_samples and rt_samples are the sample counts that are set up in 2090 * SURFACE_STATE. 2091 */ 2092 key->tex_samples = params->src.surf.samples; 2093 key->rt_samples = params->dst.surf.samples; 2094 2095 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will 2096 * use to access the source and destination surfaces. 2097 */ 2098 key->tex_layout = params->src.surf.msaa_layout; 2099 key->rt_layout = params->dst.surf.msaa_layout; 2100 2101 if (params->src.surf.samples > 0 && params->dst.surf.samples > 1) { 2102 /* We are blitting from a multisample buffer to a multisample buffer, so 2103 * we must preserve samples within a pixel. This means we have to 2104 * arrange for the WM program to run once per sample rather than once 2105 * per pixel. 2106 */ 2107 key->persample_msaa_dispatch = true; 2108 } 2109 2110 params->num_samples = params->dst.surf.samples; 2111 2112 if ((key->filter == BLORP_FILTER_AVERAGE || 2113 key->filter == BLORP_FILTER_BILINEAR) && 2114 batch->blorp->isl_dev->info->ver <= 6) { 2115 /* Gfx4-5 don't support non-normalized texture coordinates */ 2116 key->src_coords_normalized = true; 2117 params->wm_inputs.src_inv_size[0] = 2118 1.0f / u_minify(params->src.surf.logical_level0_px.width, 2119 params->src.view.base_level); 2120 params->wm_inputs.src_inv_size[1] = 2121 1.0f / u_minify(params->src.surf.logical_level0_px.height, 2122 params->src.view.base_level); 2123 } 2124 2125 if (isl_format_get_layout(params->dst.view.format)->bpb % 3 == 0) { 2126 /* We can't render to RGB formats natively because they aren't a 2127 * power-of-two size. Instead, we fake them by using a red format 2128 * with the same channel type and size and emitting shader code to 2129 * only write one channel at a time. 2130 */ 2131 params->x0 *= 3; 2132 params->x1 *= 3; 2133 2134 /* If it happens to be sRGB, we need to force a conversion */ 2135 if (params->dst.view.format == ISL_FORMAT_R8G8B8_UNORM_SRGB) 2136 key->dst_format = ISL_FORMAT_R8G8B8_UNORM_SRGB; 2137 2138 surf_fake_rgb_with_red(batch->blorp->isl_dev, ¶ms->dst); 2139 2140 key->dst_rgb = true; 2141 key->need_dst_offset = true; 2142 } else if (isl_format_is_rgbx(params->dst.view.format)) { 2143 /* We can handle RGBX formats easily enough by treating them as RGBA */ 2144 params->dst.view.format = 2145 isl_format_rgbx_to_rgba(params->dst.view.format); 2146 } else if (params->dst.view.format == ISL_FORMAT_R24_UNORM_X8_TYPELESS && 2147 key->dst_usage != ISL_SURF_USAGE_DEPTH_BIT) { 2148 key->dst_format = params->dst.view.format; 2149 params->dst.view.format = ISL_FORMAT_R32_UINT; 2150 } else if (params->dst.view.format == ISL_FORMAT_A4B4G4R4_UNORM) { 2151 params->dst.view.swizzle = 2152 isl_swizzle_compose(params->dst.view.swizzle, 2153 ISL_SWIZZLE(ALPHA, RED, GREEN, BLUE)); 2154 params->dst.view.format = ISL_FORMAT_B4G4R4A4_UNORM; 2155 } else if (params->dst.view.format == ISL_FORMAT_L8_UNORM_SRGB) { 2156 key->dst_format = params->dst.view.format; 2157 params->dst.view.format = ISL_FORMAT_R8_UNORM; 2158 } else if (params->dst.view.format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) { 2159 key->dst_format = params->dst.view.format; 2160 params->dst.view.format = ISL_FORMAT_R32_UINT; 2161 } 2162 2163 if (devinfo->verx10 <= 70 && 2164 !isl_swizzle_is_identity(params->src.view.swizzle)) { 2165 key->src_swizzle = params->src.view.swizzle; 2166 params->src.view.swizzle = ISL_SWIZZLE_IDENTITY; 2167 } else { 2168 key->src_swizzle = ISL_SWIZZLE_IDENTITY; 2169 } 2170 2171 if (!isl_swizzle_supports_rendering(devinfo, params->dst.view.swizzle)) { 2172 key->dst_swizzle = params->dst.view.swizzle; 2173 params->dst.view.swizzle = ISL_SWIZZLE_IDENTITY; 2174 } else { 2175 key->dst_swizzle = ISL_SWIZZLE_IDENTITY; 2176 } 2177 2178 if (params->src.tile_x_sa || params->src.tile_y_sa) { 2179 assert(key->need_src_offset); 2180 surf_get_intratile_offset_px(¶ms->src, 2181 ¶ms->wm_inputs.src_offset.x, 2182 ¶ms->wm_inputs.src_offset.y); 2183 } 2184 2185 if (params->dst.tile_x_sa || params->dst.tile_y_sa) { 2186 assert(key->need_dst_offset); 2187 surf_get_intratile_offset_px(¶ms->dst, 2188 ¶ms->wm_inputs.dst_offset.x, 2189 ¶ms->wm_inputs.dst_offset.y); 2190 params->x0 += params->wm_inputs.dst_offset.x; 2191 params->y0 += params->wm_inputs.dst_offset.y; 2192 params->x1 += params->wm_inputs.dst_offset.x; 2193 params->y1 += params->wm_inputs.dst_offset.y; 2194 } 2195 2196 /* For some texture types, we need to pass the layer through the sampler. */ 2197 params->wm_inputs.src_z = params->src.z_offset; 2198 2199 const bool compute = 2200 key->base.shader_pipeline == BLORP_SHADER_PIPELINE_COMPUTE; 2201 if (compute) { 2202 key->local_y = blorp_get_cs_local_y(params); 2203 2204 unsigned workgroup_width = 16 / key->local_y; 2205 unsigned workgroup_height = key->local_y; 2206 2207 /* If the rectangle being drawn isn't an exact multiple of the 2208 * workgroup size, we'll get extra invocations that should not 2209 * perform blits. We need to set use_kill to bounds check and 2210 * prevent those invocations from blitting. 2211 */ 2212 if ((params->x0 % workgroup_width) != 0 || 2213 (params->x1 % workgroup_width) != 0 || 2214 (params->y0 % workgroup_height) != 0 || 2215 (params->y1 % workgroup_height) != 0) 2216 key->use_kill = true; 2217 } 2218 2219 if (compute) { 2220 if (!brw_blorp_get_blit_kernel_cs(batch, params, key)) 2221 return 0; 2222 } else { 2223 if (!brw_blorp_get_blit_kernel_fs(batch, params, key)) 2224 return 0; 2225 2226 if (!blorp_ensure_sf_program(batch, params)) 2227 return 0; 2228 } 2229 2230 unsigned result = 0; 2231 unsigned max_src_surface_size = get_max_surface_size(devinfo, ¶ms->src); 2232 if (params->src.surf.logical_level0_px.width > max_src_surface_size) 2233 result |= BLIT_SRC_WIDTH_SHRINK; 2234 if (params->src.surf.logical_level0_px.height > max_src_surface_size) 2235 result |= BLIT_SRC_HEIGHT_SHRINK; 2236 2237 unsigned max_dst_surface_size = get_max_surface_size(devinfo, ¶ms->dst); 2238 if (params->dst.surf.logical_level0_px.width > max_dst_surface_size) 2239 result |= BLIT_DST_WIDTH_SHRINK; 2240 if (params->dst.surf.logical_level0_px.height > max_dst_surface_size) 2241 result |= BLIT_DST_HEIGHT_SHRINK; 2242 2243 if (result == 0) { 2244 if (key->dst_usage == ISL_SURF_USAGE_DEPTH_BIT) { 2245 params->depth = params->dst; 2246 memset(¶ms->dst, 0, sizeof(params->dst)); 2247 } else if (key->dst_usage == ISL_SURF_USAGE_STENCIL_BIT) { 2248 params->stencil = params->dst; 2249 params->stencil_mask = 0xff; 2250 memset(¶ms->dst, 0, sizeof(params->dst)); 2251 } 2252 2253 batch->blorp->exec(batch, params); 2254 } 2255 2256 return result; 2257} 2258 2259/* Adjust split blit source coordinates for the current destination 2260 * coordinates. 2261 */ 2262static void 2263adjust_split_source_coords(const struct blt_axis *orig, 2264 struct blt_axis *split_coords, 2265 double scale) 2266{ 2267 /* When scale is greater than 0, then we are growing from the start, so 2268 * src0 uses delta0, and src1 uses delta1. When scale is less than 0, the 2269 * source range shrinks from the end. In that case src0 is adjusted by 2270 * delta1, and src1 is adjusted by delta0. 2271 */ 2272 double delta0 = scale * (split_coords->dst0 - orig->dst0); 2273 double delta1 = scale * (split_coords->dst1 - orig->dst1); 2274 split_coords->src0 = orig->src0 + (scale >= 0.0 ? delta0 : delta1); 2275 split_coords->src1 = orig->src1 + (scale >= 0.0 ? delta1 : delta0); 2276} 2277 2278static struct isl_extent2d 2279get_px_size_sa(const struct isl_surf *surf) 2280{ 2281 static const struct isl_extent2d one_to_one = { .w = 1, .h = 1 }; 2282 2283 if (surf->msaa_layout != ISL_MSAA_LAYOUT_INTERLEAVED) 2284 return one_to_one; 2285 else 2286 return isl_get_interleaved_msaa_px_size_sa(surf->samples); 2287} 2288 2289static void 2290shrink_surface_params(const struct isl_device *dev, 2291 struct brw_blorp_surface_info *info, 2292 double *x0, double *x1, double *y0, double *y1) 2293{ 2294 uint64_t offset_B; 2295 uint32_t x_offset_sa, y_offset_sa, size; 2296 struct isl_extent2d px_size_sa; 2297 int adjust; 2298 2299 blorp_surf_convert_to_single_slice(dev, info); 2300 2301 px_size_sa = get_px_size_sa(&info->surf); 2302 2303 /* Because this gets called after we lower compressed images, the tile 2304 * offsets may be non-zero and we need to incorporate them in our 2305 * calculations. 2306 */ 2307 x_offset_sa = (uint32_t)*x0 * px_size_sa.w + info->tile_x_sa; 2308 y_offset_sa = (uint32_t)*y0 * px_size_sa.h + info->tile_y_sa; 2309 uint32_t tile_z_sa, tile_a; 2310 isl_tiling_get_intratile_offset_sa(info->surf.tiling, info->surf.dim, 2311 info->surf.msaa_layout, 2312 info->surf.format, info->surf.samples, 2313 info->surf.row_pitch_B, 2314 info->surf.array_pitch_el_rows, 2315 x_offset_sa, y_offset_sa, 0, 0, 2316 &offset_B, 2317 &info->tile_x_sa, &info->tile_y_sa, 2318 &tile_z_sa, &tile_a); 2319 assert(tile_z_sa == 0 && tile_a == 0); 2320 2321 info->addr.offset += offset_B; 2322 2323 adjust = (int)info->tile_x_sa / px_size_sa.w - (int)*x0; 2324 *x0 += adjust; 2325 *x1 += adjust; 2326 info->tile_x_sa = 0; 2327 2328 adjust = (int)info->tile_y_sa / px_size_sa.h - (int)*y0; 2329 *y0 += adjust; 2330 *y1 += adjust; 2331 info->tile_y_sa = 0; 2332 2333 size = MIN2((uint32_t)ceil(*x1), info->surf.logical_level0_px.width); 2334 info->surf.logical_level0_px.width = size; 2335 info->surf.phys_level0_sa.width = size * px_size_sa.w; 2336 2337 size = MIN2((uint32_t)ceil(*y1), info->surf.logical_level0_px.height); 2338 info->surf.logical_level0_px.height = size; 2339 info->surf.phys_level0_sa.height = size * px_size_sa.h; 2340} 2341 2342static void 2343do_blorp_blit(struct blorp_batch *batch, 2344 const struct blorp_params *orig_params, 2345 struct brw_blorp_blit_prog_key *key, 2346 const struct blt_coords *orig) 2347{ 2348 struct blorp_params params; 2349 struct blt_coords blit_coords; 2350 struct blt_coords split_coords = *orig; 2351 double w = orig->x.dst1 - orig->x.dst0; 2352 double h = orig->y.dst1 - orig->y.dst0; 2353 double x_scale = (orig->x.src1 - orig->x.src0) / w; 2354 double y_scale = (orig->y.src1 - orig->y.src0) / h; 2355 if (orig->x.mirror) 2356 x_scale = -x_scale; 2357 if (orig->y.mirror) 2358 y_scale = -y_scale; 2359 2360 enum blit_shrink_status shrink = BLIT_NO_SHRINK; 2361 if (split_blorp_blit_debug) { 2362 if (can_shrink_surface(&orig_params->src)) 2363 shrink |= BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK; 2364 if (can_shrink_surface(&orig_params->dst)) 2365 shrink |= BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK; 2366 } 2367 2368 bool x_done, y_done; 2369 do { 2370 params = *orig_params; 2371 blit_coords = split_coords; 2372 2373 if (shrink & (BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK)) { 2374 shrink_surface_params(batch->blorp->isl_dev, ¶ms.src, 2375 &blit_coords.x.src0, &blit_coords.x.src1, 2376 &blit_coords.y.src0, &blit_coords.y.src1); 2377 key->need_src_offset = false; 2378 } 2379 2380 if (shrink & (BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK)) { 2381 shrink_surface_params(batch->blorp->isl_dev, ¶ms.dst, 2382 &blit_coords.x.dst0, &blit_coords.x.dst1, 2383 &blit_coords.y.dst0, &blit_coords.y.dst1); 2384 key->need_dst_offset = false; 2385 } 2386 2387 enum blit_shrink_status result = 2388 try_blorp_blit(batch, ¶ms, key, &blit_coords); 2389 2390 if (result & (BLIT_SRC_WIDTH_SHRINK | BLIT_SRC_HEIGHT_SHRINK)) 2391 assert(can_shrink_surface(&orig_params->src)); 2392 2393 if (result & (BLIT_DST_WIDTH_SHRINK | BLIT_DST_HEIGHT_SHRINK)) 2394 assert(can_shrink_surface(&orig_params->dst)); 2395 2396 if (result & (BLIT_SRC_WIDTH_SHRINK | BLIT_DST_WIDTH_SHRINK)) { 2397 w /= 2.0; 2398 assert(w >= 1.0); 2399 split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); 2400 adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); 2401 } 2402 if (result & (BLIT_SRC_HEIGHT_SHRINK | BLIT_DST_HEIGHT_SHRINK)) { 2403 h /= 2.0; 2404 assert(h >= 1.0); 2405 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 2406 adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); 2407 } 2408 2409 if (result) { 2410 /* We may get less bits set on result than we had already, so make 2411 * sure we remember all the ways in which a resize is required. 2412 */ 2413 shrink |= result; 2414 continue; 2415 } 2416 2417 y_done = (orig->y.dst1 - split_coords.y.dst1 < 0.5); 2418 x_done = y_done && (orig->x.dst1 - split_coords.x.dst1 < 0.5); 2419 if (x_done) { 2420 break; 2421 } else if (y_done) { 2422 split_coords.x.dst0 += w; 2423 split_coords.x.dst1 = MIN2(split_coords.x.dst0 + w, orig->x.dst1); 2424 split_coords.y.dst0 = orig->y.dst0; 2425 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 2426 adjust_split_source_coords(&orig->x, &split_coords.x, x_scale); 2427 } else { 2428 split_coords.y.dst0 += h; 2429 split_coords.y.dst1 = MIN2(split_coords.y.dst0 + h, orig->y.dst1); 2430 adjust_split_source_coords(&orig->y, &split_coords.y, y_scale); 2431 } 2432 } while (true); 2433} 2434 2435bool 2436blorp_blit_supports_compute(struct blorp_context *blorp, 2437 const struct isl_surf *src_surf, 2438 const struct isl_surf *dst_surf, 2439 enum isl_aux_usage dst_aux_usage) 2440{ 2441 /* Our compiler doesn't currently support typed image writes with MSAA. 2442 * Also, our BLORP compute shaders don't handle multisampling cases. 2443 */ 2444 if (dst_surf->samples > 1 || src_surf->samples > 1) 2445 return false; 2446 2447 if (blorp->isl_dev->info->ver >= 12) { 2448 return dst_aux_usage == ISL_AUX_USAGE_GFX12_CCS_E || 2449 dst_aux_usage == ISL_AUX_USAGE_CCS_E || 2450 dst_aux_usage == ISL_AUX_USAGE_NONE; 2451 } else if (blorp->isl_dev->info->ver >= 7) { 2452 return dst_aux_usage == ISL_AUX_USAGE_NONE; 2453 } else { 2454 /* No compute shader support */ 2455 return false; 2456 } 2457} 2458 2459static bool 2460blitter_supports_aux(const struct intel_device_info *devinfo, 2461 enum isl_aux_usage aux_usage) 2462{ 2463 switch (aux_usage) { 2464 case ISL_AUX_USAGE_NONE: 2465 return true; 2466 case ISL_AUX_USAGE_CCS_E: 2467 case ISL_AUX_USAGE_GFX12_CCS_E: 2468 return devinfo->verx10 >= 125; 2469 default: 2470 return false; 2471 } 2472} 2473 2474bool 2475blorp_copy_supports_blitter(struct blorp_context *blorp, 2476 const struct isl_surf *src_surf, 2477 const struct isl_surf *dst_surf, 2478 enum isl_aux_usage src_aux_usage, 2479 enum isl_aux_usage dst_aux_usage) 2480{ 2481 const struct intel_device_info *devinfo = blorp->isl_dev->info; 2482 2483 if (devinfo->ver < 12) 2484 return false; 2485 2486 if (dst_surf->samples > 1 || src_surf->samples > 1) 2487 return false; 2488 2489 if (!blitter_supports_aux(devinfo, dst_aux_usage)) 2490 return false; 2491 2492 if (!blitter_supports_aux(devinfo, src_aux_usage)) 2493 return false; 2494 2495 const struct isl_format_layout *fmtl = 2496 isl_format_get_layout(dst_surf->format); 2497 2498 if (fmtl->bpb == 96) { 2499 /* XY_BLOCK_COPY_BLT mentions it doesn't support clear colors for 96bpp 2500 * formats, but none of them support CCS anyway, so it's a moot point. 2501 */ 2502 assert(src_aux_usage == ISL_AUX_USAGE_NONE); 2503 assert(dst_aux_usage == ISL_AUX_USAGE_NONE); 2504 2505 /* We can only support linear mode for 96bpp. */ 2506 if (src_surf->tiling != ISL_TILING_LINEAR || 2507 dst_surf->tiling != ISL_TILING_LINEAR) 2508 return false; 2509 } 2510 2511 return true; 2512} 2513 2514void 2515blorp_blit(struct blorp_batch *batch, 2516 const struct blorp_surf *src_surf, 2517 unsigned src_level, float src_layer, 2518 enum isl_format src_format, struct isl_swizzle src_swizzle, 2519 const struct blorp_surf *dst_surf, 2520 unsigned dst_level, unsigned dst_layer, 2521 enum isl_format dst_format, struct isl_swizzle dst_swizzle, 2522 float src_x0, float src_y0, 2523 float src_x1, float src_y1, 2524 float dst_x0, float dst_y0, 2525 float dst_x1, float dst_y1, 2526 enum blorp_filter filter, 2527 bool mirror_x, bool mirror_y) 2528{ 2529 struct blorp_params params; 2530 blorp_params_init(¶ms); 2531 params.snapshot_type = INTEL_SNAPSHOT_BLIT; 2532 const bool compute = batch->flags & BLORP_BATCH_USE_COMPUTE; 2533 if (compute) { 2534 assert(blorp_blit_supports_compute(batch->blorp, 2535 src_surf->surf, dst_surf->surf, 2536 dst_surf->aux_usage)); 2537 } 2538 2539 /* We cannot handle combined depth and stencil. */ 2540 if (src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) 2541 assert(src_surf->surf->format == ISL_FORMAT_R8_UINT); 2542 if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) 2543 assert(dst_surf->surf->format == ISL_FORMAT_R8_UINT); 2544 2545 if (dst_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT) { 2546 assert(src_surf->surf->usage & ISL_SURF_USAGE_STENCIL_BIT); 2547 /* Prior to Broadwell, we can't render to R8_UINT */ 2548 if (batch->blorp->isl_dev->info->ver < 8) { 2549 src_format = ISL_FORMAT_R8_UNORM; 2550 dst_format = ISL_FORMAT_R8_UNORM; 2551 } 2552 } 2553 2554 brw_blorp_surface_info_init(batch, ¶ms.src, src_surf, src_level, 2555 src_layer, src_format, false); 2556 brw_blorp_surface_info_init(batch, ¶ms.dst, dst_surf, dst_level, 2557 dst_layer, dst_format, true); 2558 2559 params.src.view.swizzle = src_swizzle; 2560 params.dst.view.swizzle = dst_swizzle; 2561 2562 const struct isl_format_layout *src_fmtl = 2563 isl_format_get_layout(params.src.view.format); 2564 2565 struct brw_blorp_blit_prog_key key = { 2566 .base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_BLIT), 2567 .base.shader_pipeline = compute ? BLORP_SHADER_PIPELINE_COMPUTE : 2568 BLORP_SHADER_PIPELINE_RENDER, 2569 .filter = filter, 2570 .sint32_to_uint = src_fmtl->channels.r.bits == 32 && 2571 isl_format_has_sint_channel(params.src.view.format) && 2572 isl_format_has_uint_channel(params.dst.view.format), 2573 .uint32_to_sint = src_fmtl->channels.r.bits == 32 && 2574 isl_format_has_uint_channel(params.src.view.format) && 2575 isl_format_has_sint_channel(params.dst.view.format), 2576 }; 2577 2578 params.shader_type = key.base.shader_type; 2579 params.shader_pipeline = key.base.shader_pipeline; 2580 2581 /* Scaling factors used for bilinear filtering in multisample scaled 2582 * blits. 2583 */ 2584 if (params.src.surf.samples == 16) 2585 key.x_scale = 4.0f; 2586 else 2587 key.x_scale = 2.0f; 2588 key.y_scale = params.src.surf.samples / key.x_scale; 2589 2590 params.wm_inputs.rect_grid.x1 = 2591 u_minify(params.src.surf.logical_level0_px.width, src_level) * 2592 key.x_scale - 1.0f; 2593 params.wm_inputs.rect_grid.y1 = 2594 u_minify(params.src.surf.logical_level0_px.height, src_level) * 2595 key.y_scale - 1.0f; 2596 2597 struct blt_coords coords = { 2598 .x = { 2599 .src0 = src_x0, 2600 .src1 = src_x1, 2601 .dst0 = dst_x0, 2602 .dst1 = dst_x1, 2603 .mirror = mirror_x 2604 }, 2605 .y = { 2606 .src0 = src_y0, 2607 .src1 = src_y1, 2608 .dst0 = dst_y0, 2609 .dst1 = dst_y1, 2610 .mirror = mirror_y 2611 } 2612 }; 2613 2614 do_blorp_blit(batch, ¶ms, &key, &coords); 2615} 2616 2617static enum isl_format 2618get_copy_format_for_bpb(const struct isl_device *isl_dev, unsigned bpb) 2619{ 2620 /* The choice of UNORM and UINT formats is very intentional here. Most 2621 * of the time, we want to use a UINT format to avoid any rounding error 2622 * in the blit. For stencil blits, R8_UINT is required by the hardware. 2623 * (It's the only format allowed in conjunction with W-tiling.) Also we 2624 * intentionally use the 4-channel formats whenever we can. This is so 2625 * that, when we do a RGB <-> RGBX copy, the two formats will line up 2626 * even though one of them is 3/4 the size of the other. The choice of 2627 * UNORM vs. UINT is also very intentional because we don't have 8 or 2628 * 16-bit RGB UINT formats until Sky Lake so we have to use UNORM there. 2629 * Fortunately, the only time we should ever use two different formats in 2630 * the table below is for RGB -> RGBA blits and so we will never have any 2631 * UNORM/UINT mismatch. 2632 */ 2633 if (ISL_GFX_VER(isl_dev) >= 9) { 2634 switch (bpb) { 2635 case 8: return ISL_FORMAT_R8_UINT; 2636 case 16: return ISL_FORMAT_R8G8_UINT; 2637 case 24: return ISL_FORMAT_R8G8B8_UINT; 2638 case 32: return ISL_FORMAT_R8G8B8A8_UINT; 2639 case 48: return ISL_FORMAT_R16G16B16_UINT; 2640 case 64: return ISL_FORMAT_R16G16B16A16_UINT; 2641 case 96: return ISL_FORMAT_R32G32B32_UINT; 2642 case 128:return ISL_FORMAT_R32G32B32A32_UINT; 2643 default: 2644 unreachable("Unknown format bpb"); 2645 } 2646 } else { 2647 switch (bpb) { 2648 case 8: return ISL_FORMAT_R8_UINT; 2649 case 16: return ISL_FORMAT_R8G8_UINT; 2650 case 24: return ISL_FORMAT_R8G8B8_UNORM; 2651 case 32: return ISL_FORMAT_R8G8B8A8_UNORM; 2652 case 48: return ISL_FORMAT_R16G16B16_UNORM; 2653 case 64: return ISL_FORMAT_R16G16B16A16_UNORM; 2654 case 96: return ISL_FORMAT_R32G32B32_UINT; 2655 case 128:return ISL_FORMAT_R32G32B32A32_UINT; 2656 default: 2657 unreachable("Unknown format bpb"); 2658 } 2659 } 2660} 2661 2662/** Returns a UINT format that is CCS-compatible with the given format 2663 * 2664 * The PRM's say absolutely nothing about how render compression works. The 2665 * only thing they provide is a list of formats on which it is and is not 2666 * supported. Empirical testing indicates that the compression is only based 2667 * on the bit-layout of the format and the channel encoding doesn't matter. 2668 * So, while texture views don't work in general, you can create a view as 2669 * long as the bit-layout of the formats are the same. 2670 * 2671 * Fortunately, for every render compression capable format, the UINT format 2672 * with the same bit layout also supports render compression. This means that 2673 * we only need to handle UINT formats for copy operations. In order to do 2674 * copies between formats with different bit layouts, we attach both with a 2675 * UINT format and use bit_cast_color() to generate code to do the bit-cast 2676 * operation between the two bit layouts. 2677 */ 2678static enum isl_format 2679get_ccs_compatible_copy_format(const struct isl_format_layout *fmtl) 2680{ 2681 switch (fmtl->format) { 2682 case ISL_FORMAT_R32G32B32A32_FLOAT: 2683 case ISL_FORMAT_R32G32B32A32_SINT: 2684 case ISL_FORMAT_R32G32B32A32_UINT: 2685 case ISL_FORMAT_R32G32B32A32_UNORM: 2686 case ISL_FORMAT_R32G32B32A32_SNORM: 2687 case ISL_FORMAT_R32G32B32X32_FLOAT: 2688 return ISL_FORMAT_R32G32B32A32_UINT; 2689 2690 case ISL_FORMAT_R16G16B16A16_UNORM: 2691 case ISL_FORMAT_R16G16B16A16_SNORM: 2692 case ISL_FORMAT_R16G16B16A16_SINT: 2693 case ISL_FORMAT_R16G16B16A16_UINT: 2694 case ISL_FORMAT_R16G16B16A16_FLOAT: 2695 case ISL_FORMAT_R16G16B16X16_UNORM: 2696 case ISL_FORMAT_R16G16B16X16_FLOAT: 2697 return ISL_FORMAT_R16G16B16A16_UINT; 2698 2699 case ISL_FORMAT_R32G32_FLOAT: 2700 case ISL_FORMAT_R32G32_SINT: 2701 case ISL_FORMAT_R32G32_UINT: 2702 case ISL_FORMAT_R32G32_UNORM: 2703 case ISL_FORMAT_R32G32_SNORM: 2704 return ISL_FORMAT_R32G32_UINT; 2705 2706 case ISL_FORMAT_B8G8R8A8_UNORM: 2707 case ISL_FORMAT_B8G8R8A8_UNORM_SRGB: 2708 case ISL_FORMAT_R8G8B8A8_UNORM: 2709 case ISL_FORMAT_R8G8B8A8_UNORM_SRGB: 2710 case ISL_FORMAT_R8G8B8A8_SNORM: 2711 case ISL_FORMAT_R8G8B8A8_SINT: 2712 case ISL_FORMAT_R8G8B8A8_UINT: 2713 case ISL_FORMAT_B8G8R8X8_UNORM: 2714 case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: 2715 case ISL_FORMAT_R8G8B8X8_UNORM: 2716 case ISL_FORMAT_R8G8B8X8_UNORM_SRGB: 2717 return ISL_FORMAT_R8G8B8A8_UINT; 2718 2719 case ISL_FORMAT_R16G16_UNORM: 2720 case ISL_FORMAT_R16G16_SNORM: 2721 case ISL_FORMAT_R16G16_SINT: 2722 case ISL_FORMAT_R16G16_UINT: 2723 case ISL_FORMAT_R16G16_FLOAT: 2724 return ISL_FORMAT_R16G16_UINT; 2725 2726 case ISL_FORMAT_R32_SINT: 2727 case ISL_FORMAT_R32_UINT: 2728 case ISL_FORMAT_R32_FLOAT: 2729 case ISL_FORMAT_R32_UNORM: 2730 case ISL_FORMAT_R32_SNORM: 2731 return ISL_FORMAT_R32_UINT; 2732 2733 case ISL_FORMAT_B10G10R10A2_UNORM: 2734 case ISL_FORMAT_B10G10R10A2_UNORM_SRGB: 2735 case ISL_FORMAT_R10G10B10A2_UNORM: 2736 case ISL_FORMAT_R10G10B10A2_UNORM_SRGB: 2737 case ISL_FORMAT_R10G10B10_FLOAT_A2_UNORM: 2738 case ISL_FORMAT_R10G10B10A2_UINT: 2739 return ISL_FORMAT_R10G10B10A2_UINT; 2740 2741 case ISL_FORMAT_R16_UNORM: 2742 case ISL_FORMAT_R16_SNORM: 2743 case ISL_FORMAT_R16_SINT: 2744 case ISL_FORMAT_R16_UINT: 2745 case ISL_FORMAT_R16_FLOAT: 2746 return ISL_FORMAT_R16_UINT; 2747 2748 case ISL_FORMAT_R8G8_UNORM: 2749 case ISL_FORMAT_R8G8_SNORM: 2750 case ISL_FORMAT_R8G8_SINT: 2751 case ISL_FORMAT_R8G8_UINT: 2752 return ISL_FORMAT_R8G8_UINT; 2753 2754 case ISL_FORMAT_B5G5R5X1_UNORM: 2755 case ISL_FORMAT_B5G5R5X1_UNORM_SRGB: 2756 case ISL_FORMAT_B5G5R5A1_UNORM: 2757 case ISL_FORMAT_B5G5R5A1_UNORM_SRGB: 2758 return ISL_FORMAT_B5G5R5A1_UNORM; 2759 2760 case ISL_FORMAT_A4B4G4R4_UNORM: 2761 case ISL_FORMAT_B4G4R4A4_UNORM: 2762 case ISL_FORMAT_B4G4R4A4_UNORM_SRGB: 2763 return ISL_FORMAT_B4G4R4A4_UNORM; 2764 2765 case ISL_FORMAT_B5G6R5_UNORM: 2766 case ISL_FORMAT_B5G6R5_UNORM_SRGB: 2767 return ISL_FORMAT_B5G6R5_UNORM; 2768 2769 case ISL_FORMAT_A1B5G5R5_UNORM: 2770 return ISL_FORMAT_A1B5G5R5_UNORM; 2771 2772 case ISL_FORMAT_A8_UNORM: 2773 case ISL_FORMAT_R8_UNORM: 2774 case ISL_FORMAT_R8_SNORM: 2775 case ISL_FORMAT_R8_SINT: 2776 case ISL_FORMAT_R8_UINT: 2777 return ISL_FORMAT_R8_UINT; 2778 2779 default: 2780 unreachable("Not a compressible format"); 2781 } 2782} 2783 2784void 2785blorp_surf_convert_to_uncompressed(const struct isl_device *isl_dev, 2786 struct brw_blorp_surface_info *info, 2787 uint32_t *x, uint32_t *y, 2788 uint32_t *width, uint32_t *height) 2789{ 2790 const struct isl_format_layout *fmtl = 2791 isl_format_get_layout(info->surf.format); 2792 2793 assert(fmtl->bw > 1 || fmtl->bh > 1); 2794 2795 /* This should be the first modification made to the surface */ 2796 assert(info->tile_x_sa == 0 && info->tile_y_sa == 0); 2797 2798 if (width && height) { 2799 ASSERTED const uint32_t level_width = 2800 u_minify(info->surf.logical_level0_px.width, info->view.base_level); 2801 ASSERTED const uint32_t level_height = 2802 u_minify(info->surf.logical_level0_px.height, info->view.base_level); 2803 assert(*width % fmtl->bw == 0 || *x + *width == level_width); 2804 assert(*height % fmtl->bh == 0 || *y + *height == level_height); 2805 *width = DIV_ROUND_UP(*width, fmtl->bw); 2806 *height = DIV_ROUND_UP(*height, fmtl->bh); 2807 } 2808 2809 if (x && y) { 2810 assert(*x % fmtl->bw == 0); 2811 assert(*y % fmtl->bh == 0); 2812 *x /= fmtl->bw; 2813 *y /= fmtl->bh; 2814 } 2815 2816 /* We only want one level and slice */ 2817 info->view.levels = 1; 2818 info->view.array_len = 1; 2819 2820 if (info->surf.dim == ISL_SURF_DIM_3D) { 2821 /* Roll the Z offset into the image view */ 2822 info->view.base_array_layer += info->z_offset; 2823 info->z_offset = 0; 2824 } 2825 2826 uint64_t offset_B; 2827 ASSERTED bool ok = 2828 isl_surf_get_uncompressed_surf(isl_dev, &info->surf, &info->view, 2829 &info->surf, &info->view, &offset_B, 2830 &info->tile_x_sa, &info->tile_y_sa); 2831 assert(ok); 2832 info->addr.offset += offset_B; 2833 2834 /* BLORP doesn't use the actual intratile offsets. Instead, it needs the 2835 * surface to be a bit bigger and we offset the vertices instead. 2836 */ 2837 assert(info->surf.dim == ISL_SURF_DIM_2D); 2838 assert(info->surf.logical_level0_px.array_len == 1); 2839 info->surf.logical_level0_px.w += info->tile_x_sa; 2840 info->surf.logical_level0_px.h += info->tile_y_sa; 2841 info->surf.phys_level0_sa.w += info->tile_x_sa; 2842 info->surf.phys_level0_sa.h += info->tile_y_sa; 2843} 2844 2845bool 2846blorp_copy_supports_compute(struct blorp_context *blorp, 2847 const struct isl_surf *src_surf, 2848 const struct isl_surf *dst_surf, 2849 enum isl_aux_usage dst_aux_usage) 2850{ 2851 return blorp_blit_supports_compute(blorp, src_surf, dst_surf, dst_aux_usage); 2852} 2853 2854void 2855blorp_copy(struct blorp_batch *batch, 2856 const struct blorp_surf *src_surf, 2857 unsigned src_level, unsigned src_layer, 2858 const struct blorp_surf *dst_surf, 2859 unsigned dst_level, unsigned dst_layer, 2860 uint32_t src_x, uint32_t src_y, 2861 uint32_t dst_x, uint32_t dst_y, 2862 uint32_t src_width, uint32_t src_height) 2863{ 2864 const struct isl_device *isl_dev = batch->blorp->isl_dev; 2865 const struct intel_device_info *devinfo = isl_dev->info; 2866 struct blorp_params params; 2867 2868 if (src_width == 0 || src_height == 0) 2869 return; 2870 2871 blorp_params_init(¶ms); 2872 params.snapshot_type = INTEL_SNAPSHOT_COPY; 2873 2874 const bool compute = batch->flags & BLORP_BATCH_USE_COMPUTE; 2875 if (compute) { 2876 assert(blorp_copy_supports_compute(batch->blorp, 2877 src_surf->surf, dst_surf->surf, 2878 dst_surf->aux_usage)); 2879 } else if (batch->flags & BLORP_BATCH_USE_BLITTER) { 2880 assert(blorp_copy_supports_blitter(batch->blorp, 2881 src_surf->surf, dst_surf->surf, 2882 src_surf->aux_usage, 2883 dst_surf->aux_usage)); 2884 } 2885 2886 brw_blorp_surface_info_init(batch, ¶ms.src, src_surf, src_level, 2887 src_layer, ISL_FORMAT_UNSUPPORTED, false); 2888 brw_blorp_surface_info_init(batch, ¶ms.dst, dst_surf, dst_level, 2889 dst_layer, ISL_FORMAT_UNSUPPORTED, true); 2890 2891 struct brw_blorp_blit_prog_key key = { 2892 .base = BRW_BLORP_BASE_KEY_INIT(BLORP_SHADER_TYPE_COPY), 2893 .base.shader_pipeline = compute ? BLORP_SHADER_PIPELINE_COMPUTE : 2894 BLORP_SHADER_PIPELINE_RENDER, 2895 .filter = BLORP_FILTER_NONE, 2896 .need_src_offset = src_surf->tile_x_sa || src_surf->tile_y_sa, 2897 .need_dst_offset = dst_surf->tile_x_sa || dst_surf->tile_y_sa, 2898 }; 2899 2900 params.shader_type = key.base.shader_type; 2901 params.shader_pipeline = key.base.shader_pipeline; 2902 2903 const struct isl_format_layout *src_fmtl = 2904 isl_format_get_layout(params.src.surf.format); 2905 const struct isl_format_layout *dst_fmtl = 2906 isl_format_get_layout(params.dst.surf.format); 2907 2908 assert(params.src.aux_usage == ISL_AUX_USAGE_NONE || 2909 params.src.aux_usage == ISL_AUX_USAGE_HIZ || 2910 params.src.aux_usage == ISL_AUX_USAGE_HIZ_CCS_WT || 2911 params.src.aux_usage == ISL_AUX_USAGE_MCS || 2912 params.src.aux_usage == ISL_AUX_USAGE_MCS_CCS || 2913 params.src.aux_usage == ISL_AUX_USAGE_CCS_E || 2914 params.src.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E || 2915 params.src.aux_usage == ISL_AUX_USAGE_STC_CCS); 2916 2917 if (isl_aux_usage_has_hiz(params.src.aux_usage)) { 2918 /* In order to use HiZ, we have to use the real format for the source. 2919 * Depth <-> Color copies are not allowed. 2920 */ 2921 params.src.view.format = params.src.surf.format; 2922 params.dst.view.format = params.src.surf.format; 2923 } else if ((params.dst.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) && 2924 isl_dev->info->ver >= 7) { 2925 /* On Gfx7 and higher, we use actual depth writes for blits into depth 2926 * buffers so we need the real format. 2927 */ 2928 params.src.view.format = params.dst.surf.format; 2929 params.dst.view.format = params.dst.surf.format; 2930 } else if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E || 2931 params.dst.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E) { 2932 params.dst.view.format = get_ccs_compatible_copy_format(dst_fmtl); 2933 if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E || 2934 params.src.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E) { 2935 params.src.view.format = get_ccs_compatible_copy_format(src_fmtl); 2936 } else if (src_fmtl->bpb == dst_fmtl->bpb) { 2937 params.src.view.format = params.dst.view.format; 2938 } else { 2939 params.src.view.format = 2940 get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); 2941 } 2942 } else if (params.src.aux_usage == ISL_AUX_USAGE_CCS_E || 2943 params.src.aux_usage == ISL_AUX_USAGE_GFX12_CCS_E) { 2944 params.src.view.format = get_ccs_compatible_copy_format(src_fmtl); 2945 if (src_fmtl->bpb == dst_fmtl->bpb) { 2946 params.dst.view.format = params.src.view.format; 2947 } else { 2948 params.dst.view.format = 2949 get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb); 2950 } 2951 } else { 2952 params.dst.view.format = get_copy_format_for_bpb(isl_dev, dst_fmtl->bpb); 2953 params.src.view.format = get_copy_format_for_bpb(isl_dev, src_fmtl->bpb); 2954 } 2955 2956 if (params.src.view.format != params.dst.view.format) { 2957 enum isl_format src_cast_format = params.src.view.format; 2958 enum isl_format dst_cast_format = params.dst.view.format; 2959 2960 /* The BLORP bitcast code gets confused by RGB formats. Just treat them 2961 * as RGBA and then everything will be happy. This is perfectly safe 2962 * because BLORP likes to treat things as if they have vec4 colors all 2963 * the time anyway. 2964 */ 2965 if (isl_format_get_layout(src_cast_format)->bpb % 3 == 0) 2966 src_cast_format = isl_format_rgb_to_rgba(src_cast_format); 2967 if (isl_format_get_layout(dst_cast_format)->bpb % 3 == 0) 2968 dst_cast_format = isl_format_rgb_to_rgba(dst_cast_format); 2969 2970 if (src_cast_format != dst_cast_format) { 2971 key.format_bit_cast = true; 2972 key.src_format = src_cast_format; 2973 key.dst_format = dst_cast_format; 2974 } 2975 } 2976 2977 if (src_fmtl->bw > 1 || src_fmtl->bh > 1) { 2978 blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.src, 2979 &src_x, &src_y, 2980 &src_width, &src_height); 2981 key.need_src_offset = true; 2982 } 2983 2984 if (dst_fmtl->bw > 1 || dst_fmtl->bh > 1) { 2985 blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, ¶ms.dst, 2986 &dst_x, &dst_y, NULL, NULL); 2987 key.need_dst_offset = true; 2988 } 2989 2990 /* Once both surfaces are stompped to uncompressed as needed, the 2991 * destination size is the same as the source size. 2992 */ 2993 uint32_t dst_width = src_width; 2994 uint32_t dst_height = src_height; 2995 2996 if (batch->flags & BLORP_BATCH_USE_BLITTER) { 2997 if (devinfo->verx10 < 125) { 2998 blorp_surf_convert_to_single_slice(isl_dev, ¶ms.dst); 2999 blorp_surf_convert_to_single_slice(isl_dev, ¶ms.src); 3000 } 3001 3002 params.x0 = dst_x; 3003 params.x1 = dst_x + dst_width; 3004 params.y0 = dst_y; 3005 params.y1 = dst_y + dst_height; 3006 params.wm_inputs.coord_transform[0].offset = dst_x - (float)src_x; 3007 params.wm_inputs.coord_transform[1].offset = dst_y - (float)src_y; 3008 params.wm_inputs.coord_transform[0].multiplier = 1.0f; 3009 params.wm_inputs.coord_transform[1].multiplier = 1.0f; 3010 3011 batch->blorp->exec(batch, ¶ms); 3012 return; 3013 } 3014 3015 struct blt_coords coords = { 3016 .x = { 3017 .src0 = src_x, 3018 .src1 = src_x + src_width, 3019 .dst0 = dst_x, 3020 .dst1 = dst_x + dst_width, 3021 .mirror = false 3022 }, 3023 .y = { 3024 .src0 = src_y, 3025 .src1 = src_y + src_height, 3026 .dst0 = dst_y, 3027 .dst1 = dst_y + dst_height, 3028 .mirror = false 3029 } 3030 }; 3031 3032 do_blorp_blit(batch, ¶ms, &key, &coords); 3033} 3034 3035static enum isl_format 3036isl_format_for_size(unsigned size_B) 3037{ 3038 switch (size_B) { 3039 case 1: return ISL_FORMAT_R8_UINT; 3040 case 2: return ISL_FORMAT_R8G8_UINT; 3041 case 4: return ISL_FORMAT_R8G8B8A8_UINT; 3042 case 8: return ISL_FORMAT_R16G16B16A16_UINT; 3043 case 16: return ISL_FORMAT_R32G32B32A32_UINT; 3044 default: 3045 unreachable("Not a power-of-two format size"); 3046 } 3047} 3048 3049/** 3050 * Returns the greatest common divisor of a and b that is a power of two. 3051 */ 3052static uint64_t 3053gcd_pow2_u64(uint64_t a, uint64_t b) 3054{ 3055 assert(a > 0 || b > 0); 3056 3057 unsigned a_log2 = ffsll(a) - 1; 3058 unsigned b_log2 = ffsll(b) - 1; 3059 3060 /* If either a or b is 0, then a_log2 or b_log2 till be UINT_MAX in which 3061 * case, the MIN2() will take the other one. If both are 0 then we will 3062 * hit the assert above. 3063 */ 3064 return 1 << MIN2(a_log2, b_log2); 3065} 3066 3067static void 3068do_buffer_copy(struct blorp_batch *batch, 3069 struct blorp_address *src, 3070 struct blorp_address *dst, 3071 int width, int height, int block_size) 3072{ 3073 /* The actual format we pick doesn't matter as blorp will throw it away. 3074 * The only thing that actually matters is the size. 3075 */ 3076 enum isl_format format = isl_format_for_size(block_size); 3077 3078 UNUSED bool ok; 3079 struct isl_surf surf; 3080 ok = isl_surf_init(batch->blorp->isl_dev, &surf, 3081 .dim = ISL_SURF_DIM_2D, 3082 .format = format, 3083 .width = width, 3084 .height = height, 3085 .depth = 1, 3086 .levels = 1, 3087 .array_len = 1, 3088 .samples = 1, 3089 .row_pitch_B = width * block_size, 3090 .usage = ISL_SURF_USAGE_TEXTURE_BIT | 3091 ISL_SURF_USAGE_RENDER_TARGET_BIT, 3092 .tiling_flags = ISL_TILING_LINEAR_BIT); 3093 assert(ok); 3094 3095 struct blorp_surf src_blorp_surf = { 3096 .surf = &surf, 3097 .addr = *src, 3098 }; 3099 3100 struct blorp_surf dst_blorp_surf = { 3101 .surf = &surf, 3102 .addr = *dst, 3103 }; 3104 3105 blorp_copy(batch, &src_blorp_surf, 0, 0, &dst_blorp_surf, 0, 0, 3106 0, 0, 0, 0, width, height); 3107} 3108 3109void 3110blorp_buffer_copy(struct blorp_batch *batch, 3111 struct blorp_address src, 3112 struct blorp_address dst, 3113 uint64_t size) 3114{ 3115 const struct intel_device_info *devinfo = batch->blorp->isl_dev->info; 3116 uint64_t copy_size = size; 3117 3118 /* This is maximum possible width/height our HW can handle */ 3119 uint64_t max_surface_dim = 1 << (devinfo->ver >= 7 ? 14 : 13); 3120 3121 /* First, we compute the biggest format that can be used with the 3122 * given offsets and size. 3123 */ 3124 int bs = 16; 3125 bs = gcd_pow2_u64(bs, src.offset); 3126 bs = gcd_pow2_u64(bs, dst.offset); 3127 bs = gcd_pow2_u64(bs, size); 3128 3129 /* First, we make a bunch of max-sized copies */ 3130 uint64_t max_copy_size = max_surface_dim * max_surface_dim * bs; 3131 while (copy_size >= max_copy_size) { 3132 do_buffer_copy(batch, &src, &dst, max_surface_dim, max_surface_dim, bs); 3133 copy_size -= max_copy_size; 3134 src.offset += max_copy_size; 3135 dst.offset += max_copy_size; 3136 } 3137 3138 /* Now make a max-width copy */ 3139 uint64_t height = copy_size / (max_surface_dim * bs); 3140 assert(height < max_surface_dim); 3141 if (height != 0) { 3142 uint64_t rect_copy_size = height * max_surface_dim * bs; 3143 do_buffer_copy(batch, &src, &dst, max_surface_dim, height, bs); 3144 copy_size -= rect_copy_size; 3145 src.offset += rect_copy_size; 3146 dst.offset += rect_copy_size; 3147 } 3148 3149 /* Finally, make a small copy to finish it off */ 3150 if (copy_size != 0) { 3151 do_buffer_copy(batch, &src, &dst, copy_size / bs, 1, bs); 3152 } 3153} 3154