1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2016-2018 Broadcom 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci#include "v3d_compiler.h" 25bf215546Sopenharmony_ci 26bf215546Sopenharmony_ci/* We don't do any address packing. */ 27bf215546Sopenharmony_ci#define __gen_user_data void 28bf215546Sopenharmony_ci#define __gen_address_type uint32_t 29bf215546Sopenharmony_ci#define __gen_address_offset(reloc) (*reloc) 30bf215546Sopenharmony_ci#define __gen_emit_reloc(cl, reloc) 31bf215546Sopenharmony_ci#include "cle/v3d_packet_v41_pack.h" 32bf215546Sopenharmony_ci 33bf215546Sopenharmony_cistatic inline void 34bf215546Sopenharmony_civir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val) 35bf215546Sopenharmony_ci{ 36bf215546Sopenharmony_ci /* XXX perf: We should figure out how to merge ALU operations 37bf215546Sopenharmony_ci * producing the val with this MOV, when possible. 38bf215546Sopenharmony_ci */ 39bf215546Sopenharmony_ci vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val); 40bf215546Sopenharmony_ci} 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_cistatic inline void 43bf215546Sopenharmony_civir_TMU_WRITE_or_count(struct v3d_compile *c, 44bf215546Sopenharmony_ci enum v3d_qpu_waddr waddr, 45bf215546Sopenharmony_ci struct qreg val, 46bf215546Sopenharmony_ci uint32_t *tmu_writes) 47bf215546Sopenharmony_ci{ 48bf215546Sopenharmony_ci if (tmu_writes) 49bf215546Sopenharmony_ci (*tmu_writes)++; 50bf215546Sopenharmony_ci else 51bf215546Sopenharmony_ci vir_TMU_WRITE(c, waddr, val); 52bf215546Sopenharmony_ci} 53bf215546Sopenharmony_ci 54bf215546Sopenharmony_cistatic void 55bf215546Sopenharmony_civir_WRTMUC(struct v3d_compile *c, enum quniform_contents contents, uint32_t data) 56bf215546Sopenharmony_ci{ 57bf215546Sopenharmony_ci struct qinst *inst = vir_NOP(c); 58bf215546Sopenharmony_ci inst->qpu.sig.wrtmuc = true; 59bf215546Sopenharmony_ci inst->uniform = vir_get_uniform_index(c, contents, data); 60bf215546Sopenharmony_ci} 61bf215546Sopenharmony_ci 62bf215546Sopenharmony_cistatic const struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked_default = { 63bf215546Sopenharmony_ci .per_pixel_mask_enable = true, 64bf215546Sopenharmony_ci}; 65bf215546Sopenharmony_ci 66bf215546Sopenharmony_cistatic const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = { 67bf215546Sopenharmony_ci .op = V3D_TMU_OP_REGULAR, 68bf215546Sopenharmony_ci}; 69bf215546Sopenharmony_ci 70bf215546Sopenharmony_ci/** 71bf215546Sopenharmony_ci * If 'tmu_writes' is not NULL, then it just counts required register writes, 72bf215546Sopenharmony_ci * otherwise, it emits the actual register writes. 73bf215546Sopenharmony_ci * 74bf215546Sopenharmony_ci * It is important to notice that emitting register writes for the current 75bf215546Sopenharmony_ci * TMU operation may trigger a TMU flush, since it is possible that any 76bf215546Sopenharmony_ci * of the inputs required for the register writes is the result of a pending 77bf215546Sopenharmony_ci * TMU operation. If that happens we need to make sure that it doesn't happen 78bf215546Sopenharmony_ci * in the middle of the TMU register writes for the current TMU operation, 79bf215546Sopenharmony_ci * which is why we always call ntq_get_src() even if we are only interested in 80bf215546Sopenharmony_ci * register write counts. 81bf215546Sopenharmony_ci */ 82bf215546Sopenharmony_cistatic void 83bf215546Sopenharmony_cihandle_tex_src(struct v3d_compile *c, 84bf215546Sopenharmony_ci nir_tex_instr *instr, 85bf215546Sopenharmony_ci unsigned src_idx, 86bf215546Sopenharmony_ci unsigned non_array_components, 87bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, 88bf215546Sopenharmony_ci struct qreg *s_out, 89bf215546Sopenharmony_ci unsigned *tmu_writes) 90bf215546Sopenharmony_ci{ 91bf215546Sopenharmony_ci /* Either we are calling this just to count required TMU writes, or we 92bf215546Sopenharmony_ci * are calling this to emit the actual TMU writes. 93bf215546Sopenharmony_ci */ 94bf215546Sopenharmony_ci assert(tmu_writes || (s_out && p2_unpacked)); 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_ci struct qreg s; 97bf215546Sopenharmony_ci switch (instr->src[src_idx].src_type) { 98bf215546Sopenharmony_ci case nir_tex_src_coord: 99bf215546Sopenharmony_ci /* S triggers the lookup, so save it for the end. */ 100bf215546Sopenharmony_ci s = ntq_get_src(c, instr->src[src_idx].src, 0); 101bf215546Sopenharmony_ci if (tmu_writes) 102bf215546Sopenharmony_ci (*tmu_writes)++; 103bf215546Sopenharmony_ci else 104bf215546Sopenharmony_ci *s_out = s; 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_ci if (non_array_components > 1) { 107bf215546Sopenharmony_ci struct qreg src = 108bf215546Sopenharmony_ci ntq_get_src(c, instr->src[src_idx].src, 1); 109bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src, 110bf215546Sopenharmony_ci tmu_writes); 111bf215546Sopenharmony_ci } 112bf215546Sopenharmony_ci 113bf215546Sopenharmony_ci if (non_array_components > 2) { 114bf215546Sopenharmony_ci struct qreg src = 115bf215546Sopenharmony_ci ntq_get_src(c, instr->src[src_idx].src, 2); 116bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src, 117bf215546Sopenharmony_ci tmu_writes); 118bf215546Sopenharmony_ci } 119bf215546Sopenharmony_ci 120bf215546Sopenharmony_ci if (instr->is_array) { 121bf215546Sopenharmony_ci struct qreg src = 122bf215546Sopenharmony_ci ntq_get_src(c, instr->src[src_idx].src, 123bf215546Sopenharmony_ci instr->coord_components - 1); 124bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src, 125bf215546Sopenharmony_ci tmu_writes); 126bf215546Sopenharmony_ci } 127bf215546Sopenharmony_ci break; 128bf215546Sopenharmony_ci 129bf215546Sopenharmony_ci case nir_tex_src_bias: { 130bf215546Sopenharmony_ci struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0); 131bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes); 132bf215546Sopenharmony_ci break; 133bf215546Sopenharmony_ci } 134bf215546Sopenharmony_ci 135bf215546Sopenharmony_ci case nir_tex_src_lod: { 136bf215546Sopenharmony_ci struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0); 137bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUB, src, tmu_writes); 138bf215546Sopenharmony_ci if (!tmu_writes) { 139bf215546Sopenharmony_ci /* With texel fetch automatic LOD is already disabled, 140bf215546Sopenharmony_ci * and disable_autolod must not be enabled. For 141bf215546Sopenharmony_ci * non-cubes we can use the register TMUSLOD, that 142bf215546Sopenharmony_ci * implicitly sets disable_autolod. 143bf215546Sopenharmony_ci */ 144bf215546Sopenharmony_ci assert(p2_unpacked); 145bf215546Sopenharmony_ci if (instr->op != nir_texop_txf && 146bf215546Sopenharmony_ci instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 147bf215546Sopenharmony_ci p2_unpacked->disable_autolod = true; 148bf215546Sopenharmony_ci } 149bf215546Sopenharmony_ci } 150bf215546Sopenharmony_ci break; 151bf215546Sopenharmony_ci } 152bf215546Sopenharmony_ci 153bf215546Sopenharmony_ci case nir_tex_src_comparator: { 154bf215546Sopenharmony_ci struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0); 155bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUDREF, src, tmu_writes); 156bf215546Sopenharmony_ci break; 157bf215546Sopenharmony_ci } 158bf215546Sopenharmony_ci 159bf215546Sopenharmony_ci case nir_tex_src_offset: { 160bf215546Sopenharmony_ci bool is_const_offset = nir_src_is_const(instr->src[src_idx].src); 161bf215546Sopenharmony_ci if (is_const_offset) { 162bf215546Sopenharmony_ci if (!tmu_writes) { 163bf215546Sopenharmony_ci p2_unpacked->offset_s = 164bf215546Sopenharmony_ci nir_src_comp_as_int(instr->src[src_idx].src, 0); 165bf215546Sopenharmony_ci if (non_array_components >= 2) 166bf215546Sopenharmony_ci p2_unpacked->offset_t = 167bf215546Sopenharmony_ci nir_src_comp_as_int(instr->src[src_idx].src, 1); 168bf215546Sopenharmony_ci if (non_array_components >= 3) 169bf215546Sopenharmony_ci p2_unpacked->offset_r = 170bf215546Sopenharmony_ci nir_src_comp_as_int(instr->src[src_idx].src, 2); 171bf215546Sopenharmony_ci } 172bf215546Sopenharmony_ci } else { 173bf215546Sopenharmony_ci struct qreg src_0 = 174bf215546Sopenharmony_ci ntq_get_src(c, instr->src[src_idx].src, 0); 175bf215546Sopenharmony_ci struct qreg src_1 = 176bf215546Sopenharmony_ci ntq_get_src(c, instr->src[src_idx].src, 1); 177bf215546Sopenharmony_ci if (!tmu_writes) { 178bf215546Sopenharmony_ci struct qreg mask = vir_uniform_ui(c, 0xf); 179bf215546Sopenharmony_ci struct qreg x, y, offset; 180bf215546Sopenharmony_ci 181bf215546Sopenharmony_ci x = vir_AND(c, src_0, mask); 182bf215546Sopenharmony_ci y = vir_AND(c, src_1, mask); 183bf215546Sopenharmony_ci offset = vir_OR(c, x, 184bf215546Sopenharmony_ci vir_SHL(c, y, vir_uniform_ui(c, 4))); 185bf215546Sopenharmony_ci 186bf215546Sopenharmony_ci vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset); 187bf215546Sopenharmony_ci } else { 188bf215546Sopenharmony_ci (*tmu_writes)++; 189bf215546Sopenharmony_ci } 190bf215546Sopenharmony_ci } 191bf215546Sopenharmony_ci break; 192bf215546Sopenharmony_ci } 193bf215546Sopenharmony_ci 194bf215546Sopenharmony_ci default: 195bf215546Sopenharmony_ci unreachable("unknown texture source"); 196bf215546Sopenharmony_ci } 197bf215546Sopenharmony_ci} 198bf215546Sopenharmony_ci 199bf215546Sopenharmony_cistatic void 200bf215546Sopenharmony_civir_tex_handle_srcs(struct v3d_compile *c, 201bf215546Sopenharmony_ci nir_tex_instr *instr, 202bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked, 203bf215546Sopenharmony_ci struct qreg *s, 204bf215546Sopenharmony_ci unsigned *tmu_writes) 205bf215546Sopenharmony_ci{ 206bf215546Sopenharmony_ci unsigned non_array_components = instr->op != nir_texop_lod ? 207bf215546Sopenharmony_ci instr->coord_components - instr->is_array : 208bf215546Sopenharmony_ci instr->coord_components; 209bf215546Sopenharmony_ci 210bf215546Sopenharmony_ci for (unsigned i = 0; i < instr->num_srcs; i++) { 211bf215546Sopenharmony_ci handle_tex_src(c, instr, i, non_array_components, 212bf215546Sopenharmony_ci p2_unpacked, s, tmu_writes); 213bf215546Sopenharmony_ci } 214bf215546Sopenharmony_ci} 215bf215546Sopenharmony_ci 216bf215546Sopenharmony_cistatic unsigned 217bf215546Sopenharmony_ciget_required_tex_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr) 218bf215546Sopenharmony_ci{ 219bf215546Sopenharmony_ci unsigned tmu_writes = 0; 220bf215546Sopenharmony_ci vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes); 221bf215546Sopenharmony_ci return tmu_writes; 222bf215546Sopenharmony_ci} 223bf215546Sopenharmony_ci 224bf215546Sopenharmony_civoid 225bf215546Sopenharmony_civ3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 226bf215546Sopenharmony_ci{ 227bf215546Sopenharmony_ci assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42); 228bf215546Sopenharmony_ci 229bf215546Sopenharmony_ci unsigned texture_idx = instr->texture_index; 230bf215546Sopenharmony_ci unsigned sampler_idx = instr->sampler_index; 231bf215546Sopenharmony_ci 232bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { 233bf215546Sopenharmony_ci }; 234bf215546Sopenharmony_ci 235bf215546Sopenharmony_ci /* Limit the number of channels returned to both how many the NIR 236bf215546Sopenharmony_ci * instruction writes and how many the instruction could produce. 237bf215546Sopenharmony_ci */ 238bf215546Sopenharmony_ci p0_unpacked.return_words_of_texture_data = 239bf215546Sopenharmony_ci instr->dest.is_ssa ? 240bf215546Sopenharmony_ci nir_ssa_def_components_read(&instr->dest.ssa) : 241bf215546Sopenharmony_ci (1 << instr->dest.reg.reg->num_components) - 1; 242bf215546Sopenharmony_ci assert(p0_unpacked.return_words_of_texture_data != 0); 243bf215546Sopenharmony_ci 244bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 245bf215546Sopenharmony_ci .op = V3D_TMU_OP_REGULAR, 246bf215546Sopenharmony_ci .gather_mode = instr->op == nir_texop_tg4, 247bf215546Sopenharmony_ci .gather_component = instr->component, 248bf215546Sopenharmony_ci .coefficient_mode = instr->op == nir_texop_txd, 249bf215546Sopenharmony_ci .disable_autolod = instr->op == nir_texop_tg4 250bf215546Sopenharmony_ci }; 251bf215546Sopenharmony_ci 252bf215546Sopenharmony_ci const unsigned tmu_writes = get_required_tex_tmu_writes(c, instr); 253bf215546Sopenharmony_ci 254bf215546Sopenharmony_ci /* The input FIFO has 16 slots across all threads so if we require 255bf215546Sopenharmony_ci * more than that we need to lower thread count. 256bf215546Sopenharmony_ci */ 257bf215546Sopenharmony_ci while (tmu_writes > 16 / c->threads) 258bf215546Sopenharmony_ci c->threads /= 2; 259bf215546Sopenharmony_ci 260bf215546Sopenharmony_ci /* If pipelining this TMU operation would overflow TMU fifos, we need 261bf215546Sopenharmony_ci * to flush any outstanding TMU operations. 262bf215546Sopenharmony_ci */ 263bf215546Sopenharmony_ci const unsigned dest_components = 264bf215546Sopenharmony_ci util_bitcount(p0_unpacked.return_words_of_texture_data); 265bf215546Sopenharmony_ci if (ntq_tmu_fifo_overflow(c, dest_components)) 266bf215546Sopenharmony_ci ntq_flush_tmu(c); 267bf215546Sopenharmony_ci 268bf215546Sopenharmony_ci /* Process tex sources emitting corresponding TMU writes */ 269bf215546Sopenharmony_ci struct qreg s = { }; 270bf215546Sopenharmony_ci vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL); 271bf215546Sopenharmony_ci 272bf215546Sopenharmony_ci uint32_t p0_packed; 273bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, 274bf215546Sopenharmony_ci (uint8_t *)&p0_packed, 275bf215546Sopenharmony_ci &p0_unpacked); 276bf215546Sopenharmony_ci 277bf215546Sopenharmony_ci uint32_t p2_packed; 278bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL, 279bf215546Sopenharmony_ci (uint8_t *)&p2_packed, 280bf215546Sopenharmony_ci &p2_unpacked); 281bf215546Sopenharmony_ci 282bf215546Sopenharmony_ci /* We manually set the LOD Query bit (see 283bf215546Sopenharmony_ci * V3D42_TMU_CONFIG_PARAMETER_2) as right now is the only V42 specific 284bf215546Sopenharmony_ci * feature over V41 we are using 285bf215546Sopenharmony_ci */ 286bf215546Sopenharmony_ci if (instr->op == nir_texop_lod) 287bf215546Sopenharmony_ci p2_packed |= 1UL << 24; 288bf215546Sopenharmony_ci 289bf215546Sopenharmony_ci /* Load texture_idx number into the high bits of the texture address field, 290bf215546Sopenharmony_ci * which will be be used by the driver to decide which texture to put 291bf215546Sopenharmony_ci * in the actual address field. 292bf215546Sopenharmony_ci */ 293bf215546Sopenharmony_ci p0_packed |= texture_idx << 24; 294bf215546Sopenharmony_ci 295bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed); 296bf215546Sopenharmony_ci 297bf215546Sopenharmony_ci /* Even if the texture operation doesn't need a sampler by 298bf215546Sopenharmony_ci * itself, we still need to add the sampler configuration 299bf215546Sopenharmony_ci * parameter if the output is 32 bit 300bf215546Sopenharmony_ci */ 301bf215546Sopenharmony_ci bool output_type_32_bit = 302bf215546Sopenharmony_ci c->key->sampler[sampler_idx].return_size == 32 && 303bf215546Sopenharmony_ci !instr->is_shadow; 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci /* p1 is optional, but we can skip it only if p2 can be skipped too */ 306bf215546Sopenharmony_ci bool needs_p2_config = 307bf215546Sopenharmony_ci (instr->op == nir_texop_lod || 308bf215546Sopenharmony_ci memcmp(&p2_unpacked, &p2_unpacked_default, 309bf215546Sopenharmony_ci sizeof(p2_unpacked)) != 0); 310bf215546Sopenharmony_ci 311bf215546Sopenharmony_ci /* To handle the cases were we can't just use p1_unpacked_default */ 312bf215546Sopenharmony_ci bool non_default_p1_config = nir_tex_instr_need_sampler(instr) || 313bf215546Sopenharmony_ci output_type_32_bit; 314bf215546Sopenharmony_ci 315bf215546Sopenharmony_ci if (non_default_p1_config) { 316bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = { 317bf215546Sopenharmony_ci .output_type_32_bit = output_type_32_bit, 318bf215546Sopenharmony_ci 319bf215546Sopenharmony_ci .unnormalized_coordinates = (instr->sampler_dim == 320bf215546Sopenharmony_ci GLSL_SAMPLER_DIM_RECT), 321bf215546Sopenharmony_ci }; 322bf215546Sopenharmony_ci 323bf215546Sopenharmony_ci /* Word enables can't ask for more channels than the 324bf215546Sopenharmony_ci * output type could provide (2 for f16, 4 for 325bf215546Sopenharmony_ci * 32-bit). 326bf215546Sopenharmony_ci */ 327bf215546Sopenharmony_ci assert(!p1_unpacked.output_type_32_bit || 328bf215546Sopenharmony_ci p0_unpacked.return_words_of_texture_data < (1 << 4)); 329bf215546Sopenharmony_ci assert(p1_unpacked.output_type_32_bit || 330bf215546Sopenharmony_ci p0_unpacked.return_words_of_texture_data < (1 << 2)); 331bf215546Sopenharmony_ci 332bf215546Sopenharmony_ci uint32_t p1_packed; 333bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, 334bf215546Sopenharmony_ci (uint8_t *)&p1_packed, 335bf215546Sopenharmony_ci &p1_unpacked); 336bf215546Sopenharmony_ci 337bf215546Sopenharmony_ci if (nir_tex_instr_need_sampler(instr)) { 338bf215546Sopenharmony_ci /* Load sampler_idx number into the high bits of the 339bf215546Sopenharmony_ci * sampler address field, which will be be used by the 340bf215546Sopenharmony_ci * driver to decide which sampler to put in the actual 341bf215546Sopenharmony_ci * address field. 342bf215546Sopenharmony_ci */ 343bf215546Sopenharmony_ci p1_packed |= sampler_idx << 24; 344bf215546Sopenharmony_ci 345bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed); 346bf215546Sopenharmony_ci } else { 347bf215546Sopenharmony_ci /* In this case, we don't need to merge in any 348bf215546Sopenharmony_ci * sampler state from the API and can just use 349bf215546Sopenharmony_ci * our packed bits */ 350bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed); 351bf215546Sopenharmony_ci } 352bf215546Sopenharmony_ci } else if (needs_p2_config) { 353bf215546Sopenharmony_ci /* Configuration parameters need to be set up in 354bf215546Sopenharmony_ci * order, and if P2 is needed, you need to set up P1 355bf215546Sopenharmony_ci * too even if sampler info is not needed by the 356bf215546Sopenharmony_ci * texture operation. But we can set up default info, 357bf215546Sopenharmony_ci * and avoid asking the driver for the sampler state 358bf215546Sopenharmony_ci * address 359bf215546Sopenharmony_ci */ 360bf215546Sopenharmony_ci uint32_t p1_packed_default; 361bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, 362bf215546Sopenharmony_ci (uint8_t *)&p1_packed_default, 363bf215546Sopenharmony_ci &p1_unpacked_default); 364bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed_default); 365bf215546Sopenharmony_ci } 366bf215546Sopenharmony_ci 367bf215546Sopenharmony_ci if (needs_p2_config) 368bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); 369bf215546Sopenharmony_ci 370bf215546Sopenharmony_ci /* Emit retiring TMU write */ 371bf215546Sopenharmony_ci if (instr->op == nir_texop_txf) { 372bf215546Sopenharmony_ci assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE); 373bf215546Sopenharmony_ci vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s); 374bf215546Sopenharmony_ci } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 375bf215546Sopenharmony_ci vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s); 376bf215546Sopenharmony_ci } else if (instr->op == nir_texop_txl) { 377bf215546Sopenharmony_ci vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s); 378bf215546Sopenharmony_ci } else { 379bf215546Sopenharmony_ci vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s); 380bf215546Sopenharmony_ci } 381bf215546Sopenharmony_ci 382bf215546Sopenharmony_ci ntq_add_pending_tmu_flush(c, &instr->dest, 383bf215546Sopenharmony_ci p0_unpacked.return_words_of_texture_data); 384bf215546Sopenharmony_ci} 385bf215546Sopenharmony_ci 386bf215546Sopenharmony_cistatic uint32_t 387bf215546Sopenharmony_civ3d40_image_load_store_tmu_op(nir_intrinsic_instr *instr) 388bf215546Sopenharmony_ci{ 389bf215546Sopenharmony_ci switch (instr->intrinsic) { 390bf215546Sopenharmony_ci case nir_intrinsic_image_load: 391bf215546Sopenharmony_ci case nir_intrinsic_image_store: 392bf215546Sopenharmony_ci return V3D_TMU_OP_REGULAR; 393bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_add: 394bf215546Sopenharmony_ci return v3d_get_op_for_atomic_add(instr, 3); 395bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imin: 396bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_SMIN; 397bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umin: 398bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; 399bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_imax: 400bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_SMAX; 401bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_umax: 402bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_UMAX; 403bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_and: 404bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_AND_READ_INC; 405bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_or: 406bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_OR_READ_DEC; 407bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_xor: 408bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_XOR_READ_NOT; 409bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_exchange: 410bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; 411bf215546Sopenharmony_ci case nir_intrinsic_image_atomic_comp_swap: 412bf215546Sopenharmony_ci return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; 413bf215546Sopenharmony_ci default: 414bf215546Sopenharmony_ci unreachable("unknown image intrinsic"); 415bf215546Sopenharmony_ci }; 416bf215546Sopenharmony_ci} 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci/** 419bf215546Sopenharmony_ci * If 'tmu_writes' is not NULL, then it just counts required register writes, 420bf215546Sopenharmony_ci * otherwise, it emits the actual register writes. 421bf215546Sopenharmony_ci * 422bf215546Sopenharmony_ci * It is important to notice that emitting register writes for the current 423bf215546Sopenharmony_ci * TMU operation may trigger a TMU flush, since it is possible that any 424bf215546Sopenharmony_ci * of the inputs required for the register writes is the result of a pending 425bf215546Sopenharmony_ci * TMU operation. If that happens we need to make sure that it doesn't happen 426bf215546Sopenharmony_ci * in the middle of the TMU register writes for the current TMU operation, 427bf215546Sopenharmony_ci * which is why we always call ntq_get_src() even if we are only interested in 428bf215546Sopenharmony_ci * register write counts. 429bf215546Sopenharmony_ci */ 430bf215546Sopenharmony_cistatic void 431bf215546Sopenharmony_civir_image_emit_register_writes(struct v3d_compile *c, 432bf215546Sopenharmony_ci nir_intrinsic_instr *instr, 433bf215546Sopenharmony_ci bool atomic_add_replaced, 434bf215546Sopenharmony_ci uint32_t *tmu_writes) 435bf215546Sopenharmony_ci{ 436bf215546Sopenharmony_ci if (tmu_writes) 437bf215546Sopenharmony_ci *tmu_writes = 0; 438bf215546Sopenharmony_ci 439bf215546Sopenharmony_ci bool is_1d = false; 440bf215546Sopenharmony_ci switch (nir_intrinsic_image_dim(instr)) { 441bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_1D: 442bf215546Sopenharmony_ci is_1d = true; 443bf215546Sopenharmony_ci break; 444bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_BUF: 445bf215546Sopenharmony_ci break; 446bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_2D: 447bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_RECT: 448bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_CUBE: { 449bf215546Sopenharmony_ci struct qreg src = ntq_get_src(c, instr->src[1], 1); 450bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src, tmu_writes); 451bf215546Sopenharmony_ci break; 452bf215546Sopenharmony_ci } 453bf215546Sopenharmony_ci case GLSL_SAMPLER_DIM_3D: { 454bf215546Sopenharmony_ci struct qreg src_1_1 = ntq_get_src(c, instr->src[1], 1); 455bf215546Sopenharmony_ci struct qreg src_1_2 = ntq_get_src(c, instr->src[1], 2); 456bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUT, src_1_1, tmu_writes); 457bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUR, src_1_2, tmu_writes); 458bf215546Sopenharmony_ci break; 459bf215546Sopenharmony_ci } 460bf215546Sopenharmony_ci default: 461bf215546Sopenharmony_ci unreachable("bad image sampler dim"); 462bf215546Sopenharmony_ci } 463bf215546Sopenharmony_ci 464bf215546Sopenharmony_ci /* In order to fetch on a cube map, we need to interpret it as 465bf215546Sopenharmony_ci * 2D arrays, where the third coord would be the face index. 466bf215546Sopenharmony_ci */ 467bf215546Sopenharmony_ci if (nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE || 468bf215546Sopenharmony_ci nir_intrinsic_image_array(instr)) { 469bf215546Sopenharmony_ci struct qreg src = ntq_get_src(c, instr->src[1], is_1d ? 1 : 2); 470bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUI, src, tmu_writes); 471bf215546Sopenharmony_ci } 472bf215546Sopenharmony_ci 473bf215546Sopenharmony_ci /* Emit the data writes for atomics or image store. */ 474bf215546Sopenharmony_ci if (instr->intrinsic != nir_intrinsic_image_load && 475bf215546Sopenharmony_ci !atomic_add_replaced) { 476bf215546Sopenharmony_ci for (int i = 0; i < nir_intrinsic_src_components(instr, 3); i++) { 477bf215546Sopenharmony_ci struct qreg src_3_i = ntq_get_src(c, instr->src[3], i); 478bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_3_i, 479bf215546Sopenharmony_ci tmu_writes); 480bf215546Sopenharmony_ci } 481bf215546Sopenharmony_ci 482bf215546Sopenharmony_ci /* Second atomic argument */ 483bf215546Sopenharmony_ci if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap) { 484bf215546Sopenharmony_ci struct qreg src_4_0 = ntq_get_src(c, instr->src[4], 0); 485bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUD, src_4_0, 486bf215546Sopenharmony_ci tmu_writes); 487bf215546Sopenharmony_ci } 488bf215546Sopenharmony_ci } 489bf215546Sopenharmony_ci 490bf215546Sopenharmony_ci struct qreg src_1_0 = ntq_get_src(c, instr->src[1], 0); 491bf215546Sopenharmony_ci if (!tmu_writes && vir_in_nonuniform_control_flow(c) && 492bf215546Sopenharmony_ci instr->intrinsic != nir_intrinsic_image_load) { 493bf215546Sopenharmony_ci vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 494bf215546Sopenharmony_ci V3D_QPU_PF_PUSHZ); 495bf215546Sopenharmony_ci } 496bf215546Sopenharmony_ci 497bf215546Sopenharmony_ci vir_TMU_WRITE_or_count(c, V3D_QPU_WADDR_TMUSF, src_1_0, tmu_writes); 498bf215546Sopenharmony_ci 499bf215546Sopenharmony_ci if (!tmu_writes && vir_in_nonuniform_control_flow(c) && 500bf215546Sopenharmony_ci instr->intrinsic != nir_intrinsic_image_load) { 501bf215546Sopenharmony_ci struct qinst *last_inst = 502bf215546Sopenharmony_ci (struct qinst *)c->cur_block->instructions.prev; 503bf215546Sopenharmony_ci vir_set_cond(last_inst, V3D_QPU_COND_IFA); 504bf215546Sopenharmony_ci } 505bf215546Sopenharmony_ci} 506bf215546Sopenharmony_ci 507bf215546Sopenharmony_cistatic unsigned 508bf215546Sopenharmony_ciget_required_image_tmu_writes(struct v3d_compile *c, 509bf215546Sopenharmony_ci nir_intrinsic_instr *instr, 510bf215546Sopenharmony_ci bool atomic_add_replaced) 511bf215546Sopenharmony_ci{ 512bf215546Sopenharmony_ci unsigned tmu_writes; 513bf215546Sopenharmony_ci vir_image_emit_register_writes(c, instr, atomic_add_replaced, 514bf215546Sopenharmony_ci &tmu_writes); 515bf215546Sopenharmony_ci return tmu_writes; 516bf215546Sopenharmony_ci} 517bf215546Sopenharmony_ci 518bf215546Sopenharmony_civoid 519bf215546Sopenharmony_civ3d40_vir_emit_image_load_store(struct v3d_compile *c, 520bf215546Sopenharmony_ci nir_intrinsic_instr *instr) 521bf215546Sopenharmony_ci{ 522bf215546Sopenharmony_ci unsigned format = nir_intrinsic_format(instr); 523bf215546Sopenharmony_ci unsigned unit = nir_src_as_uint(instr->src[0]); 524bf215546Sopenharmony_ci 525bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = { 526bf215546Sopenharmony_ci }; 527bf215546Sopenharmony_ci 528bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_1 p1_unpacked = { 529bf215546Sopenharmony_ci .per_pixel_mask_enable = true, 530bf215546Sopenharmony_ci .output_type_32_bit = v3d_gl_format_is_return_32(format), 531bf215546Sopenharmony_ci }; 532bf215546Sopenharmony_ci 533bf215546Sopenharmony_ci struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = { 0 }; 534bf215546Sopenharmony_ci 535bf215546Sopenharmony_ci /* Limit the number of channels returned to both how many the NIR 536bf215546Sopenharmony_ci * instruction writes and how many the instruction could produce. 537bf215546Sopenharmony_ci */ 538bf215546Sopenharmony_ci uint32_t instr_return_channels = nir_intrinsic_dest_components(instr); 539bf215546Sopenharmony_ci if (!p1_unpacked.output_type_32_bit) 540bf215546Sopenharmony_ci instr_return_channels = (instr_return_channels + 1) / 2; 541bf215546Sopenharmony_ci 542bf215546Sopenharmony_ci p0_unpacked.return_words_of_texture_data = 543bf215546Sopenharmony_ci (1 << instr_return_channels) - 1; 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci p2_unpacked.op = v3d40_image_load_store_tmu_op(instr); 546bf215546Sopenharmony_ci 547bf215546Sopenharmony_ci /* If we were able to replace atomic_add for an inc/dec, then we 548bf215546Sopenharmony_ci * need/can to do things slightly different, like not loading the 549bf215546Sopenharmony_ci * amount to add/sub, as that is implicit. 550bf215546Sopenharmony_ci */ 551bf215546Sopenharmony_ci bool atomic_add_replaced = 552bf215546Sopenharmony_ci (instr->intrinsic == nir_intrinsic_image_atomic_add && 553bf215546Sopenharmony_ci (p2_unpacked.op == V3D_TMU_OP_WRITE_AND_READ_INC || 554bf215546Sopenharmony_ci p2_unpacked.op == V3D_TMU_OP_WRITE_OR_READ_DEC)); 555bf215546Sopenharmony_ci 556bf215546Sopenharmony_ci uint32_t p0_packed; 557bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL, 558bf215546Sopenharmony_ci (uint8_t *)&p0_packed, 559bf215546Sopenharmony_ci &p0_unpacked); 560bf215546Sopenharmony_ci 561bf215546Sopenharmony_ci /* Load unit number into the high bits of the texture or sampler 562bf215546Sopenharmony_ci * address field, which will be be used by the driver to decide which 563bf215546Sopenharmony_ci * texture to put in the actual address field. 564bf215546Sopenharmony_ci */ 565bf215546Sopenharmony_ci p0_packed |= unit << 24; 566bf215546Sopenharmony_ci 567bf215546Sopenharmony_ci uint32_t p1_packed; 568bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_1_pack(NULL, 569bf215546Sopenharmony_ci (uint8_t *)&p1_packed, 570bf215546Sopenharmony_ci &p1_unpacked); 571bf215546Sopenharmony_ci 572bf215546Sopenharmony_ci uint32_t p2_packed; 573bf215546Sopenharmony_ci V3D41_TMU_CONFIG_PARAMETER_2_pack(NULL, 574bf215546Sopenharmony_ci (uint8_t *)&p2_packed, 575bf215546Sopenharmony_ci &p2_unpacked); 576bf215546Sopenharmony_ci 577bf215546Sopenharmony_ci if (instr->intrinsic != nir_intrinsic_image_load) 578bf215546Sopenharmony_ci c->tmu_dirty_rcl = true; 579bf215546Sopenharmony_ci 580bf215546Sopenharmony_ci 581bf215546Sopenharmony_ci const uint32_t tmu_writes = 582bf215546Sopenharmony_ci get_required_image_tmu_writes(c, instr, atomic_add_replaced); 583bf215546Sopenharmony_ci 584bf215546Sopenharmony_ci /* The input FIFO has 16 slots across all threads so if we require 585bf215546Sopenharmony_ci * more than that we need to lower thread count. 586bf215546Sopenharmony_ci */ 587bf215546Sopenharmony_ci while (tmu_writes > 16 / c->threads) 588bf215546Sopenharmony_ci c->threads /= 2; 589bf215546Sopenharmony_ci 590bf215546Sopenharmony_ci /* If pipelining this TMU operation would overflow TMU fifos, we need 591bf215546Sopenharmony_ci * to flush any outstanding TMU operations. 592bf215546Sopenharmony_ci */ 593bf215546Sopenharmony_ci if (ntq_tmu_fifo_overflow(c, instr_return_channels)) 594bf215546Sopenharmony_ci ntq_flush_tmu(c); 595bf215546Sopenharmony_ci 596bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed); 597bf215546Sopenharmony_ci if (memcmp(&p1_unpacked, &p1_unpacked_default, sizeof(p1_unpacked))) 598bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_CONSTANT, p1_packed); 599bf215546Sopenharmony_ci if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked))) 600bf215546Sopenharmony_ci vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed); 601bf215546Sopenharmony_ci 602bf215546Sopenharmony_ci vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL); 603bf215546Sopenharmony_ci 604bf215546Sopenharmony_ci ntq_add_pending_tmu_flush(c, &instr->dest, 605bf215546Sopenharmony_ci p0_unpacked.return_words_of_texture_data); 606bf215546Sopenharmony_ci} 607