1/* 2 * Copyright © 2016 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "anv_private.h" 25 26#include "genxml/gen_macros.h" 27#include "genxml/genX_pack.h" 28 29#include "common/intel_l3_config.h" 30 31/** 32 * This file implements some lightweight memcpy/memset operations on the GPU 33 * using a vertex buffer and streamout. 34 */ 35 36/** 37 * Returns the greatest common divisor of a and b that is a power of two. 38 */ 39static uint64_t 40gcd_pow2_u64(uint64_t a, uint64_t b) 41{ 42 assert(a > 0 || b > 0); 43 44 unsigned a_log2 = ffsll(a) - 1; 45 unsigned b_log2 = ffsll(b) - 1; 46 47 /* If either a or b is 0, then a_log2 or b_log2 will be UINT_MAX in which 48 * case, the MIN2() will take the other one. If both are 0 then we will 49 * hit the assert above. 50 */ 51 return 1 << MIN2(a_log2, b_log2); 52} 53 54static void 55emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device, 56 const struct intel_l3_config *l3_config) 57{ 58#if GFX_VER >= 8 59 anv_batch_emit(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { 60 vfi.InstancingEnable = false; 61 vfi.VertexElementIndex = 0; 62 } 63 anv_batch_emit(batch, GENX(3DSTATE_VF_SGVS), sgvs); 64#endif 65 66 /* Disable all shader stages */ 67 anv_batch_emit(batch, GENX(3DSTATE_VS), vs); 68 anv_batch_emit(batch, GENX(3DSTATE_HS), hs); 69 anv_batch_emit(batch, GENX(3DSTATE_TE), te); 70 anv_batch_emit(batch, GENX(3DSTATE_DS), DS); 71 anv_batch_emit(batch, GENX(3DSTATE_GS), gs); 72 anv_batch_emit(batch, GENX(3DSTATE_PS), gs); 73 74 anv_batch_emit(batch, GENX(3DSTATE_SBE), sbe) { 75 sbe.VertexURBEntryReadOffset = 1; 76 sbe.NumberofSFOutputAttributes = 1; 77 sbe.VertexURBEntryReadLength = 1; 78#if GFX_VER >= 8 79 sbe.ForceVertexURBEntryReadLength = true; 80 sbe.ForceVertexURBEntryReadOffset = true; 81#endif 82 83#if GFX_VER >= 9 84 for (unsigned i = 0; i < 32; i++) 85 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; 86#endif 87 } 88 89 /* Emit URB setup. We tell it that the VS is active because we want it to 90 * allocate space for the VS. Even though one isn't run, we need VUEs to 91 * store the data that VF is going to pass to SOL. 92 */ 93 const unsigned entry_size[4] = { DIV_ROUND_UP(32, 64), 1, 1, 1 }; 94 95 genX(emit_urb_setup)(device, batch, l3_config, 96 VK_SHADER_STAGE_VERTEX_BIT, entry_size, NULL); 97 98#if GFX_VER >= 12 99 /* Disable Primitive Replication. */ 100 anv_batch_emit(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); 101#endif 102 103#if GFX_VER >= 8 104 anv_batch_emit(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { 105 topo.PrimitiveTopologyType = _3DPRIM_POINTLIST; 106 } 107#endif 108 109 anv_batch_emit(batch, GENX(3DSTATE_VF_STATISTICS), vf) { 110 vf.StatisticsEnable = false; 111 } 112} 113 114static void 115emit_so_memcpy(struct anv_batch *batch, struct anv_device *device, 116 struct anv_address dst, struct anv_address src, 117 uint32_t size) 118{ 119 /* The maximum copy block size is 4 32-bit components at a time. */ 120 assert(size % 4 == 0); 121 unsigned bs = gcd_pow2_u64(16, size); 122 123 enum isl_format format; 124 switch (bs) { 125 case 4: format = ISL_FORMAT_R32_UINT; break; 126 case 8: format = ISL_FORMAT_R32G32_UINT; break; 127 case 16: format = ISL_FORMAT_R32G32B32A32_UINT; break; 128 default: 129 unreachable("Invalid size"); 130 } 131 132 uint32_t *dw; 133 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_VERTEX_BUFFERS)); 134 GENX(VERTEX_BUFFER_STATE_pack)(batch, dw + 1, 135 &(struct GENX(VERTEX_BUFFER_STATE)) { 136 .VertexBufferIndex = 32, /* Reserved for this */ 137 .AddressModifyEnable = true, 138 .BufferStartingAddress = src, 139 .BufferPitch = bs, 140 .MOCS = anv_mocs(device, src.bo, 0), 141#if GFX_VER >= 12 142 .L3BypassDisable = true, 143#endif 144#if (GFX_VER >= 8) 145 .BufferSize = size, 146#else 147 .EndAddress = anv_address_add(src, size - 1), 148#endif 149 }); 150 151 dw = anv_batch_emitn(batch, 3, GENX(3DSTATE_VERTEX_ELEMENTS)); 152 GENX(VERTEX_ELEMENT_STATE_pack)(batch, dw + 1, 153 &(struct GENX(VERTEX_ELEMENT_STATE)) { 154 .VertexBufferIndex = 32, 155 .Valid = true, 156 .SourceElementFormat = format, 157 .SourceElementOffset = 0, 158 .Component0Control = (bs >= 4) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, 159 .Component1Control = (bs >= 8) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, 160 .Component2Control = (bs >= 12) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, 161 .Component3Control = (bs >= 16) ? VFCOMP_STORE_SRC : VFCOMP_STORE_0, 162 }); 163 164 165 anv_batch_emit(batch, GENX(3DSTATE_SO_BUFFER), sob) { 166#if GFX_VER < 12 167 sob.SOBufferIndex = 0; 168#else 169 sob._3DCommandOpcode = 0; 170 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD; 171#endif 172 sob.MOCS = anv_mocs(device, dst.bo, 0), 173 sob.SurfaceBaseAddress = dst; 174 175#if GFX_VER >= 8 176 sob.SOBufferEnable = true; 177 sob.SurfaceSize = size / 4 - 1; 178#else 179 sob.SurfacePitch = bs; 180 sob.SurfaceEndAddress = anv_address_add(dst, size); 181#endif 182 183#if GFX_VER >= 8 184 /* As SOL writes out data, it updates the SO_WRITE_OFFSET registers with 185 * the end position of the stream. We need to reset this value to 0 at 186 * the beginning of the run or else SOL will start at the offset from 187 * the previous draw. 188 */ 189 sob.StreamOffsetWriteEnable = true; 190 sob.StreamOffset = 0; 191#endif 192 } 193 194#if GFX_VER <= 7 195 /* The hardware can do this for us on BDW+ (see above) */ 196 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), load) { 197 load.RegisterOffset = GENX(SO_WRITE_OFFSET0_num); 198 load.DataDWord = 0; 199 } 200#endif 201 202 dw = anv_batch_emitn(batch, 5, GENX(3DSTATE_SO_DECL_LIST), 203 .StreamtoBufferSelects0 = (1 << 0), 204 .NumEntries0 = 1); 205 GENX(SO_DECL_ENTRY_pack)(batch, dw + 3, 206 &(struct GENX(SO_DECL_ENTRY)) { 207 .Stream0Decl = { 208 .OutputBufferSlot = 0, 209 .RegisterIndex = 0, 210 .ComponentMask = (1 << (bs / 4)) - 1, 211 }, 212 }); 213 214 anv_batch_emit(batch, GENX(3DSTATE_STREAMOUT), so) { 215 so.SOFunctionEnable = true; 216 so.RenderingDisable = true; 217 so.Stream0VertexReadOffset = 0; 218 so.Stream0VertexReadLength = DIV_ROUND_UP(32, 64); 219#if GFX_VER >= 8 220 so.Buffer0SurfacePitch = bs; 221#else 222 so.SOBufferEnable0 = true; 223#endif 224 } 225 226 anv_batch_emit(batch, GENX(3DPRIMITIVE), prim) { 227 prim.VertexAccessType = SEQUENTIAL; 228 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST; 229 prim.VertexCountPerInstance = size / bs; 230 prim.StartVertexLocation = 0; 231 prim.InstanceCount = 1; 232 prim.StartInstanceLocation = 0; 233 prim.BaseVertexLocation = 0; 234 } 235} 236 237void 238genX(emit_so_memcpy_init)(struct anv_memcpy_state *state, 239 struct anv_device *device, 240 struct anv_batch *batch) 241{ 242 memset(state, 0, sizeof(*state)); 243 244 state->batch = batch; 245 state->device = device; 246 247 const struct intel_l3_config *cfg = intel_get_default_l3_config(&device->info); 248 genX(emit_l3_config)(batch, device, cfg); 249 250 anv_batch_emit(batch, GENX(PIPELINE_SELECT), ps) { 251#if GFX_VER >= 9 252 ps.MaskBits = GFX_VER >= 12 ? 0x13 : 3; 253 ps.MediaSamplerDOPClockGateEnable = GFX_VER >= 12; 254#endif 255 ps.PipelineSelection = _3D; 256 } 257 258 emit_common_so_memcpy(batch, device, cfg); 259} 260 261void 262genX(emit_so_memcpy_fini)(struct anv_memcpy_state *state) 263{ 264 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D, 265 ANV_PIPE_END_OF_PIPE_SYNC_BIT); 266 267 anv_batch_emit(state->batch, GENX(MI_BATCH_BUFFER_END), end); 268 269 if ((state->batch->next - state->batch->start) & 4) 270 anv_batch_emit(state->batch, GENX(MI_NOOP), noop); 271} 272 273void 274genX(emit_so_memcpy)(struct anv_memcpy_state *state, 275 struct anv_address dst, struct anv_address src, 276 uint32_t size) 277{ 278 if (GFX_VER >= 8 && GFX_VER <= 9 && 279 !anv_use_relocations(state->device->physical) && 280 anv_gfx8_9_vb_cache_range_needs_workaround(&state->vb_bound, 281 &state->vb_dirty, 282 src, size)) { 283 genX(emit_apply_pipe_flushes)(state->batch, state->device, _3D, 284 ANV_PIPE_CS_STALL_BIT | 285 ANV_PIPE_VF_CACHE_INVALIDATE_BIT); 286 memset(&state->vb_dirty, 0, sizeof(state->vb_dirty)); 287 } 288 289 emit_so_memcpy(state->batch, state->device, dst, src, size); 290} 291 292void 293genX(cmd_buffer_so_memcpy)(struct anv_cmd_buffer *cmd_buffer, 294 struct anv_address dst, struct anv_address src, 295 uint32_t size) 296{ 297 if (size == 0) 298 return; 299 300 if (!cmd_buffer->state.current_l3_config) { 301 const struct intel_l3_config *cfg = 302 intel_get_default_l3_config(&cmd_buffer->device->info); 303 genX(cmd_buffer_config_l3)(cmd_buffer, cfg); 304 } 305 306 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, 32, src, size); 307 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 308 309 genX(flush_pipeline_select_3d)(cmd_buffer); 310 311 emit_common_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, 312 cmd_buffer->state.current_l3_config); 313 emit_so_memcpy(&cmd_buffer->batch, cmd_buffer->device, dst, src, size); 314 315 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer, SEQUENTIAL, 316 1ull << 32); 317 318 /* Invalidate pipeline & raster discard since we touch 319 * 3DSTATE_STREAMOUT. 320 */ 321 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_PIPELINE; 322 BITSET_SET(cmd_buffer->vk.dynamic_graphics_state.dirty, 323 MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE); 324} 325