1/* 2 * Copyright © 2020 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef BRW_RT_H 25#define BRW_RT_H 26 27#ifdef __cplusplus 28extern "C" { 29#endif 30 31/** Vulkan defines shaderGroupHandleSize = 32 */ 32#define BRW_RT_SBT_HANDLE_SIZE 32 33 34/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */ 35#define BRW_RT_DISPATCH_GLOBALS_SIZE 80 36 37/** Offset after the RT dispatch globals at which "push" constants live */ 38#define BRW_RT_PUSH_CONST_OFFSET 128 39 40/** Stride of the resume SBT */ 41#define BRW_BTD_RESUME_SBT_STRIDE 8 42 43/* Vulkan always uses exactly two levels of BVH: world and object. At the API 44 * level, these are referred to as top and bottom. 45 */ 46enum brw_rt_bvh_level { 47 BRW_RT_BVH_LEVEL_WORLD = 0, 48 BRW_RT_BVH_LEVEL_OBJECT = 1, 49}; 50#define BRW_RT_MAX_BVH_LEVELS 2 51 52enum brw_rt_bvh_node_type { 53 BRW_RT_BVH_NODE_TYPE_INTERNAL = 0, 54 BRW_RT_BVH_NODE_TYPE_INSTANCE = 1, 55 BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3, 56 BRW_RT_BVH_NODE_TYPE_QUAD = 4, 57}; 58 59/** HitKind values returned for triangle geometry 60 * 61 * This enum must match the SPIR-V enum. 62 */ 63enum brw_rt_hit_kind { 64 BRW_RT_HIT_KIND_FRONT_FACE = 0xfe, 65 BRW_RT_HIT_KIND_BACK_FACE = 0xff, 66}; 67 68/** Ray flags 69 * 70 * This enum must match the SPIR-V RayFlags enum. 71 */ 72enum brw_rt_ray_flags { 73 BRW_RT_RAY_FLAG_FORCE_OPAQUE = 0x01, 74 BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE = 0x02, 75 BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT = 0x04, 76 BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER = 0x08, 77 BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES = 0x10, 78 BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES = 0x20, 79 BRW_RT_RAY_FLAG_CULL_OPAQUE = 0x40, 80 BRW_RT_RAY_FLAG_CULL_NON_OPAQUE = 0x80, 81 BRW_RT_RAY_FLAG_SKIP_TRIANGLES = 0x100, 82 BRW_RT_RAY_FLAG_SKIP_AABBS = 0x200, 83}; 84 85struct brw_rt_scratch_layout { 86 /** Number of stack IDs per DSS */ 87 uint32_t stack_ids_per_dss; 88 89 /** Start offset (in bytes) of the hardware MemRay stack */ 90 uint32_t ray_stack_start; 91 92 /** Stride (in bytes) of the hardware MemRay stack */ 93 uint32_t ray_stack_stride; 94 95 /** Start offset (in bytes) of the SW stacks */ 96 uint64_t sw_stack_start; 97 98 /** Size (in bytes) of the SW stack for a single shader invocation */ 99 uint32_t sw_stack_size; 100 101 /** Total size (in bytes) of the RT scratch memory area */ 102 uint64_t total_size; 103}; 104 105/** Parameters passed to the raygen trampoline shader 106 * 107 * This struct is carefully construected to be 32B and must be passed to the 108 * raygen trampoline shader as as inline constant data. 109 */ 110struct brw_rt_raygen_trampoline_params { 111 /** The GPU address of the RT_DISPATCH_GLOBALS */ 112 uint64_t rt_disp_globals_addr; 113 114 /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */ 115 uint64_t raygen_bsr_addr; 116 117 /** 1 if this is an indirect dispatch, 0 otherwise */ 118 uint8_t is_indirect; 119 120 /** The integer log2 of the local group size 121 * 122 * Ray-tracing shaders don't have a concept of local vs. global workgroup 123 * size. They only have a single 3D launch size. The raygen trampoline 124 * shader is always dispatched with a local workgroup size equal to the 125 * SIMD width but the shape of the local workgroup is determined at 126 * dispatch time based on the shape of the launch and passed to the 127 * trampoline via this field. (There's no sense having a Z dimension on 128 * the local workgroup if the launch is 2D.) 129 * 130 * We use the integer log2 of the size because there's no point in 131 * non-power-of-two sizes and shifts are cheaper than division. 132 */ 133 uint8_t local_group_size_log2[3]; 134 135 uint32_t pad[3]; 136}; 137 138/** Size of the "hot zone" in bytes 139 * 140 * The hot zone is a SW-defined data structure which is a single uvec4 141 * containing two bits of information: 142 * 143 * - hotzone.x: Stack offset (in bytes) 144 * 145 * This is the offset (in bytes) into the per-thread scratch space at which 146 * the current shader's stack starts. This is incremented by the calling 147 * shader prior to any shader call type instructions and gets decremented 148 * by the resume shader as part of completing the return operation. 149 * 150 * 151 * - hotzone.yzw: The launch ID associated with the current thread 152 * 153 * Inside a bindless shader, the only information we have is the DSS ID 154 * from the hardware EU and a per-DSS stack ID. In particular, the three- 155 * dimensional launch ID is lost the moment we leave the raygen trampoline. 156 */ 157#define BRW_RT_SIZEOF_HOTZONE 16 158 159/* From the BSpec "Address Computation for Memory Based Data Structures: 160 * Ray and TraversalStack (Async Ray Tracing)": 161 * 162 * sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B. 163 */ 164#define BRW_RT_SIZEOF_RAY 64 165#define BRW_RT_SIZEOF_HIT_INFO 32 166#define BRW_RT_SIZEOF_TRAV_STACK 32 167 168/* From the BSpec: 169 * 170 * syncStackSize = (maxBVHLevels % 2 == 1) ? 171 * (sizeof(HitInfo) * 2 + 172 * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) : 173 * (sizeof(HitInfo) * 2 + 174 * (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels); 175 * 176 * The select is just to align to 64B. 177 */ 178#define BRW_RT_SIZEOF_RAY_QUERY \ 179 (BRW_RT_SIZEOF_HIT_INFO * 2 + \ 180 (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \ 181 (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0)) 182 183#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY \ 184 (BRW_RT_SIZEOF_HIT_INFO * 2 + \ 185 (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS) 186 187#define BRW_RT_SIZEOF_HW_STACK \ 188 (BRW_RT_SIZEOF_HIT_INFO * 2 + \ 189 BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \ 190 BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS) 191 192/* This is a mesa-defined region for hit attribute data */ 193#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64 194#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK 195 196#define BRW_RT_ASYNC_STACK_STRIDE \ 197 ALIGN(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \ 198 BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64) 199 200static inline void 201brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout, 202 const struct intel_device_info *devinfo, 203 uint32_t stack_ids_per_dss, 204 uint32_t sw_stack_size) 205{ 206 layout->stack_ids_per_dss = stack_ids_per_dss; 207 208 const uint32_t dss_count = intel_device_info_num_dual_subslices(devinfo); 209 const uint32_t num_stack_ids = dss_count * stack_ids_per_dss; 210 211 uint64_t size = 0; 212 213 /* The first thing in our scratch area is an array of "hot zones" which 214 * store the stack offset as well as the launch IDs for each active 215 * invocation. 216 */ 217 size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids; 218 219 /* Next, we place the HW ray stacks */ 220 assert(size % 64 == 0); /* Cache-line aligned */ 221 assert(size < UINT32_MAX); 222 layout->ray_stack_start = size; 223 layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE; 224 size += num_stack_ids * layout->ray_stack_stride; 225 226 /* Finally, we place the SW stacks for the individual ray-tracing shader 227 * invocations. We align these to 64B to ensure that we don't have any 228 * shared cache lines which could hurt performance. 229 */ 230 assert(size % 64 == 0); 231 layout->sw_stack_start = size; 232 layout->sw_stack_size = ALIGN(sw_stack_size, 64); 233 size += num_stack_ids * layout->sw_stack_size; 234 235 layout->total_size = size; 236} 237 238static inline uint32_t 239brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo) 240{ 241 /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids 242 * which includes all the threads. 243 */ 244 uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; 245 uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */ 246 return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY; 247} 248 249static inline uint32_t 250brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo) 251{ 252 /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids 253 * which includes all the threads. 254 */ 255 uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE]; 256 uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */ 257 return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY; 258} 259 260static inline uint32_t 261brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo, 262 uint32_t ray_queries) 263{ 264 /* Don't bother a shadow stack if we only have a single query. We can 265 * directly write in the HW buffer. 266 */ 267 return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) + 268 ray_queries * 4; /* Ctrl + Level data */ 269} 270 271#ifdef __cplusplus 272} 273#endif 274 275#endif /* BRW_RT_H */ 276