xref: /third_party/mesa3d/src/intel/compiler/brw_rt.h (revision bf215546)
1/*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#ifndef BRW_RT_H
25#define BRW_RT_H
26
27#ifdef __cplusplus
28extern "C" {
29#endif
30
31/** Vulkan defines shaderGroupHandleSize = 32 */
32#define BRW_RT_SBT_HANDLE_SIZE 32
33
34/** RT_DISPATCH_GLOBALS size (see gen_rt.xml) */
35#define BRW_RT_DISPATCH_GLOBALS_SIZE 80
36
37/** Offset after the RT dispatch globals at which "push" constants live */
38#define BRW_RT_PUSH_CONST_OFFSET 128
39
40/** Stride of the resume SBT */
41#define BRW_BTD_RESUME_SBT_STRIDE 8
42
43/* Vulkan always uses exactly two levels of BVH: world and object.  At the API
44 * level, these are referred to as top and bottom.
45 */
46enum brw_rt_bvh_level {
47   BRW_RT_BVH_LEVEL_WORLD = 0,
48   BRW_RT_BVH_LEVEL_OBJECT = 1,
49};
50#define BRW_RT_MAX_BVH_LEVELS 2
51
52enum brw_rt_bvh_node_type {
53   BRW_RT_BVH_NODE_TYPE_INTERNAL = 0,
54   BRW_RT_BVH_NODE_TYPE_INSTANCE = 1,
55   BRW_RT_BVH_NODE_TYPE_PROCEDURAL = 3,
56   BRW_RT_BVH_NODE_TYPE_QUAD = 4,
57};
58
59/** HitKind values returned for triangle geometry
60 *
61 * This enum must match the SPIR-V enum.
62 */
63enum brw_rt_hit_kind {
64   BRW_RT_HIT_KIND_FRONT_FACE = 0xfe,
65   BRW_RT_HIT_KIND_BACK_FACE = 0xff,
66};
67
68/** Ray flags
69 *
70 * This enum must match the SPIR-V RayFlags enum.
71 */
72enum brw_rt_ray_flags {
73   BRW_RT_RAY_FLAG_FORCE_OPAQUE                    = 0x01,
74   BRW_RT_RAY_FLAG_FORCE_NON_OPAQUE                = 0x02,
75   BRW_RT_RAY_FLAG_TERMINATE_ON_FIRST_HIT          = 0x04,
76   BRW_RT_RAY_FLAG_SKIP_CLOSEST_HIT_SHADER         = 0x08,
77   BRW_RT_RAY_FLAG_CULL_BACK_FACING_TRIANGLES      = 0x10,
78   BRW_RT_RAY_FLAG_CULL_FRONT_FACING_TRIANGLES     = 0x20,
79   BRW_RT_RAY_FLAG_CULL_OPAQUE                     = 0x40,
80   BRW_RT_RAY_FLAG_CULL_NON_OPAQUE                 = 0x80,
81   BRW_RT_RAY_FLAG_SKIP_TRIANGLES                  = 0x100,
82   BRW_RT_RAY_FLAG_SKIP_AABBS                      = 0x200,
83};
84
85struct brw_rt_scratch_layout {
86   /** Number of stack IDs per DSS */
87   uint32_t stack_ids_per_dss;
88
89   /** Start offset (in bytes) of the hardware MemRay stack */
90   uint32_t ray_stack_start;
91
92   /** Stride (in bytes) of the hardware MemRay stack */
93   uint32_t ray_stack_stride;
94
95   /** Start offset (in bytes) of the SW stacks */
96   uint64_t sw_stack_start;
97
98   /** Size (in bytes) of the SW stack for a single shader invocation */
99   uint32_t sw_stack_size;
100
101   /** Total size (in bytes) of the RT scratch memory area */
102   uint64_t total_size;
103};
104
105/** Parameters passed to the raygen trampoline shader
106 *
107 * This struct is carefully construected to be 32B and must be passed to the
108 * raygen trampoline shader as as inline constant data.
109 */
110struct brw_rt_raygen_trampoline_params {
111   /** The GPU address of the RT_DISPATCH_GLOBALS */
112   uint64_t rt_disp_globals_addr;
113
114   /** The GPU address of the BINDLESS_SHADER_RECORD for the raygen shader */
115   uint64_t raygen_bsr_addr;
116
117   /** 1 if this is an indirect dispatch, 0 otherwise */
118   uint8_t is_indirect;
119
120   /** The integer log2 of the local group size
121    *
122    * Ray-tracing shaders don't have a concept of local vs. global workgroup
123    * size.  They only have a single 3D launch size.  The raygen trampoline
124    * shader is always dispatched with a local workgroup size equal to the
125    * SIMD width but the shape of the local workgroup is determined at
126    * dispatch time based on the shape of the launch and passed to the
127    * trampoline via this field.  (There's no sense having a Z dimension on
128    * the local workgroup if the launch is 2D.)
129    *
130    * We use the integer log2 of the size because there's no point in
131    * non-power-of-two sizes and  shifts are cheaper than division.
132    */
133   uint8_t local_group_size_log2[3];
134
135   uint32_t pad[3];
136};
137
138/** Size of the "hot zone" in bytes
139 *
140 * The hot zone is a SW-defined data structure which is a single uvec4
141 * containing two bits of information:
142 *
143 *  - hotzone.x: Stack offset (in bytes)
144 *
145 *    This is the offset (in bytes) into the per-thread scratch space at which
146 *    the current shader's stack starts.  This is incremented by the calling
147 *    shader prior to any shader call type instructions and gets decremented
148 *    by the resume shader as part of completing the return operation.
149 *
150 *
151 *  - hotzone.yzw: The launch ID associated with the current thread
152 *
153 *    Inside a bindless shader, the only information we have is the DSS ID
154 *    from the hardware EU and a per-DSS stack ID.  In particular, the three-
155 *    dimensional launch ID is lost the moment we leave the raygen trampoline.
156 */
157#define BRW_RT_SIZEOF_HOTZONE 16
158
159/* From the BSpec "Address Computation for Memory Based Data Structures:
160 * Ray and TraversalStack (Async Ray Tracing)":
161 *
162 *    sizeof(Ray) = 64B, sizeof(HitInfo) = 32B, sizeof(TravStack) = 32B.
163 */
164#define BRW_RT_SIZEOF_RAY 64
165#define BRW_RT_SIZEOF_HIT_INFO 32
166#define BRW_RT_SIZEOF_TRAV_STACK 32
167
168/* From the BSpec:
169 *
170 *    syncStackSize = (maxBVHLevels % 2 == 1) ?
171 *       (sizeof(HitInfo) * 2 +
172 *          (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels + 32B) :
173 *       (sizeof(HitInfo) * 2 +
174 *          (sizeof(Ray) + sizeof(TravStack)) * maxBVHLevels);
175 *
176 * The select is just to align to 64B.
177 */
178#define BRW_RT_SIZEOF_RAY_QUERY \
179   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
180    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS + \
181    (BRW_RT_MAX_BVH_LEVELS % 2 ? 32 : 0))
182
183#define BRW_RT_SIZEOF_SHADOW_RAY_QUERY  \
184   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
185    (BRW_RT_SIZEOF_RAY + BRW_RT_SIZEOF_TRAV_STACK) * BRW_RT_MAX_BVH_LEVELS)
186
187#define BRW_RT_SIZEOF_HW_STACK \
188   (BRW_RT_SIZEOF_HIT_INFO * 2 + \
189    BRW_RT_SIZEOF_RAY * BRW_RT_MAX_BVH_LEVELS + \
190    BRW_RT_SIZEOF_TRAV_STACK * BRW_RT_MAX_BVH_LEVELS)
191
192/* This is a mesa-defined region for hit attribute data */
193#define BRW_RT_SIZEOF_HIT_ATTRIB_DATA 64
194#define BRW_RT_OFFSETOF_HIT_ATTRIB_DATA BRW_RT_SIZEOF_HW_STACK
195
196#define BRW_RT_ASYNC_STACK_STRIDE \
197   ALIGN(BRW_RT_OFFSETOF_HIT_ATTRIB_DATA + \
198         BRW_RT_SIZEOF_HIT_ATTRIB_DATA, 64)
199
200static inline void
201brw_rt_compute_scratch_layout(struct brw_rt_scratch_layout *layout,
202                              const struct intel_device_info *devinfo,
203                              uint32_t stack_ids_per_dss,
204                              uint32_t sw_stack_size)
205{
206   layout->stack_ids_per_dss = stack_ids_per_dss;
207
208   const uint32_t dss_count = intel_device_info_num_dual_subslices(devinfo);
209   const uint32_t num_stack_ids = dss_count * stack_ids_per_dss;
210
211   uint64_t size = 0;
212
213   /* The first thing in our scratch area is an array of "hot zones" which
214    * store the stack offset as well as the launch IDs for each active
215    * invocation.
216    */
217   size += BRW_RT_SIZEOF_HOTZONE * num_stack_ids;
218
219   /* Next, we place the HW ray stacks */
220   assert(size % 64 == 0); /* Cache-line aligned */
221   assert(size < UINT32_MAX);
222   layout->ray_stack_start = size;
223   layout->ray_stack_stride = BRW_RT_ASYNC_STACK_STRIDE;
224   size += num_stack_ids * layout->ray_stack_stride;
225
226   /* Finally, we place the SW stacks for the individual ray-tracing shader
227    * invocations.  We align these to 64B to ensure that we don't have any
228    * shared cache lines which could hurt performance.
229    */
230   assert(size % 64 == 0);
231   layout->sw_stack_start = size;
232   layout->sw_stack_size = ALIGN(sw_stack_size, 64);
233   size += num_stack_ids * layout->sw_stack_size;
234
235   layout->total_size = size;
236}
237
238static inline uint32_t
239brw_rt_ray_queries_hw_stacks_size(const struct intel_device_info *devinfo)
240{
241   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
242    * which includes all the threads.
243    */
244   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
245   uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
246   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_RAY_QUERY;
247}
248
249static inline uint32_t
250brw_rt_ray_queries_shadow_stack_size(const struct intel_device_info *devinfo)
251{
252   /* Maximum slice/subslice/EU ID can be computed from the max_scratch_ids
253    * which includes all the threads.
254    */
255   uint32_t max_eu_id = devinfo->max_scratch_ids[MESA_SHADER_COMPUTE];
256   uint32_t max_simd_size = 16; /* Cannot run in SIMD32 with ray queries */
257   return max_eu_id * max_simd_size * BRW_RT_SIZEOF_SHADOW_RAY_QUERY;
258}
259
260static inline uint32_t
261brw_rt_ray_queries_shadow_stacks_size(const struct intel_device_info *devinfo,
262                                      uint32_t ray_queries)
263{
264   /* Don't bother a shadow stack if we only have a single query. We can
265    * directly write in the HW buffer.
266    */
267   return (ray_queries > 1 ? ray_queries : 0) * brw_rt_ray_queries_shadow_stack_size(devinfo) +
268          ray_queries * 4; /* Ctrl + Level data */
269}
270
271#ifdef __cplusplus
272}
273#endif
274
275#endif /* BRW_RT_H */
276