1bf215546Sopenharmony_ci/*
2bf215546Sopenharmony_ci * Copyright © 2017 Broadcom
3bf215546Sopenharmony_ci *
4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation
7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
10bf215546Sopenharmony_ci *
11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next
12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
13bf215546Sopenharmony_ci * Software.
14bf215546Sopenharmony_ci *
15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21bf215546Sopenharmony_ci * IN THE SOFTWARE.
22bf215546Sopenharmony_ci */
23bf215546Sopenharmony_ci
24bf215546Sopenharmony_ci/** @file v3d_cpu_tiling.h
25bf215546Sopenharmony_ci *
26bf215546Sopenharmony_ci * Contains load/store functions common to both v3d and vc4.  The utile layout
27bf215546Sopenharmony_ci * stayed the same, though the way utiles get laid out has changed.
28bf215546Sopenharmony_ci */
29bf215546Sopenharmony_ci
30bf215546Sopenharmony_cistatic inline void
31bf215546Sopenharmony_civ3d_load_utile(void *cpu, uint32_t cpu_stride,
32bf215546Sopenharmony_ci               void *gpu, uint32_t gpu_stride)
33bf215546Sopenharmony_ci{
34bf215546Sopenharmony_ci#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35bf215546Sopenharmony_ci        if (gpu_stride == 8) {
36bf215546Sopenharmony_ci                __asm__ volatile (
37bf215546Sopenharmony_ci                        /* Load from the GPU in one shot, no interleave, to
38bf215546Sopenharmony_ci                         * d0-d7.
39bf215546Sopenharmony_ci                         */
40bf215546Sopenharmony_ci                        "vldm %[gpu], {q0, q1, q2, q3}\n"
41bf215546Sopenharmony_ci                        /* Store each 8-byte line to cpu-side destination,
42bf215546Sopenharmony_ci                         * incrementing it by the stride each time.
43bf215546Sopenharmony_ci                         */
44bf215546Sopenharmony_ci                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45bf215546Sopenharmony_ci                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46bf215546Sopenharmony_ci                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47bf215546Sopenharmony_ci                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48bf215546Sopenharmony_ci                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49bf215546Sopenharmony_ci                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50bf215546Sopenharmony_ci                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51bf215546Sopenharmony_ci                        "vst1.8 d7, [%[cpu]]\n"
52bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu)
53bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
54bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
55bf215546Sopenharmony_ci                        : "q0", "q1", "q2", "q3");
56bf215546Sopenharmony_ci                return;
57bf215546Sopenharmony_ci        } else if (gpu_stride == 16) {
58bf215546Sopenharmony_ci                void *cpu2 = cpu + 8;
59bf215546Sopenharmony_ci                __asm__ volatile (
60bf215546Sopenharmony_ci                        /* Load from the GPU in one shot, no interleave, to
61bf215546Sopenharmony_ci                         * d0-d7.
62bf215546Sopenharmony_ci                         */
63bf215546Sopenharmony_ci                        "vldm %[gpu], {q0, q1, q2, q3};\n"
64bf215546Sopenharmony_ci                        /* Store each 16-byte line in 2 parts to the cpu-side
65bf215546Sopenharmony_ci                         * destination.  (vld1 can only store one d-register
66bf215546Sopenharmony_ci                         * at a time).
67bf215546Sopenharmony_ci                         */
68bf215546Sopenharmony_ci                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69bf215546Sopenharmony_ci                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70bf215546Sopenharmony_ci                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71bf215546Sopenharmony_ci                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72bf215546Sopenharmony_ci                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73bf215546Sopenharmony_ci                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74bf215546Sopenharmony_ci                        "vst1.8 d6, [%[cpu]]\n"
75bf215546Sopenharmony_ci                        "vst1.8 d7, [%[cpu2]]\n"
76bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu),
77bf215546Sopenharmony_ci                          [cpu2]        "+r"(cpu2)
78bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
79bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
80bf215546Sopenharmony_ci                        : "q0", "q1", "q2", "q3");
81bf215546Sopenharmony_ci                return;
82bf215546Sopenharmony_ci        }
83bf215546Sopenharmony_ci#elif defined (PIPE_ARCH_AARCH64)
84bf215546Sopenharmony_ci        if (gpu_stride == 8) {
85bf215546Sopenharmony_ci                __asm__ volatile (
86bf215546Sopenharmony_ci                        /* Load from the GPU in one shot, no interleave, to
87bf215546Sopenharmony_ci                         * d0-d7.
88bf215546Sopenharmony_ci                         */
89bf215546Sopenharmony_ci                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90bf215546Sopenharmony_ci                        /* Store each 8-byte line to cpu-side destination,
91bf215546Sopenharmony_ci                         * incrementing it by the stride each time.
92bf215546Sopenharmony_ci                         */
93bf215546Sopenharmony_ci                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94bf215546Sopenharmony_ci                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95bf215546Sopenharmony_ci                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96bf215546Sopenharmony_ci                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97bf215546Sopenharmony_ci                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98bf215546Sopenharmony_ci                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99bf215546Sopenharmony_ci                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100bf215546Sopenharmony_ci                        "st1 {v3.D}[1], [%[cpu]]\n"
101bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu)
102bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
103bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
104bf215546Sopenharmony_ci                        : "v0", "v1", "v2", "v3");
105bf215546Sopenharmony_ci                return;
106bf215546Sopenharmony_ci        } else if (gpu_stride == 16) {
107bf215546Sopenharmony_ci                void *cpu2 = cpu + 8;
108bf215546Sopenharmony_ci                __asm__ volatile (
109bf215546Sopenharmony_ci                        /* Load from the GPU in one shot, no interleave, to
110bf215546Sopenharmony_ci                         * d0-d7.
111bf215546Sopenharmony_ci                         */
112bf215546Sopenharmony_ci                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113bf215546Sopenharmony_ci                        /* Store each 16-byte line in 2 parts to the cpu-side
114bf215546Sopenharmony_ci                         * destination.  (vld1 can only store one d-register
115bf215546Sopenharmony_ci                         * at a time).
116bf215546Sopenharmony_ci                         */
117bf215546Sopenharmony_ci                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118bf215546Sopenharmony_ci                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119bf215546Sopenharmony_ci                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120bf215546Sopenharmony_ci                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121bf215546Sopenharmony_ci                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122bf215546Sopenharmony_ci                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123bf215546Sopenharmony_ci                        "st1 {v3.D}[0], [%[cpu]]\n"
124bf215546Sopenharmony_ci                        "st1 {v3.D}[1], [%[cpu2]]\n"
125bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu),
126bf215546Sopenharmony_ci                          [cpu2]        "+r"(cpu2)
127bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
128bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
129bf215546Sopenharmony_ci                        : "v0", "v1", "v2", "v3");
130bf215546Sopenharmony_ci                return;
131bf215546Sopenharmony_ci        }
132bf215546Sopenharmony_ci#endif
133bf215546Sopenharmony_ci
134bf215546Sopenharmony_ci        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135bf215546Sopenharmony_ci                memcpy(cpu, gpu + gpu_offset, gpu_stride);
136bf215546Sopenharmony_ci                cpu += cpu_stride;
137bf215546Sopenharmony_ci        }
138bf215546Sopenharmony_ci}
139bf215546Sopenharmony_ci
140bf215546Sopenharmony_cistatic inline void
141bf215546Sopenharmony_civ3d_store_utile(void *gpu, uint32_t gpu_stride,
142bf215546Sopenharmony_ci                void *cpu, uint32_t cpu_stride)
143bf215546Sopenharmony_ci{
144bf215546Sopenharmony_ci#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145bf215546Sopenharmony_ci        if (gpu_stride == 8) {
146bf215546Sopenharmony_ci                __asm__ volatile (
147bf215546Sopenharmony_ci                        /* Load each 8-byte line from cpu-side source,
148bf215546Sopenharmony_ci                         * incrementing it by the stride each time.
149bf215546Sopenharmony_ci                         */
150bf215546Sopenharmony_ci                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151bf215546Sopenharmony_ci                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152bf215546Sopenharmony_ci                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153bf215546Sopenharmony_ci                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154bf215546Sopenharmony_ci                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155bf215546Sopenharmony_ci                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156bf215546Sopenharmony_ci                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157bf215546Sopenharmony_ci                        "vld1.8 d7, [%[cpu]]\n"
158bf215546Sopenharmony_ci                        /* Load from the GPU in one shot, no interleave, to
159bf215546Sopenharmony_ci                         * d0-d7.
160bf215546Sopenharmony_ci                         */
161bf215546Sopenharmony_ci                        "vstm %[gpu], {q0, q1, q2, q3}\n"
162bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu)
163bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
164bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
165bf215546Sopenharmony_ci                        : "q0", "q1", "q2", "q3");
166bf215546Sopenharmony_ci                return;
167bf215546Sopenharmony_ci        } else if (gpu_stride == 16) {
168bf215546Sopenharmony_ci                void *cpu2 = cpu + 8;
169bf215546Sopenharmony_ci                __asm__ volatile (
170bf215546Sopenharmony_ci                        /* Load each 16-byte line in 2 parts from the cpu-side
171bf215546Sopenharmony_ci                         * destination.  (vld1 can only store one d-register
172bf215546Sopenharmony_ci                         * at a time).
173bf215546Sopenharmony_ci                         */
174bf215546Sopenharmony_ci                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175bf215546Sopenharmony_ci                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176bf215546Sopenharmony_ci                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177bf215546Sopenharmony_ci                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178bf215546Sopenharmony_ci                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179bf215546Sopenharmony_ci                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180bf215546Sopenharmony_ci                        "vld1.8 d6, [%[cpu]]\n"
181bf215546Sopenharmony_ci                        "vld1.8 d7, [%[cpu2]]\n"
182bf215546Sopenharmony_ci                        /* Store to the GPU in one shot, no interleave. */
183bf215546Sopenharmony_ci                        "vstm %[gpu], {q0, q1, q2, q3}\n"
184bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu),
185bf215546Sopenharmony_ci                          [cpu2]        "+r"(cpu2)
186bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
187bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
188bf215546Sopenharmony_ci                        : "q0", "q1", "q2", "q3");
189bf215546Sopenharmony_ci                return;
190bf215546Sopenharmony_ci        }
191bf215546Sopenharmony_ci#elif defined (PIPE_ARCH_AARCH64)
192bf215546Sopenharmony_ci        if (gpu_stride == 8) {
193bf215546Sopenharmony_ci                __asm__ volatile (
194bf215546Sopenharmony_ci                        /* Load each 8-byte line from cpu-side source,
195bf215546Sopenharmony_ci                         * incrementing it by the stride each time.
196bf215546Sopenharmony_ci                         */
197bf215546Sopenharmony_ci                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198bf215546Sopenharmony_ci                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199bf215546Sopenharmony_ci                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200bf215546Sopenharmony_ci                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201bf215546Sopenharmony_ci                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202bf215546Sopenharmony_ci                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203bf215546Sopenharmony_ci                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204bf215546Sopenharmony_ci                        "ld1 {v3.D}[1], [%[cpu]]\n"
205bf215546Sopenharmony_ci                        /* Store to the GPU in one shot, no interleave. */
206bf215546Sopenharmony_ci                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu)
208bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
209bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
210bf215546Sopenharmony_ci                        : "v0", "v1", "v2", "v3");
211bf215546Sopenharmony_ci                return;
212bf215546Sopenharmony_ci        } else if (gpu_stride == 16) {
213bf215546Sopenharmony_ci                void *cpu2 = cpu + 8;
214bf215546Sopenharmony_ci                __asm__ volatile (
215bf215546Sopenharmony_ci                        /* Load each 16-byte line in 2 parts from the cpu-side
216bf215546Sopenharmony_ci                         * destination.  (vld1 can only store one d-register
217bf215546Sopenharmony_ci                         * at a time).
218bf215546Sopenharmony_ci                         */
219bf215546Sopenharmony_ci                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220bf215546Sopenharmony_ci                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221bf215546Sopenharmony_ci                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222bf215546Sopenharmony_ci                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223bf215546Sopenharmony_ci                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224bf215546Sopenharmony_ci                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225bf215546Sopenharmony_ci                        "ld1 {v3.D}[0], [%[cpu]]\n"
226bf215546Sopenharmony_ci                        "ld1 {v3.D}[1], [%[cpu2]]\n"
227bf215546Sopenharmony_ci                        /* Store to the GPU in one shot, no interleave. */
228bf215546Sopenharmony_ci                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229bf215546Sopenharmony_ci                        : [cpu]         "+r"(cpu),
230bf215546Sopenharmony_ci                          [cpu2]        "+r"(cpu2)
231bf215546Sopenharmony_ci                        : [gpu]         "r"(gpu),
232bf215546Sopenharmony_ci                          [cpu_stride]  "r"(cpu_stride)
233bf215546Sopenharmony_ci                        : "v0", "v1", "v2", "v3");
234bf215546Sopenharmony_ci                return;
235bf215546Sopenharmony_ci        }
236bf215546Sopenharmony_ci#endif
237bf215546Sopenharmony_ci
238bf215546Sopenharmony_ci        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239bf215546Sopenharmony_ci                memcpy(gpu + gpu_offset, cpu, gpu_stride);
240bf215546Sopenharmony_ci                cpu += cpu_stride;
241bf215546Sopenharmony_ci        }
242bf215546Sopenharmony_ci}
243