1bf215546Sopenharmony_ci/* 2bf215546Sopenharmony_ci * Copyright © 2017 Broadcom 3bf215546Sopenharmony_ci * 4bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a 5bf215546Sopenharmony_ci * copy of this software and associated documentation files (the "Software"), 6bf215546Sopenharmony_ci * to deal in the Software without restriction, including without limitation 7bf215546Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bf215546Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the 9bf215546Sopenharmony_ci * Software is furnished to do so, subject to the following conditions: 10bf215546Sopenharmony_ci * 11bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the next 12bf215546Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the 13bf215546Sopenharmony_ci * Software. 14bf215546Sopenharmony_ci * 15bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16bf215546Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17bf215546Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18bf215546Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19bf215546Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20bf215546Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21bf215546Sopenharmony_ci * IN THE SOFTWARE. 22bf215546Sopenharmony_ci */ 23bf215546Sopenharmony_ci 24bf215546Sopenharmony_ci/** @file v3d_cpu_tiling.h 25bf215546Sopenharmony_ci * 26bf215546Sopenharmony_ci * Contains load/store functions common to both v3d and vc4. The utile layout 27bf215546Sopenharmony_ci * stayed the same, though the way utiles get laid out has changed. 28bf215546Sopenharmony_ci */ 29bf215546Sopenharmony_ci 30bf215546Sopenharmony_cistatic inline void 31bf215546Sopenharmony_civ3d_load_utile(void *cpu, uint32_t cpu_stride, 32bf215546Sopenharmony_ci void *gpu, uint32_t gpu_stride) 33bf215546Sopenharmony_ci{ 34bf215546Sopenharmony_ci#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 35bf215546Sopenharmony_ci if (gpu_stride == 8) { 36bf215546Sopenharmony_ci __asm__ volatile ( 37bf215546Sopenharmony_ci /* Load from the GPU in one shot, no interleave, to 38bf215546Sopenharmony_ci * d0-d7. 39bf215546Sopenharmony_ci */ 40bf215546Sopenharmony_ci "vldm %[gpu], {q0, q1, q2, q3}\n" 41bf215546Sopenharmony_ci /* Store each 8-byte line to cpu-side destination, 42bf215546Sopenharmony_ci * incrementing it by the stride each time. 43bf215546Sopenharmony_ci */ 44bf215546Sopenharmony_ci "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 45bf215546Sopenharmony_ci "vst1.8 d1, [%[cpu]], %[cpu_stride]\n" 46bf215546Sopenharmony_ci "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 47bf215546Sopenharmony_ci "vst1.8 d3, [%[cpu]], %[cpu_stride]\n" 48bf215546Sopenharmony_ci "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 49bf215546Sopenharmony_ci "vst1.8 d5, [%[cpu]], %[cpu_stride]\n" 50bf215546Sopenharmony_ci "vst1.8 d6, [%[cpu]], %[cpu_stride]\n" 51bf215546Sopenharmony_ci "vst1.8 d7, [%[cpu]]\n" 52bf215546Sopenharmony_ci : [cpu] "+r"(cpu) 53bf215546Sopenharmony_ci : [gpu] "r"(gpu), 54bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 55bf215546Sopenharmony_ci : "q0", "q1", "q2", "q3"); 56bf215546Sopenharmony_ci return; 57bf215546Sopenharmony_ci } else if (gpu_stride == 16) { 58bf215546Sopenharmony_ci void *cpu2 = cpu + 8; 59bf215546Sopenharmony_ci __asm__ volatile ( 60bf215546Sopenharmony_ci /* Load from the GPU in one shot, no interleave, to 61bf215546Sopenharmony_ci * d0-d7. 62bf215546Sopenharmony_ci */ 63bf215546Sopenharmony_ci "vldm %[gpu], {q0, q1, q2, q3};\n" 64bf215546Sopenharmony_ci /* Store each 16-byte line in 2 parts to the cpu-side 65bf215546Sopenharmony_ci * destination. (vld1 can only store one d-register 66bf215546Sopenharmony_ci * at a time). 67bf215546Sopenharmony_ci */ 68bf215546Sopenharmony_ci "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 69bf215546Sopenharmony_ci "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n" 70bf215546Sopenharmony_ci "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 71bf215546Sopenharmony_ci "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n" 72bf215546Sopenharmony_ci "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 73bf215546Sopenharmony_ci "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n" 74bf215546Sopenharmony_ci "vst1.8 d6, [%[cpu]]\n" 75bf215546Sopenharmony_ci "vst1.8 d7, [%[cpu2]]\n" 76bf215546Sopenharmony_ci : [cpu] "+r"(cpu), 77bf215546Sopenharmony_ci [cpu2] "+r"(cpu2) 78bf215546Sopenharmony_ci : [gpu] "r"(gpu), 79bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 80bf215546Sopenharmony_ci : "q0", "q1", "q2", "q3"); 81bf215546Sopenharmony_ci return; 82bf215546Sopenharmony_ci } 83bf215546Sopenharmony_ci#elif defined (PIPE_ARCH_AARCH64) 84bf215546Sopenharmony_ci if (gpu_stride == 8) { 85bf215546Sopenharmony_ci __asm__ volatile ( 86bf215546Sopenharmony_ci /* Load from the GPU in one shot, no interleave, to 87bf215546Sopenharmony_ci * d0-d7. 88bf215546Sopenharmony_ci */ 89bf215546Sopenharmony_ci "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 90bf215546Sopenharmony_ci /* Store each 8-byte line to cpu-side destination, 91bf215546Sopenharmony_ci * incrementing it by the stride each time. 92bf215546Sopenharmony_ci */ 93bf215546Sopenharmony_ci "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 94bf215546Sopenharmony_ci "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 95bf215546Sopenharmony_ci "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 96bf215546Sopenharmony_ci "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 97bf215546Sopenharmony_ci "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 98bf215546Sopenharmony_ci "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 99bf215546Sopenharmony_ci "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 100bf215546Sopenharmony_ci "st1 {v3.D}[1], [%[cpu]]\n" 101bf215546Sopenharmony_ci : [cpu] "+r"(cpu) 102bf215546Sopenharmony_ci : [gpu] "r"(gpu), 103bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 104bf215546Sopenharmony_ci : "v0", "v1", "v2", "v3"); 105bf215546Sopenharmony_ci return; 106bf215546Sopenharmony_ci } else if (gpu_stride == 16) { 107bf215546Sopenharmony_ci void *cpu2 = cpu + 8; 108bf215546Sopenharmony_ci __asm__ volatile ( 109bf215546Sopenharmony_ci /* Load from the GPU in one shot, no interleave, to 110bf215546Sopenharmony_ci * d0-d7. 111bf215546Sopenharmony_ci */ 112bf215546Sopenharmony_ci "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 113bf215546Sopenharmony_ci /* Store each 16-byte line in 2 parts to the cpu-side 114bf215546Sopenharmony_ci * destination. (vld1 can only store one d-register 115bf215546Sopenharmony_ci * at a time). 116bf215546Sopenharmony_ci */ 117bf215546Sopenharmony_ci "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 118bf215546Sopenharmony_ci "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 119bf215546Sopenharmony_ci "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 120bf215546Sopenharmony_ci "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 121bf215546Sopenharmony_ci "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 122bf215546Sopenharmony_ci "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 123bf215546Sopenharmony_ci "st1 {v3.D}[0], [%[cpu]]\n" 124bf215546Sopenharmony_ci "st1 {v3.D}[1], [%[cpu2]]\n" 125bf215546Sopenharmony_ci : [cpu] "+r"(cpu), 126bf215546Sopenharmony_ci [cpu2] "+r"(cpu2) 127bf215546Sopenharmony_ci : [gpu] "r"(gpu), 128bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 129bf215546Sopenharmony_ci : "v0", "v1", "v2", "v3"); 130bf215546Sopenharmony_ci return; 131bf215546Sopenharmony_ci } 132bf215546Sopenharmony_ci#endif 133bf215546Sopenharmony_ci 134bf215546Sopenharmony_ci for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 135bf215546Sopenharmony_ci memcpy(cpu, gpu + gpu_offset, gpu_stride); 136bf215546Sopenharmony_ci cpu += cpu_stride; 137bf215546Sopenharmony_ci } 138bf215546Sopenharmony_ci} 139bf215546Sopenharmony_ci 140bf215546Sopenharmony_cistatic inline void 141bf215546Sopenharmony_civ3d_store_utile(void *gpu, uint32_t gpu_stride, 142bf215546Sopenharmony_ci void *cpu, uint32_t cpu_stride) 143bf215546Sopenharmony_ci{ 144bf215546Sopenharmony_ci#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 145bf215546Sopenharmony_ci if (gpu_stride == 8) { 146bf215546Sopenharmony_ci __asm__ volatile ( 147bf215546Sopenharmony_ci /* Load each 8-byte line from cpu-side source, 148bf215546Sopenharmony_ci * incrementing it by the stride each time. 149bf215546Sopenharmony_ci */ 150bf215546Sopenharmony_ci "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 151bf215546Sopenharmony_ci "vld1.8 d1, [%[cpu]], %[cpu_stride]\n" 152bf215546Sopenharmony_ci "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 153bf215546Sopenharmony_ci "vld1.8 d3, [%[cpu]], %[cpu_stride]\n" 154bf215546Sopenharmony_ci "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 155bf215546Sopenharmony_ci "vld1.8 d5, [%[cpu]], %[cpu_stride]\n" 156bf215546Sopenharmony_ci "vld1.8 d6, [%[cpu]], %[cpu_stride]\n" 157bf215546Sopenharmony_ci "vld1.8 d7, [%[cpu]]\n" 158bf215546Sopenharmony_ci /* Load from the GPU in one shot, no interleave, to 159bf215546Sopenharmony_ci * d0-d7. 160bf215546Sopenharmony_ci */ 161bf215546Sopenharmony_ci "vstm %[gpu], {q0, q1, q2, q3}\n" 162bf215546Sopenharmony_ci : [cpu] "+r"(cpu) 163bf215546Sopenharmony_ci : [gpu] "r"(gpu), 164bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 165bf215546Sopenharmony_ci : "q0", "q1", "q2", "q3"); 166bf215546Sopenharmony_ci return; 167bf215546Sopenharmony_ci } else if (gpu_stride == 16) { 168bf215546Sopenharmony_ci void *cpu2 = cpu + 8; 169bf215546Sopenharmony_ci __asm__ volatile ( 170bf215546Sopenharmony_ci /* Load each 16-byte line in 2 parts from the cpu-side 171bf215546Sopenharmony_ci * destination. (vld1 can only store one d-register 172bf215546Sopenharmony_ci * at a time). 173bf215546Sopenharmony_ci */ 174bf215546Sopenharmony_ci "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 175bf215546Sopenharmony_ci "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n" 176bf215546Sopenharmony_ci "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 177bf215546Sopenharmony_ci "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n" 178bf215546Sopenharmony_ci "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 179bf215546Sopenharmony_ci "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n" 180bf215546Sopenharmony_ci "vld1.8 d6, [%[cpu]]\n" 181bf215546Sopenharmony_ci "vld1.8 d7, [%[cpu2]]\n" 182bf215546Sopenharmony_ci /* Store to the GPU in one shot, no interleave. */ 183bf215546Sopenharmony_ci "vstm %[gpu], {q0, q1, q2, q3}\n" 184bf215546Sopenharmony_ci : [cpu] "+r"(cpu), 185bf215546Sopenharmony_ci [cpu2] "+r"(cpu2) 186bf215546Sopenharmony_ci : [gpu] "r"(gpu), 187bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 188bf215546Sopenharmony_ci : "q0", "q1", "q2", "q3"); 189bf215546Sopenharmony_ci return; 190bf215546Sopenharmony_ci } 191bf215546Sopenharmony_ci#elif defined (PIPE_ARCH_AARCH64) 192bf215546Sopenharmony_ci if (gpu_stride == 8) { 193bf215546Sopenharmony_ci __asm__ volatile ( 194bf215546Sopenharmony_ci /* Load each 8-byte line from cpu-side source, 195bf215546Sopenharmony_ci * incrementing it by the stride each time. 196bf215546Sopenharmony_ci */ 197bf215546Sopenharmony_ci "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 198bf215546Sopenharmony_ci "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 199bf215546Sopenharmony_ci "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 200bf215546Sopenharmony_ci "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 201bf215546Sopenharmony_ci "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 202bf215546Sopenharmony_ci "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 203bf215546Sopenharmony_ci "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 204bf215546Sopenharmony_ci "ld1 {v3.D}[1], [%[cpu]]\n" 205bf215546Sopenharmony_ci /* Store to the GPU in one shot, no interleave. */ 206bf215546Sopenharmony_ci "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 207bf215546Sopenharmony_ci : [cpu] "+r"(cpu) 208bf215546Sopenharmony_ci : [gpu] "r"(gpu), 209bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 210bf215546Sopenharmony_ci : "v0", "v1", "v2", "v3"); 211bf215546Sopenharmony_ci return; 212bf215546Sopenharmony_ci } else if (gpu_stride == 16) { 213bf215546Sopenharmony_ci void *cpu2 = cpu + 8; 214bf215546Sopenharmony_ci __asm__ volatile ( 215bf215546Sopenharmony_ci /* Load each 16-byte line in 2 parts from the cpu-side 216bf215546Sopenharmony_ci * destination. (vld1 can only store one d-register 217bf215546Sopenharmony_ci * at a time). 218bf215546Sopenharmony_ci */ 219bf215546Sopenharmony_ci "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 220bf215546Sopenharmony_ci "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 221bf215546Sopenharmony_ci "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 222bf215546Sopenharmony_ci "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 223bf215546Sopenharmony_ci "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 224bf215546Sopenharmony_ci "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 225bf215546Sopenharmony_ci "ld1 {v3.D}[0], [%[cpu]]\n" 226bf215546Sopenharmony_ci "ld1 {v3.D}[1], [%[cpu2]]\n" 227bf215546Sopenharmony_ci /* Store to the GPU in one shot, no interleave. */ 228bf215546Sopenharmony_ci "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 229bf215546Sopenharmony_ci : [cpu] "+r"(cpu), 230bf215546Sopenharmony_ci [cpu2] "+r"(cpu2) 231bf215546Sopenharmony_ci : [gpu] "r"(gpu), 232bf215546Sopenharmony_ci [cpu_stride] "r"(cpu_stride) 233bf215546Sopenharmony_ci : "v0", "v1", "v2", "v3"); 234bf215546Sopenharmony_ci return; 235bf215546Sopenharmony_ci } 236bf215546Sopenharmony_ci#endif 237bf215546Sopenharmony_ci 238bf215546Sopenharmony_ci for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 239bf215546Sopenharmony_ci memcpy(gpu + gpu_offset, cpu, gpu_stride); 240bf215546Sopenharmony_ci cpu += cpu_stride; 241bf215546Sopenharmony_ci } 242bf215546Sopenharmony_ci} 243