1cabdff1aSopenharmony_ci/* 2cabdff1aSopenharmony_ci * Minimum CUDA compatibility definitions header 3cabdff1aSopenharmony_ci * 4cabdff1aSopenharmony_ci * Copyright (c) 2019 rcombs 5cabdff1aSopenharmony_ci * 6cabdff1aSopenharmony_ci * This file is part of FFmpeg. 7cabdff1aSopenharmony_ci * 8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or 9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public 10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either 11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version. 12cabdff1aSopenharmony_ci * 13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful, 14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of 15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16cabdff1aSopenharmony_ci * Lesser General Public License for more details. 17cabdff1aSopenharmony_ci * 18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public 19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software 20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21cabdff1aSopenharmony_ci */ 22cabdff1aSopenharmony_ci 23cabdff1aSopenharmony_ci#ifndef COMPAT_CUDA_CUDA_RUNTIME_H 24cabdff1aSopenharmony_ci#define COMPAT_CUDA_CUDA_RUNTIME_H 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ci// Common macros 27cabdff1aSopenharmony_ci#define __global__ __attribute__((global)) 28cabdff1aSopenharmony_ci#define __device__ __attribute__((device)) 29cabdff1aSopenharmony_ci#define __device_builtin__ __attribute__((device_builtin)) 30cabdff1aSopenharmony_ci#define __align__(N) __attribute__((aligned(N))) 31cabdff1aSopenharmony_ci#define __inline__ __inline__ __attribute__((always_inline)) 32cabdff1aSopenharmony_ci 33cabdff1aSopenharmony_ci#define max(a, b) ((a) > (b) ? (a) : (b)) 34cabdff1aSopenharmony_ci#define min(a, b) ((a) < (b) ? (a) : (b)) 35cabdff1aSopenharmony_ci#define abs(x) ((x) < 0 ? -(x) : (x)) 36cabdff1aSopenharmony_ci 37cabdff1aSopenharmony_ci#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST)) 38cabdff1aSopenharmony_ci 39cabdff1aSopenharmony_ci// Basic typedefs 40cabdff1aSopenharmony_citypedef __device_builtin__ unsigned long long cudaTextureObject_t; 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(2) uchar2 43cabdff1aSopenharmony_ci{ 44cabdff1aSopenharmony_ci unsigned char x, y; 45cabdff1aSopenharmony_ci} uchar2; 46cabdff1aSopenharmony_ci 47cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(4) ushort2 48cabdff1aSopenharmony_ci{ 49cabdff1aSopenharmony_ci unsigned short x, y; 50cabdff1aSopenharmony_ci} ushort2; 51cabdff1aSopenharmony_ci 52cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(8) float2 53cabdff1aSopenharmony_ci{ 54cabdff1aSopenharmony_ci float x, y; 55cabdff1aSopenharmony_ci} float2; 56cabdff1aSopenharmony_ci 57cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(8) int2 58cabdff1aSopenharmony_ci{ 59cabdff1aSopenharmony_ci int x, y; 60cabdff1aSopenharmony_ci} int2; 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_citypedef struct __device_builtin__ uint3 63cabdff1aSopenharmony_ci{ 64cabdff1aSopenharmony_ci unsigned int x, y, z; 65cabdff1aSopenharmony_ci} uint3; 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_citypedef struct uint3 dim3; 68cabdff1aSopenharmony_ci 69cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(4) uchar4 70cabdff1aSopenharmony_ci{ 71cabdff1aSopenharmony_ci unsigned char x, y, z, w; 72cabdff1aSopenharmony_ci} uchar4; 73cabdff1aSopenharmony_ci 74cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(8) ushort4 75cabdff1aSopenharmony_ci{ 76cabdff1aSopenharmony_ci unsigned short x, y, z, w; 77cabdff1aSopenharmony_ci} ushort4; 78cabdff1aSopenharmony_ci 79cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(16) int4 80cabdff1aSopenharmony_ci{ 81cabdff1aSopenharmony_ci int x, y, z, w; 82cabdff1aSopenharmony_ci} int4; 83cabdff1aSopenharmony_ci 84cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(16) float4 85cabdff1aSopenharmony_ci{ 86cabdff1aSopenharmony_ci float x, y, z, w; 87cabdff1aSopenharmony_ci} float4; 88cabdff1aSopenharmony_ci 89cabdff1aSopenharmony_ci// Accessors for special registers 90cabdff1aSopenharmony_ci#define GETCOMP(reg, comp) \ 91cabdff1aSopenharmony_ci asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \ 92cabdff1aSopenharmony_ci ret.comp = tmp; 93cabdff1aSopenharmony_ci 94cabdff1aSopenharmony_ci#define GET(name, reg) static inline __device__ uint3 name() {\ 95cabdff1aSopenharmony_ci uint3 ret; \ 96cabdff1aSopenharmony_ci unsigned tmp; \ 97cabdff1aSopenharmony_ci GETCOMP(reg, x) \ 98cabdff1aSopenharmony_ci GETCOMP(reg, y) \ 99cabdff1aSopenharmony_ci GETCOMP(reg, z) \ 100cabdff1aSopenharmony_ci return ret; \ 101cabdff1aSopenharmony_ci} 102cabdff1aSopenharmony_ci 103cabdff1aSopenharmony_ciGET(getBlockIdx, ctaid) 104cabdff1aSopenharmony_ciGET(getBlockDim, ntid) 105cabdff1aSopenharmony_ciGET(getThreadIdx, tid) 106cabdff1aSopenharmony_ci 107cabdff1aSopenharmony_ci// Instead of externs for these registers, we turn access to them into calls into trivial ASM 108cabdff1aSopenharmony_ci#define blockIdx (getBlockIdx()) 109cabdff1aSopenharmony_ci#define blockDim (getBlockDim()) 110cabdff1aSopenharmony_ci#define threadIdx (getThreadIdx()) 111cabdff1aSopenharmony_ci 112cabdff1aSopenharmony_ci// Basic initializers (simple macros rather than inline functions) 113cabdff1aSopenharmony_ci#define make_int2(a, b) ((int2){.x = a, .y = b}) 114cabdff1aSopenharmony_ci#define make_uchar2(a, b) ((uchar2){.x = a, .y = b}) 115cabdff1aSopenharmony_ci#define make_ushort2(a, b) ((ushort2){.x = a, .y = b}) 116cabdff1aSopenharmony_ci#define make_float2(a, b) ((float2){.x = a, .y = b}) 117cabdff1aSopenharmony_ci#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d}) 118cabdff1aSopenharmony_ci#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d}) 119cabdff1aSopenharmony_ci#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d}) 120cabdff1aSopenharmony_ci#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d}) 121cabdff1aSopenharmony_ci 122cabdff1aSopenharmony_ci// Conversions from the tex instruction's 4-register output to various types 123cabdff1aSopenharmony_ci#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);} 124cabdff1aSopenharmony_ci 125cabdff1aSopenharmony_ciTEX2D(unsigned char, a & 0xFF) 126cabdff1aSopenharmony_ciTEX2D(unsigned short, a & 0xFFFF) 127cabdff1aSopenharmony_ciTEX2D(float, a) 128cabdff1aSopenharmony_ciTEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF)) 129cabdff1aSopenharmony_ciTEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF)) 130cabdff1aSopenharmony_ciTEX2D(float2, make_float2(a, b)) 131cabdff1aSopenharmony_ciTEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF)) 132cabdff1aSopenharmony_ciTEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF)) 133cabdff1aSopenharmony_ciTEX2D(float4, make_float4(a, b, c, d)) 134cabdff1aSopenharmony_ci 135cabdff1aSopenharmony_ci// Template calling tex instruction and converting the output to the selected type 136cabdff1aSopenharmony_citemplate<typename T> 137cabdff1aSopenharmony_ciinline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y) 138cabdff1aSopenharmony_ci{ 139cabdff1aSopenharmony_ci T ret; 140cabdff1aSopenharmony_ci unsigned ret1, ret2, ret3, ret4; 141cabdff1aSopenharmony_ci asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" : 142cabdff1aSopenharmony_ci "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) : 143cabdff1aSopenharmony_ci "l"(texObject), "f"(x), "f"(y)); 144cabdff1aSopenharmony_ci conv(&ret, ret1, ret2, ret3, ret4); 145cabdff1aSopenharmony_ci return ret; 146cabdff1aSopenharmony_ci} 147cabdff1aSopenharmony_ci 148cabdff1aSopenharmony_citemplate<> 149cabdff1aSopenharmony_ciinline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y) 150cabdff1aSopenharmony_ci{ 151cabdff1aSopenharmony_ci float4 ret; 152cabdff1aSopenharmony_ci asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" : 153cabdff1aSopenharmony_ci "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : 154cabdff1aSopenharmony_ci "l"(texObject), "f"(x), "f"(y)); 155cabdff1aSopenharmony_ci return ret; 156cabdff1aSopenharmony_ci} 157cabdff1aSopenharmony_ci 158cabdff1aSopenharmony_citemplate<> 159cabdff1aSopenharmony_ciinline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y) 160cabdff1aSopenharmony_ci{ 161cabdff1aSopenharmony_ci return tex2D<float4>(texObject, x, y).x; 162cabdff1aSopenharmony_ci} 163cabdff1aSopenharmony_ci 164cabdff1aSopenharmony_citemplate<> 165cabdff1aSopenharmony_ciinline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y) 166cabdff1aSopenharmony_ci{ 167cabdff1aSopenharmony_ci float4 ret = tex2D<float4>(texObject, x, y); 168cabdff1aSopenharmony_ci return make_float2(ret.x, ret.y); 169cabdff1aSopenharmony_ci} 170cabdff1aSopenharmony_ci 171cabdff1aSopenharmony_ci// Math helper functions 172cabdff1aSopenharmony_cistatic inline __device__ float floorf(float a) { return __builtin_floorf(a); } 173cabdff1aSopenharmony_cistatic inline __device__ float floor(float a) { return __builtin_floorf(a); } 174cabdff1aSopenharmony_cistatic inline __device__ double floor(double a) { return __builtin_floor(a); } 175cabdff1aSopenharmony_cistatic inline __device__ float ceilf(float a) { return __builtin_ceilf(a); } 176cabdff1aSopenharmony_cistatic inline __device__ float ceil(float a) { return __builtin_ceilf(a); } 177cabdff1aSopenharmony_cistatic inline __device__ double ceil(double a) { return __builtin_ceil(a); } 178cabdff1aSopenharmony_cistatic inline __device__ float truncf(float a) { return __builtin_truncf(a); } 179cabdff1aSopenharmony_cistatic inline __device__ float trunc(float a) { return __builtin_truncf(a); } 180cabdff1aSopenharmony_cistatic inline __device__ double trunc(double a) { return __builtin_trunc(a); } 181cabdff1aSopenharmony_cistatic inline __device__ float fabsf(float a) { return __builtin_fabsf(a); } 182cabdff1aSopenharmony_cistatic inline __device__ float fabs(float a) { return __builtin_fabsf(a); } 183cabdff1aSopenharmony_cistatic inline __device__ double fabs(double a) { return __builtin_fabs(a); } 184cabdff1aSopenharmony_cistatic inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); } 185cabdff1aSopenharmony_ci 186cabdff1aSopenharmony_cistatic inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); } 187cabdff1aSopenharmony_cistatic inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); } 188cabdff1aSopenharmony_cistatic inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); } 189cabdff1aSopenharmony_cistatic inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); } 190cabdff1aSopenharmony_ci 191cabdff1aSopenharmony_ci#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */ 192