1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * Minimum CUDA compatibility definitions header
3cabdff1aSopenharmony_ci *
4cabdff1aSopenharmony_ci * Copyright (c) 2019 rcombs
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * This file is part of FFmpeg.
7cabdff1aSopenharmony_ci *
8cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
9cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
10cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
11cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
12cabdff1aSopenharmony_ci *
13cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
14cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
15cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16cabdff1aSopenharmony_ci * Lesser General Public License for more details.
17cabdff1aSopenharmony_ci *
18cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
19cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
20cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21cabdff1aSopenharmony_ci */
22cabdff1aSopenharmony_ci
23cabdff1aSopenharmony_ci#ifndef COMPAT_CUDA_CUDA_RUNTIME_H
24cabdff1aSopenharmony_ci#define COMPAT_CUDA_CUDA_RUNTIME_H
25cabdff1aSopenharmony_ci
26cabdff1aSopenharmony_ci// Common macros
27cabdff1aSopenharmony_ci#define __global__ __attribute__((global))
28cabdff1aSopenharmony_ci#define __device__ __attribute__((device))
29cabdff1aSopenharmony_ci#define __device_builtin__ __attribute__((device_builtin))
30cabdff1aSopenharmony_ci#define __align__(N) __attribute__((aligned(N)))
31cabdff1aSopenharmony_ci#define __inline__ __inline__ __attribute__((always_inline))
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci#define max(a, b) ((a) > (b) ? (a) : (b))
34cabdff1aSopenharmony_ci#define min(a, b) ((a) < (b) ? (a) : (b))
35cabdff1aSopenharmony_ci#define abs(x) ((x) < 0 ? -(x) : (x))
36cabdff1aSopenharmony_ci
37cabdff1aSopenharmony_ci#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci// Basic typedefs
40cabdff1aSopenharmony_citypedef __device_builtin__ unsigned long long cudaTextureObject_t;
41cabdff1aSopenharmony_ci
42cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(2) uchar2
43cabdff1aSopenharmony_ci{
44cabdff1aSopenharmony_ci    unsigned char x, y;
45cabdff1aSopenharmony_ci} uchar2;
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(4) ushort2
48cabdff1aSopenharmony_ci{
49cabdff1aSopenharmony_ci    unsigned short x, y;
50cabdff1aSopenharmony_ci} ushort2;
51cabdff1aSopenharmony_ci
52cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(8) float2
53cabdff1aSopenharmony_ci{
54cabdff1aSopenharmony_ci    float x, y;
55cabdff1aSopenharmony_ci} float2;
56cabdff1aSopenharmony_ci
57cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(8) int2
58cabdff1aSopenharmony_ci{
59cabdff1aSopenharmony_ci    int x, y;
60cabdff1aSopenharmony_ci} int2;
61cabdff1aSopenharmony_ci
62cabdff1aSopenharmony_citypedef struct __device_builtin__ uint3
63cabdff1aSopenharmony_ci{
64cabdff1aSopenharmony_ci    unsigned int x, y, z;
65cabdff1aSopenharmony_ci} uint3;
66cabdff1aSopenharmony_ci
67cabdff1aSopenharmony_citypedef struct uint3 dim3;
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(4) uchar4
70cabdff1aSopenharmony_ci{
71cabdff1aSopenharmony_ci    unsigned char x, y, z, w;
72cabdff1aSopenharmony_ci} uchar4;
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(8) ushort4
75cabdff1aSopenharmony_ci{
76cabdff1aSopenharmony_ci    unsigned short x, y, z, w;
77cabdff1aSopenharmony_ci} ushort4;
78cabdff1aSopenharmony_ci
79cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(16) int4
80cabdff1aSopenharmony_ci{
81cabdff1aSopenharmony_ci    int x, y, z, w;
82cabdff1aSopenharmony_ci} int4;
83cabdff1aSopenharmony_ci
84cabdff1aSopenharmony_citypedef struct __device_builtin__ __align__(16) float4
85cabdff1aSopenharmony_ci{
86cabdff1aSopenharmony_ci    float x, y, z, w;
87cabdff1aSopenharmony_ci} float4;
88cabdff1aSopenharmony_ci
89cabdff1aSopenharmony_ci// Accessors for special registers
90cabdff1aSopenharmony_ci#define GETCOMP(reg, comp) \
91cabdff1aSopenharmony_ci    asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
92cabdff1aSopenharmony_ci    ret.comp = tmp;
93cabdff1aSopenharmony_ci
94cabdff1aSopenharmony_ci#define GET(name, reg) static inline __device__ uint3 name() {\
95cabdff1aSopenharmony_ci    uint3 ret; \
96cabdff1aSopenharmony_ci    unsigned tmp; \
97cabdff1aSopenharmony_ci    GETCOMP(reg, x) \
98cabdff1aSopenharmony_ci    GETCOMP(reg, y) \
99cabdff1aSopenharmony_ci    GETCOMP(reg, z) \
100cabdff1aSopenharmony_ci    return ret; \
101cabdff1aSopenharmony_ci}
102cabdff1aSopenharmony_ci
103cabdff1aSopenharmony_ciGET(getBlockIdx, ctaid)
104cabdff1aSopenharmony_ciGET(getBlockDim, ntid)
105cabdff1aSopenharmony_ciGET(getThreadIdx, tid)
106cabdff1aSopenharmony_ci
107cabdff1aSopenharmony_ci// Instead of externs for these registers, we turn access to them into calls into trivial ASM
108cabdff1aSopenharmony_ci#define blockIdx (getBlockIdx())
109cabdff1aSopenharmony_ci#define blockDim (getBlockDim())
110cabdff1aSopenharmony_ci#define threadIdx (getThreadIdx())
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ci// Basic initializers (simple macros rather than inline functions)
113cabdff1aSopenharmony_ci#define make_int2(a, b) ((int2){.x = a, .y = b})
114cabdff1aSopenharmony_ci#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
115cabdff1aSopenharmony_ci#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
116cabdff1aSopenharmony_ci#define make_float2(a, b) ((float2){.x = a, .y = b})
117cabdff1aSopenharmony_ci#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
118cabdff1aSopenharmony_ci#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
119cabdff1aSopenharmony_ci#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
120cabdff1aSopenharmony_ci#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})
121cabdff1aSopenharmony_ci
122cabdff1aSopenharmony_ci// Conversions from the tex instruction's 4-register output to various types
123cabdff1aSopenharmony_ci#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ciTEX2D(unsigned char, a & 0xFF)
126cabdff1aSopenharmony_ciTEX2D(unsigned short, a & 0xFFFF)
127cabdff1aSopenharmony_ciTEX2D(float, a)
128cabdff1aSopenharmony_ciTEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
129cabdff1aSopenharmony_ciTEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
130cabdff1aSopenharmony_ciTEX2D(float2, make_float2(a, b))
131cabdff1aSopenharmony_ciTEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
132cabdff1aSopenharmony_ciTEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
133cabdff1aSopenharmony_ciTEX2D(float4, make_float4(a, b, c, d))
134cabdff1aSopenharmony_ci
135cabdff1aSopenharmony_ci// Template calling tex instruction and converting the output to the selected type
136cabdff1aSopenharmony_citemplate<typename T>
137cabdff1aSopenharmony_ciinline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
138cabdff1aSopenharmony_ci{
139cabdff1aSopenharmony_ci  T ret;
140cabdff1aSopenharmony_ci  unsigned ret1, ret2, ret3, ret4;
141cabdff1aSopenharmony_ci  asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
142cabdff1aSopenharmony_ci      "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
143cabdff1aSopenharmony_ci      "l"(texObject), "f"(x), "f"(y));
144cabdff1aSopenharmony_ci  conv(&ret, ret1, ret2, ret3, ret4);
145cabdff1aSopenharmony_ci  return ret;
146cabdff1aSopenharmony_ci}
147cabdff1aSopenharmony_ci
148cabdff1aSopenharmony_citemplate<>
149cabdff1aSopenharmony_ciinline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)
150cabdff1aSopenharmony_ci{
151cabdff1aSopenharmony_ci    float4 ret;
152cabdff1aSopenharmony_ci    asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
153cabdff1aSopenharmony_ci        "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :
154cabdff1aSopenharmony_ci        "l"(texObject), "f"(x), "f"(y));
155cabdff1aSopenharmony_ci    return ret;
156cabdff1aSopenharmony_ci}
157cabdff1aSopenharmony_ci
158cabdff1aSopenharmony_citemplate<>
159cabdff1aSopenharmony_ciinline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)
160cabdff1aSopenharmony_ci{
161cabdff1aSopenharmony_ci    return tex2D<float4>(texObject, x, y).x;
162cabdff1aSopenharmony_ci}
163cabdff1aSopenharmony_ci
164cabdff1aSopenharmony_citemplate<>
165cabdff1aSopenharmony_ciinline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)
166cabdff1aSopenharmony_ci{
167cabdff1aSopenharmony_ci    float4 ret = tex2D<float4>(texObject, x, y);
168cabdff1aSopenharmony_ci    return make_float2(ret.x, ret.y);
169cabdff1aSopenharmony_ci}
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci// Math helper functions
172cabdff1aSopenharmony_cistatic inline __device__ float floorf(float a) { return __builtin_floorf(a); }
173cabdff1aSopenharmony_cistatic inline __device__ float floor(float a) { return __builtin_floorf(a); }
174cabdff1aSopenharmony_cistatic inline __device__ double floor(double a) { return __builtin_floor(a); }
175cabdff1aSopenharmony_cistatic inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }
176cabdff1aSopenharmony_cistatic inline __device__ float ceil(float a) { return __builtin_ceilf(a); }
177cabdff1aSopenharmony_cistatic inline __device__ double ceil(double a) { return __builtin_ceil(a); }
178cabdff1aSopenharmony_cistatic inline __device__ float truncf(float a) { return __builtin_truncf(a); }
179cabdff1aSopenharmony_cistatic inline __device__ float trunc(float a) { return __builtin_truncf(a); }
180cabdff1aSopenharmony_cistatic inline __device__ double trunc(double a) { return __builtin_trunc(a); }
181cabdff1aSopenharmony_cistatic inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
182cabdff1aSopenharmony_cistatic inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
183cabdff1aSopenharmony_cistatic inline __device__ double fabs(double a) { return __builtin_fabs(a); }
184cabdff1aSopenharmony_cistatic inline __device__ float sqrtf(float a) { return __builtin_sqrtf(a); }
185cabdff1aSopenharmony_ci
186cabdff1aSopenharmony_cistatic inline __device__ float __saturatef(float a) { return __nvvm_saturate_f(a); }
187cabdff1aSopenharmony_cistatic inline __device__ float __sinf(float a) { return __nvvm_sin_approx_f(a); }
188cabdff1aSopenharmony_cistatic inline __device__ float __cosf(float a) { return __nvvm_cos_approx_f(a); }
189cabdff1aSopenharmony_cistatic inline __device__ float __expf(float a) { return __nvvm_ex2_approx_f(a * (float)__builtin_log2(__builtin_exp(1))); }
190cabdff1aSopenharmony_ci
191cabdff1aSopenharmony_ci#endif /* COMPAT_CUDA_CUDA_RUNTIME_H */
192