1/*
2 * Copyright © Microsoft Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <cmath>
25#include <stdio.h>
26#include <stdint.h>
27#include <stdexcept>
28#include <vector>
29
30#include <unknwn.h>
31#include <directx/d3d12.h>
32#include <dxgi1_4.h>
33#include <gtest/gtest.h>
34#include <wrl.h>
35#include <dxguids/dxguids.h>
36
37#include "compute_test.h"
38
39using std::vector;
40
41TEST_F(ComputeTest, runtime_memcpy)
42{
43   struct shift { uint8_t val; uint8_t shift; uint16_t ret; };
44   const char *kernel_source =
45   "struct shift { uchar val; uchar shift; ushort ret; };\n\
46   __kernel void main_test(__global struct shift *inout)\n\
47   {\n\
48      uint id = get_global_id(0);\n\
49      uint id2 = id + get_global_id(1);\n\
50      struct shift lc[4] = { { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }, { 0, 0, 0 }};\n\
51      lc[id] = inout[id];\n\
52      inout[id2].ret = (ushort) lc[id2].val << (ushort) lc[id2].shift;\n\
53   }\n";
54
55   auto inout = ShaderArg<struct shift>({
56         { 0x10, 1, 0xffff },
57         { 0x20, 2, 0xffff },
58         { 0x30, 3, 0xffff },
59         { 0x40, 4, 0xffff },
60      },
61      SHADER_ARG_INOUT);
62   const uint16_t expected[] = { 0x20, 0x80, 0x180, 0x400 };
63   run_shader(kernel_source, inout.size(), 1, 1, inout);
64   for (int i = 0; i < inout.size(); ++i)
65      EXPECT_EQ(inout[i].ret, expected[i]);
66}
67
68TEST_F(ComputeTest, two_global_arrays)
69{
70   const char *kernel_source =
71   "__kernel void main_test(__global uint *g1, __global uint *g2)\n\
72   {\n\
73       uint idx = get_global_id(0);\n\
74       g1[idx] -= g2[idx];\n\
75   }\n";
76   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
77   auto g2 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
78   const uint32_t expected[] = {
79      9, 18, 27, 36
80   };
81
82   run_shader(kernel_source, g1.size(), 1, 1, g1, g2);
83   for (int i = 0; i < g1.size(); ++i)
84      EXPECT_EQ(g1[i], expected[i]);
85}
86
87/* Disabled until saturated conversions from f32->i64 fixed (mesa/mesa#3824) */
88TEST_F(ComputeTest, DISABLED_i64tof32)
89{
90   const char *kernel_source =
91   "__kernel void main_test(__global long *out, __constant long *in)\n\
92   {\n\
93       __local float tmp[12];\n\
94       uint idx = get_global_id(0);\n\
95       tmp[idx] = in[idx];\n\
96       barrier(CLK_LOCAL_MEM_FENCE);\n\
97       out[idx] = tmp[idx + get_global_id(1)];\n\
98   }\n";
99   auto in = ShaderArg<int64_t>({ 0x100000000LL,
100                                  -0x100000000LL,
101                                  0x7fffffffffffffffLL,
102                                  0x4000004000000000LL,
103                                  0x4000003fffffffffLL,
104                                  0x4000004000000001LL,
105                                  -1,
106                                  -0x4000004000000000LL,
107                                  -0x4000003fffffffffLL,
108                                  -0x4000004000000001LL,
109                                  0,
110                                  INT64_MIN },
111                                SHADER_ARG_INPUT);
112   auto out = ShaderArg<int64_t>(std::vector<int64_t>(12, 0xdeadbeed), SHADER_ARG_OUTPUT);
113   const int64_t expected[] = {
114      0x100000000LL,
115      -0x100000000LL,
116      0x7fffffffffffffffLL,
117      0x4000000000000000LL,
118      0x4000000000000000LL,
119      0x4000008000000000LL,
120      -1,
121      -0x4000000000000000LL,
122      -0x4000000000000000LL,
123      -0x4000008000000000LL,
124      0,
125      INT64_MIN,
126   };
127
128   run_shader(kernel_source, out.size(), 1, 1, out, in);
129   for (int i = 0; i < out.size(); ++i) {
130      EXPECT_EQ((int64_t)out[i], expected[i]);
131   }
132}
133TEST_F(ComputeTest, two_constant_arrays)
134{
135   const char *kernel_source =
136   "__kernel void main_test(__constant uint *c1, __global uint *g1, __constant uint *c2)\n\
137   {\n\
138       uint idx = get_global_id(0);\n\
139       g1[idx] -= c1[idx] + c2[idx];\n\
140   }\n";
141   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
142   auto c1 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
143   auto c2 = ShaderArg<uint32_t>(std::vector<uint32_t>(16384, 5), SHADER_ARG_INPUT);
144   const uint32_t expected[] = {
145      4, 13, 22, 31
146   };
147
148   run_shader(kernel_source, g1.size(), 1, 1, c1, g1, c2);
149   for (int i = 0; i < g1.size(); ++i)
150      EXPECT_EQ(g1[i], expected[i]);
151}
152
153TEST_F(ComputeTest, null_constant_ptr)
154{
155   const char *kernel_source =
156   "__kernel void main_test(__global uint *g1, __constant uint *c1)\n\
157   {\n\
158       __constant uint fallback[] = {2, 3, 4, 5};\n\
159       __constant uint *c = c1 ? c1 : fallback;\n\
160       uint idx = get_global_id(0);\n\
161       g1[idx] -= c[idx];\n\
162   }\n";
163   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
164   auto c1 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
165   const uint32_t expected1[] = {
166      9, 18, 27, 36
167   };
168
169   run_shader(kernel_source, g1.size(), 1, 1, g1, c1);
170   for (int i = 0; i < g1.size(); ++i)
171      EXPECT_EQ(g1[i], expected1[i]);
172
173   const uint32_t expected2[] = {
174      8, 17, 26, 35
175   };
176
177   g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
178   auto c2 = NullShaderArg();
179   run_shader(kernel_source, g1.size(), 1, 1, g1, c2);
180   for (int i = 0; i < g1.size(); ++i)
181      EXPECT_EQ(g1[i], expected2[i]);
182}
183
184TEST_F(ComputeTest, null_global_ptr)
185{
186   const char *kernel_source =
187   "__kernel void main_test(__global uint *g1, __global uint *g2)\n\
188   {\n\
189       __constant uint fallback[] = {2, 3, 4, 5};\n\
190       uint idx = get_global_id(0);\n\
191       g1[idx] -= g2 ? g2[idx] : fallback[idx];\n\
192   }\n";
193   auto g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
194   auto g2 = ShaderArg<uint32_t>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
195   const uint32_t expected1[] = {
196      9, 18, 27, 36
197   };
198
199   run_shader(kernel_source, g1.size(), 1, 1, g1, g2);
200   for (int i = 0; i < g1.size(); ++i)
201      EXPECT_EQ(g1[i], expected1[i]);
202
203   const uint32_t expected2[] = {
204      8, 17, 26, 35
205   };
206
207   g1 = ShaderArg<uint32_t>({ 10, 20, 30, 40 }, SHADER_ARG_INOUT);
208   auto g2null = NullShaderArg();
209   run_shader(kernel_source, g1.size(), 1, 1, g1, g2null);
210   for (int i = 0; i < g1.size(); ++i)
211      EXPECT_EQ(g1[i], expected2[i]);
212}
213
214TEST_F(ComputeTest, ret_constant_ptr)
215{
216   struct s { uint64_t ptr; uint32_t val; };
217   const char *kernel_source =
218   "struct s { __constant uint *ptr; uint val; };\n\
219   __kernel void main_test(__global struct s *out, __constant uint *in)\n\
220   {\n\
221       __constant uint foo[] = { 1, 2 };\n\
222       uint idx = get_global_id(0);\n\
223       if (idx == 0)\n\
224          out[idx].ptr = foo;\n\
225       else\n\
226          out[idx].ptr = in;\n\
227       out[idx].val = out[idx].ptr[idx];\n\
228   }\n";
229   auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT);
230   auto in = ShaderArg<uint32_t>({ 3, 4 }, SHADER_ARG_INPUT);
231   const uint32_t expected_val[] = {
232      1, 4
233   };
234   const uint64_t expected_ptr[] = {
235      2ull << 32, 1ull << 32
236   };
237
238   run_shader(kernel_source, out.size(), 1, 1, out, in);
239   for (int i = 0; i < out.size(); ++i) {
240      EXPECT_EQ(out[i].val, expected_val[i]);
241      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
242   }
243}
244
245TEST_F(ComputeTest, ret_global_ptr)
246{
247   struct s { uint64_t ptr; uint32_t val; };
248   const char *kernel_source =
249   "struct s { __global uint *ptr; uint val; };\n\
250   __kernel void main_test(__global struct s *out, __global uint *in1, __global uint *in2)\n\
251   {\n\
252       uint idx = get_global_id(0);\n\
253       out[idx].ptr = idx ? in2 : in1;\n\
254       out[idx].val = out[idx].ptr[idx];\n\
255   }\n";
256   auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0xdeadbeefdeadbeef, 0}), SHADER_ARG_OUTPUT);
257   auto in1 = ShaderArg<uint32_t>({ 1, 2 }, SHADER_ARG_INPUT);
258   auto in2 = ShaderArg<uint32_t>({ 3, 4 }, SHADER_ARG_INPUT);
259   const uint32_t expected_val[] = {
260      1, 4
261   };
262   const uint64_t expected_ptr[] = {
263      1ull << 32, 2ull << 32
264   };
265
266   run_shader(kernel_source, out.size(), 1, 1, out, in1, in2);
267   for (int i = 0; i < out.size(); ++i) {
268      EXPECT_EQ(out[i].val, expected_val[i]);
269      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
270   }
271}
272
273TEST_F(ComputeTest, ret_local_ptr)
274{
275   struct s { uint64_t ptr; };
276   const char *kernel_source =
277   "struct s { __local uint *ptr; };\n\
278   __kernel void main_test(__global struct s *out)\n\
279   {\n\
280       __local uint tmp[2];\n\
281       uint idx = get_global_id(0);\n\
282       tmp[idx] = idx;\n\
283       out[idx].ptr = &tmp[idx];\n\
284   }\n";
285   auto out = ShaderArg<struct s>(std::vector<struct s>(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT);
286   const uint64_t expected_ptr[] = {
287      0, 4,
288   };
289
290   run_shader(kernel_source, out.size(), 1, 1, out);
291   for (int i = 0; i < out.size(); ++i) {
292      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
293   }
294}
295
296TEST_F(ComputeTest, ret_private_ptr)
297{
298   struct s { uint64_t ptr; uint32_t value; };
299   const char *kernel_source =
300   "struct s { __private uint *ptr; uint value; };\n\
301   __kernel void main_test(__global struct s *out)\n\
302   {\n\
303       uint tmp[2] = {1, 2};\n\
304       uint idx = get_global_id(0);\n\
305       out[idx].ptr = &tmp[idx];\n\
306       out[idx].value = *out[idx].ptr;\n\
307   }\n";
308   auto out = ShaderArg<struct s>(std::vector<struct s>(2, { 0xdeadbeefdeadbeef }), SHADER_ARG_OUTPUT);
309   const uint64_t expected_ptr[] = {
310      0, 4,
311   };
312   const uint32_t expected_value[] = {
313      1, 2
314   };
315
316   run_shader(kernel_source, out.size(), 1, 1, out);
317   for (int i = 0; i < out.size(); ++i) {
318      EXPECT_EQ(out[i].ptr, expected_ptr[i]);
319   }
320}
321
322TEST_F(ComputeTest, globals_8bit)
323{
324   const char *kernel_source =
325   "__kernel void main_test(__global unsigned char *inout)\n\
326   {\n\
327       uint idx = get_global_id(0);\n\
328       inout[idx] = inout[idx] + 1;\n\
329   }\n";
330   auto inout = ShaderArg<uint8_t> ({ 100, 110, 120, 130 }, SHADER_ARG_INOUT);
331   const uint8_t expected[] = {
332      101, 111, 121, 131
333   };
334   run_shader(kernel_source, inout.size(), 1, 1, inout);
335   for (int i = 0; i < inout.size(); ++i)
336      EXPECT_EQ(inout[i], expected[i]);
337}
338
339TEST_F(ComputeTest, globals_16bit)
340{
341   const char *kernel_source =
342   "__kernel void main_test(__global unsigned short *inout)\n\
343   {\n\
344       uint idx = get_global_id(0);\n\
345       inout[idx] = inout[idx] + 1;\n\
346   }\n";
347   auto inout = ShaderArg<uint16_t> ({ 10000, 10010, 10020, 10030 }, SHADER_ARG_INOUT);
348   const uint16_t expected[] = {
349      10001, 10011, 10021, 10031
350   };
351   run_shader(kernel_source, inout.size(), 1, 1, inout);
352   for (int i = 0; i < inout.size(); ++i)
353      EXPECT_EQ(inout[i], expected[i]);
354}
355
356TEST_F(ComputeTest, globals_64bit)
357{
358   const char *kernel_source =
359   "__kernel void main_test(__global unsigned long *inout)\n\
360   {\n\
361       uint idx = get_global_id(0);\n\
362       inout[idx] = inout[idx] + 1;\n\
363   }\n";
364   uint64_t base = 1ull << 50;
365   auto inout = ShaderArg<uint64_t>({ base, base + 10, base + 20, base + 30 },
366                                    SHADER_ARG_INOUT);
367   const uint64_t expected[] = {
368      base + 1, base + 11, base + 21, base + 31
369   };
370   run_shader(kernel_source, inout.size(), 1, 1, inout);
371   for (int i = 0; i < inout.size(); ++i)
372      EXPECT_EQ(inout[i], expected[i]);
373}
374
375TEST_F(ComputeTest, built_ins_global_id)
376{
377   const char *kernel_source =
378   "__kernel void main_test(__global uint *output)\n\
379   {\n\
380       output[get_global_id(0)] = get_global_id(0);\n\
381   }\n";
382   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
383                                     SHADER_ARG_OUTPUT);
384   const uint32_t expected[] = {
385      0, 1, 2, 3
386   };
387
388   run_shader(kernel_source, output.size(), 1, 1, output);
389   for (int i = 0; i < output.size(); ++i)
390      EXPECT_EQ(output[i], expected[i]);
391}
392
393TEST_F(ComputeTest, built_ins_global_id_rmw)
394{
395   const char *kernel_source =
396   "__kernel void main_test(__global uint *output)\n\
397   {\n\
398       uint id = get_global_id(0);\n\
399       output[id] = output[id] * (id + 1);\n\
400   }\n";
401   auto inout = ShaderArg<uint32_t>({0x00000001, 0x10000001, 0x00020002, 0x04010203},
402                                    SHADER_ARG_INOUT);
403   const uint32_t expected[] = {
404      0x00000001, 0x20000002, 0x00060006, 0x1004080c
405   };
406   run_shader(kernel_source, inout.size(), 1, 1, inout);
407   for (int i = 0; i < inout.size(); ++i)
408      EXPECT_EQ(inout[i], expected[i]);
409}
410
411TEST_F(ComputeTest, types_float_basics)
412{
413   const char *kernel_source =
414   "__kernel void main_test(__global uint *output)\n\
415   {\n\
416       output[get_global_id(0)] = (uint)((float)get_global_id(0) + 1.5f);\n\
417   }\n";
418   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
419                                     SHADER_ARG_OUTPUT);
420   const uint32_t expected[] = {
421      1, 2, 3, 4
422   };
423   run_shader(kernel_source, output.size(), 1, 1, output);
424   for (int i = 0; i < output.size(); ++i)
425      EXPECT_EQ(output[i], expected[i]);
426}
427
428TEST_F(ComputeTest, DISABLED_types_double_basics)
429{
430   /* Disabled because doubles are unsupported */
431   const char *kernel_source =
432   "__kernel void main_test(__global uint *output)\n\
433   {\n\
434       output[get_global_id(0)] = (uint)((double)get_global_id(0) + 1.5);\n\
435   }\n";
436   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
437                                     SHADER_ARG_OUTPUT);
438   const uint32_t expected[] = {
439      1, 2, 3, 4
440   };
441   run_shader(kernel_source, output.size(), 1, 1, output);
442   for (int i = 0; i < output.size(); ++i)
443      EXPECT_EQ(output[i], expected[i]);
444}
445
446TEST_F(ComputeTest, types_short_basics)
447{
448   const char *kernel_source =
449   "__kernel void main_test(__global uint *output)\n\
450   {\n\
451       output[get_global_id(0)] = (uint)((short)get_global_id(0) + (short)1);\n\
452   }\n";
453   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
454                                     SHADER_ARG_OUTPUT);
455   const uint32_t expected[] = {
456      1, 2, 3, 4
457   };
458   run_shader(kernel_source, output.size(), 1, 1, output);
459   for (int i = 0; i < output.size(); ++i)
460      EXPECT_EQ(output[i], expected[i]);
461}
462
463TEST_F(ComputeTest, types_char_basics)
464{
465   const char *kernel_source =
466   "__kernel void main_test(__global uint *output)\n\
467   {\n\
468       output[get_global_id(0)] = (uint)((char)get_global_id(0) + (char)1);\n\
469   }\n";
470   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
471                                     SHADER_ARG_OUTPUT);
472   const uint32_t expected[] = {
473      1, 2, 3, 4
474   };
475   run_shader(kernel_source, output.size(), 1, 1, output);
476   for (int i = 0; i < output.size(); ++i)
477      EXPECT_EQ(output[i], expected[i]);
478}
479
480TEST_F(ComputeTest, types_if_statement)
481{
482   const char *kernel_source =
483   "__kernel void main_test(__global uint *output)\n\
484   {\n\
485       int idx = get_global_id(0);\n\
486       if (idx > 0)\n\
487           output[idx] = ~idx;\n\
488       else\n\
489           output[0] = 0xff;\n\
490   }\n";
491   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
492                                     SHADER_ARG_OUTPUT);
493   const uint32_t expected[] = {
494      0xff, ~1u, ~2u, ~3u
495   };
496   run_shader(kernel_source, output.size(), 1, 1, output);
497   for (int i = 0; i < output.size(); ++i)
498      EXPECT_EQ(output[i], expected[i]);
499}
500
501TEST_F(ComputeTest, types_do_while_loop)
502{
503   const char *kernel_source =
504   "__kernel void main_test(__global uint *output)\n\
505   {\n\
506       int value = 1;\n\
507       int i = 1, n = get_global_id(0);\n\
508       do {\n\
509          value *= i++;\n\
510       } while (i <= n);\n\
511       output[n] = value;\n\
512   }\n";
513   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(5, 0xdeadbeef),
514                                     SHADER_ARG_OUTPUT);
515   const uint32_t expected[] = {
516      1, 1, 1*2, 1*2*3, 1*2*3*4
517   };
518   run_shader(kernel_source, output.size(), 1, 1, output);
519   for (int i = 0; i < output.size(); ++i)
520      EXPECT_EQ(output[i], expected[i]);
521}
522
523TEST_F(ComputeTest, types_for_loop)
524{
525   const char *kernel_source =
526   "__kernel void main_test(__global uint *output)\n\
527   {\n\
528       int value = 1;\n\
529       int n = get_global_id(0);\n\
530       for (int i = 1; i <= n; ++i)\n\
531          value *= i;\n\
532       output[n] = value;\n\
533   }\n";
534   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(5, 0xdeadbeef),
535                                     SHADER_ARG_OUTPUT);
536   const uint32_t expected[] = {
537      1, 1, 1*2, 1*2*3, 1*2*3*4
538   };
539   run_shader(kernel_source, output.size(), 1, 1, output);
540   for (int i = 0; i < output.size(); ++i)
541      EXPECT_EQ(output[i], expected[i]);
542}
543
544TEST_F(ComputeTest, complex_types_local_array_long)
545{
546   const char *kernel_source =
547   "__kernel void main_test(__global ulong *inout)\n\
548   {\n\
549      ulong tmp[] = {\n\
550         get_global_id(1) + 0x00000000,\n\
551         get_global_id(1) + 0x10000001,\n\
552         get_global_id(1) + 0x20000020,\n\
553         get_global_id(1) + 0x30000300,\n\
554      };\n\
555      uint idx = get_global_id(0);\n\
556      inout[idx] = tmp[idx];\n\
557   }\n";
558   auto inout = ShaderArg<uint64_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
559   const uint64_t expected[] = {
560      0x00000000, 0x10000001, 0x20000020, 0x30000300,
561   };
562   run_shader(kernel_source, inout.size(), 1, 1, inout);
563   for (int i = 0; i < inout.size(); ++i)
564      EXPECT_EQ(inout[i], expected[i]);
565}
566
567TEST_F(ComputeTest, complex_types_local_array_short)
568{
569   const char *kernel_source =
570   "__kernel void main_test(__global ushort *inout)\n\
571   {\n\
572      ushort tmp[] = {\n\
573         get_global_id(1) + 0x00,\n\
574         get_global_id(1) + 0x10,\n\
575         get_global_id(1) + 0x20,\n\
576         get_global_id(1) + 0x30,\n\
577      };\n\
578      uint idx = get_global_id(0);\n\
579      inout[idx] = tmp[idx];\n\
580   }\n";
581   auto inout = ShaderArg<uint16_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
582   const uint16_t expected[] = {
583      0x00, 0x10, 0x20, 0x30,
584   };
585   run_shader(kernel_source, inout.size(), 1, 1, inout);
586   for (int i = 0; i < inout.size(); ++i)
587      EXPECT_EQ(inout[i], expected[i]);
588}
589
590TEST_F(ComputeTest, complex_types_local_array_struct_vec_float_misaligned)
591{
592   const char *kernel_source =
593   "struct has_vecs { uchar c; ushort s; float2 f; };\n\
594   __kernel void main_test(__global uint *inout)\n\
595   {\n\
596      struct has_vecs tmp[] = {\n\
597         { 10 + get_global_id(0), get_global_id(1), { 10.0f, 1.0f } },\n\
598         { 19 + get_global_id(0), get_global_id(1), { 20.0f, 4.0f } },\n\
599         { 28 + get_global_id(0), get_global_id(1), { 30.0f, 9.0f } },\n\
600         { 37 + get_global_id(0), get_global_id(1), { 40.0f, 16.0f } },\n\
601      };\n\
602      uint idx = get_global_id(0);\n\
603      uint mul = (tmp[idx].c + tmp[idx].s) * trunc(tmp[idx].f[0]);\n\
604      inout[idx] = mul + trunc(tmp[idx].f[1]);\n\
605   }\n";
606   auto inout = ShaderArg<uint32_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
607   const uint16_t expected[] = { 101, 404, 909, 1616 };
608   run_shader(kernel_source, inout.size(), 1, 1, inout);
609   for (int i = 0; i < inout.size(); ++i)
610      EXPECT_EQ(inout[i], expected[i]);
611}
612
613TEST_F(ComputeTest, complex_types_local_array)
614{
615   const char *kernel_source =
616   "__kernel void main_test(__global uint *inout)\n\
617   {\n\
618      uint tmp[] = {\n\
619         get_global_id(1) + 0x00,\n\
620         get_global_id(1) + 0x10,\n\
621         get_global_id(1) + 0x20,\n\
622         get_global_id(1) + 0x30,\n\
623      };\n\
624      uint idx = get_global_id(0);\n\
625      inout[idx] = tmp[idx];\n\
626   }\n";
627   auto inout = ShaderArg<uint32_t>({ 0, 0, 0, 0 }, SHADER_ARG_INOUT);
628   const uint32_t expected[] = {
629      0x00, 0x10, 0x20, 0x30,
630   };
631   run_shader(kernel_source, inout.size(), 1, 1, inout);
632   for (int i = 0; i < inout.size(); ++i)
633      EXPECT_EQ(inout[i], expected[i]);
634}
635
636TEST_F(ComputeTest, complex_types_global_struct_array)
637{
638   struct two_vals { uint32_t add; uint32_t mul; };
639   const char *kernel_source =
640   "struct two_vals { uint add; uint mul; };\n\
641   __kernel void main_test(__global struct two_vals *in_out)\n\
642   {\n\
643      uint id = get_global_id(0);\n\
644      in_out[id].add = in_out[id].add + id;\n\
645      in_out[id].mul = in_out[id].mul * id;\n\
646   }\n";
647   auto inout = ShaderArg<struct two_vals>({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } },
648                                           SHADER_ARG_INOUT);
649   const struct two_vals expected[] = {
650      { 8 + 0, 8 * 0 },
651      { 16 + 1, 16 * 1 },
652      { 64 + 2, 64 * 2 },
653      { 65536 + 3, 65536 * 3 }
654   };
655   run_shader(kernel_source, inout.size(), 1, 1, inout);
656   for (int i = 0; i < inout.size(); ++i) {
657      EXPECT_EQ(inout[i].add, expected[i].add);
658      EXPECT_EQ(inout[i].mul, expected[i].mul);
659   }
660}
661
662TEST_F(ComputeTest, complex_types_global_uint2)
663{
664   struct uint2 { uint32_t x; uint32_t y; };
665   const char *kernel_source =
666   "__kernel void main_test(__global uint2 *inout)\n\
667   {\n\
668      uint id = get_global_id(0);\n\
669      inout[id].x = inout[id].x + id;\n\
670      inout[id].y = inout[id].y * id;\n\
671   }\n";
672   auto inout = ShaderArg<struct uint2>({ { 8, 8 }, { 16, 16 }, { 64, 64 }, { 65536, 65536 } },
673                                        SHADER_ARG_INOUT);
674   const struct uint2 expected[] = {
675      { 8 + 0, 8 * 0 },
676      { 16 + 1, 16 * 1 },
677      { 64 + 2, 64 * 2 },
678      { 65536 + 3, 65536 * 3 }
679   };
680   run_shader(kernel_source, inout.size(), 1, 1, inout);
681   for (int i = 0; i < inout.size(); ++i) {
682      EXPECT_EQ(inout[i].x, expected[i].x);
683      EXPECT_EQ(inout[i].y, expected[i].y);
684   }
685}
686
687TEST_F(ComputeTest, complex_types_global_ushort2)
688{
689   struct ushort2 { uint16_t x; uint16_t y; };
690   const char *kernel_source =
691   "__kernel void main_test(__global ushort2 *inout)\n\
692   {\n\
693      uint id = get_global_id(0);\n\
694      inout[id].x = inout[id].x + id;\n\
695      inout[id].y = inout[id].y * id;\n\
696   }\n";
697   auto inout = ShaderArg<struct ushort2>({ { 8, 8 }, { 16, 16 }, { 64, 64 },
698                                            { (uint16_t)65536, (uint16_t)65536 } },
699                                          SHADER_ARG_INOUT);
700   const struct ushort2 expected[] = {
701      { 8 + 0, 8 * 0 },
702      { 16 + 1, 16 * 1 },
703      { 64 + 2, 64 * 2 },
704      { (uint16_t)(65536 + 3), (uint16_t)(65536 * 3) }
705   };
706   run_shader(kernel_source, inout.size(), 1, 1, inout);
707   for (int i = 0; i < inout.size(); ++i) {
708      EXPECT_EQ(inout[i].x, expected[i].x);
709      EXPECT_EQ(inout[i].y, expected[i].y);
710   }
711}
712
713TEST_F(ComputeTest, complex_types_global_uchar3)
714{
715   struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; };
716   const char *kernel_source =
717   "__kernel void main_test(__global uchar3 *inout)\n\
718   {\n\
719      uint id = get_global_id(0);\n\
720      inout[id].x = inout[id].x + id;\n\
721      inout[id].y = inout[id].y * id;\n\
722      inout[id].z = inout[id].y + inout[id].x;\n\
723   }\n";
724   auto inout = ShaderArg<struct uchar3>({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } },
725                                         SHADER_ARG_INOUT);
726   const struct uchar3 expected[] = {
727      { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) },
728      { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) },
729      { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) },
730      { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) }
731   };
732   run_shader(kernel_source, inout.size(), 1, 1, inout);
733   for (int i = 0; i < inout.size(); ++i) {
734      EXPECT_EQ(inout[i].x, expected[i].x);
735      EXPECT_EQ(inout[i].y, expected[i].y);
736      EXPECT_EQ(inout[i].z, expected[i].z);
737   }
738}
739
740TEST_F(ComputeTest, complex_types_constant_uchar3)
741{
742   struct uchar3 { uint8_t x; uint8_t y; uint8_t z; uint8_t pad; };
743   const char *kernel_source =
744   "__kernel void main_test(__global uchar3 *out, __constant uchar3 *in)\n\
745   {\n\
746      uint id = get_global_id(0);\n\
747      out[id].x = in[id].x + id;\n\
748      out[id].y = in[id].y * id;\n\
749      out[id].z = out[id].y + out[id].x;\n\
750   }\n";
751   auto in = ShaderArg<struct uchar3>({ { 8, 8, 8 }, { 16, 16, 16 }, { 64, 64, 64 }, { 255, 255, 255 } },
752                                      SHADER_ARG_INPUT);
753   auto out = ShaderArg<struct uchar3>(std::vector<struct uchar3>(4, { 0xff, 0xff, 0xff }),
754                                      SHADER_ARG_OUTPUT);
755   const struct uchar3 expected[] = {
756      { 8 + 0, 8 * 0, (8 + 0) + (8 * 0) },
757      { 16 + 1, 16 * 1, (16 + 1) + (16 * 1) },
758      { 64 + 2, 64 * 2, (64 + 2) + (64 * 2) },
759      { (uint8_t)(255 + 3), (uint8_t)(255 * 3), (uint8_t)((255 + 3) + (255 * 3)) }
760   };
761   run_shader(kernel_source, out.size(), 1, 1, out, in);
762   for (int i = 0; i < out.size(); ++i) {
763      EXPECT_EQ(out[i].x, expected[i].x);
764      EXPECT_EQ(out[i].y, expected[i].y);
765      EXPECT_EQ(out[i].z, expected[i].z);
766   }
767}
768
769TEST_F(ComputeTest, complex_types_global_uint8)
770{
771   struct uint8 {
772      uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
773      uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
774   };
775   const char *kernel_source =
776   "__kernel void main_test(__global uint8 *inout)\n\
777   {\n\
778      uint id = get_global_id(0);\n\
779      inout[id].s01234567 = inout[id].s01234567 * 2;\n\
780   }\n";
781   auto inout = ShaderArg<struct uint8>({ { 1, 2, 3, 4, 5, 6, 7, 8 } },
782                                        SHADER_ARG_INOUT);
783   const struct uint8 expected[] = {
784      { 2, 4, 6, 8, 10, 12, 14, 16 }
785   };
786   run_shader(kernel_source, inout.size(), 1, 1, inout);
787   for (int i = 0; i < inout.size(); ++i) {
788      EXPECT_EQ(inout[i].s0, expected[i].s0);
789      EXPECT_EQ(inout[i].s1, expected[i].s1);
790      EXPECT_EQ(inout[i].s2, expected[i].s2);
791      EXPECT_EQ(inout[i].s3, expected[i].s3);
792      EXPECT_EQ(inout[i].s4, expected[i].s4);
793      EXPECT_EQ(inout[i].s5, expected[i].s5);
794      EXPECT_EQ(inout[i].s6, expected[i].s6);
795      EXPECT_EQ(inout[i].s7, expected[i].s7);
796   }
797}
798
799TEST_F(ComputeTest, complex_types_local_ulong16)
800{
801   struct ulong16 {
802      uint64_t values[16];
803   };
804   const char *kernel_source =
805   R"(__kernel void main_test(__global ulong16 *inout)
806   {
807      __local ulong16 local_array[2];
808      uint id = get_global_id(0);
809      local_array[id] = inout[id];
810      barrier(CLK_LOCAL_MEM_FENCE);
811      inout[id] = local_array[0] * 2;
812   })";
813   auto inout = ShaderArg<struct ulong16>({ { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } },
814                                        SHADER_ARG_INOUT);
815   const struct ulong16 expected[] = {
816      { 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 }
817   };
818   run_shader(kernel_source, inout.size(), 1, 1, inout);
819   for (int i = 0; i < inout.size(); ++i) {
820      for (int j = 0; j < 16; ++j) {
821         EXPECT_EQ(inout[i].values[j], expected[i].values[j]);
822      }
823   }
824}
825
826TEST_F(ComputeTest, complex_types_constant_uint8)
827{
828   struct uint8 {
829      uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
830      uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
831   };
832   const char *kernel_source =
833   "__kernel void main_test(__global uint8 *out, __constant uint8 *in)\n\
834   {\n\
835      uint id = get_global_id(0);\n\
836      out[id].s01234567 = in[id].s01234567 * 2;\n\
837   }\n";
838   auto in = ShaderArg<struct uint8>({ { 1, 2, 3, 4, 5, 6, 7, 8 } },
839                                     SHADER_ARG_INPUT);
840   auto out = ShaderArg<struct uint8>({ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } },
841                                      SHADER_ARG_INOUT);
842   const struct uint8 expected[] = {
843      { 2, 4, 6, 8, 10, 12, 14, 16 }
844   };
845   run_shader(kernel_source, out.size(), 1, 1, out, in);
846   for (int i = 0; i < out.size(); ++i) {
847      EXPECT_EQ(out[i].s0, expected[i].s0);
848      EXPECT_EQ(out[i].s1, expected[i].s1);
849      EXPECT_EQ(out[i].s2, expected[i].s2);
850      EXPECT_EQ(out[i].s3, expected[i].s3);
851      EXPECT_EQ(out[i].s4, expected[i].s4);
852      EXPECT_EQ(out[i].s5, expected[i].s5);
853      EXPECT_EQ(out[i].s6, expected[i].s6);
854      EXPECT_EQ(out[i].s7, expected[i].s7);
855   }
856}
857
858TEST_F(ComputeTest, complex_types_const_array)
859{
860   const char *kernel_source =
861   "__kernel void main_test(__global uint *output)\n\
862   {\n\
863       const uint foo[] = { 100, 101, 102, 103 };\n\
864       output[get_global_id(0)] = foo[get_global_id(0) % 4];\n\
865   }\n";
866   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
867                                     SHADER_ARG_OUTPUT);
868   const uint32_t expected[] = {
869      100, 101, 102, 103
870   };
871   run_shader(kernel_source, output.size(), 1, 1, output);
872   for (int i = 0; i < output.size(); ++i)
873      EXPECT_EQ(output[i], expected[i]);
874}
875
876TEST_F(ComputeTest, mem_access_load_store_ordering)
877{
878   const char *kernel_source =
879   "__kernel void main_test(__global uint *output)\n\
880   {\n\
881       uint foo[4];\n\
882       foo[0] = 0x11111111;\n\
883       foo[1] = 0x22222222;\n\
884       foo[2] = 0x44444444;\n\
885       foo[3] = 0x88888888;\n\
886       foo[get_global_id(1)] -= 0x11111111; // foo[0] = 0 \n\
887       foo[0] += get_global_id(0); // foo[0] = tid\n\
888       foo[foo[get_global_id(1)]] = get_global_id(0); // foo[tid] = tid\n\
889       output[get_global_id(0)] = foo[get_global_id(0)]; // output[tid] = tid\n\
890   }\n";
891   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
892                                     SHADER_ARG_OUTPUT);
893   const uint16_t expected[] = {
894      0, 1, 2, 3
895   };
896   run_shader(kernel_source, output.size(), 1, 1, output);
897   for (int i = 0; i < output.size(); ++i)
898      EXPECT_EQ(output[i], expected[i]);
899}
900
901TEST_F(ComputeTest, two_const_arrays)
902{
903   const char *kernel_source =
904   "__kernel void main_test(__global uint *output)\n\
905   {\n\
906      uint id = get_global_id(0);\n\
907      uint foo[4] = {100, 101, 102, 103};\n\
908      uint bar[4] = {1, 2, 3, 4};\n\
909      output[id] = foo[id] * bar[id];\n\
910   }\n";
911   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
912                                     SHADER_ARG_OUTPUT);
913   const uint32_t expected[] = {
914      100, 202, 306, 412
915   };
916   run_shader(kernel_source, output.size(), 1, 1, output);
917   for (int i = 0; i < output.size(); ++i)
918      EXPECT_EQ(output[i], expected[i]);
919}
920
921TEST_F(ComputeTest, imod_pos)
922{
923   const char *kernel_source =
924   "__kernel void main_test(__global int *inout)\n\
925   {\n\
926       inout[get_global_id(0)] = inout[get_global_id(0)] % 3;\n\
927   }\n";
928   auto inout = ShaderArg<int32_t>({ -4, -3, -2, -1, 0, 1, 2, 3, 4 },
929                                   SHADER_ARG_INOUT);
930   const int32_t expected[] = {
931      -1, 0, -2, -1,  0, 1, 2, 0, 1
932   };
933   run_shader(kernel_source, inout.size(), 1, 1, inout);
934   for (int i = 0; i < inout.size(); ++i)
935      EXPECT_EQ(inout[i], expected[i]);
936}
937
938TEST_F(ComputeTest, imod_neg)
939{
940   const char *kernel_source =
941   "__kernel void main_test(__global int *inout)\n\
942   {\n\
943       inout[get_global_id(0)] = inout[get_global_id(0)] % -3;\n\
944   }\n";
945   auto inout = ShaderArg<int32_t>({ -4, -3, -2, -1, 0, 1, 2, 3, 4 },
946                                   SHADER_ARG_INOUT);
947   const int32_t expected[] = {
948      -1, 0, -2, -1,  0, 1, 2, 0, 1
949   };
950   run_shader(kernel_source, inout.size(), 1, 1, inout);
951   for (int i = 0; i < inout.size(); ++i)
952      EXPECT_EQ(inout[i], expected[i]);
953}
954
955TEST_F(ComputeTest, umod)
956{
957   const char *kernel_source =
958   "__kernel void main_test(__global uint *inout)\n\
959   {\n\
960       inout[get_global_id(0)] = inout[get_global_id(0)] % 0xfffffffc;\n\
961   }\n";
962   auto inout = ShaderArg<uint32_t>({ 0xfffffffa, 0xfffffffb, 0xfffffffc, 0xfffffffd, 0xfffffffe },
963                                    SHADER_ARG_INOUT);
964   const uint32_t expected[] = {
965      0xfffffffa, 0xfffffffb, 0, 1, 2
966   };
967   run_shader(kernel_source, inout.size(), 1, 1, inout);
968   for (int i = 0; i < inout.size(); ++i)
969      EXPECT_EQ(inout[i], expected[i]);
970}
971
972TEST_F(ComputeTest, rotate)
973{
974   const char *kernel_source =
975   "__kernel void main_test(__global uint *inout)\n\
976   {\n\
977       inout[get_global_id(0)] = rotate(inout[get_global_id(0)], (uint)get_global_id(0) * 4);\n\
978   }\n";
979   auto inout = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
980                                    SHADER_ARG_INOUT);
981   const uint32_t expected[] = {
982      0xdeadbeef, 0xeadbeefd, 0xadbeefde, 0xdbeefdea
983   };
984   run_shader(kernel_source, inout.size(), 1, 1, inout);
985   for (int i = 0; i < inout.size(); ++i)
986      EXPECT_EQ(inout[i], expected[i]);
987}
988
989TEST_F(ComputeTest, popcount)
990{
991   const char *kernel_source =
992   "__kernel void main_test(__global uint *inout)\n\
993   {\n\
994       inout[get_global_id(0)] = popcount(inout[get_global_id(0)]);\n\
995   }\n";
996   auto inout = ShaderArg<uint32_t>({ 0, 0x1, 0x3, 0x101, 0x110011, ~0u },
997                                    SHADER_ARG_INOUT);
998   const uint32_t expected[] = {
999      0, 1, 2, 2, 4, 32
1000   };
1001   run_shader(kernel_source, inout.size(), 1, 1, inout);
1002   for (int i = 0; i < inout.size(); ++i)
1003      EXPECT_EQ(inout[i], expected[i]);
1004}
1005
1006TEST_F(ComputeTest, hadd)
1007{
1008   const char *kernel_source =
1009   "__kernel void main_test(__global uint *inout)\n\
1010   {\n\
1011       inout[get_global_id(0)] = hadd(inout[get_global_id(0)], 1u << 31);\n\
1012   }\n";
1013   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff },
1014                                    SHADER_ARG_INOUT);
1015   const uint32_t expected[] = {
1016      (1u << 31) >> 1,
1017      ((1u << 31) + 1) >> 1,
1018      ((1u << 31) + 2) >> 1,
1019      ((1u << 31) + 3) >> 1,
1020      ((1ull << 31) + 0xfffffffc) >> 1,
1021      ((1ull << 31) + 0xfffffffd) >> 1,
1022      ((1ull << 31) + 0xfffffffe) >> 1,
1023      ((1ull << 31) + 0xffffffff) >> 1,
1024   };
1025   run_shader(kernel_source, inout.size(), 1, 1, inout);
1026   for (int i = 0; i < inout.size(); ++i)
1027      EXPECT_EQ(inout[i], expected[i]);
1028}
1029
1030TEST_F(ComputeTest, rhadd)
1031{
1032   const char *kernel_source =
1033   "__kernel void main_test(__global uint *inout)\n\
1034   {\n\
1035       inout[get_global_id(0)] = rhadd(inout[get_global_id(0)], 1u << 31);\n\
1036   }\n";
1037   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, 0xfffffffc, 0xfffffffd, 0xfffffffe, 0xffffffff },
1038                                    SHADER_ARG_INOUT);
1039   const uint32_t expected[] = {
1040      ((1u << 31) + 1) >> 1,
1041      ((1u << 31) + 2) >> 1,
1042      ((1u << 31) + 3) >> 1,
1043      ((1u << 31) + 4) >> 1,
1044      ((1ull << 31) + 0xfffffffd) >> 1,
1045      ((1ull << 31) + 0xfffffffe) >> 1,
1046      ((1ull << 31) + 0xffffffff) >> 1,
1047      ((1ull << 31) + (1ull << 32)) >> 1,
1048   };
1049   run_shader(kernel_source, inout.size(), 1, 1, inout);
1050   for (int i = 0; i < inout.size(); ++i)
1051      EXPECT_EQ(inout[i], expected[i]);
1052}
1053
1054TEST_F(ComputeTest, add_sat)
1055{
1056   const char *kernel_source =
1057   "__kernel void main_test(__global uint *inout)\n\
1058   {\n\
1059       inout[get_global_id(0)] = add_sat(inout[get_global_id(0)], 2u);\n\
1060   }\n";
1061   auto inout = ShaderArg<uint32_t>({ 0xffffffff - 3, 0xffffffff - 2, 0xffffffff - 1, 0xffffffff },
1062                                    SHADER_ARG_INOUT);
1063   const uint32_t expected[] = {
1064      0xffffffff - 1, 0xffffffff, 0xffffffff, 0xffffffff
1065   };
1066   run_shader(kernel_source, inout.size(), 1, 1, inout);
1067   for (int i = 0; i < inout.size(); ++i)
1068      EXPECT_EQ(inout[i], expected[i]);
1069}
1070
1071TEST_F(ComputeTest, sub_sat)
1072{
1073   const char *kernel_source =
1074   "__kernel void main_test(__global uint *inout)\n\
1075   {\n\
1076       inout[get_global_id(0)] = sub_sat(inout[get_global_id(0)], 2u);\n\
1077   }\n";
1078   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3 }, SHADER_ARG_INOUT);
1079   const uint32_t expected[] = {
1080      0, 0, 0, 1
1081   };
1082   run_shader(kernel_source, inout.size(), 1, 1, inout);
1083   for (int i = 0; i < inout.size(); ++i)
1084      EXPECT_EQ(inout[i], expected[i]);
1085}
1086
1087TEST_F(ComputeTest, mul_hi)
1088{
1089   const char *kernel_source =
1090   "__kernel void main_test(__global uint *inout)\n\
1091   {\n\
1092       inout[get_global_id(0)] = mul_hi(inout[get_global_id(0)], 1u << 31);\n\
1093   }\n";
1094   auto inout = ShaderArg<uint32_t>({ 0, 1, 2, 3, (1u << 31) }, SHADER_ARG_INOUT);
1095   const uint32_t expected[] = {
1096      0, 0, 1, 1, (1u << 30)
1097   };
1098   run_shader(kernel_source, inout.size(), 1, 1, inout);
1099   for (int i = 0; i < inout.size(); ++i)
1100      EXPECT_EQ(inout[i], expected[i]);
1101}
1102
1103TEST_F(ComputeTest, ldexp_x)
1104{
1105   const char *kernel_source =
1106   "__kernel void main_test(__global float *inout)\n\
1107   {\n\
1108       inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], 5);\n\
1109   }\n";
1110   auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 2.0f }, SHADER_ARG_INOUT);
1111   const float expected[] = {
1112      ldexp(0.0f, 5), ldexp(0.5f, 5), ldexp(1.0f, 5), ldexp(2.0f, 5)
1113   };
1114   run_shader(kernel_source, inout.size(), 1, 1, inout);
1115   for (int i = 0; i < inout.size(); ++i)
1116      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1117}
1118
1119TEST_F(ComputeTest, ldexp_y)
1120{
1121   const char *kernel_source =
1122   "__kernel void main_test(__global float *inout)\n\
1123   {\n\
1124       inout[get_global_id(0)] = ldexp(inout[get_global_id(0)], get_global_id(0));\n\
1125   }\n";
1126   auto inout = ShaderArg<float>({ 0.25f, 0.5f, 0.75f, 1.0f }, SHADER_ARG_INOUT);
1127   const float expected[] = {
1128      ldexp(0.25f, 0), ldexp(0.5f, 1), ldexp(0.75f, 2), ldexp(1.0f, 3)
1129   };
1130   run_shader(kernel_source, inout.size(), 1, 1, inout);
1131   for (int i = 0; i < inout.size(); ++i)
1132      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1133}
1134
1135TEST_F(ComputeTest, frexp_ret)
1136{
1137   const char *kernel_source =
1138   "__kernel void main_test(__global float *inout)\n\
1139   {\n\
1140       int exp;\n\
1141       inout[get_global_id(0)] = frexp(inout[get_global_id(0)], &exp);\n\
1142   }\n";
1143   auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT);
1144   const float expected[] = {
1145      0.0f, 0.5f, 0.5f, 0.75f
1146   };
1147   run_shader(kernel_source, inout.size(), 1, 1, inout);
1148   for (int i = 0; i < inout.size(); ++i)
1149      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1150}
1151
1152TEST_F(ComputeTest, frexp_exp)
1153{
1154   const char *kernel_source =
1155   "__kernel void main_test(__global float *inout)\n\
1156   {\n\
1157       int exp;\n\
1158       frexp(inout[get_global_id(0)], &exp);\n\
1159       inout[get_global_id(0)] = (float)exp;\n\
1160   }\n";
1161   auto inout = ShaderArg<float>({ 0.0f, 0.5f, 1.0f, 3.0f }, SHADER_ARG_INOUT);
1162   const float expected[] = {
1163      0.0f, 0.0f, 1.0f, 2.0f
1164   };
1165   run_shader(kernel_source, inout.size(), 1, 1, inout);
1166   for (int i = 0; i < inout.size(); ++i)
1167      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1168}
1169
1170TEST_F(ComputeTest, clz)
1171{
1172   const char *kernel_source =
1173   "__kernel void main_test(__global uint *inout)\n\
1174   {\n\
1175       inout[get_global_id(0)] = clz(inout[get_global_id(0)]);\n\
1176   }\n";
1177   auto inout = ShaderArg<uint32_t>({ 0, 1, 0xffff,  (1u << 30), (1u << 31) }, SHADER_ARG_INOUT);
1178   const uint32_t expected[] = {
1179      32, 31, 16, 1, 0
1180   };
1181   run_shader(kernel_source, inout.size(), 1, 1, inout);
1182   for (int i = 0; i < inout.size(); ++i)
1183      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1184}
1185
1186TEST_F(ComputeTest, sin)
1187{
1188   struct sin_vals { float in; float clc; float native; };
1189   const char *kernel_source =
1190   "struct sin_vals { float in; float clc; float native; };\n\
1191   __kernel void main_test(__global struct sin_vals *inout)\n\
1192   {\n\
1193       inout[get_global_id(0)].clc = sin(inout[get_global_id(0)].in);\n\
1194       inout[get_global_id(0)].native = native_sin(inout[get_global_id(0)].in);\n\
1195   }\n";
1196   const vector<sin_vals> input = {
1197      { 0.0f, 0.0f, 0.0f },
1198      { 1.0f, 0.0f, 0.0f },
1199      { 2.0f, 0.0f, 0.0f },
1200      { 3.0f, 0.0f, 0.0f },
1201   };
1202   auto inout = ShaderArg<sin_vals>(input, SHADER_ARG_INOUT);
1203   const struct sin_vals expected[] = {
1204      { 0.0f, 0.0f,       0.0f       },
1205      { 1.0f, sin(1.0f), sin(1.0f) },
1206      { 2.0f, sin(2.0f), sin(2.0f) },
1207      { 3.0f, sin(3.0f), sin(3.0f) },
1208   };
1209   run_shader(kernel_source, inout.size(), 1, 1, inout);
1210   for (int i = 0; i < inout.size(); ++i) {
1211      EXPECT_FLOAT_EQ(inout[i].in, inout[i].in);
1212      EXPECT_FLOAT_EQ(inout[i].clc, inout[i].clc);
1213      EXPECT_NEAR(inout[i].clc, inout[i].native, 0.008f); // range from DXIL spec
1214   }
1215}
1216
1217TEST_F(ComputeTest, cosh)
1218{
1219   const char *kernel_source =
1220   "__kernel void main_test(__global float *inout)\n\
1221   {\n\
1222       inout[get_global_id(0)] = cosh(inout[get_global_id(0)]);\n\
1223   }\n";
1224   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1225   const float expected[] = {
1226      cosh(0.0f), cosh(1.0f), cosh(2.0f), cosh(3.0f)
1227   };
1228   run_shader(kernel_source, inout.size(), 1, 1, inout);
1229   for (int i = 0; i < inout.size(); ++i)
1230      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1231}
1232
1233TEST_F(ComputeTest, exp)
1234{
1235   const char *kernel_source =
1236   "__kernel void main_test(__global float *inout)\n\
1237   {\n\
1238       inout[get_global_id(0)] = native_exp(inout[get_global_id(0)]);\n\
1239   }\n";
1240   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1241   const float expected[] = {
1242      exp(0.0f), exp(1.0f), exp(2.0f), exp(3.0f)
1243   };
1244   run_shader(kernel_source, inout.size(), 1, 1, inout);
1245   for (int i = 0; i < inout.size(); ++i)
1246      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1247}
1248
1249TEST_F(ComputeTest, exp10)
1250{
1251   const char *kernel_source =
1252   "__kernel void main_test(__global float *inout)\n\
1253   {\n\
1254       inout[get_global_id(0)] = native_exp10(inout[get_global_id(0)]);\n\
1255   }\n";
1256   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1257   const float expected[] = {
1258      pow(10.0f, 0.0f), pow(10.0f, 1.0f), pow(10.0f, 2.0f), pow(10.0f, 3.0f)
1259   };
1260   run_shader(kernel_source, inout.size(), 1, 1, inout);
1261   for (int i = 0; i < inout.size(); ++i)
1262      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1263}
1264
1265TEST_F(ComputeTest, exp2)
1266{
1267   const char *kernel_source =
1268   "__kernel void main_test(__global float *inout)\n\
1269   {\n\
1270       inout[get_global_id(0)] = native_exp2(inout[get_global_id(0)]);\n\
1271   }\n";
1272   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1273   const float expected[] = {
1274      pow(2.0f, 0.0f), pow(2.0f, 1.0f), pow(2.0f, 2.0f), pow(2.0f, 3.0f)
1275   };
1276   run_shader(kernel_source, inout.size(), 1, 1, inout);
1277   for (int i = 0; i < inout.size(); ++i)
1278      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1279}
1280
1281TEST_F(ComputeTest, log)
1282{
1283   const char *kernel_source =
1284   "__kernel void main_test(__global float *inout)\n\
1285   {\n\
1286       inout[get_global_id(0)] = native_log(inout[get_global_id(0)]);\n\
1287   }\n";
1288   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1289   const float expected[] = {
1290      log(0.0f), log(1.0f), log(2.0f), log(3.0f)
1291   };
1292   run_shader(kernel_source, inout.size(), 1, 1, inout);
1293   for (int i = 0; i < inout.size(); ++i)
1294      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1295}
1296
1297TEST_F(ComputeTest, log10)
1298{
1299   const char *kernel_source =
1300   "__kernel void main_test(__global float *inout)\n\
1301   {\n\
1302       inout[get_global_id(0)] = native_log10(inout[get_global_id(0)]);\n\
1303   }\n";
1304   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1305   const float expected[] = {
1306      log10(0.0f), log10(1.0f), log10(2.0f), log10(3.0f)
1307   };
1308   run_shader(kernel_source, inout.size(), 1, 1, inout);
1309   for (int i = 0; i < inout.size(); ++i)
1310      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1311}
1312
1313TEST_F(ComputeTest, log2)
1314{
1315   const char *kernel_source =
1316   "__kernel void main_test(__global float *inout)\n\
1317   {\n\
1318       inout[get_global_id(0)] = native_log2(inout[get_global_id(0)]);\n\
1319   }\n";
1320   auto inout = ShaderArg<float>({ 0.0f, 1.0f, 2.0f, 3.0f }, SHADER_ARG_INOUT);
1321   const float expected[] = {
1322      log(0.0f) / log(2.0f), log(1.0f) / log(2.0f), log(2.0f) / log(2.0f), log(3.0f) / log(2.0f)
1323   };
1324   run_shader(kernel_source, inout.size(), 1, 1, inout);
1325   for (int i = 0; i < inout.size(); ++i)
1326      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1327}
1328
1329TEST_F(ComputeTest, rint)
1330{
1331   const char *kernel_source =
1332   "__kernel void main_test(__global float *inout)\n\
1333   {\n\
1334      inout[get_global_id(0)] = rint(inout[get_global_id(0)]);\n\
1335   }\n";
1336
1337   auto inout = ShaderArg<float>({ 0.5f, 1.5f, -0.5f, -1.5f, 1.4f }, SHADER_ARG_INOUT);
1338   const float expected[] = {
1339      0.0f, 2.0f, 0.0f, -2.0f, 1.0f,
1340   };
1341   run_shader(kernel_source, inout.size(), 1, 1, inout);
1342   for (int i = 0; i < inout.size(); ++i)
1343      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1344}
1345
1346TEST_F(ComputeTest, round)
1347{
1348   const char *kernel_source =
1349   "__kernel void main_test(__global float *inout)\n\
1350   {\n\
1351       inout[get_global_id(0)] = round(inout[get_global_id(0)]);\n\
1352   }\n";
1353   auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
1354                                 SHADER_ARG_INOUT);
1355   const float expected[] = {
1356      0.0f, 0.0f, -0.0f, 1.0f, -1.0f, 1.0f, -1.0f
1357   };
1358   run_shader(kernel_source, inout.size(), 1, 1, inout);
1359   for (int i = 0; i < inout.size(); ++i)
1360      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1361}
1362
1363TEST_F(ComputeTest, arg_by_val)
1364{
1365   const char *kernel_source =
1366   "__kernel void main_test(__global float *inout, float mul)\n\
1367   {\n\
1368       inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\
1369   }\n";
1370   auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
1371                                 SHADER_ARG_INOUT);
1372   auto mul = ShaderArg<float>(10.0f, SHADER_ARG_INPUT);
1373   const float expected[] = {
1374      0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f
1375   };
1376   run_shader(kernel_source, inout.size(), 1, 1, inout, mul);
1377   for (int i = 0; i < inout.size(); ++i)
1378      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1379}
1380
1381TEST_F(ComputeTest, uint8_by_val)
1382{
1383   struct uint8 {
1384      uint32_t s0; uint32_t s1; uint32_t s2; uint32_t s3;
1385      uint32_t s4; uint32_t s5; uint32_t s6; uint32_t s7;
1386   };
1387   const char *kernel_source =
1388   "__kernel void main_test(__global uint *out, uint8 val)\n\
1389   {\n\
1390       out[get_global_id(0)] = val.s0 + val.s1 + val.s2 + val.s3 +\n\
1391                               val.s4 + val.s5 + val.s6 + val.s7;\n\
1392   }\n";
1393   auto out = ShaderArg<uint32_t>({ 0 }, SHADER_ARG_OUTPUT);
1394   auto val = ShaderArg<struct uint8>({ {0, 1, 2, 3, 4, 5, 6, 7 }}, SHADER_ARG_INPUT);
1395   const uint32_t expected[] = { 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 };
1396   run_shader(kernel_source, out.size(), 1, 1, out, val);
1397   for (int i = 0; i < out.size(); ++i)
1398      EXPECT_EQ(out[i], expected[i]);
1399}
1400
1401TEST_F(ComputeTest, link)
1402{
1403   const char *foo_src =
1404   "float foo(float in)\n\
1405   {\n\
1406       return in * in;\n\
1407   }\n";
1408   const char *kernel_source =
1409   "float foo(float in);\n\
1410   __kernel void main_test(__global float *inout)\n\
1411   {\n\
1412       inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\
1413   }\n";
1414   std::vector<const char *> srcs = { foo_src, kernel_source };
1415   auto inout = ShaderArg<float>({ 2.0f }, SHADER_ARG_INOUT);
1416   const float expected[] = {
1417      4.0f,
1418   };
1419   run_shader(srcs, inout.size(), 1, 1, inout);
1420   for (int i = 0; i < inout.size(); ++i)
1421      EXPECT_EQ(inout[i], expected[i]);
1422}
1423
1424TEST_F(ComputeTest, link_library)
1425{
1426   const char *bar_src =
1427   "float bar(float in)\n\
1428   {\n\
1429      return in * 5;\n\
1430   }\n";
1431   const char *foo_src =
1432   "float bar(float in);\n\
1433   float foo(float in)\n\
1434   {\n\
1435       return in * bar(in);\n\
1436   }\n";
1437   const char *kernel_source =
1438   "float foo(float in);\n\
1439   __kernel void main_test(__global float *inout)\n\
1440   {\n\
1441       inout[get_global_id(0)] = foo(inout[get_global_id(0)]);\n\
1442   }\n";
1443   std::vector<Shader> libraries = {
1444      compile({ bar_src, kernel_source }, {}, true),
1445      compile({ foo_src }, {}, true)
1446   };
1447   Shader exe = link(libraries);
1448   auto inout = ShaderArg<float>({ 2.0f }, SHADER_ARG_INOUT);
1449   const float expected[] = {
1450      20.0f,
1451   };
1452   run_shader(exe, { (unsigned)inout.size(), 1, 1 }, inout);
1453   for (int i = 0; i < inout.size(); ++i)
1454      EXPECT_EQ(inout[i], expected[i]);
1455}
1456
1457TEST_F(ComputeTest, localvar)
1458{
1459   const char *kernel_source =
1460   "__kernel __attribute__((reqd_work_group_size(2, 1, 1)))\n\
1461   void main_test(__global float *inout)\n\
1462   {\n\
1463      __local float2 tmp[2];\n\
1464      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
1465      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
1466      barrier(CLK_LOCAL_MEM_FENCE);\n\
1467      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
1468   }\n";
1469
1470   auto inout = ShaderArg<float>({ 2.0f, 4.0f }, SHADER_ARG_INOUT);
1471   const float expected[] = {
1472      9.0f, 5.0f
1473   };
1474   run_shader(kernel_source, inout.size(), 1, 1, inout);
1475   for (int i = 0; i < inout.size(); ++i)
1476      EXPECT_EQ(inout[i], expected[i]);
1477}
1478
1479TEST_F(ComputeTest, localvar_uchar2)
1480{
1481   const char *kernel_source =
1482   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
1483   __kernel void main_test(__global uchar *inout)\n\
1484   {\n\
1485      __local uchar2 tmp[2];\n\
1486      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
1487      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
1488      barrier(CLK_LOCAL_MEM_FENCE);\n\
1489      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
1490   }\n";
1491
1492   auto inout = ShaderArg<uint8_t>({ 2, 4 }, SHADER_ARG_INOUT);
1493   const uint8_t expected[] = { 9, 5 };
1494   run_shader(kernel_source, inout.size(), 1, 1, inout);
1495   for (int i = 0; i < inout.size(); ++i)
1496      EXPECT_EQ(inout[i], expected[i]);
1497}
1498
1499TEST_F(ComputeTest, work_group_size_hint)
1500{
1501   const char *kernel_source =
1502   "__attribute__((work_group_size_hint(2, 1, 1)))\n\
1503   __kernel void main_test(__global uint *output)\n\
1504   {\n\
1505       output[get_global_id(0)] = get_local_id(0);\n\
1506   }\n";
1507   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
1508                                     SHADER_ARG_OUTPUT);
1509   const uint32_t expected[] = {
1510      0, 1, 2, 3
1511   };
1512   run_shader(kernel_source, output.size(), 1, 1, output);
1513   for (int i = 0; i < output.size(); ++i)
1514      EXPECT_EQ(output[i], expected[i]);
1515}
1516
1517TEST_F(ComputeTest, reqd_work_group_size)
1518{
1519   const char *kernel_source =
1520   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
1521   __kernel void main_test(__global uint *output)\n\
1522   {\n\
1523       output[get_global_id(0)] = get_local_id(0);\n\
1524   }\n";
1525   auto output = ShaderArg<uint32_t>(std::vector<uint32_t>(4, 0xdeadbeef),
1526                                     SHADER_ARG_OUTPUT);
1527   const uint32_t expected[] = {
1528      0, 1, 0, 1
1529   };
1530   run_shader(kernel_source, output.size(), 1, 1, output);
1531   for (int i = 0; i < output.size(); ++i)
1532      EXPECT_EQ(output[i], expected[i]);
1533}
1534
1535TEST_F(ComputeTest, image)
1536{
1537   const char* kernel_source =
1538   "__kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
1539   {\n\
1540      int2 coords = (int2)(get_global_id(0), get_global_id(1));\n\
1541      write_imagef(output, coords, read_imagef(input, coords));\n\
1542   }\n";
1543   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1544   validate(shader);
1545}
1546
1547TEST_F(ComputeTest, image_two_reads)
1548{
1549   const char* kernel_source =
1550   "__kernel void main_test(image2d_t image, int is_float, __global float* output)\n\
1551   {\n\
1552      if (is_float)\n\
1553         output[get_global_id(0)] = read_imagef(image, (int2)(0, 0)).x;\n\
1554      else \n\
1555         output[get_global_id(0)] = (float)read_imagei(image, (int2)(0, 0)).x;\n\
1556   }\n";
1557   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1558   validate(shader);
1559}
1560
1561TEST_F(ComputeTest, image_unused)
1562{
1563   const char* kernel_source =
1564   "__kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
1565   {\n\
1566   }\n";
1567   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1568   validate(shader);
1569}
1570
1571TEST_F(ComputeTest, image_read_write)
1572{
1573   const char *kernel_source =
1574   R"(__kernel void main_test(read_write image2d_t image)
1575   {
1576      int2 coords = (int2)(get_global_id(0), get_global_id(1));
1577      write_imagef(image, coords, read_imagef(image, coords) + (float4)(1.0f, 1.0f, 1.0f, 1.0f));
1578   })";
1579   Shader shader = compile(std::vector<const char*>({ kernel_source }), { "-cl-std=cl3.0" });
1580   validate(shader);
1581}
1582
1583TEST_F(ComputeTest, sampler)
1584{
1585   const char* kernel_source =
1586   "__kernel void main_test(image2d_t image, sampler_t sampler, __global float* output)\n\
1587   {\n\
1588      output[get_global_id(0)] = read_imagef(image, sampler, (int2)(0, 0)).x;\n\
1589   }\n";
1590   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1591   validate(shader);
1592}
1593
1594TEST_F(ComputeTest, image_dims)
1595{
1596   const char* kernel_source =
1597   "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\
1598   {\n\
1599      output[get_global_id(0)] = get_image_width(roimage);\n\
1600      output[get_global_id(0) + 1] = get_image_width(woimage);\n\
1601   }\n";
1602   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1603   validate(shader);
1604}
1605
1606TEST_F(ComputeTest, image_format)
1607{
1608   const char* kernel_source =
1609   "__kernel void main_test(image2d_t roimage, write_only image2d_t woimage, __global uint* output)\n\
1610   {\n\
1611      output[get_global_id(0)] = get_image_channel_data_type(roimage);\n\
1612      output[get_global_id(0) + 1] = get_image_channel_order(woimage);\n\
1613   }\n";
1614   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1615   validate(shader);
1616}
1617
1618TEST_F(ComputeTest, image1d_buffer_t)
1619{
1620   const char* kernel_source =
1621   "__kernel void main_test(read_only image1d_buffer_t input, write_only image1d_buffer_t output)\n\
1622   {\n\
1623      write_imageui(output, get_global_id(0), read_imageui(input, get_global_id(0)));\n\
1624   }\n";
1625   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1626   validate(shader);
1627}
1628
1629TEST_F(ComputeTest, local_ptr)
1630{
1631   struct uint2 { uint32_t x, y; };
1632   const char *kernel_source =
1633   "__kernel void main_test(__global uint *inout, __local uint2 *tmp)\n\
1634   {\n\
1635      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
1636      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
1637      barrier(CLK_LOCAL_MEM_FENCE);\n\
1638      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y;\n\
1639   }\n";
1640   auto inout = ShaderArg<uint32_t>({ 2, 4 }, SHADER_ARG_INOUT);
1641   auto tmp = ShaderArg<struct uint2>(std::vector<struct uint2>(4096), SHADER_ARG_INPUT);
1642   const uint8_t expected[] = { 9, 5 };
1643   run_shader(kernel_source, inout.size(), 1, 1, inout, tmp);
1644   for (int i = 0; i < inout.size(); ++i)
1645      EXPECT_EQ(inout[i], expected[i]);
1646}
1647
1648TEST_F(ComputeTest, two_local_ptrs)
1649{
1650   struct uint2 { uint32_t x, y; };
1651   const char *kernel_source =
1652   "__kernel void main_test(__global uint *inout, __local uint2 *tmp, __local uint *tmp2)\n\
1653   {\n\
1654      tmp[get_local_id(0)].x = inout[get_global_id(0)] + 1;\n\
1655      tmp[get_local_id(0)].y = inout[get_global_id(0)] - 1;\n\
1656      tmp2[get_local_id(0)] = get_global_id(0);\n\
1657      barrier(CLK_LOCAL_MEM_FENCE);\n\
1658      inout[get_global_id(0)] = tmp[get_local_id(0) % 2].x * tmp[(get_local_id(0) + 1) % 2].y + tmp2[get_local_id(0) % 2];\n\
1659   }\n";
1660   auto inout = ShaderArg<uint32_t>({ 2, 4 }, SHADER_ARG_INOUT);
1661   auto tmp = ShaderArg<struct uint2>(std::vector<struct uint2>(1024), SHADER_ARG_INPUT);
1662   auto tmp2 = ShaderArg<uint32_t>(std::vector<uint32_t>(1024), SHADER_ARG_INPUT);
1663   const uint8_t expected[] = { 9, 6 };
1664   run_shader(kernel_source, inout.size(), 1, 1, inout, tmp, tmp2);
1665   for (int i = 0; i < inout.size(); ++i)
1666      EXPECT_EQ(inout[i], expected[i]);
1667}
1668
1669TEST_F(ComputeTest, int8_to_float)
1670{
1671   const char *kernel_source =
1672   "__kernel void main_test(__global char* in, __global float* out)\n\
1673   {\n\
1674      uint pos = get_global_id(0);\n\
1675      out[pos] = in[pos] / 100.0f;\n\
1676   }";
1677   auto in = ShaderArg<char>({ 10, 20, 30, 40 }, SHADER_ARG_INPUT);
1678   auto out = ShaderArg<float>(std::vector<float>(4, std::numeric_limits<float>::infinity()), SHADER_ARG_OUTPUT);
1679   const float expected[] = { 0.1f, 0.2f, 0.3f, 0.4f };
1680   run_shader(kernel_source, in.size(), 1, 1, in, out);
1681   for (int i = 0; i < in.size(); ++i)
1682      EXPECT_FLOAT_EQ(out[i], expected[i]);
1683}
1684
1685TEST_F(ComputeTest, vec_hint_float4)
1686{
1687   const char *kernel_source =
1688   "__kernel __attribute__((vec_type_hint(float4))) void main_test(__global float *inout)\n\
1689   {\n\
1690      inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
1691   }";
1692   Shader shader = compile({ kernel_source });
1693   EXPECT_EQ(shader.metadata->kernels[0].vec_hint_size, 4);
1694   EXPECT_EQ(shader.metadata->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_FLOAT);
1695}
1696
1697TEST_F(ComputeTest, vec_hint_uchar2)
1698{
1699   const char *kernel_source =
1700   "__kernel __attribute__((vec_type_hint(uchar2))) void main_test(__global float *inout)\n\
1701   {\n\
1702      inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
1703   }";
1704   Shader shader = compile({ kernel_source });
1705   EXPECT_EQ(shader.metadata->kernels[0].vec_hint_size, 2);
1706   EXPECT_EQ(shader.metadata->kernels[0].vec_hint_type, CLC_VEC_HINT_TYPE_CHAR);
1707}
1708
1709TEST_F(ComputeTest, vec_hint_none)
1710{
1711   const char *kernel_source =
1712   "__kernel void main_test(__global float *inout)\n\
1713   {\n\
1714      inout[get_global_id(0)] *= inout[get_global_id(1)];\n\
1715   }";
1716   Shader shader = compile({ kernel_source });
1717   EXPECT_EQ(shader.metadata->kernels[0].vec_hint_size, 0);
1718}
1719
1720TEST_F(ComputeTest, DISABLED_debug_layer_failure)
1721{
1722   /* This is a negative test case, it intentionally triggers a failure to validate the mechanism
1723    * is in place, so other tests will fail if they produce debug messages
1724    */
1725   const char *kernel_source =
1726   "__kernel void main_test(__global float *inout, float mul)\n\
1727   {\n\
1728       inout[get_global_id(0)] = inout[get_global_id(0)] * mul;\n\
1729   }\n";
1730   auto inout = ShaderArg<float>({ 0, 0.3f, -0.3f, 0.5f, -0.5f, 1.1f, -1.1f },
1731                                 SHADER_ARG_INOUT);
1732   auto mul = ShaderArg<float>(10.0f, SHADER_ARG_INPUT);
1733   const float expected[] = {
1734      0.0f, 3.0f, -3.0f, 5.0f, -5.0f, 11.0f, -11.0f
1735   };
1736   ComPtr<ID3D12InfoQueue> info_queue;
1737   dev->QueryInterface(info_queue.ReleaseAndGetAddressOf());
1738   if (!info_queue) {
1739      GTEST_SKIP() << "No info queue";
1740      return;
1741   }
1742
1743   info_queue->AddApplicationMessage(D3D12_MESSAGE_SEVERITY_ERROR, "This should cause the test to fail");
1744   run_shader(kernel_source, inout.size(), 1, 1, inout, mul);
1745   for (int i = 0; i < inout.size(); ++i)
1746      EXPECT_FLOAT_EQ(inout[i], expected[i]);
1747}
1748
1749TEST_F(ComputeTest, compiler_defines)
1750{
1751   const char *kernel_source =
1752      "__kernel void main_test(__global int* out)\n\
1753   {\n\
1754      out[0] = OUT_VAL0;\n\
1755      out[1] = __OPENCL_C_VERSION__;\n\
1756   }";
1757   auto out = ShaderArg<int>(std::vector<int>(2, 0), SHADER_ARG_OUTPUT);
1758   CompileArgs compile_args = { 1, 1, 1 };
1759   compile_args.compiler_command_line = { "-DOUT_VAL0=5", "-cl-std=cl" };
1760   std::vector<RawShaderArg *> raw_args = { &out };
1761   run_shader({ kernel_source }, compile_args, out);
1762   EXPECT_EQ(out[0], 5);
1763   EXPECT_EQ(out[1], 100);
1764}
1765
1766TEST_F(ComputeTest, global_atomic_add)
1767{
1768   const char *kernel_source =
1769   "__kernel void main_test(__global int *inout, __global int *old)\n\
1770   {\n\
1771      old[get_global_id(0)] = atomic_add(inout + get_global_id(0), 3);\n\
1772   }\n";
1773   auto inout = ShaderArg<int32_t>({ 2, 4 }, SHADER_ARG_INOUT);
1774   auto old = ShaderArg<int32_t>(std::vector<int32_t>(2, 0xdeadbeef), SHADER_ARG_OUTPUT);
1775   const int32_t expected_inout[] = { 5, 7 };
1776   const int32_t expected_old[] = { 2, 4 };
1777   run_shader(kernel_source, inout.size(), 1, 1, inout, old);
1778   for (int i = 0; i < inout.size(); ++i) {
1779      EXPECT_EQ(inout[i], expected_inout[i]);
1780      EXPECT_EQ(old[i], expected_old[i]);
1781   }
1782}
1783
1784TEST_F(ComputeTest, global_atomic_imin)
1785{
1786   const char *kernel_source =
1787   "__kernel void main_test(__global int *inout, __global int *old)\n\
1788   {\n\
1789      old[get_global_id(0)] = atomic_min(inout + get_global_id(0), 1);\n\
1790   }\n";
1791   auto inout = ShaderArg<int32_t>({ 0, 2, -1 }, SHADER_ARG_INOUT);
1792   auto old = ShaderArg<int32_t>(std::vector<int32_t>(3, 0xdeadbeef), SHADER_ARG_OUTPUT);
1793   const int32_t expected_inout[] = { 0, 1, -1 };
1794   const int32_t expected_old[] = { 0, 2, -1 };
1795   run_shader(kernel_source, inout.size(), 1, 1, inout, old);
1796   for (int i = 0; i < inout.size(); ++i) {
1797      EXPECT_EQ(inout[i], expected_inout[i]);
1798      EXPECT_EQ(old[i], expected_old[i]);
1799   }
1800}
1801
1802TEST_F(ComputeTest, global_atomic_and_or)
1803{
1804   const char *kernel_source =
1805   "__attribute__((reqd_work_group_size(3, 1, 1)))\n\
1806   __kernel void main_test(__global int *inout)\n\
1807   {\n\
1808      atomic_and(inout, ~(1 << get_global_id(0)));\n\
1809      atomic_or(inout, (1 << (get_global_id(0) + 4)));\n\
1810   }\n";
1811   auto inout = ShaderArg<int32_t>(0xf, SHADER_ARG_INOUT);
1812   const int32_t expected[] = { 0x78 };
1813   run_shader(kernel_source, 3, 1, 1, inout);
1814   for (int i = 0; i < inout.size(); ++i)
1815      EXPECT_EQ(inout[i], expected[i]);
1816}
1817
1818TEST_F(ComputeTest, global_atomic_cmpxchg)
1819{
1820   const char *kernel_source =
1821   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
1822   __kernel void main_test(__global int *inout)\n\
1823   {\n\
1824      while (atomic_cmpxchg(inout, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\
1825         ;\n\
1826   }\n";
1827   auto inout = ShaderArg<int32_t>(0, SHADER_ARG_INOUT);
1828   const int32_t expected_inout[] = { 2 };
1829   run_shader(kernel_source, 2, 1, 1, inout);
1830   for (int i = 0; i < inout.size(); ++i)
1831      EXPECT_EQ(inout[i], expected_inout[i]);
1832}
1833
1834TEST_F(ComputeTest, local_atomic_and_or)
1835{
1836   const char *kernel_source =
1837   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
1838   __kernel void main_test(__global ushort *inout)\n\
1839   {\n\
1840      __local ushort tmp;\n\
1841      atomic_and(&tmp, ~(0xff << (get_global_id(0) * 8)));\n\
1842      atomic_or(&tmp, inout[get_global_id(0)] << (get_global_id(0) * 8));\n\
1843      barrier(CLK_LOCAL_MEM_FENCE);\n\
1844      inout[get_global_id(0)] = tmp;\n\
1845   }\n";
1846   auto inout = ShaderArg<uint16_t>({ 2, 4 }, SHADER_ARG_INOUT);
1847   const uint16_t expected[] = { 0x402, 0x402 };
1848   run_shader(kernel_source, inout.size(), 1, 1, inout);
1849   for (int i = 0; i < inout.size(); ++i)
1850      EXPECT_EQ(inout[i], expected[i]);
1851}
1852
1853TEST_F(ComputeTest, local_atomic_cmpxchg)
1854{
1855   const char *kernel_source =
1856   "__attribute__((reqd_work_group_size(2, 1, 1)))\n\
1857   __kernel void main_test(__global int *out)\n\
1858   {\n\
1859      __local uint tmp;\n\
1860      tmp = 0;\n\
1861      barrier(CLK_LOCAL_MEM_FENCE);\n\
1862      while (atomic_cmpxchg(&tmp, get_global_id(0), get_global_id(0) + 1) != get_global_id(0))\n\
1863         ;\n\
1864      barrier(CLK_LOCAL_MEM_FENCE);\n\
1865      out[0] = tmp;\n\
1866   }\n";
1867
1868   auto out = ShaderArg<uint32_t>(0xdeadbeef, SHADER_ARG_OUTPUT);
1869   const uint16_t expected[] = { 2 };
1870   run_shader(kernel_source, 2, 1, 1, out);
1871   for (int i = 0; i < out.size(); ++i)
1872      EXPECT_EQ(out[i], expected[i]);
1873}
1874
1875TEST_F(ComputeTest, constant_sampler)
1876{
1877   const char* kernel_source =
1878   "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;\n\
1879   __kernel void main_test(read_only image2d_t input, write_only image2d_t output)\n\
1880   {\n\
1881      int2 coordsi = (int2)(get_global_id(0), get_global_id(1));\n\
1882      float2 coordsf = (float2)((float)coordsi.x / get_image_width(input), (float)coordsi.y / get_image_height(input));\n\
1883      write_imagef(output, coordsi, \n\
1884         read_imagef(input, sampler, coordsf) + \n\
1885         read_imagef(input, sampler, coordsf + (float2)(0.1, 0.1)));\n\
1886   }\n";
1887   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1888   validate(shader);
1889   EXPECT_EQ(shader.dxil->metadata.num_const_samplers, 1);
1890}
1891
1892TEST_F(ComputeTest, hi)
1893{
1894   const char *kernel_source = R"(
1895   __kernel void main_test(__global char3 *srcA, __global char2 *dst)
1896   {
1897       int  tid = get_global_id(0);
1898
1899       char2 tmp = srcA[tid].hi;
1900       dst[tid] = tmp;
1901   })";
1902   Shader shader = compile(std::vector<const char*>({ kernel_source }));
1903   validate(shader);
1904}
1905
1906TEST_F(ComputeTest, system_values)
1907{
1908   const char *kernel_source =
1909   "__kernel void main_test(__global uint* outputs)\n\
1910   {\n\
1911      outputs[0] = get_work_dim();\n\
1912      outputs[1] = get_global_size(0);\n\
1913      outputs[2] = get_local_size(0);\n\
1914      outputs[3] = get_num_groups(0);\n\
1915      outputs[4] = get_group_id(0);\n\
1916      outputs[5] = get_global_offset(0);\n\
1917      outputs[6] = get_global_id(0);\n\
1918   }\n";
1919   auto out = ShaderArg<uint32_t>(std::vector<uint32_t>(6, 0xdeadbeef), SHADER_ARG_OUTPUT);
1920   const uint16_t expected[] = { 3, 1, 1, 1, 0, 0, 0, };
1921   CompileArgs args = { 1, 1, 1 };
1922   Shader shader = compile({ kernel_source });
1923   run_shader(shader, args, out);
1924   for (int i = 0; i < out.size(); ++i)
1925      EXPECT_EQ(out[i], expected[i]);
1926
1927   args.work_props.work_dim = 2;
1928   args.work_props.global_offset_x = 100;
1929   args.work_props.group_id_offset_x = 2;
1930   args.work_props.group_count_total_x = 5;
1931   const uint32_t expected_withoffsets[] = { 2, 5, 1, 5, 2, 100, 102 };
1932   run_shader(shader, args, out);
1933   for (int i = 0; i < out.size(); ++i)
1934      EXPECT_EQ(out[i], expected_withoffsets[i]);
1935}
1936
1937TEST_F(ComputeTest, convert_round_sat)
1938{
1939   const char *kernel_source =
1940   "__kernel void main_test(__global float *f, __global uchar *u)\n\
1941   {\n\
1942       uint idx = get_global_id(0);\n\
1943       u[idx] = convert_uchar_sat_rtp(f[idx]);\n\
1944   }\n";
1945   auto f = ShaderArg<float>({ -1.0f, 1.1f, 20.0f, 255.5f }, SHADER_ARG_INPUT);
1946   auto u = ShaderArg<uint8_t>({ 255, 0, 0, 0 }, SHADER_ARG_OUTPUT);
1947   const uint8_t expected[] = {
1948      0, 2, 20, 255
1949   };
1950
1951   run_shader(kernel_source, f.size(), 1, 1, f, u);
1952   for (int i = 0; i < u.size(); ++i)
1953      EXPECT_EQ(u[i], expected[i]);
1954}
1955
1956TEST_F(ComputeTest, convert_round_sat_vec)
1957{
1958   const char *kernel_source =
1959   "__kernel void main_test(__global float16 *f, __global uchar16 *u)\n\
1960   {\n\
1961       uint idx = get_global_id(0);\n\
1962       u[idx] = convert_uchar16_sat_rtp(f[idx]);\n\
1963   }\n";
1964   auto f = ShaderArg<float>({
1965      -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
1966      -0.5f, 1.9f, 20.0f, 254.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
1967       0.0f, 1.3f, 20.0f, 255.1f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
1968      -0.0f, 1.5555f, 20.0f, 254.9f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f, -1.0f, 1.1f, 20.0f, 255.5f,
1969   }, SHADER_ARG_INPUT);
1970   auto u = ShaderArg<uint8_t>({
1971      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
1972      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
1973      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
1974      255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0,
1975   }, SHADER_ARG_OUTPUT);
1976   const uint8_t expected[] = {
1977      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
1978      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
1979      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
1980      0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255, 0, 2, 20, 255,
1981   };
1982
1983   run_shader(kernel_source, 4, 1, 1, f, u);
1984   for (int i = 0; i < u.size(); ++i)
1985      EXPECT_EQ(u[i], expected[i]);
1986}
1987
1988TEST_F(ComputeTest, convert_char2_uchar2)
1989{
1990   const char *kernel_source =
1991   "__kernel void main_test( __global char2 *src, __global uchar2 *dest )\n\
1992   {\n\
1993      size_t i = get_global_id(0);\n\
1994      dest[i] = convert_uchar2_sat( src[i] );\n\
1995   }\n";
1996
1997   auto c = ShaderArg<int8_t>({ -127, -4, 0, 4, 126, 127, 16, 32 }, SHADER_ARG_INPUT);
1998   auto u = ShaderArg<uint8_t>({ 99, 99, 99, 99, 99, 99, 99, 99 }, SHADER_ARG_OUTPUT);
1999   const uint8_t expected[] = { 0, 0, 0, 4, 126, 127, 16, 32 };
2000   run_shader(kernel_source, 4, 1, 1, c, u);
2001   for (int i = 0; i < u.size(); i++)
2002      EXPECT_EQ(u[i], expected[i]);
2003}
2004
2005TEST_F(ComputeTest, async_copy)
2006{
2007   const char *kernel_source = R"(
2008   __kernel void main_test( const __global char *src, __global char *dst, __local char *localBuffer, int copiesPerWorkgroup, int copiesPerWorkItem )
2009   {
2010    int i;
2011    for(i=0; i<copiesPerWorkItem; i++)
2012        localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ] = (char)(char)0;
2013       barrier( CLK_LOCAL_MEM_FENCE );
2014       event_t event;
2015       event = async_work_group_copy( (__local char*)localBuffer, (__global const char*)(src+copiesPerWorkgroup*get_group_id(0)), (size_t)copiesPerWorkgroup, 0 );
2016       wait_group_events( 1, &event );
2017    for(i=0; i<copiesPerWorkItem; i++)
2018     dst[ get_global_id( 0 )*copiesPerWorkItem+i ] = localBuffer[ get_local_id( 0 )*copiesPerWorkItem+i ];
2019   })";
2020   Shader shader = compile({ kernel_source });
2021   validate(shader);
2022}
2023
2024TEST_F(ComputeTest, packed_struct_global)
2025{
2026#pragma pack(push, 1)
2027   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
2028#pragma pack(pop)
2029
2030   const char *kernel_source =
2031   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
2032   __kernel void main_test(__global struct s *inout, global uint *size)\n\
2033   {\n\
2034       uint idx = get_global_id(0);\n\
2035       inout[idx].uc = idx + 1;\n\
2036       inout[idx].ul = ((ulong)(idx + 1 + 0xfbfcfdfe) << 32) | 0x12345678;\n\
2037       inout[idx].us = ((ulong)(idx + 1 + 0xa0) << 8) | 0x12;\n\
2038       *size = sizeof(struct s);\n\
2039   }\n";
2040   auto inout = ShaderArg<struct s>({0, 0, 0}, SHADER_ARG_OUTPUT);
2041   auto size = ShaderArg<uint32_t>(0, SHADER_ARG_OUTPUT);
2042   const struct s expected[] = {
2043      { 1, 0xfbfcfdff12345678, 0xa112 }
2044   };
2045
2046   run_shader(kernel_source, inout.size(), 1, 1, inout, size);
2047   for (int i = 0; i < inout.size(); ++i) {
2048      EXPECT_EQ(inout[i].uc, expected[i].uc);
2049      EXPECT_EQ(inout[i].ul, expected[i].ul);
2050      EXPECT_EQ(inout[i].us, expected[i].us);
2051   }
2052   EXPECT_EQ(size, sizeof(struct s));
2053}
2054
2055TEST_F(ComputeTest, packed_struct_arg)
2056{
2057#pragma pack(push, 1)
2058   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
2059#pragma pack(pop)
2060
2061   const char *kernel_source =
2062   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
2063   __kernel void main_test(__global struct s *out, struct s in)\n\
2064   {\n\
2065       uint idx = get_global_id(0);\n\
2066       out[idx].uc = in.uc + 0x12;\n\
2067       out[idx].ul = in.ul + 0x123456789abcdef;\n\
2068       out[idx].us = in.us + 0x1234;\n\
2069   }\n";
2070   auto out = ShaderArg<struct s>({0, 0, 0}, SHADER_ARG_OUTPUT);
2071   auto in = ShaderArg<struct s>({1, 2, 3}, SHADER_ARG_INPUT);
2072   const struct s expected[] = {
2073      { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 }
2074   };
2075
2076   run_shader(kernel_source, out.size(), 1, 1, out, in);
2077   for (int i = 0; i < out.size(); ++i) {
2078      EXPECT_EQ(out[i].uc, expected[i].uc);
2079      EXPECT_EQ(out[i].ul, expected[i].ul);
2080      EXPECT_EQ(out[i].us, expected[i].us);
2081   }
2082}
2083
2084TEST_F(ComputeTest, packed_struct_local)
2085{
2086#pragma pack(push, 1)
2087   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
2088#pragma pack(pop)
2089
2090   const char *kernel_source =
2091   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
2092   __kernel void main_test(__global struct s *out, __constant struct s *in)\n\
2093   {\n\
2094       uint idx = get_global_id(0);\n\
2095       __local struct s tmp[2];\n\
2096       tmp[get_local_id(0)] = in[idx];\n\
2097       barrier(CLK_LOCAL_MEM_FENCE);\n\
2098       out[idx] = tmp[(get_local_id(0) + 1) % 2];\n\
2099   }\n";
2100   auto out = ShaderArg<struct s>({{0, 0, 0}, {0, 0, 0}}, SHADER_ARG_OUTPUT);
2101   auto in = ShaderArg<struct s>({{1, 2, 3}, {0x12, 0x123456789abcdef, 0x1234} }, SHADER_ARG_INPUT);
2102   const struct s expected[] = {
2103      { 0x12, 0x123456789abcdef, 0x1234 },
2104      { 1, 2, 3 },
2105   };
2106
2107   run_shader(kernel_source, out.size(), 1, 1, out, in);
2108   for (int i = 0; i < out.size(); ++i) {
2109      EXPECT_EQ(out[i].uc, expected[i].uc);
2110      EXPECT_EQ(out[i].ul, expected[i].ul);
2111      EXPECT_EQ(out[i].us, expected[i].us);
2112   }
2113}
2114
2115TEST_F(ComputeTest, DISABLED_packed_struct_const)
2116{
2117#pragma pack(push, 1)
2118   struct s { uint8_t uc; uint64_t ul; uint16_t us; };
2119#pragma pack(pop)
2120
2121   const char *kernel_source =
2122   "struct __attribute__((packed)) s {uchar uc; ulong ul; ushort us; };\n\
2123   __kernel void main_test(__global struct s *out, struct s in)\n\
2124   {\n\
2125       __constant struct s base[] = {\n\
2126          {0x12, 0x123456789abcdef, 0x1234},\n\
2127          {0x11, 0x123456789abcdee, 0x1233},\n\
2128       };\n\
2129       uint idx = get_global_id(0);\n\
2130       out[idx].uc = base[idx % 2].uc + in.uc;\n\
2131       out[idx].ul = base[idx % 2].ul + in.ul;\n\
2132       out[idx].us = base[idx % 2].us + in.us;\n\
2133   }\n";
2134   auto out = ShaderArg<struct s>(std::vector<struct s>(2, {0, 0, 0}), SHADER_ARG_OUTPUT);
2135   auto in = ShaderArg<struct s>({1, 2, 3}, SHADER_ARG_INPUT);
2136   const struct s expected[] = {
2137      { 0x12 + 1, 0x123456789abcdef + 2, 0x1234 + 3 },
2138      { 0x11 + 1, 0x123456789abcdee + 2, 0x1233 + 3 },
2139   };
2140
2141   run_shader(kernel_source, out.size(), 1, 1, out, in);
2142   for (int i = 0; i < out.size(); ++i) {
2143      EXPECT_EQ(out[i].uc, expected[i].uc);
2144      EXPECT_EQ(out[i].ul, expected[i].ul);
2145      EXPECT_EQ(out[i].us, expected[i].us);
2146   }
2147}
2148
2149TEST_F(ComputeTest, printf)
2150{
2151   const char *kernel_source = R"(
2152   __kernel void main_test(__global float *src, __global uint *dest)
2153   {
2154      *dest = printf("%s: %f", "Test", src[0]);
2155   })";
2156
2157   auto src = ShaderArg<float>({ 1.0f }, SHADER_ARG_INPUT);
2158   auto dest = ShaderArg<uint32_t>({ 0xdeadbeef }, SHADER_ARG_OUTPUT);
2159   run_shader(kernel_source, 1, 1, 1, src, dest);
2160   EXPECT_EQ(dest[0], 0);
2161}
2162
2163TEST_F(ComputeTest, vload_half)
2164{
2165   const char *kernel_source = R"(
2166   __kernel void main_test(__global half *src, __global float4 *dest)
2167   {
2168      int offset = get_global_id(0);
2169      dest[offset] = vload_half4(offset, src);
2170   })";
2171   auto src = ShaderArg<uint16_t>({ 0x3c00, 0x4000, 0x4200, 0x4400,
2172                                    0x4500, 0x4600, 0x4700, 0x4800 }, SHADER_ARG_INPUT);
2173   auto dest = ShaderArg<float>({ FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX,
2174                                  FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }, SHADER_ARG_OUTPUT);
2175   run_shader(kernel_source, 2, 1, 1, src, dest);
2176   for (unsigned i = 0; i < 8; ++i)
2177      EXPECT_FLOAT_EQ(dest[i], (float)(i + 1));
2178}
2179
2180TEST_F(ComputeTest, vstore_half)
2181{
2182   const char *kernel_source = R"(
2183   __kernel void main_test(__global half *dst, __global float4 *src)
2184   {
2185      int offset = get_global_id(0);
2186      vstore_half4(src[offset], offset, dst);
2187   })";
2188   auto dest = ShaderArg<uint16_t>({0xdead, 0xdead, 0xdead, 0xdead,
2189                                   0xdead, 0xdead, 0xdead, 0xdead}, SHADER_ARG_OUTPUT);
2190   auto src = ShaderArg<float>({ 1.0, 2.0, 3.0, 4.0,
2191                                  5.0, 6.0, 7.0, 8.0 }, SHADER_ARG_INPUT);
2192   run_shader(kernel_source, 2, 1, 1, dest, src);
2193   const uint16_t expected[] = { 0x3c00, 0x4000, 0x4200, 0x4400,
2194                                 0x4500, 0x4600, 0x4700, 0x4800 };
2195   for (unsigned i = 0; i < 8; ++i)
2196      EXPECT_EQ(dest[i], expected[i]);
2197}
2198
2199TEST_F(ComputeTest, inline_function)
2200{
2201   const char *kernel_source = R"(
2202   inline float helper(float foo)
2203   {
2204      return foo * 2;
2205   }
2206
2207   __kernel void main_test(__global float *dst, __global float *src)
2208   {
2209      *dst = helper(*src);
2210   })";
2211   auto dest = ShaderArg<float>({ NAN }, SHADER_ARG_OUTPUT);
2212   auto src = ShaderArg<float>({ 1.0f }, SHADER_ARG_INPUT);
2213   run_shader(kernel_source, 1, 1, 1, dest, src);
2214   EXPECT_EQ(dest[0], 2.0f);
2215}
2216
2217TEST_F(ComputeTest, unused_arg)
2218{
2219   const char *kernel_source = R"(
2220   __kernel void main_test(__global int *dst, __global int *unused, __global int *src)
2221   {
2222      int i = get_global_id(0);
2223      dst[i] = src[i];
2224   })";
2225   auto dest = ShaderArg<int>({ -1, -1, -1, -1 }, SHADER_ARG_OUTPUT);
2226   auto src = ShaderArg<int>({ 1, 2, 3, 4 }, SHADER_ARG_INPUT);
2227   auto unused = ShaderArg<int>({ -1, -1, -1, -1 }, SHADER_ARG_INPUT);
2228   run_shader(kernel_source, 4, 1, 1, dest, unused, src);
2229   for (int i = 0; i < 4; ++i)
2230      EXPECT_EQ(dest[i], i + 1);
2231}
2232
2233TEST_F(ComputeTest, spec_constant)
2234{
2235   const char *spirv_asm = R"(
2236               OpCapability Addresses
2237               OpCapability Kernel
2238               OpCapability Int64
2239          %1 = OpExtInstImport "OpenCL.std"
2240               OpMemoryModel Physical64 OpenCL
2241               OpEntryPoint Kernel %2 "main_test" %__spirv_BuiltInGlobalInvocationId
2242          %4 = OpString "kernel_arg_type.main_test.uint*,"
2243               OpSource OpenCL_C 102000
2244               OpName %__spirv_BuiltInGlobalInvocationId "__spirv_BuiltInGlobalInvocationId"
2245               OpName %output "output"
2246               OpName %entry "entry"
2247               OpName %output_addr "output.addr"
2248               OpName %id "id"
2249               OpName %call "call"
2250               OpName %conv "conv"
2251               OpName %idxprom "idxprom"
2252               OpName %arrayidx "arrayidx"
2253               OpName %add "add"
2254               OpName %mul "mul"
2255               OpName %idxprom1 "idxprom1"
2256               OpName %arrayidx2 "arrayidx2"
2257               OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
2258               OpDecorate %__spirv_BuiltInGlobalInvocationId Constant
2259               OpDecorate %id Alignment 4
2260               OpDecorate %output_addr Alignment 8
2261               OpDecorate %uint_1 SpecId 1
2262      %ulong = OpTypeInt 64 0
2263       %uint = OpTypeInt 32 0
2264     %uint_1 = OpSpecConstant %uint 1
2265    %v3ulong = OpTypeVector %ulong 3
2266%_ptr_Input_v3ulong = OpTypePointer Input %v3ulong
2267       %void = OpTypeVoid
2268%_ptr_CrossWorkgroup_uint = OpTypePointer CrossWorkgroup %uint
2269         %24 = OpTypeFunction %void %_ptr_CrossWorkgroup_uint
2270%_ptr_Function__ptr_CrossWorkgroup_uint = OpTypePointer Function %_ptr_CrossWorkgroup_uint
2271%_ptr_Function_uint = OpTypePointer Function %uint
2272%__spirv_BuiltInGlobalInvocationId = OpVariable %_ptr_Input_v3ulong Input
2273          %2 = OpFunction %void DontInline %24
2274     %output = OpFunctionParameter %_ptr_CrossWorkgroup_uint
2275      %entry = OpLabel
2276%output_addr = OpVariable %_ptr_Function__ptr_CrossWorkgroup_uint Function
2277         %id = OpVariable %_ptr_Function_uint Function
2278               OpStore %output_addr %output Aligned 8
2279         %27 = OpLoad %v3ulong %__spirv_BuiltInGlobalInvocationId Aligned 32
2280       %call = OpCompositeExtract %ulong %27 0
2281       %conv = OpUConvert %uint %call
2282               OpStore %id %conv Aligned 4
2283         %28 = OpLoad %_ptr_CrossWorkgroup_uint %output_addr Aligned 8
2284         %29 = OpLoad %uint %id Aligned 4
2285    %idxprom = OpUConvert %ulong %29
2286   %arrayidx = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %28 %idxprom
2287         %30 = OpLoad %uint %arrayidx Aligned 4
2288         %31 = OpLoad %uint %id Aligned 4
2289        %add = OpIAdd %uint %31 %uint_1
2290        %mul = OpIMul %uint %30 %add
2291         %32 = OpLoad %_ptr_CrossWorkgroup_uint %output_addr Aligned 8
2292         %33 = OpLoad %uint %id Aligned 4
2293   %idxprom1 = OpUConvert %ulong %33
2294  %arrayidx2 = OpInBoundsPtrAccessChain %_ptr_CrossWorkgroup_uint %32 %idxprom1
2295               OpStore %arrayidx2 %mul Aligned 4
2296               OpReturn
2297               OpFunctionEnd)";
2298   Shader shader = assemble(spirv_asm);
2299   Shader spec_shader = specialize(shader, 1, 5);
2300
2301   auto inout = ShaderArg<uint32_t>({ 0x00000001, 0x10000001, 0x00020002, 0x04010203 },
2302      SHADER_ARG_INOUT);
2303   const uint32_t expected[] = {
2304      0x00000005, 0x60000006, 0x000e000e, 0x20081018
2305   };
2306   CompileArgs args = { (unsigned)inout.size(), 1, 1 };
2307   run_shader(spec_shader, args, inout);
2308   for (int i = 0; i < inout.size(); ++i)
2309      EXPECT_EQ(inout[i], expected[i]);
2310}
2311
2312TEST_F(ComputeTest, arg_metadata)
2313{
2314   const char *kernel_source = R"(
2315   __kernel void main_test(
2316      __global int *undec_ptr,
2317      __global volatile int *vol_ptr,
2318      __global const int *const_ptr,
2319      __global int *restrict restr_ptr,
2320      __global const int *restrict const_restr_ptr,
2321      __constant int *const_ptr2)
2322   {
2323   })";
2324   Shader shader = compile({ kernel_source });
2325   EXPECT_EQ(shader.metadata->kernels[0].args[0].address_qualifier, CLC_KERNEL_ARG_ADDRESS_GLOBAL);
2326   EXPECT_EQ(shader.metadata->kernels[0].args[0].type_qualifier, 0);
2327   EXPECT_EQ(shader.metadata->kernels[0].args[1].address_qualifier, CLC_KERNEL_ARG_ADDRESS_GLOBAL);
2328   EXPECT_EQ(shader.metadata->kernels[0].args[1].type_qualifier, CLC_KERNEL_ARG_TYPE_VOLATILE);
2329   EXPECT_EQ(shader.metadata->kernels[0].args[2].address_qualifier, CLC_KERNEL_ARG_ADDRESS_GLOBAL);
2330   EXPECT_EQ(shader.metadata->kernels[0].args[2].type_qualifier, CLC_KERNEL_ARG_TYPE_CONST);
2331   EXPECT_EQ(shader.metadata->kernels[0].args[3].address_qualifier, CLC_KERNEL_ARG_ADDRESS_GLOBAL);
2332   EXPECT_EQ(shader.metadata->kernels[0].args[3].type_qualifier, CLC_KERNEL_ARG_TYPE_RESTRICT);
2333   EXPECT_EQ(shader.metadata->kernels[0].args[4].address_qualifier, CLC_KERNEL_ARG_ADDRESS_GLOBAL);
2334   EXPECT_EQ(shader.metadata->kernels[0].args[4].type_qualifier, CLC_KERNEL_ARG_TYPE_RESTRICT | CLC_KERNEL_ARG_TYPE_CONST);
2335   EXPECT_EQ(shader.metadata->kernels[0].args[5].address_qualifier, CLC_KERNEL_ARG_ADDRESS_CONSTANT);
2336   EXPECT_EQ(shader.metadata->kernels[0].args[5].type_qualifier, CLC_KERNEL_ARG_TYPE_CONST);
2337}
2338