1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * Helper functions for logical operations.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35#include <llvm/Config/llvm-config.h>
36
37#include "util/u_cpu_detect.h"
38#include "util/u_memory.h"
39#include "util/u_debug.h"
40
41#include "lp_bld_type.h"
42#include "lp_bld_const.h"
43#include "lp_bld_swizzle.h"
44#include "lp_bld_init.h"
45#include "lp_bld_intr.h"
46#include "lp_bld_debug.h"
47#include "lp_bld_logic.h"
48
49
50/*
51 * XXX
52 *
53 * Selection with vector conditional like
54 *
55 *    select <4 x i1> %C, %A, %B
56 *
57 * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
58 * supported on some backends (x86) starting with llvm 3.1.
59 *
60 * Expanding the boolean vector to full SIMD register width, as in
61 *
62 *    sext <4 x i1> %C to <4 x i32>
63 *
64 * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
65 * it causes assertion failures in LLVM 2.6. It appears to work correctly on
66 * LLVM 2.7.
67 */
68
69
70/**
71 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
72 * \param func  one of PIPE_FUNC_x
73 * If the ordered argument is true the function will use LLVM's ordered
74 * comparisons, otherwise unordered comparisons will be used.
75 * The result values will be 0 for false or ~0 for true.
76 */
77static LLVMValueRef
78lp_build_compare_ext(struct gallivm_state *gallivm,
79                     const struct lp_type type,
80                     enum pipe_compare_func func,
81                     LLVMValueRef a,
82                     LLVMValueRef b,
83                     boolean ordered)
84{
85   LLVMBuilderRef builder = gallivm->builder;
86   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
87   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
88   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
89   LLVMValueRef cond;
90   LLVMValueRef res;
91
92   assert(lp_check_value(type, a));
93   assert(lp_check_value(type, b));
94
95   if (func == PIPE_FUNC_NEVER)
96      return zeros;
97   if (func == PIPE_FUNC_ALWAYS)
98      return ones;
99
100   assert(func > PIPE_FUNC_NEVER);
101   assert(func < PIPE_FUNC_ALWAYS);
102
103   if (type.floating) {
104      LLVMRealPredicate op;
105      switch(func) {
106      case PIPE_FUNC_EQUAL:
107         op = ordered ? LLVMRealOEQ : LLVMRealUEQ;
108         break;
109      case PIPE_FUNC_NOTEQUAL:
110         op = ordered ? LLVMRealONE : LLVMRealUNE;
111         break;
112      case PIPE_FUNC_LESS:
113         op = ordered ? LLVMRealOLT : LLVMRealULT;
114         break;
115      case PIPE_FUNC_LEQUAL:
116         op = ordered ? LLVMRealOLE : LLVMRealULE;
117         break;
118      case PIPE_FUNC_GREATER:
119         op = ordered ? LLVMRealOGT : LLVMRealUGT;
120         break;
121      case PIPE_FUNC_GEQUAL:
122         op = ordered ? LLVMRealOGE : LLVMRealUGE;
123         break;
124      default:
125         assert(0);
126         return lp_build_undef(gallivm, type);
127      }
128
129      cond = LLVMBuildFCmp(builder, op, a, b, "");
130      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
131   }
132   else {
133      LLVMIntPredicate op;
134      switch(func) {
135      case PIPE_FUNC_EQUAL:
136         op = LLVMIntEQ;
137         break;
138      case PIPE_FUNC_NOTEQUAL:
139         op = LLVMIntNE;
140         break;
141      case PIPE_FUNC_LESS:
142         op = type.sign ? LLVMIntSLT : LLVMIntULT;
143         break;
144      case PIPE_FUNC_LEQUAL:
145         op = type.sign ? LLVMIntSLE : LLVMIntULE;
146         break;
147      case PIPE_FUNC_GREATER:
148         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
149         break;
150      case PIPE_FUNC_GEQUAL:
151         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
152         break;
153      default:
154         assert(0);
155         return lp_build_undef(gallivm, type);
156      }
157
158      cond = LLVMBuildICmp(builder, op, a, b, "");
159      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
160   }
161
162   return res;
163}
164
165/**
166 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
167 * \param func  one of PIPE_FUNC_x
168 * The result values will be 0 for false or ~0 for true.
169 */
170LLVMValueRef
171lp_build_compare(struct gallivm_state *gallivm,
172                 const struct lp_type type,
173                 enum pipe_compare_func func,
174                 LLVMValueRef a,
175                 LLVMValueRef b)
176{
177   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
178   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
179   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
180
181   assert(lp_check_value(type, a));
182   assert(lp_check_value(type, b));
183
184   if (func == PIPE_FUNC_NEVER)
185      return zeros;
186   if (func == PIPE_FUNC_ALWAYS)
187      return ones;
188
189   assert(func > PIPE_FUNC_NEVER);
190   assert(func < PIPE_FUNC_ALWAYS);
191
192#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
193   /*
194    * There are no unsigned integer comparison instructions in SSE.
195    */
196
197   if (!type.floating && !type.sign &&
198       type.width * type.length == 128 &&
199       util_get_cpu_caps()->has_sse2 &&
200       (func == PIPE_FUNC_LESS ||
201        func == PIPE_FUNC_LEQUAL ||
202        func == PIPE_FUNC_GREATER ||
203        func == PIPE_FUNC_GEQUAL) &&
204       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
205         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
206                      __FUNCTION__, type.length, type.width);
207   }
208#endif
209
210   return lp_build_compare_ext(gallivm, type, func, a, b, FALSE);
211}
212
213/**
214 * Build code to compare two values 'a' and 'b' using the given func.
215 * \param func  one of PIPE_FUNC_x
216 * If the operands are floating point numbers, the function will use
217 * ordered comparison which means that it will return true if both
218 * operands are not a NaN and the specified condition evaluates to true.
219 * The result values will be 0 for false or ~0 for true.
220 */
221LLVMValueRef
222lp_build_cmp_ordered(struct lp_build_context *bld,
223                     enum pipe_compare_func func,
224                     LLVMValueRef a,
225                     LLVMValueRef b)
226{
227   return lp_build_compare_ext(bld->gallivm, bld->type, func, a, b, TRUE);
228}
229
230/**
231 * Build code to compare two values 'a' and 'b' using the given func.
232 * \param func  one of PIPE_FUNC_x
233 * If the operands are floating point numbers, the function will use
234 * unordered comparison which means that it will return true if either
235 * operand is a NaN or the specified condition evaluates to true.
236 * The result values will be 0 for false or ~0 for true.
237 */
238LLVMValueRef
239lp_build_cmp(struct lp_build_context *bld,
240             enum pipe_compare_func func,
241             LLVMValueRef a,
242             LLVMValueRef b)
243{
244   return lp_build_compare(bld->gallivm, bld->type, func, a, b);
245}
246
247
248/**
249 * Return (mask & a) | (~mask & b);
250 */
251LLVMValueRef
252lp_build_select_bitwise(struct lp_build_context *bld,
253                        LLVMValueRef mask,
254                        LLVMValueRef a,
255                        LLVMValueRef b)
256{
257   LLVMBuilderRef builder = bld->gallivm->builder;
258   struct lp_type type = bld->type;
259   LLVMValueRef res;
260   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
261
262   assert(lp_check_value(type, a));
263   assert(lp_check_value(type, b));
264
265   if (a == b) {
266      return a;
267   }
268
269   if (type.floating) {
270      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
271      b = LLVMBuildBitCast(builder, b, int_vec_type, "");
272   }
273
274   if (type.width > 32)
275      mask = LLVMBuildSExt(builder, mask, int_vec_type, "");
276   a = LLVMBuildAnd(builder, a, mask, "");
277
278   /* This often gets translated to PANDN, but sometimes the NOT is
279    * pre-computed and stored in another constant. The best strategy depends
280    * on available registers, so it is not a big deal -- hopefully LLVM does
281    * the right decision attending the rest of the program.
282    */
283   b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), "");
284
285   res = LLVMBuildOr(builder, a, b, "");
286
287   if (type.floating) {
288      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
289      res = LLVMBuildBitCast(builder, res, vec_type, "");
290   }
291
292   return res;
293}
294
295
296/**
297 * Return mask ? a : b;
298 *
299 * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value
300 * will yield unpredictable results.
301 */
302LLVMValueRef
303lp_build_select(struct lp_build_context *bld,
304                LLVMValueRef mask,
305                LLVMValueRef a,
306                LLVMValueRef b)
307{
308   LLVMBuilderRef builder = bld->gallivm->builder;
309   LLVMContextRef lc = bld->gallivm->context;
310   struct lp_type type = bld->type;
311   LLVMValueRef res;
312
313   assert(lp_check_value(type, a));
314   assert(lp_check_value(type, b));
315
316   if (a == b)
317      return a;
318
319   if (type.length == 1) {
320      mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
321      res = LLVMBuildSelect(builder, mask, a, b, "");
322   }
323   else if (LLVMIsConstant(mask) ||
324            LLVMGetInstructionOpcode(mask) == LLVMSExt) {
325      /* Generate a vector select.
326       *
327       * Using vector selects should avoid emitting intrinsics hence avoid
328       * hindering optimization passes, but vector selects weren't properly
329       * supported yet for a long time, and LLVM will generate poor code when
330       * the mask is not the result of a comparison.
331       * XXX: Even if the instruction was an SExt, this may still produce
332       * terrible code. Try piglit stencil-twoside.
333       */
334
335      /* Convert the mask to a vector of booleans.
336       *
337       * XXX: In x86 the mask is controlled by the MSB, so if we shifted the
338       * mask by `type.width - 1`, LLVM should realize the mask is ready.  Alas
339       * what really happens is that LLVM will emit two shifts back to back.
340       */
341      if (0) {
342         LLVMValueRef shift =
343            LLVMConstInt(bld->int_elem_type, bld->type.width - 1, 0);
344         shift = lp_build_broadcast(bld->gallivm, bld->int_vec_type, shift);
345         mask = LLVMBuildLShr(builder, mask, shift, "");
346      }
347      LLVMTypeRef bool_vec_type =
348         LLVMVectorType(LLVMInt1TypeInContext(lc), type.length);
349      mask = LLVMBuildTrunc(builder, mask, bool_vec_type, "");
350
351      res = LLVMBuildSelect(builder, mask, a, b, "");
352   }
353   else if (((util_get_cpu_caps()->has_sse4_1 &&
354              type.width * type.length == 128) ||
355             (util_get_cpu_caps()->has_avx &&
356              type.width * type.length == 256 && type.width >= 32) ||
357             (util_get_cpu_caps()->has_avx2 &&
358              type.width * type.length == 256)) &&
359            !LLVMIsConstant(a) &&
360            !LLVMIsConstant(b) &&
361            !LLVMIsConstant(mask)) {
362      const char *intrinsic;
363      LLVMTypeRef arg_type;
364      LLVMValueRef args[3];
365
366      LLVMTypeRef mask_type = LLVMGetElementType(LLVMTypeOf(mask));
367      if (LLVMGetIntTypeWidth(mask_type) != type.width) {
368         LLVMTypeRef int_vec_type =
369            LLVMVectorType(LLVMIntTypeInContext(lc, type.width), type.length);
370         mask = LLVMBuildSExt(builder, mask, int_vec_type, "");
371      }
372      /*
373       *  There's only float blend in AVX but can just cast i32/i64
374       *  to float.
375       */
376      if (type.width * type.length == 256) {
377         if (type.width == 64) {
378           intrinsic = "llvm.x86.avx.blendv.pd.256";
379           arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
380         }
381         else if (type.width == 32) {
382            intrinsic = "llvm.x86.avx.blendv.ps.256";
383            arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
384         } else {
385            assert(util_get_cpu_caps()->has_avx2);
386            intrinsic = "llvm.x86.avx2.pblendvb";
387            arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
388         }
389      }
390      else if (type.floating &&
391               type.width == 64) {
392         intrinsic = "llvm.x86.sse41.blendvpd";
393         arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
394      } else if (type.floating &&
395                 type.width == 32) {
396         intrinsic = "llvm.x86.sse41.blendvps";
397         arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4);
398      } else {
399         intrinsic = "llvm.x86.sse41.pblendvb";
400         arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16);
401      }
402
403      if (arg_type != bld->int_vec_type) {
404         mask = LLVMBuildBitCast(builder, mask, arg_type, "");
405      }
406
407      if (arg_type != bld->vec_type) {
408         a = LLVMBuildBitCast(builder, a, arg_type, "");
409         b = LLVMBuildBitCast(builder, b, arg_type, "");
410      }
411
412      args[0] = b;
413      args[1] = a;
414      args[2] = mask;
415
416      res = lp_build_intrinsic(builder, intrinsic,
417                               arg_type, args, ARRAY_SIZE(args), 0);
418
419      if (arg_type != bld->vec_type) {
420         res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
421      }
422   }
423   else {
424      res = lp_build_select_bitwise(bld, mask, a, b);
425   }
426
427   return res;
428}
429
430
431/**
432 * Return mask ? a : b;
433 *
434 * mask is a TGSI_WRITEMASK_xxx.
435 */
436LLVMValueRef
437lp_build_select_aos(struct lp_build_context *bld,
438                    unsigned mask,
439                    LLVMValueRef a,
440                    LLVMValueRef b,
441                    unsigned num_channels)
442{
443   LLVMBuilderRef builder = bld->gallivm->builder;
444   const struct lp_type type = bld->type;
445   const unsigned n = type.length;
446
447   assert((mask & ~0xf) == 0);
448   assert(lp_check_value(type, a));
449   assert(lp_check_value(type, b));
450
451   if (a == b)
452      return a;
453   if ((mask & 0xf) == 0xf)
454      return a;
455   if ((mask & 0xf) == 0x0)
456      return b;
457   if (a == bld->undef || b == bld->undef)
458      return bld->undef;
459
460   /*
461    * There are two major ways of accomplishing this:
462    * - with a shuffle
463    * - with a select
464    *
465    * The flip between these is empirical and might need to be adjusted.
466    */
467   if (n <= 4) {
468      /*
469       * Shuffle.
470       */
471      LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
472      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
473
474      for (unsigned j = 0; j < n; j += num_channels)
475         for (unsigned i = 0; i < num_channels; ++i)
476            shuffles[j + i] = LLVMConstInt(elem_type,
477                                           (mask & (1 << i) ? 0 : n) + j + i,
478                                           0);
479
480      return LLVMBuildShuffleVector(builder, a, b,
481                                    LLVMConstVector(shuffles, n), "");
482   }
483   else {
484      LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm,
485                                                      type, mask, num_channels);
486      return lp_build_select(bld, mask_vec, a, b);
487   }
488}
489
490
491/**
492 * Return (scalar-cast)val ? true : false;
493 */
494LLVMValueRef
495lp_build_any_true_range(struct lp_build_context *bld,
496                        unsigned real_length,
497                        LLVMValueRef val)
498{
499   LLVMBuilderRef builder = bld->gallivm->builder;
500   LLVMTypeRef scalar_type;
501   LLVMTypeRef true_type;
502
503   assert(real_length <= bld->type.length);
504
505   true_type = LLVMIntTypeInContext(bld->gallivm->context,
506                                    bld->type.width * real_length);
507   scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
508                                      bld->type.width * bld->type.length);
509   val = LLVMBuildBitCast(builder, val, scalar_type, "");
510   /*
511    * We're using always native types so we can use intrinsics.
512    * However, if we don't do per-element calculations, we must ensure
513    * the excess elements aren't used since they may contain garbage.
514    */
515   if (real_length < bld->type.length) {
516      val = LLVMBuildTrunc(builder, val, true_type, "");
517   }
518   return LLVMBuildICmp(builder, LLVMIntNE,
519                        val, LLVMConstNull(true_type), "");
520}
521