1/**************************************************************************
2 *
3 * Copyright 2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29#include "util/u_debug.h"
30#include "util/u_cpu_detect.h"
31#include "util/u_math.h"
32#include "lp_bld_debug.h"
33#include "lp_bld_const.h"
34#include "lp_bld_format.h"
35#include "lp_bld_gather.h"
36#include "lp_bld_swizzle.h"
37#include "lp_bld_type.h"
38#include "lp_bld_init.h"
39#include "lp_bld_intr.h"
40#include "lp_bld_pack.h"
41
42
43/**
44 * Get the pointer to one element from scatter positions in memory.
45 *
46 * @sa lp_build_gather()
47 */
48LLVMValueRef
49lp_build_gather_elem_ptr(struct gallivm_state *gallivm,
50                         unsigned length,
51                         LLVMValueRef base_ptr,
52                         LLVMValueRef offsets,
53                         unsigned i)
54{
55   LLVMValueRef offset;
56   LLVMValueRef ptr;
57
58   ASSERTED LLVMTypeRef element_type = LLVMInt8TypeInContext(gallivm->context);
59   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(element_type, 0));
60
61   if (length == 1) {
62      assert(i == 0);
63      offset = offsets;
64   } else {
65      LLVMValueRef index = lp_build_const_int32(gallivm, i);
66      offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, "");
67   }
68
69   ptr = LLVMBuildGEP2(gallivm->builder, element_type, base_ptr, &offset, 1, "");
70
71   return ptr;
72}
73
74
75/**
76 * Gather one element from scatter positions in memory.
77 *
78 * @sa lp_build_gather()
79 */
80LLVMValueRef
81lp_build_gather_elem(struct gallivm_state *gallivm,
82                     unsigned length,
83                     unsigned src_width,
84                     unsigned dst_width,
85                     boolean aligned,
86                     LLVMValueRef base_ptr,
87                     LLVMValueRef offsets,
88                     unsigned i,
89                     boolean vector_justify)
90{
91   LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width);
92   LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width);
93   LLVMValueRef ptr;
94   LLVMValueRef res;
95
96   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
97
98   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
99   ptr = LLVMBuildBitCast(gallivm->builder, ptr, LLVMPointerType(src_type, 0), "");
100   res = LLVMBuildLoad2(gallivm->builder, src_type, ptr, "");
101
102   /* XXX
103    * On some archs we probably really want to avoid having to deal
104    * with alignments lower than 4 bytes (if fetch size is a power of
105    * two >= 32). On x86 it doesn't matter, however.
106    * We should be able to guarantee full alignment for any kind of texture
107    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
108    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
109    * but I don't think that's quite what we wanted).
110    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
111    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
112    * enforcing what we want (which is what d3d10 does, the offset needs to
113    * be aligned to element size, but GL has bytes regardless of element
114    * size which would only leave us with minimum alignment restriction of 16
115    * which doesn't make much sense if the type isn't 4x32bit). Due to
116    * translation of offsets to first_elem in sampler_views it actually seems
117    * gallium could not do anything else except 16 no matter what...
118    */
119   if (!aligned) {
120      LLVMSetAlignment(res, 1);
121   } else if (!util_is_power_of_two_or_zero(src_width)) {
122      /*
123       * Full alignment is impossible, assume the caller really meant
124       * the individual elements were aligned (e.g. 3x32bit format).
125       * And yes the generated code may otherwise crash, llvm will
126       * really assume 128bit alignment with a 96bit fetch (I suppose
127       * that makes sense as it can just assume the upper 32bit to be
128       * whatever).
129       * Maybe the caller should be able to explicitly set this, but
130       * this should cover all the 3-channel formats.
131       */
132      if (((src_width / 24) * 24 == src_width) &&
133           util_is_power_of_two_or_zero(src_width / 24)) {
134          LLVMSetAlignment(res, src_width / 24);
135      } else {
136         LLVMSetAlignment(res, 1);
137      }
138   }
139
140   assert(src_width <= dst_width);
141   if (src_width < dst_width) {
142      res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
143      if (vector_justify) {
144#if UTIL_ARCH_BIG_ENDIAN
145         res = LLVMBuildShl(gallivm->builder, res,
146                            LLVMConstInt(dst_elem_type, dst_width - src_width, 0), "");
147#endif
148      }
149   }
150
151   return res;
152}
153
154
155/**
156 * Gather one element from scatter positions in memory.
157 * Nearly the same as above, however the individual elements
158 * may be vectors themselves, and fetches may be float type.
159 * Can also do pad vector instead of ZExt.
160 *
161 * @sa lp_build_gather()
162 */
163static LLVMValueRef
164lp_build_gather_elem_vec(struct gallivm_state *gallivm,
165                         unsigned length,
166                         unsigned src_width,
167                         LLVMTypeRef src_type,
168                         struct lp_type dst_type,
169                         boolean aligned,
170                         LLVMValueRef base_ptr,
171                         LLVMValueRef offsets,
172                         unsigned i,
173                         boolean vector_justify)
174{
175   LLVMValueRef ptr, res;
176   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
177
178   ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i);
179   ptr = LLVMBuildBitCast(gallivm->builder, ptr, LLVMPointerType(src_type, 0), "");
180   res = LLVMBuildLoad2(gallivm->builder, src_type, ptr, "");
181
182   /* XXX
183    * On some archs we probably really want to avoid having to deal
184    * with alignments lower than 4 bytes (if fetch size is a power of
185    * two >= 32). On x86 it doesn't matter, however.
186    * We should be able to guarantee full alignment for any kind of texture
187    * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch
188    * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends
189    * but I don't think that's quite what we wanted).
190    * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT
191    * looks like a good fit, but it seems this cap bit (and OpenGL) aren't
192    * enforcing what we want (which is what d3d10 does, the offset needs to
193    * be aligned to element size, but GL has bytes regardless of element
194    * size which would only leave us with minimum alignment restriction of 16
195    * which doesn't make much sense if the type isn't 4x32bit). Due to
196    * translation of offsets to first_elem in sampler_views it actually seems
197    * gallium could not do anything else except 16 no matter what...
198    */
199   if (!aligned) {
200      LLVMSetAlignment(res, 1);
201   } else if (!util_is_power_of_two_or_zero(src_width)) {
202      /*
203       * Full alignment is impossible, assume the caller really meant
204       * the individual elements were aligned (e.g. 3x32bit format).
205       * And yes the generated code may otherwise crash, llvm will
206       * really assume 128bit alignment with a 96bit fetch (I suppose
207       * that makes sense as it can just assume the upper 32bit to be
208       * whatever).
209       * Maybe the caller should be able to explicitly set this, but
210       * this should cover all the 3-channel formats.
211       */
212      if (((src_width / 24) * 24 == src_width) &&
213           util_is_power_of_two_or_zero(src_width / 24)) {
214          LLVMSetAlignment(res, src_width / 24);
215      } else {
216         LLVMSetAlignment(res, 1);
217      }
218   }
219
220   assert(src_width <= dst_type.width * dst_type.length);
221   if (src_width < dst_type.width * dst_type.length) {
222      if (dst_type.length > 1) {
223         res = lp_build_pad_vector(gallivm, res, dst_type.length);
224         /*
225          * vector_justify hopefully a non-issue since we only deal
226          * with src_width >= 32 here?
227          */
228      } else {
229         LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type);
230
231         /*
232          * Only valid if src_ptr_type is int type...
233          */
234         res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, "");
235
236#if UTIL_ARCH_BIG_ENDIAN
237         if (vector_justify) {
238         res = LLVMBuildShl(gallivm->builder, res,
239                            LLVMConstInt(dst_elem_type,
240                                         dst_type.width - src_width, 0), "");
241         }
242         if (src_width == 48) {
243            /* Load 3x16 bit vector.
244             * The sequence of loads on big-endian hardware proceeds as follows.
245             * 16-bit fields are denoted by X, Y, Z, and 0.  In memory, the sequence
246             * of three fields appears in the order X, Y, Z.
247             *
248             * Load 32-bit word: 0.0.X.Y
249             * Load 16-bit halfword: 0.0.0.Z
250             * Rotate left: 0.X.Y.0
251             * Bitwise OR: 0.X.Y.Z
252             *
253             * The order in which we need the fields in the result is 0.Z.Y.X,
254             * the same as on little-endian; permute 16-bit fields accordingly
255             * within 64-bit register:
256             */
257            LLVMValueRef shuffles[4] = {
258               lp_build_const_int32(gallivm, 2),
259               lp_build_const_int32(gallivm, 1),
260               lp_build_const_int32(gallivm, 0),
261               lp_build_const_int32(gallivm, 3),
262            };
263            res = LLVMBuildBitCast(gallivm->builder, res,
264                                   lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), "");
265            res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), "");
266            res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, "");
267         }
268#endif
269      }
270   }
271   return res;
272}
273
274
275
276
277static LLVMValueRef
278lp_build_gather_avx2(struct gallivm_state *gallivm,
279                     unsigned length,
280                     unsigned src_width,
281                     struct lp_type dst_type,
282                     LLVMValueRef base_ptr,
283                     LLVMValueRef offsets)
284{
285   LLVMBuilderRef builder = gallivm->builder;
286   LLVMTypeRef src_type, src_vec_type;
287   LLVMValueRef res;
288   struct lp_type res_type = dst_type;
289   res_type.length *= length;
290
291   if (dst_type.floating) {
292      src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) :
293                                   LLVMFloatTypeInContext(gallivm->context);
294   } else {
295      src_type = LLVMIntTypeInContext(gallivm->context, src_width);
296   }
297   src_vec_type = LLVMVectorType(src_type, length);
298
299   /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */
300   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0));
301
302   if (0) {
303      /*
304       * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but
305       * will not use the AVX2 gather instrinsics (even with llvm 4.0), at
306       * least with Haswell. See
307       * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html
308       * And the generated code doing the emulation is quite a bit worse
309       * than what we get by doing it ourselves too.
310       */
311      LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32);
312      LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
313      LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1);
314      LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length);
315      LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
316      LLVMValueRef src_ptr;
317
318      base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, "");
319
320      /* Rescale offsets from bytes to elements */
321      LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0);
322      scale = lp_build_broadcast(gallivm, i32_vec_type, scale);
323      assert(LLVMTypeOf(offsets) == i32_vec_type);
324      offsets = LLVMBuildSDiv(builder, offsets, scale, "");
325
326      src_ptr = LLVMBuildGEP2(builder, src_type, base_ptr, &offsets, 1, "vector-gep");
327
328      char intrinsic[64];
329      snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u",
330               length, dst_type.floating ? "f" : "i", src_width);
331      LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0);
332      LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type);
333      LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
334
335      LLVMValueRef args[] = { src_ptr, alignment, mask, passthru };
336
337      res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0);
338   } else {
339      LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8);
340      const char *intrinsic = NULL;
341      unsigned l_idx = 0;
342
343      assert(src_width == 32 || src_width == 64);
344      if (src_width == 32) {
345         assert(length == 4 || length == 8);
346      } else {
347         assert(length == 2 || length == 4);
348      }
349
350      static const char *intrinsics[2][2][2] = {
351
352         {{"llvm.x86.avx2.gather.d.d",
353           "llvm.x86.avx2.gather.d.d.256"},
354          {"llvm.x86.avx2.gather.d.q",
355           "llvm.x86.avx2.gather.d.q.256"}},
356
357         {{"llvm.x86.avx2.gather.d.ps",
358           "llvm.x86.avx2.gather.d.ps.256"},
359          {"llvm.x86.avx2.gather.d.pd",
360           "llvm.x86.avx2.gather.d.pd.256"}},
361      };
362
363      if ((src_width == 32 && length == 8) ||
364          (src_width == 64 && length == 4)) {
365         l_idx = 1;
366      }
367      intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx];
368
369      LLVMValueRef passthru = LLVMGetUndef(src_vec_type);
370      LLVMValueRef mask = LLVMConstAllOnes(src_vec_type);
371      mask = LLVMConstBitCast(mask, src_vec_type);
372      LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0);
373
374      LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale };
375
376      res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0);
377   }
378   res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), "");
379
380   return res;
381}
382
383
384/**
385 * Gather elements from scatter positions in memory into a single vector.
386 * Use for fetching texels from a texture.
387 * For SSE, typical values are length=4, src_width=32, dst_width=32.
388 *
389 * When src_width < dst_width, the return value can be justified in
390 * one of two ways:
391 * "integer justification" is used when the caller treats the destination
392 * as a packed integer bitmask, as described by the channels' "shift" and
393 * "width" fields;
394 * "vector justification" is used when the caller casts the destination
395 * to a vector and needs channel X to be in vector element 0.
396 *
397 * @param length length of the offsets
398 * @param src_width src element width in bits
399 * @param dst_type result element type (src will be expanded to fit,
400 *        but truncation is not allowed)
401 *        (this may be a vector, must be pot sized)
402 * @param aligned whether the data is guaranteed to be aligned (to src_width)
403 * @param base_ptr base pointer, needs to be a i8 pointer type.
404 * @param offsets vector with offsets
405 * @param vector_justify select vector rather than integer justification
406 */
407LLVMValueRef
408lp_build_gather(struct gallivm_state *gallivm,
409                unsigned length,
410                unsigned src_width,
411                struct lp_type dst_type,
412                boolean aligned,
413                LLVMValueRef base_ptr,
414                LLVMValueRef offsets,
415                boolean vector_justify)
416{
417   LLVMValueRef res;
418   boolean need_expansion = src_width < dst_type.width * dst_type.length;
419   boolean vec_fetch;
420   struct lp_type fetch_type, fetch_dst_type;
421   LLVMTypeRef src_type;
422
423   assert(src_width <= dst_type.width * dst_type.length);
424
425   /*
426    * This is quite a mess...
427    * Figure out if the fetch should be done as:
428    * a) scalar or vector
429    * b) float or int
430    *
431    * As an example, for a 96bit fetch expanded into 4x32bit, it is better
432    * to use (3x32bit) vector type (then pad the vector). Otherwise, the
433    * zext will cause extra instructions.
434    * However, the same isn't true for 3x16bit (the codegen for that is
435    * completely worthless on x86 simd, and for 3x8bit is is way worse
436    * still, don't try that... (To get really good code out of llvm for
437    * these cases, the only way is to decompose the fetches manually
438    * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter
439    * case requires sse41, otherwise simple scalar zext is way better.
440    * But probably not important enough, so don't bother.)
441    * Also, we try to honor the floating bit of destination (but isn't
442    * possible if caller asks for instance for 2x32bit dst_type with
443    * 48bit fetch - the idea would be to use 3x16bit fetch, pad and
444    * cast to 2x32f type, so the fetch is always int and on top of that
445    * we avoid the vec pad and use scalar zext due the above mentioned
446    * issue).
447    * Note this is optimized for x86 sse2 and up backend. Could be tweaked
448    * for other archs if necessary...
449    */
450   if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) &&
451       (dst_type.length > 1)) {
452      /* use vector fetch (if dst_type is vector) */
453      vec_fetch = TRUE;
454      if (dst_type.floating) {
455         fetch_type = lp_type_float_vec(dst_type.width, src_width);
456      } else {
457         fetch_type = lp_type_int_vec(dst_type.width, src_width);
458      }
459      /* intentionally not using lp_build_vec_type here */
460      src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type),
461                                fetch_type.length);
462      fetch_dst_type = fetch_type;
463      fetch_dst_type.length = dst_type.length;
464    } else {
465      /* use scalar fetch */
466      vec_fetch = FALSE;
467      if (dst_type.floating && ((src_width == 32) || (src_width == 64))) {
468         fetch_type = lp_type_float(src_width);
469      } else {
470         fetch_type = lp_type_int(src_width);
471      }
472      src_type = lp_build_vec_type(gallivm, fetch_type);
473      fetch_dst_type = fetch_type;
474      fetch_dst_type.width = dst_type.width * dst_type.length;
475   }
476
477   if (length == 1) {
478      /* Scalar */
479      res = lp_build_gather_elem_vec(gallivm, length,
480                                     src_width, src_type, fetch_dst_type,
481                                     aligned, base_ptr, offsets, 0,
482                                     vector_justify);
483      return LLVMBuildBitCast(gallivm->builder, res,
484                              lp_build_vec_type(gallivm, dst_type), "");
485      /*
486       * Excluding expansion from these paths because if you need it for
487       * 32bit/64bit fetches you're doing it wrong (this is gather, not
488       * conversion) and it would be awkward for floats.
489       */
490   } else if (util_get_cpu_caps()->has_avx2 && !need_expansion &&
491              src_width == 32 && (length == 4 || length == 8)) {
492      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
493                                  base_ptr, offsets);
494   /*
495    * This looks bad on paper wrt throughtput/latency on Haswell.
496    * Even on Broadwell it doesn't look stellar.
497    * Albeit no measurements were done (but tested to work).
498    * Should definitely enable on Skylake.
499    * (In general, should be more of a win if the fetch is 256bit wide -
500    * this is true for the 32bit case above too.)
501    */
502   } else if (0 && util_get_cpu_caps()->has_avx2 && !need_expansion &&
503              src_width == 64 && (length == 2 || length == 4)) {
504      return lp_build_gather_avx2(gallivm, length, src_width, dst_type,
505                                  base_ptr, offsets);
506   } else {
507      /* Vector */
508
509      LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8];
510      unsigned i;
511      boolean vec_zext = FALSE;
512      struct lp_type res_type, gather_res_type;
513      LLVMTypeRef res_t, gather_res_t;
514
515      res_type = fetch_dst_type;
516      res_type.length *= length;
517      gather_res_type = res_type;
518
519      if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) {
520         /*
521          * Note that llvm is never able to optimize zext/insert combos
522          * directly (i.e. zero the simd reg, then place the elements into
523          * the appropriate place directly). (I think this has to do with
524          * scalar/vector transition.) And scalar 16->32bit zext simd loads
525          * aren't possible (instead loading to scalar reg first).
526          * No idea about other archs...
527          * We could do this manually, but instead we just use a vector
528          * zext, which is simple enough (and, in fact, llvm might optimize
529          * this away).
530          * (We're not trying that with other bit widths as that might not be
531          * easier, in particular with 8 bit values at least with only sse2.)
532          */
533         assert(vec_fetch == FALSE);
534         gather_res_type.width /= 2;
535         fetch_dst_type = fetch_type;
536         src_type = lp_build_vec_type(gallivm, fetch_type);
537         vec_zext = TRUE;
538      }
539      res_t = lp_build_vec_type(gallivm, res_type);
540      gather_res_t = lp_build_vec_type(gallivm, gather_res_type);
541      res = LLVMGetUndef(gather_res_t);
542      for (i = 0; i < length; ++i) {
543         LLVMValueRef index = lp_build_const_int32(gallivm, i);
544         elems[i] = lp_build_gather_elem_vec(gallivm, length,
545                                             src_width, src_type, fetch_dst_type,
546                                             aligned, base_ptr, offsets, i,
547                                             vector_justify);
548         if (!vec_fetch) {
549            res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, "");
550         }
551      }
552      if (vec_zext) {
553         res = LLVMBuildZExt(gallivm->builder, res, res_t, "");
554         if (vector_justify) {
555#if UTIL_ARCH_BIG_ENDIAN
556            unsigned sv = dst_type.width - src_width;
557            res = LLVMBuildShl(gallivm->builder, res,
558                               lp_build_const_int_vec(gallivm, res_type, sv), "");
559#endif
560         }
561      }
562      if (vec_fetch) {
563         /*
564          * Do bitcast now otherwise llvm might get some funny ideas wrt
565          * float/int types...
566          */
567         for (i = 0; i < length; i++) {
568            elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i],
569                                        lp_build_vec_type(gallivm, dst_type), "");
570         }
571         res = lp_build_concat(gallivm, elems, dst_type, length);
572      } else {
573         struct lp_type really_final_type = dst_type;
574         assert(res_type.length * res_type.width ==
575                dst_type.length * dst_type.width * length);
576         really_final_type.length *= length;
577         res = LLVMBuildBitCast(gallivm->builder, res,
578                                lp_build_vec_type(gallivm, really_final_type), "");
579      }
580   }
581
582   return res;
583}
584
585LLVMValueRef
586lp_build_gather_values(struct gallivm_state * gallivm,
587                       LLVMValueRef * values,
588                       unsigned value_count)
589{
590   LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count);
591   LLVMBuilderRef builder = gallivm->builder;
592   LLVMValueRef vec = LLVMGetUndef(vec_type);
593   unsigned i;
594
595   for (i = 0; i < value_count; i++) {
596      LLVMValueRef index = lp_build_const_int32(gallivm, i);
597      vec = LLVMBuildInsertElement(builder, vec, values[i], index, "");
598   }
599   return vec;
600}
601