1bf215546Sopenharmony_ci/**************************************************************************
2bf215546Sopenharmony_ci *
3bf215546Sopenharmony_ci * Copyright 2009 VMware, Inc.
4bf215546Sopenharmony_ci * All Rights Reserved.
5bf215546Sopenharmony_ci *
6bf215546Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
7bf215546Sopenharmony_ci * copy of this software and associated documentation files (the
8bf215546Sopenharmony_ci * "Software"), to deal in the Software without restriction, including
9bf215546Sopenharmony_ci * without limitation the rights to use, copy, modify, merge, publish,
10bf215546Sopenharmony_ci * distribute, sub license, and/or sell copies of the Software, and to
11bf215546Sopenharmony_ci * permit persons to whom the Software is furnished to do so, subject to
12bf215546Sopenharmony_ci * the following conditions:
13bf215546Sopenharmony_ci *
14bf215546Sopenharmony_ci * The above copyright notice and this permission notice (including the
15bf215546Sopenharmony_ci * next paragraph) shall be included in all copies or substantial portions
16bf215546Sopenharmony_ci * of the Software.
17bf215546Sopenharmony_ci *
18bf215546Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19bf215546Sopenharmony_ci * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20bf215546Sopenharmony_ci * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21bf215546Sopenharmony_ci * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22bf215546Sopenharmony_ci * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23bf215546Sopenharmony_ci * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24bf215546Sopenharmony_ci * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25bf215546Sopenharmony_ci *
26bf215546Sopenharmony_ci **************************************************************************/
27bf215546Sopenharmony_ci
28bf215546Sopenharmony_ci
29bf215546Sopenharmony_ci/**
30bf215546Sopenharmony_ci * @file
31bf215546Sopenharmony_ci * Helper functions for packing/unpacking.
32bf215546Sopenharmony_ci *
33bf215546Sopenharmony_ci * Pack/unpacking is necessary for conversion between types of different
34bf215546Sopenharmony_ci * bit width.
35bf215546Sopenharmony_ci *
36bf215546Sopenharmony_ci * They are also commonly used when an computation needs higher
37bf215546Sopenharmony_ci * precision for the intermediate values. For example, if one needs the
38bf215546Sopenharmony_ci * function:
39bf215546Sopenharmony_ci *
40bf215546Sopenharmony_ci *   c = compute(a, b);
41bf215546Sopenharmony_ci *
42bf215546Sopenharmony_ci * to use more precision for intermediate results then one should implement it
43bf215546Sopenharmony_ci * as:
44bf215546Sopenharmony_ci *
45bf215546Sopenharmony_ci *   LLVMValueRef
46bf215546Sopenharmony_ci *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
47bf215546Sopenharmony_ci *   {
48bf215546Sopenharmony_ci *      struct lp_type wide_type = lp_wider_type(type);
49bf215546Sopenharmony_ci *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
50bf215546Sopenharmony_ci *
51bf215546Sopenharmony_ci *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
52bf215546Sopenharmony_ci *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
53bf215546Sopenharmony_ci *
54bf215546Sopenharmony_ci *      cl = compute_half(al, bl);
55bf215546Sopenharmony_ci *      ch = compute_half(ah, bh);
56bf215546Sopenharmony_ci *
57bf215546Sopenharmony_ci *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
58bf215546Sopenharmony_ci *
59bf215546Sopenharmony_ci *      return c;
60bf215546Sopenharmony_ci *   }
61bf215546Sopenharmony_ci *
62bf215546Sopenharmony_ci * where compute_half() would do the computation for half the elements with
63bf215546Sopenharmony_ci * twice the precision.
64bf215546Sopenharmony_ci *
65bf215546Sopenharmony_ci * @author Jose Fonseca <jfonseca@vmware.com>
66bf215546Sopenharmony_ci */
67bf215546Sopenharmony_ci
68bf215546Sopenharmony_ci
69bf215546Sopenharmony_ci#include "util/u_debug.h"
70bf215546Sopenharmony_ci#include "util/u_math.h"
71bf215546Sopenharmony_ci#include "util/u_cpu_detect.h"
72bf215546Sopenharmony_ci#include "util/u_memory.h"
73bf215546Sopenharmony_ci
74bf215546Sopenharmony_ci#include "lp_bld_type.h"
75bf215546Sopenharmony_ci#include "lp_bld_const.h"
76bf215546Sopenharmony_ci#include "lp_bld_init.h"
77bf215546Sopenharmony_ci#include "lp_bld_intr.h"
78bf215546Sopenharmony_ci#include "lp_bld_arit.h"
79bf215546Sopenharmony_ci#include "lp_bld_pack.h"
80bf215546Sopenharmony_ci#include "lp_bld_swizzle.h"
81bf215546Sopenharmony_ci
82bf215546Sopenharmony_ci
83bf215546Sopenharmony_ci/**
84bf215546Sopenharmony_ci * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
85bf215546Sopenharmony_ci */
86bf215546Sopenharmony_cistatic LLVMValueRef
87bf215546Sopenharmony_cilp_build_const_unpack_shuffle(struct gallivm_state *gallivm,
88bf215546Sopenharmony_ci                              unsigned n, unsigned lo_hi)
89bf215546Sopenharmony_ci{
90bf215546Sopenharmony_ci   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
91bf215546Sopenharmony_ci   unsigned i, j;
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_ci   assert(n <= LP_MAX_VECTOR_LENGTH);
94bf215546Sopenharmony_ci   assert(lo_hi < 2);
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_ci   /* TODO: cache results in a static table */
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_ci   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
99bf215546Sopenharmony_ci      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
100bf215546Sopenharmony_ci      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
101bf215546Sopenharmony_ci   }
102bf215546Sopenharmony_ci
103bf215546Sopenharmony_ci   return LLVMConstVector(elems, n);
104bf215546Sopenharmony_ci}
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_ci/**
107bf215546Sopenharmony_ci * Similar to lp_build_const_unpack_shuffle but for special AVX 256bit unpack.
108bf215546Sopenharmony_ci * See comment above lp_build_interleave2_half for more details.
109bf215546Sopenharmony_ci */
110bf215546Sopenharmony_cistatic LLVMValueRef
111bf215546Sopenharmony_cilp_build_const_unpack_shuffle_half(struct gallivm_state *gallivm,
112bf215546Sopenharmony_ci                                   unsigned n, unsigned lo_hi)
113bf215546Sopenharmony_ci{
114bf215546Sopenharmony_ci   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
115bf215546Sopenharmony_ci   unsigned i, j;
116bf215546Sopenharmony_ci
117bf215546Sopenharmony_ci   assert(n <= LP_MAX_VECTOR_LENGTH);
118bf215546Sopenharmony_ci   assert(lo_hi < 2);
119bf215546Sopenharmony_ci
120bf215546Sopenharmony_ci   for (i = 0, j = lo_hi*(n/4); i < n; i += 2, ++j) {
121bf215546Sopenharmony_ci      if (i == (n / 2))
122bf215546Sopenharmony_ci         j += n / 4;
123bf215546Sopenharmony_ci
124bf215546Sopenharmony_ci      elems[i + 0] = lp_build_const_int32(gallivm, 0 + j);
125bf215546Sopenharmony_ci      elems[i + 1] = lp_build_const_int32(gallivm, n + j);
126bf215546Sopenharmony_ci   }
127bf215546Sopenharmony_ci
128bf215546Sopenharmony_ci   return LLVMConstVector(elems, n);
129bf215546Sopenharmony_ci}
130bf215546Sopenharmony_ci
131bf215546Sopenharmony_ci/**
132bf215546Sopenharmony_ci * Similar to lp_build_const_unpack_shuffle_half, but for AVX512
133bf215546Sopenharmony_ci * See comment above lp_build_interleave2_half for more details.
134bf215546Sopenharmony_ci */
135bf215546Sopenharmony_cistatic LLVMValueRef
136bf215546Sopenharmony_cilp_build_const_unpack_shuffle_16wide(struct gallivm_state *gallivm,
137bf215546Sopenharmony_ci                                     unsigned lo_hi)
138bf215546Sopenharmony_ci{
139bf215546Sopenharmony_ci   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
140bf215546Sopenharmony_ci   unsigned i, j;
141bf215546Sopenharmony_ci
142bf215546Sopenharmony_ci   assert(lo_hi < 2);
143bf215546Sopenharmony_ci
144bf215546Sopenharmony_ci   // for the following lo_hi setting, convert 0 -> f to:
145bf215546Sopenharmony_ci   // 0: 0 16 4 20  8 24 12 28 1 17 5 21  9 25 13 29
146bf215546Sopenharmony_ci   // 1: 2 18 6 22 10 26 14 30 3 19 7 23 11 27 15 31
147bf215546Sopenharmony_ci   for (i = 0; i < 16; i++) {
148bf215546Sopenharmony_ci      j = ((i&0x06)<<1) + ((i&1)<<4) + (i>>3) + (lo_hi<<1);
149bf215546Sopenharmony_ci
150bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, j);
151bf215546Sopenharmony_ci   }
152bf215546Sopenharmony_ci
153bf215546Sopenharmony_ci   return LLVMConstVector(elems, 16);
154bf215546Sopenharmony_ci}
155bf215546Sopenharmony_ci
156bf215546Sopenharmony_ci/**
157bf215546Sopenharmony_ci * Build shuffle vectors that match PACKxx (SSE) instructions or
158bf215546Sopenharmony_ci * VPERM (Altivec).
159bf215546Sopenharmony_ci */
160bf215546Sopenharmony_cistatic LLVMValueRef
161bf215546Sopenharmony_cilp_build_const_pack_shuffle(struct gallivm_state *gallivm, unsigned n)
162bf215546Sopenharmony_ci{
163bf215546Sopenharmony_ci   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
164bf215546Sopenharmony_ci   unsigned i;
165bf215546Sopenharmony_ci
166bf215546Sopenharmony_ci   assert(n <= LP_MAX_VECTOR_LENGTH);
167bf215546Sopenharmony_ci
168bf215546Sopenharmony_ci   for(i = 0; i < n; ++i)
169bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
170bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, 2*i);
171bf215546Sopenharmony_ci#else
172bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, 2*i+1);
173bf215546Sopenharmony_ci#endif
174bf215546Sopenharmony_ci
175bf215546Sopenharmony_ci   return LLVMConstVector(elems, n);
176bf215546Sopenharmony_ci}
177bf215546Sopenharmony_ci
178bf215546Sopenharmony_ci/**
179bf215546Sopenharmony_ci * Return a vector with elements src[start:start+size]
180bf215546Sopenharmony_ci * Most useful for getting half the values out of a 256bit sized vector,
181bf215546Sopenharmony_ci * otherwise may cause data rearrangement to happen.
182bf215546Sopenharmony_ci */
183bf215546Sopenharmony_ciLLVMValueRef
184bf215546Sopenharmony_cilp_build_extract_range(struct gallivm_state *gallivm,
185bf215546Sopenharmony_ci                       LLVMValueRef src,
186bf215546Sopenharmony_ci                       unsigned start,
187bf215546Sopenharmony_ci                       unsigned size)
188bf215546Sopenharmony_ci{
189bf215546Sopenharmony_ci   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
190bf215546Sopenharmony_ci   unsigned i;
191bf215546Sopenharmony_ci
192bf215546Sopenharmony_ci   assert(size <= ARRAY_SIZE(elems));
193bf215546Sopenharmony_ci
194bf215546Sopenharmony_ci   for (i = 0; i < size; ++i)
195bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, i + start);
196bf215546Sopenharmony_ci
197bf215546Sopenharmony_ci   if (size == 1) {
198bf215546Sopenharmony_ci      return LLVMBuildExtractElement(gallivm->builder, src, elems[0], "");
199bf215546Sopenharmony_ci   }
200bf215546Sopenharmony_ci   else {
201bf215546Sopenharmony_ci      return LLVMBuildShuffleVector(gallivm->builder, src, src,
202bf215546Sopenharmony_ci                                    LLVMConstVector(elems, size), "");
203bf215546Sopenharmony_ci   }
204bf215546Sopenharmony_ci}
205bf215546Sopenharmony_ci
206bf215546Sopenharmony_ci/**
207bf215546Sopenharmony_ci * Concatenates several (must be a power of 2) vectors (of same type)
208bf215546Sopenharmony_ci * into a larger one.
209bf215546Sopenharmony_ci * Most useful for building up a 256bit sized vector out of two 128bit ones.
210bf215546Sopenharmony_ci */
211bf215546Sopenharmony_ciLLVMValueRef
212bf215546Sopenharmony_cilp_build_concat(struct gallivm_state *gallivm,
213bf215546Sopenharmony_ci                LLVMValueRef src[],
214bf215546Sopenharmony_ci                struct lp_type src_type,
215bf215546Sopenharmony_ci                unsigned num_vectors)
216bf215546Sopenharmony_ci{
217bf215546Sopenharmony_ci   unsigned new_length, i;
218bf215546Sopenharmony_ci   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH/2];
219bf215546Sopenharmony_ci   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci   assert(src_type.length * num_vectors <= ARRAY_SIZE(shuffles));
222bf215546Sopenharmony_ci   assert(util_is_power_of_two_or_zero(num_vectors));
223bf215546Sopenharmony_ci
224bf215546Sopenharmony_ci   new_length = src_type.length;
225bf215546Sopenharmony_ci
226bf215546Sopenharmony_ci   for (i = 0; i < num_vectors; i++)
227bf215546Sopenharmony_ci      tmp[i] = src[i];
228bf215546Sopenharmony_ci
229bf215546Sopenharmony_ci   while (num_vectors > 1) {
230bf215546Sopenharmony_ci      num_vectors >>= 1;
231bf215546Sopenharmony_ci      new_length <<= 1;
232bf215546Sopenharmony_ci      for (i = 0; i < new_length; i++) {
233bf215546Sopenharmony_ci         shuffles[i] = lp_build_const_int32(gallivm, i);
234bf215546Sopenharmony_ci      }
235bf215546Sopenharmony_ci      for (i = 0; i < num_vectors; i++) {
236bf215546Sopenharmony_ci         tmp[i] = LLVMBuildShuffleVector(gallivm->builder, tmp[i*2], tmp[i*2 + 1],
237bf215546Sopenharmony_ci                                         LLVMConstVector(shuffles, new_length), "");
238bf215546Sopenharmony_ci      }
239bf215546Sopenharmony_ci   }
240bf215546Sopenharmony_ci
241bf215546Sopenharmony_ci   return tmp[0];
242bf215546Sopenharmony_ci}
243bf215546Sopenharmony_ci
244bf215546Sopenharmony_ci
245bf215546Sopenharmony_ci/**
246bf215546Sopenharmony_ci * Combines vectors to reduce from num_srcs to num_dsts.
247bf215546Sopenharmony_ci * Returns the number of src vectors concatenated in a single dst.
248bf215546Sopenharmony_ci *
249bf215546Sopenharmony_ci * num_srcs must be exactly divisible by num_dsts.
250bf215546Sopenharmony_ci *
251bf215546Sopenharmony_ci * e.g. For num_srcs = 4 and src = [x, y, z, w]
252bf215546Sopenharmony_ci *          num_dsts = 1  dst = [xyzw]    return = 4
253bf215546Sopenharmony_ci *          num_dsts = 2  dst = [xy, zw]  return = 2
254bf215546Sopenharmony_ci */
255bf215546Sopenharmony_ciint
256bf215546Sopenharmony_cilp_build_concat_n(struct gallivm_state *gallivm,
257bf215546Sopenharmony_ci                  struct lp_type src_type,
258bf215546Sopenharmony_ci                  LLVMValueRef *src,
259bf215546Sopenharmony_ci                  unsigned num_srcs,
260bf215546Sopenharmony_ci                  LLVMValueRef *dst,
261bf215546Sopenharmony_ci                  unsigned num_dsts)
262bf215546Sopenharmony_ci{
263bf215546Sopenharmony_ci   int size = num_srcs / num_dsts;
264bf215546Sopenharmony_ci   unsigned i;
265bf215546Sopenharmony_ci
266bf215546Sopenharmony_ci   assert(num_srcs >= num_dsts);
267bf215546Sopenharmony_ci   assert((num_srcs % size) == 0);
268bf215546Sopenharmony_ci
269bf215546Sopenharmony_ci   if (num_srcs == num_dsts) {
270bf215546Sopenharmony_ci      for (i = 0; i < num_dsts; ++i) {
271bf215546Sopenharmony_ci         dst[i] = src[i];
272bf215546Sopenharmony_ci      }
273bf215546Sopenharmony_ci      return 1;
274bf215546Sopenharmony_ci   }
275bf215546Sopenharmony_ci
276bf215546Sopenharmony_ci   for (i = 0; i < num_dsts; ++i) {
277bf215546Sopenharmony_ci      dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
278bf215546Sopenharmony_ci   }
279bf215546Sopenharmony_ci
280bf215546Sopenharmony_ci   return size;
281bf215546Sopenharmony_ci}
282bf215546Sopenharmony_ci
283bf215546Sopenharmony_ci
284bf215546Sopenharmony_ci/**
285bf215546Sopenharmony_ci * Un-interleave vector.
286bf215546Sopenharmony_ci * This will return a vector consisting of every second element
287bf215546Sopenharmony_ci * (depending on lo_hi, beginning at 0 or 1).
288bf215546Sopenharmony_ci * The returned vector size (elems and width) will only be half
289bf215546Sopenharmony_ci * that of the source vector.
290bf215546Sopenharmony_ci */
291bf215546Sopenharmony_ciLLVMValueRef
292bf215546Sopenharmony_cilp_build_uninterleave1(struct gallivm_state *gallivm,
293bf215546Sopenharmony_ci                       unsigned num_elems,
294bf215546Sopenharmony_ci                       LLVMValueRef a,
295bf215546Sopenharmony_ci                       unsigned lo_hi)
296bf215546Sopenharmony_ci{
297bf215546Sopenharmony_ci   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
298bf215546Sopenharmony_ci   unsigned i;
299bf215546Sopenharmony_ci   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
300bf215546Sopenharmony_ci
301bf215546Sopenharmony_ci   for (i = 0; i < num_elems / 2; ++i)
302bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
303bf215546Sopenharmony_ci
304bf215546Sopenharmony_ci   shuffle = LLVMConstVector(elems, num_elems / 2);
305bf215546Sopenharmony_ci
306bf215546Sopenharmony_ci   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
307bf215546Sopenharmony_ci}
308bf215546Sopenharmony_ci
309bf215546Sopenharmony_ci
310bf215546Sopenharmony_ci/**
311bf215546Sopenharmony_ci * Interleave vector elements.
312bf215546Sopenharmony_ci *
313bf215546Sopenharmony_ci * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
314bf215546Sopenharmony_ci * (but not for 256bit AVX vectors).
315bf215546Sopenharmony_ci */
316bf215546Sopenharmony_ciLLVMValueRef
317bf215546Sopenharmony_cilp_build_interleave2(struct gallivm_state *gallivm,
318bf215546Sopenharmony_ci                     struct lp_type type,
319bf215546Sopenharmony_ci                     LLVMValueRef a,
320bf215546Sopenharmony_ci                     LLVMValueRef b,
321bf215546Sopenharmony_ci                     unsigned lo_hi)
322bf215546Sopenharmony_ci{
323bf215546Sopenharmony_ci   LLVMValueRef shuffle;
324bf215546Sopenharmony_ci
325bf215546Sopenharmony_ci   if (type.length == 2 && type.width == 128 && util_get_cpu_caps()->has_avx) {
326bf215546Sopenharmony_ci      /*
327bf215546Sopenharmony_ci       * XXX: This is a workaround for llvm code generation deficiency. Strangely
328bf215546Sopenharmony_ci       * enough, while this needs vinsertf128/vextractf128 instructions (hence
329bf215546Sopenharmony_ci       * a natural match when using 2x128bit vectors) the "normal" unpack shuffle
330bf215546Sopenharmony_ci       * generates code ranging from atrocious (llvm 3.1) to terrible (llvm 3.2, 3.3).
331bf215546Sopenharmony_ci       * So use some different shuffles instead (the exact shuffles don't seem to
332bf215546Sopenharmony_ci       * matter, as long as not using 128bit wide vectors, works with 8x32 or 4x64).
333bf215546Sopenharmony_ci       */
334bf215546Sopenharmony_ci      struct lp_type tmp_type = type;
335bf215546Sopenharmony_ci      LLVMValueRef srchalf[2], tmpdst;
336bf215546Sopenharmony_ci      tmp_type.length = 4;
337bf215546Sopenharmony_ci      tmp_type.width = 64;
338bf215546Sopenharmony_ci      a = LLVMBuildBitCast(gallivm->builder, a, lp_build_vec_type(gallivm, tmp_type), "");
339bf215546Sopenharmony_ci      b = LLVMBuildBitCast(gallivm->builder, b, lp_build_vec_type(gallivm, tmp_type), "");
340bf215546Sopenharmony_ci      srchalf[0] = lp_build_extract_range(gallivm, a, lo_hi * 2, 2);
341bf215546Sopenharmony_ci      srchalf[1] = lp_build_extract_range(gallivm, b, lo_hi * 2, 2);
342bf215546Sopenharmony_ci      tmp_type.length = 2;
343bf215546Sopenharmony_ci      tmpdst = lp_build_concat(gallivm, srchalf, tmp_type, 2);
344bf215546Sopenharmony_ci      return LLVMBuildBitCast(gallivm->builder, tmpdst, lp_build_vec_type(gallivm, type), "");
345bf215546Sopenharmony_ci   }
346bf215546Sopenharmony_ci
347bf215546Sopenharmony_ci   shuffle = lp_build_const_unpack_shuffle(gallivm, type.length, lo_hi);
348bf215546Sopenharmony_ci
349bf215546Sopenharmony_ci   return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
350bf215546Sopenharmony_ci}
351bf215546Sopenharmony_ci
352bf215546Sopenharmony_ci/**
353bf215546Sopenharmony_ci * Interleave vector elements but with 256 (or 512) bit,
354bf215546Sopenharmony_ci * treats it as interleave with 2 concatenated 128 (or 256) bit vectors.
355bf215546Sopenharmony_ci *
356bf215546Sopenharmony_ci * This differs to lp_build_interleave2 as that function would do the following (for lo):
357bf215546Sopenharmony_ci * a0 b0 a1 b1 a2 b2 a3 b3, and this does not compile into an AVX unpack instruction.
358bf215546Sopenharmony_ci *
359bf215546Sopenharmony_ci *
360bf215546Sopenharmony_ci * An example interleave 8x float with 8x float on AVX 256bit unpack:
361bf215546Sopenharmony_ci *   a0 a1 a2 a3 a4 a5 a6 a7 <-> b0 b1 b2 b3 b4 b5 b6 b7
362bf215546Sopenharmony_ci *
363bf215546Sopenharmony_ci * Equivalent to interleaving 2x 128 bit vectors
364bf215546Sopenharmony_ci *   a0 a1 a2 a3 <-> b0 b1 b2 b3 concatenated with a4 a5 a6 a7 <-> b4 b5 b6 b7
365bf215546Sopenharmony_ci *
366bf215546Sopenharmony_ci * So interleave-lo would result in:
367bf215546Sopenharmony_ci *   a0 b0 a1 b1 a4 b4 a5 b5
368bf215546Sopenharmony_ci *
369bf215546Sopenharmony_ci * And interleave-hi would result in:
370bf215546Sopenharmony_ci *   a2 b2 a3 b3 a6 b6 a7 b7
371bf215546Sopenharmony_ci *
372bf215546Sopenharmony_ci * For 512 bits, the following are true:
373bf215546Sopenharmony_ci *
374bf215546Sopenharmony_ci * Interleave-lo would result in (capital letters denote hex indices):
375bf215546Sopenharmony_ci *   a0 b0 a1 b1 a4 b4 a5 b5 a8 b8 a9 b9 aC bC aD bD
376bf215546Sopenharmony_ci *
377bf215546Sopenharmony_ci * Interleave-hi would result in:
378bf215546Sopenharmony_ci *   a2 b2 a3 b3 a6 b6 a7 b7 aA bA aB bB aE bE aF bF
379bf215546Sopenharmony_ci */
380bf215546Sopenharmony_ciLLVMValueRef
381bf215546Sopenharmony_cilp_build_interleave2_half(struct gallivm_state *gallivm,
382bf215546Sopenharmony_ci                          struct lp_type type,
383bf215546Sopenharmony_ci                          LLVMValueRef a,
384bf215546Sopenharmony_ci                          LLVMValueRef b,
385bf215546Sopenharmony_ci                          unsigned lo_hi)
386bf215546Sopenharmony_ci{
387bf215546Sopenharmony_ci   if (type.length * type.width == 256) {
388bf215546Sopenharmony_ci      LLVMValueRef shuffle = lp_build_const_unpack_shuffle_half(gallivm, type.length, lo_hi);
389bf215546Sopenharmony_ci      return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
390bf215546Sopenharmony_ci   } else if ((type.length == 16) && (type.width == 32)) {
391bf215546Sopenharmony_ci      LLVMValueRef shuffle = lp_build_const_unpack_shuffle_16wide(gallivm, lo_hi);
392bf215546Sopenharmony_ci      return LLVMBuildShuffleVector(gallivm->builder, a, b, shuffle, "");
393bf215546Sopenharmony_ci   } else {
394bf215546Sopenharmony_ci      return lp_build_interleave2(gallivm, type, a, b, lo_hi);
395bf215546Sopenharmony_ci   }
396bf215546Sopenharmony_ci}
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_ci
399bf215546Sopenharmony_ci/**
400bf215546Sopenharmony_ci * Double the bit width.
401bf215546Sopenharmony_ci *
402bf215546Sopenharmony_ci * This will only change the number of bits the values are represented, not the
403bf215546Sopenharmony_ci * values themselves.
404bf215546Sopenharmony_ci *
405bf215546Sopenharmony_ci */
406bf215546Sopenharmony_civoid
407bf215546Sopenharmony_cilp_build_unpack2(struct gallivm_state *gallivm,
408bf215546Sopenharmony_ci                 struct lp_type src_type,
409bf215546Sopenharmony_ci                 struct lp_type dst_type,
410bf215546Sopenharmony_ci                 LLVMValueRef src,
411bf215546Sopenharmony_ci                 LLVMValueRef *dst_lo,
412bf215546Sopenharmony_ci                 LLVMValueRef *dst_hi)
413bf215546Sopenharmony_ci{
414bf215546Sopenharmony_ci   LLVMBuilderRef builder = gallivm->builder;
415bf215546Sopenharmony_ci   LLVMValueRef msb;
416bf215546Sopenharmony_ci   LLVMTypeRef dst_vec_type;
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci   assert(!src_type.floating);
419bf215546Sopenharmony_ci   assert(!dst_type.floating);
420bf215546Sopenharmony_ci   assert(dst_type.width == src_type.width * 2);
421bf215546Sopenharmony_ci   assert(dst_type.length * 2 == src_type.length);
422bf215546Sopenharmony_ci
423bf215546Sopenharmony_ci   if(dst_type.sign && src_type.sign) {
424bf215546Sopenharmony_ci      /* Replicate the sign bit in the most significant bits */
425bf215546Sopenharmony_ci      msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
426bf215546Sopenharmony_ci   }
427bf215546Sopenharmony_ci   else
428bf215546Sopenharmony_ci      /* Most significant bits always zero */
429bf215546Sopenharmony_ci      msb = lp_build_zero(gallivm, src_type);
430bf215546Sopenharmony_ci
431bf215546Sopenharmony_ci   /* Interleave bits */
432bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
433bf215546Sopenharmony_ci   *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
434bf215546Sopenharmony_ci   *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
435bf215546Sopenharmony_ci
436bf215546Sopenharmony_ci#else
437bf215546Sopenharmony_ci   *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
438bf215546Sopenharmony_ci   *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
439bf215546Sopenharmony_ci#endif
440bf215546Sopenharmony_ci
441bf215546Sopenharmony_ci   /* Cast the result into the new type (twice as wide) */
442bf215546Sopenharmony_ci
443bf215546Sopenharmony_ci   dst_vec_type = lp_build_vec_type(gallivm, dst_type);
444bf215546Sopenharmony_ci
445bf215546Sopenharmony_ci   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
446bf215546Sopenharmony_ci   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
447bf215546Sopenharmony_ci}
448bf215546Sopenharmony_ci
449bf215546Sopenharmony_ci
450bf215546Sopenharmony_ci/**
451bf215546Sopenharmony_ci * Double the bit width, with an order which fits the cpu nicely.
452bf215546Sopenharmony_ci *
453bf215546Sopenharmony_ci * This will only change the number of bits the values are represented, not the
454bf215546Sopenharmony_ci * values themselves.
455bf215546Sopenharmony_ci *
456bf215546Sopenharmony_ci * The order of the results is not guaranteed, other than it will match
457bf215546Sopenharmony_ci * the corresponding lp_build_pack2_native call.
458bf215546Sopenharmony_ci */
459bf215546Sopenharmony_civoid
460bf215546Sopenharmony_cilp_build_unpack2_native(struct gallivm_state *gallivm,
461bf215546Sopenharmony_ci                        struct lp_type src_type,
462bf215546Sopenharmony_ci                        struct lp_type dst_type,
463bf215546Sopenharmony_ci                        LLVMValueRef src,
464bf215546Sopenharmony_ci                        LLVMValueRef *dst_lo,
465bf215546Sopenharmony_ci                        LLVMValueRef *dst_hi)
466bf215546Sopenharmony_ci{
467bf215546Sopenharmony_ci   LLVMBuilderRef builder = gallivm->builder;
468bf215546Sopenharmony_ci   LLVMValueRef msb;
469bf215546Sopenharmony_ci   LLVMTypeRef dst_vec_type;
470bf215546Sopenharmony_ci
471bf215546Sopenharmony_ci   assert(!src_type.floating);
472bf215546Sopenharmony_ci   assert(!dst_type.floating);
473bf215546Sopenharmony_ci   assert(dst_type.width == src_type.width * 2);
474bf215546Sopenharmony_ci   assert(dst_type.length * 2 == src_type.length);
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci   if(dst_type.sign && src_type.sign) {
477bf215546Sopenharmony_ci      /* Replicate the sign bit in the most significant bits */
478bf215546Sopenharmony_ci      msb = LLVMBuildAShr(builder, src,
479bf215546Sopenharmony_ci               lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), "");
480bf215546Sopenharmony_ci   }
481bf215546Sopenharmony_ci   else
482bf215546Sopenharmony_ci      /* Most significant bits always zero */
483bf215546Sopenharmony_ci      msb = lp_build_zero(gallivm, src_type);
484bf215546Sopenharmony_ci
485bf215546Sopenharmony_ci   /* Interleave bits */
486bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
487bf215546Sopenharmony_ci   if (src_type.length * src_type.width == 256 && util_get_cpu_caps()->has_avx2) {
488bf215546Sopenharmony_ci      *dst_lo = lp_build_interleave2_half(gallivm, src_type, src, msb, 0);
489bf215546Sopenharmony_ci      *dst_hi = lp_build_interleave2_half(gallivm, src_type, src, msb, 1);
490bf215546Sopenharmony_ci   } else {
491bf215546Sopenharmony_ci      *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0);
492bf215546Sopenharmony_ci      *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1);
493bf215546Sopenharmony_ci   }
494bf215546Sopenharmony_ci#else
495bf215546Sopenharmony_ci   *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0);
496bf215546Sopenharmony_ci   *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1);
497bf215546Sopenharmony_ci#endif
498bf215546Sopenharmony_ci
499bf215546Sopenharmony_ci   /* Cast the result into the new type (twice as wide) */
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_ci   dst_vec_type = lp_build_vec_type(gallivm, dst_type);
502bf215546Sopenharmony_ci
503bf215546Sopenharmony_ci   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
504bf215546Sopenharmony_ci   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
505bf215546Sopenharmony_ci}
506bf215546Sopenharmony_ci
507bf215546Sopenharmony_ci
508bf215546Sopenharmony_ci/**
509bf215546Sopenharmony_ci * Expand the bit width.
510bf215546Sopenharmony_ci *
511bf215546Sopenharmony_ci * This will only change the number of bits the values are represented, not the
512bf215546Sopenharmony_ci * values themselves.
513bf215546Sopenharmony_ci */
514bf215546Sopenharmony_civoid
515bf215546Sopenharmony_cilp_build_unpack(struct gallivm_state *gallivm,
516bf215546Sopenharmony_ci                struct lp_type src_type,
517bf215546Sopenharmony_ci                struct lp_type dst_type,
518bf215546Sopenharmony_ci                LLVMValueRef src,
519bf215546Sopenharmony_ci                LLVMValueRef *dst, unsigned num_dsts)
520bf215546Sopenharmony_ci{
521bf215546Sopenharmony_ci   unsigned num_tmps;
522bf215546Sopenharmony_ci   unsigned i;
523bf215546Sopenharmony_ci
524bf215546Sopenharmony_ci   /* Register width must remain constant */
525bf215546Sopenharmony_ci   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
526bf215546Sopenharmony_ci
527bf215546Sopenharmony_ci   /* We must not loose or gain channels. Only precision */
528bf215546Sopenharmony_ci   assert(src_type.length == dst_type.length * num_dsts);
529bf215546Sopenharmony_ci
530bf215546Sopenharmony_ci   num_tmps = 1;
531bf215546Sopenharmony_ci   dst[0] = src;
532bf215546Sopenharmony_ci
533bf215546Sopenharmony_ci   while(src_type.width < dst_type.width) {
534bf215546Sopenharmony_ci      struct lp_type tmp_type = src_type;
535bf215546Sopenharmony_ci
536bf215546Sopenharmony_ci      tmp_type.width *= 2;
537bf215546Sopenharmony_ci      tmp_type.length /= 2;
538bf215546Sopenharmony_ci
539bf215546Sopenharmony_ci      for(i = num_tmps; i--; ) {
540bf215546Sopenharmony_ci         lp_build_unpack2(gallivm, src_type, tmp_type, dst[i], &dst[2*i + 0],
541bf215546Sopenharmony_ci                          &dst[2*i + 1]);
542bf215546Sopenharmony_ci      }
543bf215546Sopenharmony_ci
544bf215546Sopenharmony_ci      src_type = tmp_type;
545bf215546Sopenharmony_ci
546bf215546Sopenharmony_ci      num_tmps *= 2;
547bf215546Sopenharmony_ci   }
548bf215546Sopenharmony_ci
549bf215546Sopenharmony_ci   assert(num_tmps == num_dsts);
550bf215546Sopenharmony_ci}
551bf215546Sopenharmony_ci
552bf215546Sopenharmony_ci
553bf215546Sopenharmony_ci/**
554bf215546Sopenharmony_ci * Non-interleaved pack.
555bf215546Sopenharmony_ci *
556bf215546Sopenharmony_ci * This will move values as
557bf215546Sopenharmony_ci *         (LSB)                     (MSB)
558bf215546Sopenharmony_ci *   lo =   l0 __ l1 __ l2 __..  __ ln __
559bf215546Sopenharmony_ci *   hi =   h0 __ h1 __ h2 __..  __ hn __
560bf215546Sopenharmony_ci *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
561bf215546Sopenharmony_ci *
562bf215546Sopenharmony_ci * This will only change the number of bits the values are represented, not the
563bf215546Sopenharmony_ci * values themselves.
564bf215546Sopenharmony_ci *
565bf215546Sopenharmony_ci * It is assumed the values are already clamped into the destination type range.
566bf215546Sopenharmony_ci * Values outside that range will produce undefined results. Use
567bf215546Sopenharmony_ci * lp_build_packs2 instead.
568bf215546Sopenharmony_ci */
569bf215546Sopenharmony_ciLLVMValueRef
570bf215546Sopenharmony_cilp_build_pack2(struct gallivm_state *gallivm,
571bf215546Sopenharmony_ci               struct lp_type src_type,
572bf215546Sopenharmony_ci               struct lp_type dst_type,
573bf215546Sopenharmony_ci               LLVMValueRef lo,
574bf215546Sopenharmony_ci               LLVMValueRef hi)
575bf215546Sopenharmony_ci{
576bf215546Sopenharmony_ci   LLVMBuilderRef builder = gallivm->builder;
577bf215546Sopenharmony_ci   LLVMTypeRef dst_vec_type = lp_build_vec_type(gallivm, dst_type);
578bf215546Sopenharmony_ci   LLVMValueRef shuffle;
579bf215546Sopenharmony_ci   LLVMValueRef res = NULL;
580bf215546Sopenharmony_ci   struct lp_type intr_type = dst_type;
581bf215546Sopenharmony_ci
582bf215546Sopenharmony_ci   assert(!src_type.floating);
583bf215546Sopenharmony_ci   assert(!dst_type.floating);
584bf215546Sopenharmony_ci   assert(src_type.width == dst_type.width * 2);
585bf215546Sopenharmony_ci   assert(src_type.length * 2 == dst_type.length);
586bf215546Sopenharmony_ci
587bf215546Sopenharmony_ci   /* Check for special cases first */
588bf215546Sopenharmony_ci   if ((util_get_cpu_caps()->has_sse2 || util_get_cpu_caps()->has_altivec) &&
589bf215546Sopenharmony_ci        src_type.width * src_type.length >= 128) {
590bf215546Sopenharmony_ci      const char *intrinsic = NULL;
591bf215546Sopenharmony_ci      boolean swap_intrinsic_operands = FALSE;
592bf215546Sopenharmony_ci
593bf215546Sopenharmony_ci      switch(src_type.width) {
594bf215546Sopenharmony_ci      case 32:
595bf215546Sopenharmony_ci         if (util_get_cpu_caps()->has_sse2) {
596bf215546Sopenharmony_ci           if (dst_type.sign) {
597bf215546Sopenharmony_ci              intrinsic = "llvm.x86.sse2.packssdw.128";
598bf215546Sopenharmony_ci           } else {
599bf215546Sopenharmony_ci              if (util_get_cpu_caps()->has_sse4_1) {
600bf215546Sopenharmony_ci                 intrinsic = "llvm.x86.sse41.packusdw";
601bf215546Sopenharmony_ci              }
602bf215546Sopenharmony_ci           }
603bf215546Sopenharmony_ci         } else if (util_get_cpu_caps()->has_altivec) {
604bf215546Sopenharmony_ci            if (dst_type.sign) {
605bf215546Sopenharmony_ci               intrinsic = "llvm.ppc.altivec.vpkswss";
606bf215546Sopenharmony_ci            } else {
607bf215546Sopenharmony_ci               intrinsic = "llvm.ppc.altivec.vpkuwus";
608bf215546Sopenharmony_ci            }
609bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
610bf215546Sopenharmony_ci            swap_intrinsic_operands = TRUE;
611bf215546Sopenharmony_ci#endif
612bf215546Sopenharmony_ci         }
613bf215546Sopenharmony_ci         break;
614bf215546Sopenharmony_ci      case 16:
615bf215546Sopenharmony_ci         if (dst_type.sign) {
616bf215546Sopenharmony_ci            if (util_get_cpu_caps()->has_sse2) {
617bf215546Sopenharmony_ci               intrinsic = "llvm.x86.sse2.packsswb.128";
618bf215546Sopenharmony_ci            } else if (util_get_cpu_caps()->has_altivec) {
619bf215546Sopenharmony_ci               intrinsic = "llvm.ppc.altivec.vpkshss";
620bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
621bf215546Sopenharmony_ci               swap_intrinsic_operands = TRUE;
622bf215546Sopenharmony_ci#endif
623bf215546Sopenharmony_ci            }
624bf215546Sopenharmony_ci         } else {
625bf215546Sopenharmony_ci            if (util_get_cpu_caps()->has_sse2) {
626bf215546Sopenharmony_ci               intrinsic = "llvm.x86.sse2.packuswb.128";
627bf215546Sopenharmony_ci            } else if (util_get_cpu_caps()->has_altivec) {
628bf215546Sopenharmony_ci               intrinsic = "llvm.ppc.altivec.vpkshus";
629bf215546Sopenharmony_ci#if UTIL_ARCH_LITTLE_ENDIAN
630bf215546Sopenharmony_ci               swap_intrinsic_operands = TRUE;
631bf215546Sopenharmony_ci#endif
632bf215546Sopenharmony_ci            }
633bf215546Sopenharmony_ci         }
634bf215546Sopenharmony_ci         break;
635bf215546Sopenharmony_ci      /* default uses generic shuffle below */
636bf215546Sopenharmony_ci      }
637bf215546Sopenharmony_ci      if (intrinsic) {
638bf215546Sopenharmony_ci         if (src_type.width * src_type.length == 128) {
639bf215546Sopenharmony_ci            LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
640bf215546Sopenharmony_ci            if (swap_intrinsic_operands) {
641bf215546Sopenharmony_ci               res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, hi, lo);
642bf215546Sopenharmony_ci            } else {
643bf215546Sopenharmony_ci               res = lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type, lo, hi);
644bf215546Sopenharmony_ci            }
645bf215546Sopenharmony_ci            if (dst_vec_type != intr_vec_type) {
646bf215546Sopenharmony_ci               res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
647bf215546Sopenharmony_ci            }
648bf215546Sopenharmony_ci         }
649bf215546Sopenharmony_ci         else {
650bf215546Sopenharmony_ci            int num_split = src_type.width * src_type.length / 128;
651bf215546Sopenharmony_ci            int i;
652bf215546Sopenharmony_ci            int nlen = 128 / src_type.width;
653bf215546Sopenharmony_ci            int lo_off = swap_intrinsic_operands ? nlen : 0;
654bf215546Sopenharmony_ci            int hi_off = swap_intrinsic_operands ? 0 : nlen;
655bf215546Sopenharmony_ci            struct lp_type ndst_type = lp_type_unorm(dst_type.width, 128);
656bf215546Sopenharmony_ci            struct lp_type nintr_type = lp_type_unorm(intr_type.width, 128);
657bf215546Sopenharmony_ci            LLVMValueRef tmpres[LP_MAX_VECTOR_WIDTH / 128];
658bf215546Sopenharmony_ci            LLVMValueRef tmplo, tmphi;
659bf215546Sopenharmony_ci            LLVMTypeRef ndst_vec_type = lp_build_vec_type(gallivm, ndst_type);
660bf215546Sopenharmony_ci            LLVMTypeRef nintr_vec_type = lp_build_vec_type(gallivm, nintr_type);
661bf215546Sopenharmony_ci
662bf215546Sopenharmony_ci            assert(num_split <= LP_MAX_VECTOR_WIDTH / 128);
663bf215546Sopenharmony_ci
664bf215546Sopenharmony_ci            for (i = 0; i < num_split / 2; i++) {
665bf215546Sopenharmony_ci               tmplo = lp_build_extract_range(gallivm,
666bf215546Sopenharmony_ci                                              lo, i*nlen*2 + lo_off, nlen);
667bf215546Sopenharmony_ci               tmphi = lp_build_extract_range(gallivm,
668bf215546Sopenharmony_ci                                              lo, i*nlen*2 + hi_off, nlen);
669bf215546Sopenharmony_ci               tmpres[i] = lp_build_intrinsic_binary(builder, intrinsic,
670bf215546Sopenharmony_ci                                                     nintr_vec_type, tmplo, tmphi);
671bf215546Sopenharmony_ci               if (ndst_vec_type != nintr_vec_type) {
672bf215546Sopenharmony_ci                  tmpres[i] = LLVMBuildBitCast(builder, tmpres[i], ndst_vec_type, "");
673bf215546Sopenharmony_ci               }
674bf215546Sopenharmony_ci            }
675bf215546Sopenharmony_ci            for (i = 0; i < num_split / 2; i++) {
676bf215546Sopenharmony_ci               tmplo = lp_build_extract_range(gallivm,
677bf215546Sopenharmony_ci                                              hi, i*nlen*2 + lo_off, nlen);
678bf215546Sopenharmony_ci               tmphi = lp_build_extract_range(gallivm,
679bf215546Sopenharmony_ci                                              hi, i*nlen*2 + hi_off, nlen);
680bf215546Sopenharmony_ci               tmpres[i+num_split/2] = lp_build_intrinsic_binary(builder, intrinsic,
681bf215546Sopenharmony_ci                                                                 nintr_vec_type,
682bf215546Sopenharmony_ci                                                                 tmplo, tmphi);
683bf215546Sopenharmony_ci               if (ndst_vec_type != nintr_vec_type) {
684bf215546Sopenharmony_ci                  tmpres[i+num_split/2] = LLVMBuildBitCast(builder, tmpres[i+num_split/2],
685bf215546Sopenharmony_ci                                                           ndst_vec_type, "");
686bf215546Sopenharmony_ci               }
687bf215546Sopenharmony_ci            }
688bf215546Sopenharmony_ci            res = lp_build_concat(gallivm, tmpres, ndst_type, num_split);
689bf215546Sopenharmony_ci         }
690bf215546Sopenharmony_ci         return res;
691bf215546Sopenharmony_ci      }
692bf215546Sopenharmony_ci   }
693bf215546Sopenharmony_ci
694bf215546Sopenharmony_ci   /* generic shuffle */
695bf215546Sopenharmony_ci   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
696bf215546Sopenharmony_ci   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
697bf215546Sopenharmony_ci
698bf215546Sopenharmony_ci   shuffle = lp_build_const_pack_shuffle(gallivm, dst_type.length);
699bf215546Sopenharmony_ci
700bf215546Sopenharmony_ci   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
701bf215546Sopenharmony_ci
702bf215546Sopenharmony_ci   return res;
703bf215546Sopenharmony_ci}
704bf215546Sopenharmony_ci
705bf215546Sopenharmony_ci
706bf215546Sopenharmony_ci/**
707bf215546Sopenharmony_ci * Non-interleaved native pack.
708bf215546Sopenharmony_ci *
709bf215546Sopenharmony_ci * Similar to lp_build_pack2, but the ordering of values is not
710bf215546Sopenharmony_ci * guaranteed, other than it will match lp_build_unpack2_native.
711bf215546Sopenharmony_ci *
712bf215546Sopenharmony_ci * In particular, with avx2, the lower and upper 128bits of the vectors will
713bf215546Sopenharmony_ci * be packed independently, so that (with 32bit->16bit values)
714bf215546Sopenharmony_ci *         (LSB)                                       (MSB)
715bf215546Sopenharmony_ci *   lo =   l0 __ l1 __ l2 __ l3 __ l4 __ l5 __ l6 __ l7 __
716bf215546Sopenharmony_ci *   hi =   h0 __ h1 __ h2 __ h3 __ h4 __ h5 __ h6 __ h7 __
717bf215546Sopenharmony_ci *   res =  l0 l1 l2 l3 h0 h1 h2 h3 l4 l5 l6 l7 h4 h5 h6 h7
718bf215546Sopenharmony_ci *
719bf215546Sopenharmony_ci * This will only change the number of bits the values are represented, not the
720bf215546Sopenharmony_ci * values themselves.
721bf215546Sopenharmony_ci *
722bf215546Sopenharmony_ci * It is assumed the values are already clamped into the destination type range.
723bf215546Sopenharmony_ci * Values outside that range will produce undefined results.
724bf215546Sopenharmony_ci */
725bf215546Sopenharmony_ciLLVMValueRef
726bf215546Sopenharmony_cilp_build_pack2_native(struct gallivm_state *gallivm,
727bf215546Sopenharmony_ci                      struct lp_type src_type,
728bf215546Sopenharmony_ci                      struct lp_type dst_type,
729bf215546Sopenharmony_ci                      LLVMValueRef lo,
730bf215546Sopenharmony_ci                      LLVMValueRef hi)
731bf215546Sopenharmony_ci{
732bf215546Sopenharmony_ci   LLVMBuilderRef builder = gallivm->builder;
733bf215546Sopenharmony_ci   struct lp_type intr_type = dst_type;
734bf215546Sopenharmony_ci   const char *intrinsic = NULL;
735bf215546Sopenharmony_ci
736bf215546Sopenharmony_ci   assert(!src_type.floating);
737bf215546Sopenharmony_ci   assert(!dst_type.floating);
738bf215546Sopenharmony_ci   assert(src_type.width == dst_type.width * 2);
739bf215546Sopenharmony_ci   assert(src_type.length * 2 == dst_type.length);
740bf215546Sopenharmony_ci
741bf215546Sopenharmony_ci   /* At this point only have special case for avx2 */
742bf215546Sopenharmony_ci   if (src_type.length * src_type.width == 256 &&
743bf215546Sopenharmony_ci       util_get_cpu_caps()->has_avx2) {
744bf215546Sopenharmony_ci      switch(src_type.width) {
745bf215546Sopenharmony_ci      case 32:
746bf215546Sopenharmony_ci         if (dst_type.sign) {
747bf215546Sopenharmony_ci            intrinsic = "llvm.x86.avx2.packssdw";
748bf215546Sopenharmony_ci         } else {
749bf215546Sopenharmony_ci            intrinsic = "llvm.x86.avx2.packusdw";
750bf215546Sopenharmony_ci         }
751bf215546Sopenharmony_ci         break;
752bf215546Sopenharmony_ci      case 16:
753bf215546Sopenharmony_ci         if (dst_type.sign) {
754bf215546Sopenharmony_ci            intrinsic = "llvm.x86.avx2.packsswb";
755bf215546Sopenharmony_ci         } else {
756bf215546Sopenharmony_ci            intrinsic = "llvm.x86.avx2.packuswb";
757bf215546Sopenharmony_ci         }
758bf215546Sopenharmony_ci         break;
759bf215546Sopenharmony_ci      }
760bf215546Sopenharmony_ci   }
761bf215546Sopenharmony_ci   if (intrinsic) {
762bf215546Sopenharmony_ci      LLVMTypeRef intr_vec_type = lp_build_vec_type(gallivm, intr_type);
763bf215546Sopenharmony_ci      return lp_build_intrinsic_binary(builder, intrinsic, intr_vec_type,
764bf215546Sopenharmony_ci                                       lo, hi);
765bf215546Sopenharmony_ci   }
766bf215546Sopenharmony_ci   else {
767bf215546Sopenharmony_ci      return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
768bf215546Sopenharmony_ci   }
769bf215546Sopenharmony_ci}
770bf215546Sopenharmony_ci
771bf215546Sopenharmony_ci/**
772bf215546Sopenharmony_ci * Non-interleaved pack and saturate.
773bf215546Sopenharmony_ci *
774bf215546Sopenharmony_ci * Same as lp_build_pack2 but will saturate values so that they fit into the
775bf215546Sopenharmony_ci * destination type.
776bf215546Sopenharmony_ci */
777bf215546Sopenharmony_ciLLVMValueRef
778bf215546Sopenharmony_cilp_build_packs2(struct gallivm_state *gallivm,
779bf215546Sopenharmony_ci                struct lp_type src_type,
780bf215546Sopenharmony_ci                struct lp_type dst_type,
781bf215546Sopenharmony_ci                LLVMValueRef lo,
782bf215546Sopenharmony_ci                LLVMValueRef hi)
783bf215546Sopenharmony_ci{
784bf215546Sopenharmony_ci   boolean clamp;
785bf215546Sopenharmony_ci
786bf215546Sopenharmony_ci   assert(!src_type.floating);
787bf215546Sopenharmony_ci   assert(!dst_type.floating);
788bf215546Sopenharmony_ci   assert(src_type.sign == dst_type.sign);
789bf215546Sopenharmony_ci   assert(src_type.width == dst_type.width * 2);
790bf215546Sopenharmony_ci   assert(src_type.length * 2 == dst_type.length);
791bf215546Sopenharmony_ci
792bf215546Sopenharmony_ci   clamp = TRUE;
793bf215546Sopenharmony_ci
794bf215546Sopenharmony_ci   /* All X86 SSE non-interleaved pack instructions take signed inputs and
795bf215546Sopenharmony_ci    * saturate them, so no need to clamp for those cases. */
796bf215546Sopenharmony_ci   if(util_get_cpu_caps()->has_sse2 &&
797bf215546Sopenharmony_ci      src_type.width * src_type.length >= 128 &&
798bf215546Sopenharmony_ci      src_type.sign &&
799bf215546Sopenharmony_ci      (src_type.width == 32 || src_type.width == 16))
800bf215546Sopenharmony_ci      clamp = FALSE;
801bf215546Sopenharmony_ci
802bf215546Sopenharmony_ci   if(clamp) {
803bf215546Sopenharmony_ci      struct lp_build_context bld;
804bf215546Sopenharmony_ci      unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
805bf215546Sopenharmony_ci      LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type,
806bf215546Sopenharmony_ci                                ((unsigned long long)1 << dst_bits) - 1);
807bf215546Sopenharmony_ci      lp_build_context_init(&bld, gallivm, src_type);
808bf215546Sopenharmony_ci      lo = lp_build_min(&bld, lo, dst_max);
809bf215546Sopenharmony_ci      hi = lp_build_min(&bld, hi, dst_max);
810bf215546Sopenharmony_ci      /* FIXME: What about lower bound? */
811bf215546Sopenharmony_ci   }
812bf215546Sopenharmony_ci
813bf215546Sopenharmony_ci   return lp_build_pack2(gallivm, src_type, dst_type, lo, hi);
814bf215546Sopenharmony_ci}
815bf215546Sopenharmony_ci
816bf215546Sopenharmony_ci
817bf215546Sopenharmony_ci/**
818bf215546Sopenharmony_ci * Truncate the bit width.
819bf215546Sopenharmony_ci *
820bf215546Sopenharmony_ci * TODO: Handle saturation consistently.
821bf215546Sopenharmony_ci */
822bf215546Sopenharmony_ciLLVMValueRef
823bf215546Sopenharmony_cilp_build_pack(struct gallivm_state *gallivm,
824bf215546Sopenharmony_ci              struct lp_type src_type,
825bf215546Sopenharmony_ci              struct lp_type dst_type,
826bf215546Sopenharmony_ci              boolean clamped,
827bf215546Sopenharmony_ci              const LLVMValueRef *src, unsigned num_srcs)
828bf215546Sopenharmony_ci{
829bf215546Sopenharmony_ci   LLVMValueRef (*pack2)(struct gallivm_state *gallivm,
830bf215546Sopenharmony_ci                         struct lp_type src_type,
831bf215546Sopenharmony_ci                         struct lp_type dst_type,
832bf215546Sopenharmony_ci                         LLVMValueRef lo,
833bf215546Sopenharmony_ci                         LLVMValueRef hi);
834bf215546Sopenharmony_ci   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
835bf215546Sopenharmony_ci   unsigned i;
836bf215546Sopenharmony_ci
837bf215546Sopenharmony_ci   /* Register width must remain constant */
838bf215546Sopenharmony_ci   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
839bf215546Sopenharmony_ci
840bf215546Sopenharmony_ci   /* We must not loose or gain channels. Only precision */
841bf215546Sopenharmony_ci   assert(src_type.length * num_srcs == dst_type.length);
842bf215546Sopenharmony_ci
843bf215546Sopenharmony_ci   if(clamped)
844bf215546Sopenharmony_ci      pack2 = &lp_build_pack2;
845bf215546Sopenharmony_ci   else
846bf215546Sopenharmony_ci      pack2 = &lp_build_packs2;
847bf215546Sopenharmony_ci
848bf215546Sopenharmony_ci   for(i = 0; i < num_srcs; ++i)
849bf215546Sopenharmony_ci      tmp[i] = src[i];
850bf215546Sopenharmony_ci
851bf215546Sopenharmony_ci   while(src_type.width > dst_type.width) {
852bf215546Sopenharmony_ci      struct lp_type tmp_type = src_type;
853bf215546Sopenharmony_ci
854bf215546Sopenharmony_ci      tmp_type.width /= 2;
855bf215546Sopenharmony_ci      tmp_type.length *= 2;
856bf215546Sopenharmony_ci
857bf215546Sopenharmony_ci      /* Take in consideration the sign changes only in the last step */
858bf215546Sopenharmony_ci      if(tmp_type.width == dst_type.width)
859bf215546Sopenharmony_ci         tmp_type.sign = dst_type.sign;
860bf215546Sopenharmony_ci
861bf215546Sopenharmony_ci      num_srcs /= 2;
862bf215546Sopenharmony_ci
863bf215546Sopenharmony_ci      for(i = 0; i < num_srcs; ++i)
864bf215546Sopenharmony_ci         tmp[i] = pack2(gallivm, src_type, tmp_type,
865bf215546Sopenharmony_ci                        tmp[2*i + 0], tmp[2*i + 1]);
866bf215546Sopenharmony_ci
867bf215546Sopenharmony_ci      src_type = tmp_type;
868bf215546Sopenharmony_ci   }
869bf215546Sopenharmony_ci
870bf215546Sopenharmony_ci   assert(num_srcs == 1);
871bf215546Sopenharmony_ci
872bf215546Sopenharmony_ci   return tmp[0];
873bf215546Sopenharmony_ci}
874bf215546Sopenharmony_ci
875bf215546Sopenharmony_ci
876bf215546Sopenharmony_ci/**
877bf215546Sopenharmony_ci * Truncate or expand the bitwidth.
878bf215546Sopenharmony_ci *
879bf215546Sopenharmony_ci * NOTE: Getting the right sign flags is crucial here, as we employ some
880bf215546Sopenharmony_ci * intrinsics that do saturation.
881bf215546Sopenharmony_ci */
882bf215546Sopenharmony_civoid
883bf215546Sopenharmony_cilp_build_resize(struct gallivm_state *gallivm,
884bf215546Sopenharmony_ci                struct lp_type src_type,
885bf215546Sopenharmony_ci                struct lp_type dst_type,
886bf215546Sopenharmony_ci                const LLVMValueRef *src, unsigned num_srcs,
887bf215546Sopenharmony_ci                LLVMValueRef *dst, unsigned num_dsts)
888bf215546Sopenharmony_ci{
889bf215546Sopenharmony_ci   LLVMBuilderRef builder = gallivm->builder;
890bf215546Sopenharmony_ci   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
891bf215546Sopenharmony_ci   unsigned i;
892bf215546Sopenharmony_ci
893bf215546Sopenharmony_ci   /*
894bf215546Sopenharmony_ci    * We don't support float <-> int conversion here. That must be done
895bf215546Sopenharmony_ci    * before/after calling this function.
896bf215546Sopenharmony_ci    */
897bf215546Sopenharmony_ci   assert(src_type.floating == dst_type.floating);
898bf215546Sopenharmony_ci
899bf215546Sopenharmony_ci   /*
900bf215546Sopenharmony_ci    * We don't support double <-> float conversion yet, although it could be
901bf215546Sopenharmony_ci    * added with little effort.
902bf215546Sopenharmony_ci    */
903bf215546Sopenharmony_ci   assert((!src_type.floating && !dst_type.floating) ||
904bf215546Sopenharmony_ci          src_type.width == dst_type.width);
905bf215546Sopenharmony_ci
906bf215546Sopenharmony_ci   /* We must not loose or gain channels. Only precision */
907bf215546Sopenharmony_ci   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
908bf215546Sopenharmony_ci
909bf215546Sopenharmony_ci   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
910bf215546Sopenharmony_ci   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
911bf215546Sopenharmony_ci   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
912bf215546Sopenharmony_ci   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
913bf215546Sopenharmony_ci
914bf215546Sopenharmony_ci   if (src_type.width > dst_type.width) {
915bf215546Sopenharmony_ci      /*
916bf215546Sopenharmony_ci       * Truncate bit width.
917bf215546Sopenharmony_ci       */
918bf215546Sopenharmony_ci
919bf215546Sopenharmony_ci      /* Conversion must be M:1 */
920bf215546Sopenharmony_ci      assert(num_dsts == 1);
921bf215546Sopenharmony_ci
922bf215546Sopenharmony_ci      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
923bf215546Sopenharmony_ci        /*
924bf215546Sopenharmony_ci         * Register width remains constant -- use vector packing intrinsics
925bf215546Sopenharmony_ci         */
926bf215546Sopenharmony_ci         tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, src, num_srcs);
927bf215546Sopenharmony_ci      }
928bf215546Sopenharmony_ci      else {
929bf215546Sopenharmony_ci         if (src_type.width / dst_type.width > num_srcs) {
930bf215546Sopenharmony_ci            /*
931bf215546Sopenharmony_ci            * First change src vectors size (with shuffle) so they have the
932bf215546Sopenharmony_ci            * same size as the destination vector, then pack normally.
933bf215546Sopenharmony_ci            * Note: cannot use cast/extract because llvm generates atrocious code.
934bf215546Sopenharmony_ci            */
935bf215546Sopenharmony_ci            unsigned size_ratio = (src_type.width * src_type.length) /
936bf215546Sopenharmony_ci                                  (dst_type.length * dst_type.width);
937bf215546Sopenharmony_ci            unsigned new_length = src_type.length / size_ratio;
938bf215546Sopenharmony_ci
939bf215546Sopenharmony_ci            for (i = 0; i < size_ratio * num_srcs; i++) {
940bf215546Sopenharmony_ci               unsigned start_index = (i % size_ratio) * new_length;
941bf215546Sopenharmony_ci               tmp[i] = lp_build_extract_range(gallivm, src[i / size_ratio],
942bf215546Sopenharmony_ci                                               start_index, new_length);
943bf215546Sopenharmony_ci            }
944bf215546Sopenharmony_ci            num_srcs *= size_ratio;
945bf215546Sopenharmony_ci            src_type.length = new_length;
946bf215546Sopenharmony_ci            tmp[0] = lp_build_pack(gallivm, src_type, dst_type, TRUE, tmp, num_srcs);
947bf215546Sopenharmony_ci         }
948bf215546Sopenharmony_ci         else {
949bf215546Sopenharmony_ci            /*
950bf215546Sopenharmony_ci             * Truncate bit width but expand vector size - first pack
951bf215546Sopenharmony_ci             * then expand simply because this should be more AVX-friendly
952bf215546Sopenharmony_ci             * for the cases we probably hit.
953bf215546Sopenharmony_ci             */
954bf215546Sopenharmony_ci            unsigned size_ratio = (dst_type.width * dst_type.length) /
955bf215546Sopenharmony_ci                                  (src_type.length * src_type.width);
956bf215546Sopenharmony_ci            unsigned num_pack_srcs = num_srcs / size_ratio;
957bf215546Sopenharmony_ci            dst_type.length = dst_type.length / size_ratio;
958bf215546Sopenharmony_ci
959bf215546Sopenharmony_ci            for (i = 0; i < size_ratio; i++) {
960bf215546Sopenharmony_ci               tmp[i] = lp_build_pack(gallivm, src_type, dst_type, TRUE,
961bf215546Sopenharmony_ci                                      &src[i*num_pack_srcs], num_pack_srcs);
962bf215546Sopenharmony_ci            }
963bf215546Sopenharmony_ci            tmp[0] = lp_build_concat(gallivm, tmp, dst_type, size_ratio);
964bf215546Sopenharmony_ci         }
965bf215546Sopenharmony_ci      }
966bf215546Sopenharmony_ci   }
967bf215546Sopenharmony_ci   else if (src_type.width < dst_type.width) {
968bf215546Sopenharmony_ci      /*
969bf215546Sopenharmony_ci       * Expand bit width.
970bf215546Sopenharmony_ci       */
971bf215546Sopenharmony_ci
972bf215546Sopenharmony_ci      /* Conversion must be 1:N */
973bf215546Sopenharmony_ci      assert(num_srcs == 1);
974bf215546Sopenharmony_ci
975bf215546Sopenharmony_ci      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
976bf215546Sopenharmony_ci         /*
977bf215546Sopenharmony_ci          * Register width remains constant -- use vector unpack intrinsics
978bf215546Sopenharmony_ci          */
979bf215546Sopenharmony_ci         lp_build_unpack(gallivm, src_type, dst_type, src[0], tmp, num_dsts);
980bf215546Sopenharmony_ci      }
981bf215546Sopenharmony_ci      else {
982bf215546Sopenharmony_ci         /*
983bf215546Sopenharmony_ci          * Do it element-wise.
984bf215546Sopenharmony_ci          */
985bf215546Sopenharmony_ci         assert(src_type.length * num_srcs == dst_type.length * num_dsts);
986bf215546Sopenharmony_ci
987bf215546Sopenharmony_ci         for (i = 0; i < num_dsts; i++) {
988bf215546Sopenharmony_ci            tmp[i] = lp_build_undef(gallivm, dst_type);
989bf215546Sopenharmony_ci         }
990bf215546Sopenharmony_ci
991bf215546Sopenharmony_ci         for (i = 0; i < src_type.length; ++i) {
992bf215546Sopenharmony_ci            unsigned j = i / dst_type.length;
993bf215546Sopenharmony_ci            LLVMValueRef srcindex = lp_build_const_int32(gallivm, i);
994bf215546Sopenharmony_ci            LLVMValueRef dstindex = lp_build_const_int32(gallivm, i % dst_type.length);
995bf215546Sopenharmony_ci            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], srcindex, "");
996bf215546Sopenharmony_ci
997bf215546Sopenharmony_ci            if (src_type.sign && dst_type.sign) {
998bf215546Sopenharmony_ci               val = LLVMBuildSExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
999bf215546Sopenharmony_ci            } else {
1000bf215546Sopenharmony_ci               val = LLVMBuildZExt(builder, val, lp_build_elem_type(gallivm, dst_type), "");
1001bf215546Sopenharmony_ci            }
1002bf215546Sopenharmony_ci            tmp[j] = LLVMBuildInsertElement(builder, tmp[j], val, dstindex, "");
1003bf215546Sopenharmony_ci         }
1004bf215546Sopenharmony_ci      }
1005bf215546Sopenharmony_ci   }
1006bf215546Sopenharmony_ci   else {
1007bf215546Sopenharmony_ci      /*
1008bf215546Sopenharmony_ci       * No-op
1009bf215546Sopenharmony_ci       */
1010bf215546Sopenharmony_ci
1011bf215546Sopenharmony_ci      /* "Conversion" must be N:N */
1012bf215546Sopenharmony_ci      assert(num_srcs == num_dsts);
1013bf215546Sopenharmony_ci
1014bf215546Sopenharmony_ci      for(i = 0; i < num_dsts; ++i)
1015bf215546Sopenharmony_ci         tmp[i] = src[i];
1016bf215546Sopenharmony_ci   }
1017bf215546Sopenharmony_ci
1018bf215546Sopenharmony_ci   for(i = 0; i < num_dsts; ++i)
1019bf215546Sopenharmony_ci      dst[i] = tmp[i];
1020bf215546Sopenharmony_ci}
1021bf215546Sopenharmony_ci
1022bf215546Sopenharmony_ci
1023bf215546Sopenharmony_ci/**
1024bf215546Sopenharmony_ci * Expands src vector from src.length to dst_length
1025bf215546Sopenharmony_ci */
1026bf215546Sopenharmony_ciLLVMValueRef
1027bf215546Sopenharmony_cilp_build_pad_vector(struct gallivm_state *gallivm,
1028bf215546Sopenharmony_ci                    LLVMValueRef src,
1029bf215546Sopenharmony_ci                    unsigned dst_length)
1030bf215546Sopenharmony_ci{
1031bf215546Sopenharmony_ci   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
1032bf215546Sopenharmony_ci   LLVMValueRef undef;
1033bf215546Sopenharmony_ci   LLVMTypeRef type;
1034bf215546Sopenharmony_ci   unsigned i, src_length;
1035bf215546Sopenharmony_ci
1036bf215546Sopenharmony_ci   type = LLVMTypeOf(src);
1037bf215546Sopenharmony_ci
1038bf215546Sopenharmony_ci   if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
1039bf215546Sopenharmony_ci      /* Can't use ShuffleVector on non-vector type */
1040bf215546Sopenharmony_ci      undef = LLVMGetUndef(LLVMVectorType(type, dst_length));
1041bf215546Sopenharmony_ci      return LLVMBuildInsertElement(gallivm->builder, undef, src, lp_build_const_int32(gallivm, 0), "");
1042bf215546Sopenharmony_ci   }
1043bf215546Sopenharmony_ci
1044bf215546Sopenharmony_ci   undef      = LLVMGetUndef(type);
1045bf215546Sopenharmony_ci   src_length = LLVMGetVectorSize(type);
1046bf215546Sopenharmony_ci
1047bf215546Sopenharmony_ci   assert(dst_length <= ARRAY_SIZE(elems));
1048bf215546Sopenharmony_ci   assert(dst_length >= src_length);
1049bf215546Sopenharmony_ci
1050bf215546Sopenharmony_ci   if (src_length == dst_length)
1051bf215546Sopenharmony_ci      return src;
1052bf215546Sopenharmony_ci
1053bf215546Sopenharmony_ci   /* All elements from src vector */
1054bf215546Sopenharmony_ci   for (i = 0; i < src_length; ++i)
1055bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, i);
1056bf215546Sopenharmony_ci
1057bf215546Sopenharmony_ci   /* Undef fill remaining space */
1058bf215546Sopenharmony_ci   for (i = src_length; i < dst_length; ++i)
1059bf215546Sopenharmony_ci      elems[i] = lp_build_const_int32(gallivm, src_length);
1060bf215546Sopenharmony_ci
1061bf215546Sopenharmony_ci   /* Combine the two vectors */
1062bf215546Sopenharmony_ci   return LLVMBuildShuffleVector(gallivm->builder, src, undef, LLVMConstVector(elems, dst_length), "");
1063bf215546Sopenharmony_ci}
1064