1/*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#include "ir.h"
25#include "ir_builder.h"
26#include "ir_optimization.h"
27#include "ir_rvalue_visitor.h"
28
29namespace {
30
31using namespace ir_builder;
32
33/**
34 * A visitor that lowers built-in floating-point pack/unpack expressions
35 * such packSnorm2x16.
36 */
37class lower_packing_builtins_visitor : public ir_rvalue_visitor {
38public:
39   /**
40    * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
41    */
42   explicit lower_packing_builtins_visitor(int op_mask)
43      : op_mask(op_mask),
44        progress(false)
45   {
46      factory.instructions = &factory_instructions;
47   }
48
49   virtual ~lower_packing_builtins_visitor()
50   {
51      assert(factory_instructions.is_empty());
52   }
53
54   bool get_progress() { return progress; }
55
56   void handle_rvalue(ir_rvalue **rvalue)
57   {
58      if (!*rvalue)
59	 return;
60
61      ir_expression *expr = (*rvalue)->as_expression();
62      if (!expr)
63	 return;
64
65      enum lower_packing_builtins_op lowering_op =
66         choose_lowering_op(expr->operation);
67
68      if (lowering_op == LOWER_PACK_UNPACK_NONE)
69         return;
70
71      setup_factory(ralloc_parent(expr));
72
73      ir_rvalue *op0 = expr->operands[0];
74      ralloc_steal(factory.mem_ctx, op0);
75
76      switch (lowering_op) {
77      case LOWER_PACK_SNORM_2x16:
78         *rvalue = lower_pack_snorm_2x16(op0);
79         break;
80      case LOWER_PACK_SNORM_4x8:
81         *rvalue = lower_pack_snorm_4x8(op0);
82         break;
83      case LOWER_PACK_UNORM_2x16:
84         *rvalue = lower_pack_unorm_2x16(op0);
85         break;
86      case LOWER_PACK_UNORM_4x8:
87         *rvalue = lower_pack_unorm_4x8(op0);
88         break;
89      case LOWER_PACK_HALF_2x16:
90         *rvalue = lower_pack_half_2x16(op0);
91         break;
92      case LOWER_UNPACK_SNORM_2x16:
93         *rvalue = lower_unpack_snorm_2x16(op0);
94         break;
95      case LOWER_UNPACK_SNORM_4x8:
96         *rvalue = lower_unpack_snorm_4x8(op0);
97         break;
98      case LOWER_UNPACK_UNORM_2x16:
99         *rvalue = lower_unpack_unorm_2x16(op0);
100         break;
101      case LOWER_UNPACK_UNORM_4x8:
102         *rvalue = lower_unpack_unorm_4x8(op0);
103         break;
104      case LOWER_UNPACK_HALF_2x16:
105         *rvalue = lower_unpack_half_2x16(op0);
106         break;
107      case LOWER_PACK_UNPACK_NONE:
108      case LOWER_PACK_USE_BFI:
109      case LOWER_PACK_USE_BFE:
110         assert(!"not reached");
111         break;
112      }
113
114      teardown_factory();
115      progress = true;
116   }
117
118private:
119   const int op_mask;
120   bool progress;
121   ir_factory factory;
122   exec_list factory_instructions;
123
124   /**
125    * Determine the needed lowering operation by filtering \a expr_op
126    * through \ref op_mask.
127    */
128   enum lower_packing_builtins_op
129   choose_lowering_op(ir_expression_operation expr_op)
130   {
131      /* C++ regards int and enum as fundamentally different types.
132       * So, we can't simply return from each case; we must cast the return
133       * value.
134       */
135      int result;
136
137      switch (expr_op) {
138      case ir_unop_pack_snorm_2x16:
139         result = op_mask & LOWER_PACK_SNORM_2x16;
140         break;
141      case ir_unop_pack_snorm_4x8:
142         result = op_mask & LOWER_PACK_SNORM_4x8;
143         break;
144      case ir_unop_pack_unorm_2x16:
145         result = op_mask & LOWER_PACK_UNORM_2x16;
146         break;
147      case ir_unop_pack_unorm_4x8:
148         result = op_mask & LOWER_PACK_UNORM_4x8;
149         break;
150      case ir_unop_pack_half_2x16:
151         result = op_mask & LOWER_PACK_HALF_2x16;
152         break;
153      case ir_unop_unpack_snorm_2x16:
154         result = op_mask & LOWER_UNPACK_SNORM_2x16;
155         break;
156      case ir_unop_unpack_snorm_4x8:
157         result = op_mask & LOWER_UNPACK_SNORM_4x8;
158         break;
159      case ir_unop_unpack_unorm_2x16:
160         result = op_mask & LOWER_UNPACK_UNORM_2x16;
161         break;
162      case ir_unop_unpack_unorm_4x8:
163         result = op_mask & LOWER_UNPACK_UNORM_4x8;
164         break;
165      case ir_unop_unpack_half_2x16:
166         result = op_mask & LOWER_UNPACK_HALF_2x16;
167         break;
168      default:
169         result = LOWER_PACK_UNPACK_NONE;
170         break;
171      }
172
173      return static_cast<enum lower_packing_builtins_op>(result);
174   }
175
176   void
177   setup_factory(void *mem_ctx)
178   {
179      assert(factory.mem_ctx == NULL);
180      assert(factory.instructions->is_empty());
181
182      factory.mem_ctx = mem_ctx;
183   }
184
185   void
186   teardown_factory()
187   {
188      base_ir->insert_before(factory.instructions);
189      assert(factory.instructions->is_empty());
190      factory.mem_ctx = NULL;
191   }
192
193   template <typename T>
194   ir_constant*
195   constant(T x)
196   {
197      return factory.constant(x);
198   }
199
200   /**
201    * \brief Pack two uint16's into a single uint32.
202    *
203    * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
204    * where the least significant bits specify the first element of the pair.
205    * Return the uint32.
206    */
207   ir_rvalue*
208   pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
209   {
210      assert(uvec2_rval->type == glsl_type::uvec2_type);
211
212      /* uvec2 u = UVEC2_RVAL; */
213      ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
214                                         "tmp_pack_uvec2_to_uint");
215      factory.emit(assign(u, uvec2_rval));
216
217      if (op_mask & LOWER_PACK_USE_BFI) {
218         return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
219                                swizzle_y(u),
220                                constant(16u),
221                                constant(16u));
222      }
223
224      /* return (u.y << 16) | (u.x & 0xffff); */
225      return bit_or(lshift(swizzle_y(u), constant(16u)),
226                    bit_and(swizzle_x(u), constant(0xffffu)));
227   }
228
229   /**
230    * \brief Pack four uint8's into a single uint32.
231    *
232    * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
233    * uint32 where the least significant bits specify the first element of the
234    * 4-tuple. Return the uint32.
235    */
236   ir_rvalue*
237   pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
238   {
239      assert(uvec4_rval->type == glsl_type::uvec4_type);
240
241      ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
242                                         "tmp_pack_uvec4_to_uint");
243
244      if (op_mask & LOWER_PACK_USE_BFI) {
245         /* uvec4 u = UVEC4_RVAL; */
246         factory.emit(assign(u, uvec4_rval));
247
248         return bitfield_insert(bitfield_insert(
249                                   bitfield_insert(
250                                      bit_and(swizzle_x(u), constant(0xffu)),
251                                      swizzle_y(u), constant(8u), constant(8u)),
252                                   swizzle_z(u), constant(16u), constant(8u)),
253                                swizzle_w(u), constant(24u), constant(8u));
254      }
255
256      /* uvec4 u = UVEC4_RVAL & 0xff */
257      factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
258
259      /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
260      return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
261                           lshift(swizzle_z(u), constant(16u))),
262                    bit_or(lshift(swizzle_y(u), constant(8u)),
263                           swizzle_x(u)));
264   }
265
266   /**
267    * \brief Unpack a uint32 into two uint16's.
268    *
269    * Interpret the given uint32 as a uint16 pair where the uint32's least
270    * significant bits specify the pair's first element. Return the uint16
271    * pair as a uvec2.
272    */
273   ir_rvalue*
274   unpack_uint_to_uvec2(ir_rvalue *uint_rval)
275   {
276      assert(uint_rval->type == glsl_type::uint_type);
277
278      /* uint u = UINT_RVAL; */
279      ir_variable *u = factory.make_temp(glsl_type::uint_type,
280                                          "tmp_unpack_uint_to_uvec2_u");
281      factory.emit(assign(u, uint_rval));
282
283      /* uvec2 u2; */
284      ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
285                                           "tmp_unpack_uint_to_uvec2_u2");
286
287      /* u2.x = u & 0xffffu; */
288      factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
289
290      /* u2.y = u >> 16u; */
291      factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
292
293      return deref(u2).val;
294   }
295
296   /**
297    * \brief Unpack a uint32 into two int16's.
298    *
299    * Specifically each 16-bit value is sign-extended to the full width of an
300    * int32 on return.
301    */
302   ir_rvalue *
303   unpack_uint_to_ivec2(ir_rvalue *uint_rval)
304   {
305      assert(uint_rval->type == glsl_type::uint_type);
306
307      if (!(op_mask & LOWER_PACK_USE_BFE)) {
308         return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
309                              constant(16u)),
310                       constant(16u));
311      }
312
313      ir_variable *i = factory.make_temp(glsl_type::int_type,
314                                         "tmp_unpack_uint_to_ivec2_i");
315      factory.emit(assign(i, u2i(uint_rval)));
316
317      /* ivec2 i2; */
318      ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
319                                          "tmp_unpack_uint_to_ivec2_i2");
320
321      factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
322                          WRITEMASK_X));
323      factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
324                          WRITEMASK_Y));
325
326      return deref(i2).val;
327   }
328
329   /**
330    * \brief Unpack a uint32 into four uint8's.
331    *
332    * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
333    * significant bits specify the 4-tuple's first element. Return the uint8
334    * 4-tuple as a uvec4.
335    */
336   ir_rvalue*
337   unpack_uint_to_uvec4(ir_rvalue *uint_rval)
338   {
339      assert(uint_rval->type == glsl_type::uint_type);
340
341      /* uint u = UINT_RVAL; */
342      ir_variable *u = factory.make_temp(glsl_type::uint_type,
343                                          "tmp_unpack_uint_to_uvec4_u");
344      factory.emit(assign(u, uint_rval));
345
346      /* uvec4 u4; */
347      ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
348                                           "tmp_unpack_uint_to_uvec4_u4");
349
350      /* u4.x = u & 0xffu; */
351      factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
352
353      if (op_mask & LOWER_PACK_USE_BFE) {
354         /* u4.y = bitfield_extract(u, 8, 8); */
355         factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
356                             WRITEMASK_Y));
357
358         /* u4.z = bitfield_extract(u, 16, 8); */
359         factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
360                             WRITEMASK_Z));
361      } else {
362         /* u4.y = (u >> 8u) & 0xffu; */
363         factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
364                                         constant(0xffu)), WRITEMASK_Y));
365
366         /* u4.z = (u >> 16u) & 0xffu; */
367         factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
368                                         constant(0xffu)), WRITEMASK_Z));
369      }
370
371      /* u4.w = (u >> 24u) */
372      factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
373
374      return deref(u4).val;
375   }
376
377   /**
378    * \brief Unpack a uint32 into four int8's.
379    *
380    * Specifically each 8-bit value is sign-extended to the full width of an
381    * int32 on return.
382    */
383   ir_rvalue *
384   unpack_uint_to_ivec4(ir_rvalue *uint_rval)
385   {
386      assert(uint_rval->type == glsl_type::uint_type);
387
388      if (!(op_mask & LOWER_PACK_USE_BFE)) {
389         return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
390                              constant(24u)),
391                       constant(24u));
392      }
393
394      ir_variable *i = factory.make_temp(glsl_type::int_type,
395                                         "tmp_unpack_uint_to_ivec4_i");
396      factory.emit(assign(i, u2i(uint_rval)));
397
398      /* ivec4 i4; */
399      ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
400                                          "tmp_unpack_uint_to_ivec4_i4");
401
402      factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
403                          WRITEMASK_X));
404      factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
405                          WRITEMASK_Y));
406      factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
407                          WRITEMASK_Z));
408      factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
409                          WRITEMASK_W));
410
411      return deref(i4).val;
412   }
413
414   /**
415    * \brief Lower a packSnorm2x16 expression.
416    *
417    * \param vec2_rval is packSnorm2x16's input
418    * \return packSnorm2x16's output as a uint rvalue
419    */
420   ir_rvalue*
421   lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
422   {
423      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
424       *
425       *    highp uint packSnorm2x16(vec2 v)
426       *    --------------------------------
427       *    First, converts each component of the normalized floating-point value
428       *    v into 16-bit integer values. Then, the results are packed into the
429       *    returned 32-bit unsigned integer.
430       *
431       *    The conversion for component c of v to fixed point is done as
432       *    follows:
433       *
434       *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
435       *
436       *    The first component of the vector will be written to the least
437       *    significant bits of the output; the last component will be written to
438       *    the most significant bits.
439       *
440       * This function generates IR that approximates the following pseudo-GLSL:
441       *
442       *     return pack_uvec2_to_uint(
443       *         uvec2(ivec2(
444       *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
445       *
446       * It is necessary to first convert the vec2 to ivec2 rather than directly
447       * converting vec2 to uvec2 because the latter conversion is undefined.
448       * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
449       * convert a negative floating point value to an uint".
450       */
451      assert(vec2_rval->type == glsl_type::vec2_type);
452
453      ir_rvalue *result = pack_uvec2_to_uint(
454            i2u(f2i(round_even(mul(clamp(vec2_rval,
455                                         constant(-1.0f),
456                                         constant(1.0f)),
457                                   constant(32767.0f))))));
458
459      assert(result->type == glsl_type::uint_type);
460      return result;
461   }
462
463   /**
464    * \brief Lower a packSnorm4x8 expression.
465    *
466    * \param vec4_rval is packSnorm4x8's input
467    * \return packSnorm4x8's output as a uint rvalue
468    */
469   ir_rvalue*
470   lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
471   {
472      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
473       *
474       *    highp uint packSnorm4x8(vec4 v)
475       *    -------------------------------
476       *    First, converts each component of the normalized floating-point value
477       *    v into 8-bit integer values. Then, the results are packed into the
478       *    returned 32-bit unsigned integer.
479       *
480       *    The conversion for component c of v to fixed point is done as
481       *    follows:
482       *
483       *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
484       *
485       *    The first component of the vector will be written to the least
486       *    significant bits of the output; the last component will be written to
487       *    the most significant bits.
488       *
489       * This function generates IR that approximates the following pseudo-GLSL:
490       *
491       *     return pack_uvec4_to_uint(
492       *         uvec4(ivec4(
493       *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
494       *
495       * It is necessary to first convert the vec4 to ivec4 rather than directly
496       * converting vec4 to uvec4 because the latter conversion is undefined.
497       * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
498       * convert a negative floating point value to an uint".
499       */
500      assert(vec4_rval->type == glsl_type::vec4_type);
501
502      ir_rvalue *result = pack_uvec4_to_uint(
503            i2u(f2i(round_even(mul(clamp(vec4_rval,
504                                         constant(-1.0f),
505                                         constant(1.0f)),
506                                   constant(127.0f))))));
507
508      assert(result->type == glsl_type::uint_type);
509      return result;
510   }
511
512   /**
513    * \brief Lower an unpackSnorm2x16 expression.
514    *
515    * \param uint_rval is unpackSnorm2x16's input
516    * \return unpackSnorm2x16's output as a vec2 rvalue
517    */
518   ir_rvalue*
519   lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
520   {
521      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
522       *
523       *    highp vec2 unpackSnorm2x16 (highp uint p)
524       *    -----------------------------------------
525       *    First, unpacks a single 32-bit unsigned integer p into a pair of
526       *    16-bit unsigned integers. Then, each component is converted to
527       *    a normalized floating-point value to generate the returned
528       *    two-component vector.
529       *
530       *    The conversion for unpacked fixed-point value f to floating point is
531       *    done as follows:
532       *
533       *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
534       *
535       *    The first component of the returned vector will be extracted from the
536       *    least significant bits of the input; the last component will be
537       *    extracted from the most significant bits.
538       *
539       * This function generates IR that approximates the following pseudo-GLSL:
540       *
541       *    return clamp(
542       *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
543       *       -1.0f, 1.0f);
544       *
545       * The above IR may appear unnecessarily complex, but the intermediate
546       * conversion to ivec2 and the bit shifts are necessary to correctly unpack
547       * negative floats.
548       *
549       * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
550       * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
551       * place that int16 into an int32, which results in the *positive* integer
552       * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
553       * unimportant bit 16. We must now extend the int16's sign bit into bits
554       * 17-32, which is accomplished by left-shifting then right-shifting.
555       */
556
557      assert(uint_rval->type == glsl_type::uint_type);
558
559      ir_rvalue *result =
560        clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
561                  constant(32767.0f)),
562              constant(-1.0f),
563              constant(1.0f));
564
565      assert(result->type == glsl_type::vec2_type);
566      return result;
567   }
568
569   /**
570    * \brief Lower an unpackSnorm4x8 expression.
571    *
572    * \param uint_rval is unpackSnorm4x8's input
573    * \return unpackSnorm4x8's output as a vec4 rvalue
574    */
575   ir_rvalue*
576   lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
577   {
578      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
579       *
580       *    highp vec4 unpackSnorm4x8 (highp uint p)
581       *    ----------------------------------------
582       *    First, unpacks a single 32-bit unsigned integer p into four
583       *    8-bit unsigned integers. Then, each component is converted to
584       *    a normalized floating-point value to generate the returned
585       *    four-component vector.
586       *
587       *    The conversion for unpacked fixed-point value f to floating point is
588       *    done as follows:
589       *
590       *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
591       *
592       *    The first component of the returned vector will be extracted from the
593       *    least significant bits of the input; the last component will be
594       *    extracted from the most significant bits.
595       *
596       * This function generates IR that approximates the following pseudo-GLSL:
597       *
598       *    return clamp(
599       *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
600       *       -1.0f, 1.0f);
601       *
602       * The above IR may appear unnecessarily complex, but the intermediate
603       * conversion to ivec4 and the bit shifts are necessary to correctly unpack
604       * negative floats.
605       *
606       * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
607       * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
608       * place that int8 into an int32, which results in the *positive* integer
609       * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
610       * unimportant bit 8. We must now extend the int8's sign bit into bits
611       * 9-32, which is accomplished by left-shifting then right-shifting.
612       */
613
614      assert(uint_rval->type == glsl_type::uint_type);
615
616      ir_rvalue *result =
617        clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
618                  constant(127.0f)),
619              constant(-1.0f),
620              constant(1.0f));
621
622      assert(result->type == glsl_type::vec4_type);
623      return result;
624   }
625
626   /**
627    * \brief Lower a packUnorm2x16 expression.
628    *
629    * \param vec2_rval is packUnorm2x16's input
630    * \return packUnorm2x16's output as a uint rvalue
631    */
632   ir_rvalue*
633   lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
634   {
635      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
636       *
637       *    highp uint packUnorm2x16 (vec2 v)
638       *    ---------------------------------
639       *    First, converts each component of the normalized floating-point value
640       *    v into 16-bit integer values. Then, the results are packed into the
641       *    returned 32-bit unsigned integer.
642       *
643       *    The conversion for component c of v to fixed point is done as
644       *    follows:
645       *
646       *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
647       *
648       *    The first component of the vector will be written to the least
649       *    significant bits of the output; the last component will be written to
650       *    the most significant bits.
651       *
652       * This function generates IR that approximates the following pseudo-GLSL:
653       *
654       *     return pack_uvec2_to_uint(uvec2(
655       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
656       *
657       * Here it is safe to directly convert the vec2 to uvec2 because the vec2
658       * has been clamped to a non-negative range.
659       */
660
661      assert(vec2_rval->type == glsl_type::vec2_type);
662
663      ir_rvalue *result = pack_uvec2_to_uint(
664         f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
665
666      assert(result->type == glsl_type::uint_type);
667      return result;
668   }
669
670   /**
671    * \brief Lower a packUnorm4x8 expression.
672    *
673    * \param vec4_rval is packUnorm4x8's input
674    * \return packUnorm4x8's output as a uint rvalue
675    */
676   ir_rvalue*
677   lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
678   {
679      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
680       *
681       *    highp uint packUnorm4x8 (vec4 v)
682       *    --------------------------------
683       *    First, converts each component of the normalized floating-point value
684       *    v into 8-bit integer values. Then, the results are packed into the
685       *    returned 32-bit unsigned integer.
686       *
687       *    The conversion for component c of v to fixed point is done as
688       *    follows:
689       *
690       *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
691       *
692       *    The first component of the vector will be written to the least
693       *    significant bits of the output; the last component will be written to
694       *    the most significant bits.
695       *
696       * This function generates IR that approximates the following pseudo-GLSL:
697       *
698       *     return pack_uvec4_to_uint(uvec4(
699       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
700       *
701       * Here it is safe to directly convert the vec4 to uvec4 because the vec4
702       * has been clamped to a non-negative range.
703       */
704
705      assert(vec4_rval->type == glsl_type::vec4_type);
706
707      ir_rvalue *result = pack_uvec4_to_uint(
708         f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
709
710      assert(result->type == glsl_type::uint_type);
711      return result;
712   }
713
714   /**
715    * \brief Lower an unpackUnorm2x16 expression.
716    *
717    * \param uint_rval is unpackUnorm2x16's input
718    * \return unpackUnorm2x16's output as a vec2 rvalue
719    */
720   ir_rvalue*
721   lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
722   {
723      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
724       *
725       *    highp vec2 unpackUnorm2x16 (highp uint p)
726       *    -----------------------------------------
727       *    First, unpacks a single 32-bit unsigned integer p into a pair of
728       *    16-bit unsigned integers. Then, each component is converted to
729       *    a normalized floating-point value to generate the returned
730       *    two-component vector.
731       *
732       *    The conversion for unpacked fixed-point value f to floating point is
733       *    done as follows:
734       *
735       *       unpackUnorm2x16: f / 65535.0
736       *
737       *    The first component of the returned vector will be extracted from the
738       *    least significant bits of the input; the last component will be
739       *    extracted from the most significant bits.
740       *
741       * This function generates IR that approximates the following pseudo-GLSL:
742       *
743       *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
744       */
745
746      assert(uint_rval->type == glsl_type::uint_type);
747
748      ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
749                              constant(65535.0f));
750
751      assert(result->type == glsl_type::vec2_type);
752      return result;
753   }
754
755   /**
756    * \brief Lower an unpackUnorm4x8 expression.
757    *
758    * \param uint_rval is unpackUnorm4x8's input
759    * \return unpackUnorm4x8's output as a vec4 rvalue
760    */
761   ir_rvalue*
762   lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
763   {
764      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
765       *
766       *    highp vec4 unpackUnorm4x8 (highp uint p)
767       *    ----------------------------------------
768       *    First, unpacks a single 32-bit unsigned integer p into four
769       *    8-bit unsigned integers. Then, each component is converted to
770       *    a normalized floating-point value to generate the returned
771       *    two-component vector.
772       *
773       *    The conversion for unpacked fixed-point value f to floating point is
774       *    done as follows:
775       *
776       *       unpackUnorm4x8: f / 255.0
777       *
778       *    The first component of the returned vector will be extracted from the
779       *    least significant bits of the input; the last component will be
780       *    extracted from the most significant bits.
781       *
782       * This function generates IR that approximates the following pseudo-GLSL:
783       *
784       *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
785       */
786
787      assert(uint_rval->type == glsl_type::uint_type);
788
789      ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
790                              constant(255.0f));
791
792      assert(result->type == glsl_type::vec4_type);
793      return result;
794   }
795
796   /**
797    * \brief Lower the component-wise calculation of packHalf2x16.
798    *
799    * \param f_rval is one component of packHafl2x16's input
800    * \param e_rval is the unshifted exponent bits of f_rval
801    * \param m_rval is the unshifted mantissa bits of f_rval
802    *
803    * \return a uint rvalue that encodes a float16 in its lower 16 bits
804    */
805   ir_rvalue*
806   pack_half_1x16_nosign(ir_rvalue *f_rval,
807                         ir_rvalue *e_rval,
808                         ir_rvalue *m_rval)
809   {
810      assert(e_rval->type == glsl_type::uint_type);
811      assert(m_rval->type == glsl_type::uint_type);
812
813      /* uint u16; */
814      ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
815                                           "tmp_pack_half_1x16_u16");
816
817      /* float f = FLOAT_RVAL; */
818      ir_variable *f = factory.make_temp(glsl_type::float_type,
819                                          "tmp_pack_half_1x16_f");
820      factory.emit(assign(f, f_rval));
821
822      /* uint e = E_RVAL; */
823      ir_variable *e = factory.make_temp(glsl_type::uint_type,
824                                          "tmp_pack_half_1x16_e");
825      factory.emit(assign(e, e_rval));
826
827      /* uint m = M_RVAL; */
828      ir_variable *m = factory.make_temp(glsl_type::uint_type,
829                                          "tmp_pack_half_1x16_m");
830      factory.emit(assign(m, m_rval));
831
832      /* Preliminaries
833       * -------------
834       *
835       * For a float16, the bit layout is:
836       *
837       *   sign:     15
838       *   exponent: 10:14
839       *   mantissa: 0:9
840       *
841       * Let f16 be a float16 value. The sign, exponent, and mantissa
842       * determine its value thus:
843       *
844       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
845       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
846       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
847       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
848       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
849       *
850       * where 0 <= m16 < 2^10.
851       *
852       * For a float32, the bit layout is:
853       *
854       *   sign:     31
855       *   exponent: 23:30
856       *   mantissa: 0:22
857       *
858       * Let f32 be a float32 value. The sign, exponent, and mantissa
859       * determine its value thus:
860       *
861       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
862       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
863       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
864       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
865       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
866       *
867       * where 0 <= m32 < 2^23.
868       *
869       * The minimum and maximum normal float16 values are
870       *
871       *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
872       *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
873       *
874       * The step at max_norm16 is
875       *
876       *   max_step16 = 2^5                                     (22)
877       *
878       * Observe that the float16 boundary values in equations 20-21 lie in the
879       * range of normal float32 values.
880       *
881       *
882       * Rounding Behavior
883       * -----------------
884       * Not all float32 values can be exactly represented as a float16. We
885       * round all such intermediate float32 values to the nearest float16; if
886       * the float32 is exactly between to float16 values, we round to the one
887       * with an even mantissa. This rounding behavior has several benefits:
888       *
889       *   - It has no sign bias.
890       *
891       *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
892       *     GPU ISA.
893       *
894       *   - By reproducing the behavior of the GPU (at least on Intel hardware),
895       *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
896       *     result in the same value as if the expression were executed on the
897       *     GPU.
898       *
899       * Calculation
900       * -----------
901       * Our task is to compute s16, e16, m16 given f32.  Since this function
902       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
903       * cases consider.
904       */
905
906      factory.emit(
907
908         /* Case 1) f32 is NaN
909          *
910          *   The resultant f16 will also be NaN.
911          */
912
913         /* if (e32 == 255 && m32 != 0) { */
914         if_tree(logic_and(equal(e, constant(0xffu << 23u)),
915                           logic_not(equal(m, constant(0u)))),
916
917            assign(u16, constant(0x7fffu)),
918
919         /* Case 2) f32 lies in the range [0, min_norm16).
920          *
921          *   The resultant float16 will be either zero, subnormal, or normal.
922          *
923          *   Solving
924          *
925          *     f32 = min_norm16       (30)
926          *
927          *   gives
928          *
929          *     e32 = 113 and m32 = 0  (31)
930          *
931          *   Therefore this case occurs if and only if
932          *
933          *     e32 < 113              (32)
934          */
935
936         /* } else if (e32 < 113) { */
937         if_tree(less(e, constant(113u << 23u)),
938
939            /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
940            assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
941                                           constant((float) (1 << 24)))))),
942
943         /* Case 3) f32 lies in the range
944          *         [min_norm16, max_norm16 + max_step16).
945          *
946          *   The resultant float16 will be either normal or infinite.
947          *
948          *   Solving
949          *
950          *     f32 = max_norm16 + max_step16           (40)
951          *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
952          *         = 2^16                              (42)
953          *   gives
954          *
955          *     e32 = 143 and m32 = 0                   (43)
956          *
957          *   We already solved the boundary condition f32 = min_norm16 above
958          *   in equation 31. Therefore this case occurs if and only if
959          *
960          *     113 <= e32 and e32 < 143
961          */
962
963         /* } else if (e32 < 143) { */
964         if_tree(less(e, constant(143u << 23u)),
965
966            /* The addition below handles the case where the mantissa rounds
967             * up to 1024 and bumps the exponent.
968             *
969             * u16 = ((e - (112u << 23u)) >> 13u)
970             *     + round_to_even((float(m) / (1u << 13u));
971             */
972            assign(u16, add(rshift(sub(e, constant(112u << 23u)),
973                                   constant(13u)),
974                            f2u(round_even(
975                                  div(u2f(m), constant((float) (1 << 13))))))),
976
977         /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
978          *
979          *   The resultant float16 will be infinite.
980          *
981          *   The cases above caught all float32 values in the range
982          *   [0, max_norm16 + max_step16), so this is the fall-through case.
983          */
984
985         /* } else { */
986
987            assign(u16, constant(31u << 10u))))));
988
989         /* } */
990
991       return deref(u16).val;
992   }
993
994   /**
995    * \brief Lower a packHalf2x16 expression.
996    *
997    * \param vec2_rval is packHalf2x16's input
998    * \return packHalf2x16's output as a uint rvalue
999    */
1000   ir_rvalue*
1001   lower_pack_half_2x16(ir_rvalue *vec2_rval)
1002   {
1003      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1004       *
1005       *    highp uint packHalf2x16 (mediump vec2 v)
1006       *    ----------------------------------------
1007       *    Returns an unsigned integer obtained by converting the components of
1008       *    a two-component floating-point vector to the 16-bit floating-point
1009       *    representation found in the OpenGL ES Specification, and then packing
1010       *    these two 16-bit integers into a 32-bit unsigned integer.
1011       *
1012       *    The first vector component specifies the 16 least- significant bits
1013       *    of the result; the second component specifies the 16 most-significant
1014       *    bits.
1015       */
1016
1017      assert(vec2_rval->type == glsl_type::vec2_type);
1018
1019      /* vec2 f = VEC2_RVAL; */
1020      ir_variable *f = factory.make_temp(glsl_type::vec2_type,
1021                                         "tmp_pack_half_2x16_f");
1022      factory.emit(assign(f, vec2_rval));
1023
1024      /* uvec2 f32 = bitcast_f2u(f); */
1025      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1026                                            "tmp_pack_half_2x16_f32");
1027      factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1028
1029      /* uvec2 f16; */
1030      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1031                                        "tmp_pack_half_2x16_f16");
1032
1033      /* Get f32's unshifted exponent bits.
1034       *
1035       *   uvec2 e = f32 & 0x7f800000u;
1036       */
1037      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1038                                          "tmp_pack_half_2x16_e");
1039      factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1040
1041      /* Get f32's unshifted mantissa bits.
1042       *
1043       *   uvec2 m = f32 & 0x007fffffu;
1044       */
1045      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1046                                          "tmp_pack_half_2x16_m");
1047      factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1048
1049      /* Set f16's exponent and mantissa bits.
1050       *
1051       *   f16.x = pack_half_1x16_nosign(e.x, m.x);
1052       *   f16.y = pack_half_1y16_nosign(e.y, m.y);
1053       */
1054      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1055                                                     swizzle_x(e),
1056                                                     swizzle_x(m)),
1057                           WRITEMASK_X));
1058      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1059                                                     swizzle_y(e),
1060                                                     swizzle_y(m)),
1061                           WRITEMASK_Y));
1062
1063      /* Set f16's sign bits.
1064       *
1065       *   f16 |= (f32 & (1u << 31u) >> 16u;
1066       */
1067      factory.emit(
1068         assign(f16, bit_or(f16,
1069                            rshift(bit_and(f32, constant(1u << 31u)),
1070                                   constant(16u)))));
1071
1072
1073      /* return (f16.y << 16u) | f16.x; */
1074      ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1075                                        constant(16u)),
1076                                 swizzle_x(f16));
1077
1078      assert(result->type == glsl_type::uint_type);
1079      return result;
1080   }
1081
1082   /**
1083    * \brief Lower the component-wise calculation of unpackHalf2x16.
1084    *
1085    * Given a uint that encodes a float16 in its lower 16 bits, this function
1086    * returns a uint that encodes a float32 with the same value. The sign bit
1087    * of the float16 is ignored.
1088    *
1089    * \param e_rval is the unshifted exponent bits of a float16
1090    * \param m_rval is the unshifted mantissa bits of a float16
1091    * \param a uint rvalue that encodes a float32
1092    */
1093   ir_rvalue*
1094   unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1095   {
1096      assert(e_rval->type == glsl_type::uint_type);
1097      assert(m_rval->type == glsl_type::uint_type);
1098
1099      /* uint u32; */
1100      ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1101                                           "tmp_unpack_half_1x16_u32");
1102
1103      /* uint e = E_RVAL; */
1104      ir_variable *e = factory.make_temp(glsl_type::uint_type,
1105                                          "tmp_unpack_half_1x16_e");
1106      factory.emit(assign(e, e_rval));
1107
1108      /* uint m = M_RVAL; */
1109      ir_variable *m = factory.make_temp(glsl_type::uint_type,
1110                                          "tmp_unpack_half_1x16_m");
1111      factory.emit(assign(m, m_rval));
1112
1113      /* Preliminaries
1114       * -------------
1115       *
1116       * For a float16, the bit layout is:
1117       *
1118       *   sign:     15
1119       *   exponent: 10:14
1120       *   mantissa: 0:9
1121       *
1122       * Let f16 be a float16 value. The sign, exponent, and mantissa
1123       * determine its value thus:
1124       *
1125       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1126       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1127       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1128       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1129       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1130       *
1131       * where 0 <= m16 < 2^10.
1132       *
1133       * For a float32, the bit layout is:
1134       *
1135       *   sign: 31
1136       *   exponent: 23:30
1137       *   mantissa: 0:22
1138       *
1139       * Let f32 be a float32 value. The sign, exponent, and mantissa
1140       * determine its value thus:
1141       *
1142       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1143       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1144       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1145       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1146       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1147       *
1148       * where 0 <= m32 < 2^23.
1149       *
1150       * Calculation
1151       * -----------
1152       * Our task is to compute s32, e32, m32 given f16.  Since this function
1153       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1154       * cases consider.
1155       */
1156
1157      factory.emit(
1158
1159         /* Case 1) f16 is zero or subnormal.
1160          *
1161          *   The simplest method of calcuating f32 in this case is
1162          *
1163          *     f32 = f16                       (20)
1164          *         = 2^(-14) * (m16 / 2^10)    (21)
1165          *         = m16 / 2^(-24)             (22)
1166          */
1167
1168         /* if (e16 == 0) { */
1169         if_tree(equal(e, constant(0u)),
1170
1171            /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1172            assign(u32, expr(ir_unop_bitcast_f2u,
1173                                div(u2f(m), constant((float)(1 << 24))))),
1174
1175         /* Case 2) f16 is normal.
1176          *
1177          *   The equation
1178          *
1179          *     f32 = f16                              (30)
1180          *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1181          *       2^(e16 - 15) * (1 + m16 / 2^10)
1182          *
1183          *   can be decomposed into two
1184          *
1185          *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1186          *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1187          *
1188          *   which solve to
1189          *
1190          *     e32 = e16 + 112                        (34)
1191          *     m32 = m16 * 2^13                       (35)
1192          */
1193
1194         /* } else if (e16 < 31)) { */
1195         if_tree(less(e, constant(31u << 10u)),
1196
1197              /* u32 = ((e + (112 << 10)) | m) << 13;
1198               */
1199              assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1200                                 constant(13u))),
1201
1202
1203         /* Case 3) f16 is infinite. */
1204         if_tree(equal(m, constant(0u)),
1205
1206                 assign(u32, constant(255u << 23u)),
1207
1208         /* Case 4) f16 is NaN. */
1209         /* } else { */
1210
1211            assign(u32, constant(0x7fffffffu))))));
1212
1213         /* } */
1214
1215      return deref(u32).val;
1216   }
1217
1218   /**
1219    * \brief Lower an unpackHalf2x16 expression.
1220    *
1221    * \param uint_rval is unpackHalf2x16's input
1222    * \return unpackHalf2x16's output as a vec2 rvalue
1223    */
1224   ir_rvalue*
1225   lower_unpack_half_2x16(ir_rvalue *uint_rval)
1226   {
1227      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1228       *
1229       *    mediump vec2 unpackHalf2x16 (highp uint v)
1230       *    ------------------------------------------
1231       *    Returns a two-component floating-point vector with components
1232       *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1233       *    values, interpreting those values as 16-bit floating-point numbers
1234       *    according to the OpenGL ES Specification, and converting them to
1235       *    32-bit floating-point values.
1236       *
1237       *    The first component of the vector is obtained from the
1238       *    16 least-significant bits of v; the second component is obtained
1239       *    from the 16 most-significant bits of v.
1240       */
1241      assert(uint_rval->type == glsl_type::uint_type);
1242
1243      /* uint u = RVALUE;
1244       * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1245       */
1246      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1247                                            "tmp_unpack_half_2x16_f16");
1248      factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1249
1250      /* uvec2 f32; */
1251      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1252                                            "tmp_unpack_half_2x16_f32");
1253
1254      /* Get f16's unshifted exponent bits.
1255       *
1256       *    uvec2 e = f16 & 0x7c00u;
1257       */
1258      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1259                                          "tmp_unpack_half_2x16_e");
1260      factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1261
1262      /* Get f16's unshifted mantissa bits.
1263       *
1264       *    uvec2 m = f16 & 0x03ffu;
1265       */
1266      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1267                                          "tmp_unpack_half_2x16_m");
1268      factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1269
1270      /* Set f32's exponent and mantissa bits.
1271       *
1272       *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1273       *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1274       */
1275      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1276                                                       swizzle_x(m)),
1277                           WRITEMASK_X));
1278      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1279                                                       swizzle_y(m)),
1280                           WRITEMASK_Y));
1281
1282      /* Set f32's sign bit.
1283       *
1284       *    f32 |= (f16 & 0x8000u) << 16u;
1285       */
1286      factory.emit(assign(f32, bit_or(f32,
1287                                       lshift(bit_and(f16,
1288                                                      constant(0x8000u)),
1289                                              constant(16u)))));
1290
1291      /* return bitcast_u2f(f32); */
1292      ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1293      assert(result->type == glsl_type::vec2_type);
1294      return result;
1295   }
1296};
1297
1298} // namespace anonymous
1299
1300/**
1301 * \brief Lower the builtin packing functions.
1302 *
1303 * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1304 */
1305bool
1306lower_packing_builtins(exec_list *instructions, int op_mask)
1307{
1308   lower_packing_builtins_visitor v(op_mask);
1309   visit_list_elements(&v, instructions, true);
1310   return v.get_progress();
1311}
1312