1bf215546Sopenharmony_ci# -*- coding: utf-8 -*-
2bf215546Sopenharmony_ci#
3bf215546Sopenharmony_ci# Copyright (C) 2014 Intel Corporation
4bf215546Sopenharmony_ci#
5bf215546Sopenharmony_ci# Permission is hereby granted, free of charge, to any person obtaining a
6bf215546Sopenharmony_ci# copy of this software and associated documentation files (the "Software"),
7bf215546Sopenharmony_ci# to deal in the Software without restriction, including without limitation
8bf215546Sopenharmony_ci# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9bf215546Sopenharmony_ci# and/or sell copies of the Software, and to permit persons to whom the
10bf215546Sopenharmony_ci# Software is furnished to do so, subject to the following conditions:
11bf215546Sopenharmony_ci#
12bf215546Sopenharmony_ci# The above copyright notice and this permission notice (including the next
13bf215546Sopenharmony_ci# paragraph) shall be included in all copies or substantial portions of the
14bf215546Sopenharmony_ci# Software.
15bf215546Sopenharmony_ci#
16bf215546Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17bf215546Sopenharmony_ci# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18bf215546Sopenharmony_ci# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19bf215546Sopenharmony_ci# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20bf215546Sopenharmony_ci# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21bf215546Sopenharmony_ci# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22bf215546Sopenharmony_ci# IN THE SOFTWARE.
23bf215546Sopenharmony_ci#
24bf215546Sopenharmony_ci# Authors:
25bf215546Sopenharmony_ci#    Jason Ekstrand (jason@jlekstrand.net)
26bf215546Sopenharmony_ci
27bf215546Sopenharmony_cifrom collections import OrderedDict
28bf215546Sopenharmony_ciimport nir_algebraic
29bf215546Sopenharmony_cifrom nir_opcodes import type_sizes
30bf215546Sopenharmony_ciimport itertools
31bf215546Sopenharmony_ciimport struct
32bf215546Sopenharmony_cifrom math import pi
33bf215546Sopenharmony_ciimport math
34bf215546Sopenharmony_ci
35bf215546Sopenharmony_ci# Convenience variables
36bf215546Sopenharmony_cia = 'a'
37bf215546Sopenharmony_cib = 'b'
38bf215546Sopenharmony_cic = 'c'
39bf215546Sopenharmony_cid = 'd'
40bf215546Sopenharmony_cie = 'e'
41bf215546Sopenharmony_ci
42bf215546Sopenharmony_cisigned_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)'
43bf215546Sopenharmony_cisigned_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)'
44bf215546Sopenharmony_ci
45bf215546Sopenharmony_ciignore_exact = nir_algebraic.ignore_exact
46bf215546Sopenharmony_ci
47bf215546Sopenharmony_ci# Written in the form (<search>, <replace>) where <search> is an expression
48bf215546Sopenharmony_ci# and <replace> is either an expression or a value.  An expression is
49bf215546Sopenharmony_ci# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
50bf215546Sopenharmony_ci# where each source is either an expression or a value.  A value can be
51bf215546Sopenharmony_ci# either a numeric constant or a string representing a variable name.
52bf215546Sopenharmony_ci#
53bf215546Sopenharmony_ci# If the opcode in a search expression is prefixed by a '~' character, this
54bf215546Sopenharmony_ci# indicates that the operation is inexact.  Such operations will only get
55bf215546Sopenharmony_ci# applied to SSA values that do not have the exact bit set.  This should be
56bf215546Sopenharmony_ci# used by by any optimizations that are not bit-for-bit exact.  It should not,
57bf215546Sopenharmony_ci# however, be used for backend-requested lowering operations as those need to
58bf215546Sopenharmony_ci# happen regardless of precision.
59bf215546Sopenharmony_ci#
60bf215546Sopenharmony_ci# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
61bf215546Sopenharmony_ci# "#" indicates that the given variable will only match constants,
62bf215546Sopenharmony_ci# type indicates that the given variable will only match values from ALU
63bf215546Sopenharmony_ci#    instructions with the given output type,
64bf215546Sopenharmony_ci# (cond) specifies an additional condition function (see nir_search_helpers.h),
65bf215546Sopenharmony_ci# swiz is a swizzle applied to the variable (only in the <replace> expression)
66bf215546Sopenharmony_ci#
67bf215546Sopenharmony_ci# For constants, you have to be careful to make sure that it is the right
68bf215546Sopenharmony_ci# type because python is unaware of the source and destination types of the
69bf215546Sopenharmony_ci# opcodes.
70bf215546Sopenharmony_ci#
71bf215546Sopenharmony_ci# All expression types can have a bit-size specified.  For opcodes, this
72bf215546Sopenharmony_ci# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
73bf215546Sopenharmony_ci# type and size.  In the search half of the expression this indicates that it
74bf215546Sopenharmony_ci# should only match that particular bit-size.  In the replace half of the
75bf215546Sopenharmony_ci# expression this indicates that the constructed value should have that
76bf215546Sopenharmony_ci# bit-size.
77bf215546Sopenharmony_ci#
78bf215546Sopenharmony_ci# If the opcode in a replacement expression is prefixed by a '!' character,
79bf215546Sopenharmony_ci# this indicated that the new expression will be marked exact.
80bf215546Sopenharmony_ci#
81bf215546Sopenharmony_ci# A special condition "many-comm-expr" can be used with expressions to note
82bf215546Sopenharmony_ci# that the expression and its subexpressions have more commutative expressions
83bf215546Sopenharmony_ci# than nir_replace_instr can handle.  If this special condition is needed with
84bf215546Sopenharmony_ci# another condition, the two can be separated by a comma (e.g.,
85bf215546Sopenharmony_ci# "(many-comm-expr,is_used_once)").
86bf215546Sopenharmony_ci
87bf215546Sopenharmony_ci# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
88bf215546Sopenharmony_cidef lowered_sincos(c):
89bf215546Sopenharmony_ci    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
90bf215546Sopenharmony_ci    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
91bf215546Sopenharmony_ci    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
92bf215546Sopenharmony_ci
93bf215546Sopenharmony_cidef intBitsToFloat(i):
94bf215546Sopenharmony_ci    return struct.unpack('!f', struct.pack('!I', i))[0]
95bf215546Sopenharmony_ci
96bf215546Sopenharmony_cioptimizations = [
97bf215546Sopenharmony_ci
98bf215546Sopenharmony_ci   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
99bf215546Sopenharmony_ci   (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'),
100bf215546Sopenharmony_ci   (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'),
101bf215546Sopenharmony_ci   (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'),
102bf215546Sopenharmony_ci   (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'),
103bf215546Sopenharmony_ci   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
104bf215546Sopenharmony_ci   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
105bf215546Sopenharmony_ci
106bf215546Sopenharmony_ci   (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))),
107bf215546Sopenharmony_ci    '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'),
108bf215546Sopenharmony_ci
109bf215546Sopenharmony_ci   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
110bf215546Sopenharmony_ci   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
111bf215546Sopenharmony_ci   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
112bf215546Sopenharmony_ci   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
113bf215546Sopenharmony_ci   (('udiv', a, 1), a),
114bf215546Sopenharmony_ci   (('idiv', a, 1), a),
115bf215546Sopenharmony_ci   (('umod', a, 1), 0),
116bf215546Sopenharmony_ci   (('imod', a, 1), 0),
117bf215546Sopenharmony_ci   (('imod', a, -1), 0),
118bf215546Sopenharmony_ci   (('irem', a, 1), 0),
119bf215546Sopenharmony_ci   (('irem', a, -1), 0),
120bf215546Sopenharmony_ci   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
121bf215546Sopenharmony_ci   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
122bf215546Sopenharmony_ci   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
123bf215546Sopenharmony_ci   (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
124bf215546Sopenharmony_ci   (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
125bf215546Sopenharmony_ci   (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),
126bf215546Sopenharmony_ci   # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'
127bf215546Sopenharmony_ci   (('irem', a, '#b(is_pos_power_of_two)'),
128bf215546Sopenharmony_ci    ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),
129bf215546Sopenharmony_ci    '!options->lower_bitops'),
130bf215546Sopenharmony_ci   (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),
131bf215546Sopenharmony_ci
132bf215546Sopenharmony_ci   (('~fneg', ('fneg', a)), a),
133bf215546Sopenharmony_ci   (('ineg', ('ineg', a)), a),
134bf215546Sopenharmony_ci   (('fabs', ('fneg', a)), ('fabs', a)),
135bf215546Sopenharmony_ci   (('fabs', ('u2f', a)), ('u2f', a)),
136bf215546Sopenharmony_ci   (('iabs', ('iabs', a)), ('iabs', a)),
137bf215546Sopenharmony_ci   (('iabs', ('ineg', a)), ('iabs', a)),
138bf215546Sopenharmony_ci   (('f2b', ('fneg', a)), ('f2b', a)),
139bf215546Sopenharmony_ci   (('i2b', ('ineg', a)), ('i2b', a)),
140bf215546Sopenharmony_ci   (('~fadd', a, 0.0), a),
141bf215546Sopenharmony_ci   # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a
142bf215546Sopenharmony_ci   # floating point instruction, they should flush any input denormals and we
143bf215546Sopenharmony_ci   # can replace -0.0 with 0.0 if the float execution mode allows it.
144bf215546Sopenharmony_ci   (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16),
145bf215546Sopenharmony_ci   (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32),
146bf215546Sopenharmony_ci   (('iadd', a, 0), a),
147bf215546Sopenharmony_ci   (('iadd_sat', a, 0), a),
148bf215546Sopenharmony_ci   (('isub_sat', a, 0), a),
149bf215546Sopenharmony_ci   (('uadd_sat', a, 0), a),
150bf215546Sopenharmony_ci   (('usub_sat', a, 0), a),
151bf215546Sopenharmony_ci   (('usadd_4x8_vc4', a, 0), a),
152bf215546Sopenharmony_ci   (('usadd_4x8_vc4', a, ~0), ~0),
153bf215546Sopenharmony_ci   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
154bf215546Sopenharmony_ci   (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),
155bf215546Sopenharmony_ci   (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
156bf215546Sopenharmony_ci   (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
157bf215546Sopenharmony_ci   (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
158bf215546Sopenharmony_ci   (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
159bf215546Sopenharmony_ci   (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
160bf215546Sopenharmony_ci   (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),
161bf215546Sopenharmony_ci   (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
162bf215546Sopenharmony_ci   (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),
163bf215546Sopenharmony_ci   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
164bf215546Sopenharmony_ci   (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
165bf215546Sopenharmony_ci   (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
166bf215546Sopenharmony_ci   (('~fadd', ('fneg', a), a), 0.0),
167bf215546Sopenharmony_ci   (('iadd', ('ineg', a), a), 0),
168bf215546Sopenharmony_ci   (('iadd', ('ineg', a), ('iadd', a, b)), b),
169bf215546Sopenharmony_ci   (('iadd', a, ('iadd', ('ineg', a), b)), b),
170bf215546Sopenharmony_ci   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
171bf215546Sopenharmony_ci   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
172bf215546Sopenharmony_ci   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
173bf215546Sopenharmony_ci   (('~fmul', a, 0.0), 0.0),
174bf215546Sopenharmony_ci   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
175bf215546Sopenharmony_ci   (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16),
176bf215546Sopenharmony_ci   (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32),
177bf215546Sopenharmony_ci   (('fmulz', a, 0.0), 0.0),
178bf215546Sopenharmony_ci   (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
179bf215546Sopenharmony_ci   (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),
180bf215546Sopenharmony_ci   (('fmulz', a, a), ('fmul', a, a)),
181bf215546Sopenharmony_ci   (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32),
182bf215546Sopenharmony_ci   (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),
183bf215546Sopenharmony_ci   (('ffmaz', a, a, b), ('ffma', a, a, b)),
184bf215546Sopenharmony_ci   (('imul', a, 0), 0),
185bf215546Sopenharmony_ci   (('umul_unorm_4x8_vc4', a, 0), 0),
186bf215546Sopenharmony_ci   (('umul_unorm_4x8_vc4', a, ~0), a),
187bf215546Sopenharmony_ci   (('~fmul', a, 1.0), a),
188bf215546Sopenharmony_ci   (('~fmulz', a, 1.0), a),
189bf215546Sopenharmony_ci   # The only effect a*1.0 can have is flushing denormals. If it's only used by
190bf215546Sopenharmony_ci   # a floating point instruction, they should flush any input denormals and
191bf215546Sopenharmony_ci   # this multiplication isn't needed.
192bf215546Sopenharmony_ci   (('fmul(is_only_used_as_float)', a, 1.0), a),
193bf215546Sopenharmony_ci   (('imul', a, 1), a),
194bf215546Sopenharmony_ci   (('fmul', a, -1.0), ('fneg', a)),
195bf215546Sopenharmony_ci   (('imul', a, -1), ('ineg', a)),
196bf215546Sopenharmony_ci   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
197bf215546Sopenharmony_ci   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
198bf215546Sopenharmony_ci   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
199bf215546Sopenharmony_ci   # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN
200bf215546Sopenharmony_ci   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
201bf215546Sopenharmony_ci   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
202bf215546Sopenharmony_ci   (('~ffma', 0.0, a, b), b),
203bf215546Sopenharmony_ci   (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16),
204bf215546Sopenharmony_ci   (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32),
205bf215546Sopenharmony_ci   (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
206bf215546Sopenharmony_ci   (('~ffma', a, b, 0.0), ('fmul', a, b)),
207bf215546Sopenharmony_ci   (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16),
208bf215546Sopenharmony_ci   (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
209bf215546Sopenharmony_ci   (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32),
210bf215546Sopenharmony_ci   (('ffma', 1.0, a, b), ('fadd', a, b)),
211bf215546Sopenharmony_ci   (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32),
212bf215546Sopenharmony_ci   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
213bf215546Sopenharmony_ci   (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32),
214bf215546Sopenharmony_ci   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
215bf215546Sopenharmony_ci   (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
216bf215546Sopenharmony_ci   (('~flrp', a, b, 0.0), a),
217bf215546Sopenharmony_ci   (('~flrp', a, b, 1.0), b),
218bf215546Sopenharmony_ci   (('~flrp', a, a, b), a),
219bf215546Sopenharmony_ci   (('~flrp', 0.0, a, b), ('fmul', a, b)),
220bf215546Sopenharmony_ci
221bf215546Sopenharmony_ci   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
222bf215546Sopenharmony_ci   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
223bf215546Sopenharmony_ci
224bf215546Sopenharmony_ci   (('sdot_4x8_iadd', a, 0, b), b),
225bf215546Sopenharmony_ci   (('udot_4x8_uadd', a, 0, b), b),
226bf215546Sopenharmony_ci   (('sdot_4x8_iadd_sat', a, 0, b), b),
227bf215546Sopenharmony_ci   (('udot_4x8_uadd_sat', a, 0, b), b),
228bf215546Sopenharmony_ci   (('sdot_2x16_iadd', a, 0, b), b),
229bf215546Sopenharmony_ci   (('udot_2x16_uadd', a, 0, b), b),
230bf215546Sopenharmony_ci   (('sdot_2x16_iadd_sat', a, 0, b), b),
231bf215546Sopenharmony_ci   (('udot_2x16_uadd_sat', a, 0, b), b),
232bf215546Sopenharmony_ci
233bf215546Sopenharmony_ci   # sudot_4x8_iadd is not commutative at all, so the patterns must be
234bf215546Sopenharmony_ci   # duplicated with zeros on each of the first positions.
235bf215546Sopenharmony_ci   (('sudot_4x8_iadd', a, 0, b), b),
236bf215546Sopenharmony_ci   (('sudot_4x8_iadd', 0, a, b), b),
237bf215546Sopenharmony_ci   (('sudot_4x8_iadd_sat', a, 0, b), b),
238bf215546Sopenharmony_ci   (('sudot_4x8_iadd_sat', 0, a, b), b),
239bf215546Sopenharmony_ci
240bf215546Sopenharmony_ci   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
241bf215546Sopenharmony_ci   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
242bf215546Sopenharmony_ci   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
243bf215546Sopenharmony_ci   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
244bf215546Sopenharmony_ci   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
245bf215546Sopenharmony_ci
246bf215546Sopenharmony_ci   # Try to let constant folding eliminate the dot-product part.  These are
247bf215546Sopenharmony_ci   # safe because the dot product cannot overflow 32 bits.
248bf215546Sopenharmony_ci   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
249bf215546Sopenharmony_ci   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
250bf215546Sopenharmony_ci   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
251bf215546Sopenharmony_ci   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
252bf215546Sopenharmony_ci   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
253bf215546Sopenharmony_ci   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
254bf215546Sopenharmony_ci   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
255bf215546Sopenharmony_ci   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
256bf215546Sopenharmony_ci   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
257bf215546Sopenharmony_ci   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
258bf215546Sopenharmony_ci   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
259bf215546Sopenharmony_ci   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
260bf215546Sopenharmony_ci   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
261bf215546Sopenharmony_ci   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
262bf215546Sopenharmony_ci   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
263bf215546Sopenharmony_ci   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
264bf215546Sopenharmony_ci
265bf215546Sopenharmony_ci   # Optimize open-coded fmulz.
266bf215546Sopenharmony_ci   # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b)
267bf215546Sopenharmony_ci   (('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)),
268bf215546Sopenharmony_ci    ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
269bf215546Sopenharmony_ci   (('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')),
270bf215546Sopenharmony_ci    ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
271bf215546Sopenharmony_ci
272bf215546Sopenharmony_ci   # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c)
273bf215546Sopenharmony_ci   (('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c),
274bf215546Sopenharmony_ci    ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
275bf215546Sopenharmony_ci   (('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c),
276bf215546Sopenharmony_ci    ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
277bf215546Sopenharmony_ci]
278bf215546Sopenharmony_ci
279bf215546Sopenharmony_ci# Shorthand for the expansion of just the dot product part of the [iu]dp4a
280bf215546Sopenharmony_ci# instructions.
281bf215546Sopenharmony_cisdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
282bf215546Sopenharmony_ci                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
283bf215546Sopenharmony_ci                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
284bf215546Sopenharmony_ci                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
285bf215546Sopenharmony_ciudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
286bf215546Sopenharmony_ci                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
287bf215546Sopenharmony_ci                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
288bf215546Sopenharmony_ci                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
289bf215546Sopenharmony_cisudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
290bf215546Sopenharmony_ci                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
291bf215546Sopenharmony_ci                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
292bf215546Sopenharmony_ci                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
293bf215546Sopenharmony_cisdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
294bf215546Sopenharmony_ci                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
295bf215546Sopenharmony_ciudot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
296bf215546Sopenharmony_ci                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
297bf215546Sopenharmony_ci
298bf215546Sopenharmony_cioptimizations.extend([
299bf215546Sopenharmony_ci   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
300bf215546Sopenharmony_ci   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
301bf215546Sopenharmony_ci   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
302bf215546Sopenharmony_ci   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
303bf215546Sopenharmony_ci   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
304bf215546Sopenharmony_ci
305bf215546Sopenharmony_ci   # For the unsigned dot-product, the largest possible value 4*(255*255) =
306bf215546Sopenharmony_ci   # 0x3f804, so we don't have to worry about that intermediate result
307bf215546Sopenharmony_ci   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
308bf215546Sopenharmony_ci   # that is less than 0xfffc07fc, then the result cannot overflow ever.
309bf215546Sopenharmony_ci   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
310bf215546Sopenharmony_ci   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'),
311bf215546Sopenharmony_ci
312bf215546Sopenharmony_ci   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
313bf215546Sopenharmony_ci   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
314bf215546Sopenharmony_ci   # don't have to worry about that intermediate result overflowing or
315bf215546Sopenharmony_ci   # underflowing.
316bf215546Sopenharmony_ci   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
317bf215546Sopenharmony_ci
318bf215546Sopenharmony_ci   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
319bf215546Sopenharmony_ci
320bf215546Sopenharmony_ci   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
321bf215546Sopenharmony_ci   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
322bf215546Sopenharmony_ci])
323bf215546Sopenharmony_ci
324bf215546Sopenharmony_ci# Float sizes
325bf215546Sopenharmony_cifor s in [16, 32, 64]:
326bf215546Sopenharmony_ci    optimizations.extend([
327bf215546Sopenharmony_ci       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
328bf215546Sopenharmony_ci
329bf215546Sopenharmony_ci       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
330bf215546Sopenharmony_ci       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
331bf215546Sopenharmony_ci       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
332bf215546Sopenharmony_ci
333bf215546Sopenharmony_ci       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
334bf215546Sopenharmony_ci       # These are the same as the previous three rules, but it depends on
335bf215546Sopenharmony_ci       # 1-fsat(x) <=> fsat(1-x).  See below.
336bf215546Sopenharmony_ci       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
337bf215546Sopenharmony_ci       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
338bf215546Sopenharmony_ci
339bf215546Sopenharmony_ci       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
340bf215546Sopenharmony_ci       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
341bf215546Sopenharmony_ci
342bf215546Sopenharmony_ci       (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),
343bf215546Sopenharmony_ci       (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),
344bf215546Sopenharmony_ci
345bf215546Sopenharmony_ci       # These two aren't flrp lowerings, but do appear in some shaders.
346bf215546Sopenharmony_ci       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),
347bf215546Sopenharmony_ci       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),
348bf215546Sopenharmony_ci
349bf215546Sopenharmony_ci       # 1 - ((1 - a) * (1 - b))
350bf215546Sopenharmony_ci       # 1 - (1 - a - b + a*b)
351bf215546Sopenharmony_ci       # 1 - 1 + a + b - a*b
352bf215546Sopenharmony_ci       # a + b - a*b
353bf215546Sopenharmony_ci       # a + b*(1 - a)
354bf215546Sopenharmony_ci       # b*(1 - a) + 1*a
355bf215546Sopenharmony_ci       # flrp(b, 1, a)
356bf215546Sopenharmony_ci       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
357bf215546Sopenharmony_ci    ])
358bf215546Sopenharmony_ci
359bf215546Sopenharmony_cioptimizations.extend([
360bf215546Sopenharmony_ci   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
361bf215546Sopenharmony_ci
362bf215546Sopenharmony_ci   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
363bf215546Sopenharmony_ci   (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
364bf215546Sopenharmony_ci
365bf215546Sopenharmony_ci   (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
366bf215546Sopenharmony_ci   (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
367bf215546Sopenharmony_ci   (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'),
368bf215546Sopenharmony_ci   (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
369bf215546Sopenharmony_ci   (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
370bf215546Sopenharmony_ci   (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
371bf215546Sopenharmony_ci   (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
372bf215546Sopenharmony_ci   (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
373bf215546Sopenharmony_ci   (('ffract@64', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)'),
374bf215546Sopenharmony_ci   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
375bf215546Sopenharmony_ci   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
376bf215546Sopenharmony_ci   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
377bf215546Sopenharmony_ci   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
378bf215546Sopenharmony_ci   (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
379bf215546Sopenharmony_ci   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
380bf215546Sopenharmony_ci   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
381bf215546Sopenharmony_ci   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
382bf215546Sopenharmony_ci   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
383bf215546Sopenharmony_ci   (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
384bf215546Sopenharmony_ci
385bf215546Sopenharmony_ci   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
386bf215546Sopenharmony_ci    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
387bf215546Sopenharmony_ci
388bf215546Sopenharmony_ci   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
389bf215546Sopenharmony_ci
390bf215546Sopenharmony_ci   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
391bf215546Sopenharmony_ci   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
392bf215546Sopenharmony_ci   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
393bf215546Sopenharmony_ci   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
394bf215546Sopenharmony_ci
395bf215546Sopenharmony_ci   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
396bf215546Sopenharmony_ci   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
397bf215546Sopenharmony_ci
398bf215546Sopenharmony_ci   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
399bf215546Sopenharmony_ci   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
400bf215546Sopenharmony_ci
401bf215546Sopenharmony_ci   # Lower fdot to fsum when it is available
402bf215546Sopenharmony_ci   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
403bf215546Sopenharmony_ci   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
404bf215546Sopenharmony_ci   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
405bf215546Sopenharmony_ci   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
406bf215546Sopenharmony_ci
407bf215546Sopenharmony_ci   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
408bf215546Sopenharmony_ci   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
409bf215546Sopenharmony_ci   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
410bf215546Sopenharmony_ci   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
411bf215546Sopenharmony_ci
412bf215546Sopenharmony_ci   # (a * #b + #c) << #d
413bf215546Sopenharmony_ci   # ((a * #b) << #d) + (#c << #d)
414bf215546Sopenharmony_ci   # (a * (#b << #d)) + (#c << #d)
415bf215546Sopenharmony_ci   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
416bf215546Sopenharmony_ci    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
417bf215546Sopenharmony_ci
418bf215546Sopenharmony_ci   # (a * #b) << #c
419bf215546Sopenharmony_ci   # a * (#b << #c)
420bf215546Sopenharmony_ci   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
421bf215546Sopenharmony_ci])
422bf215546Sopenharmony_ci
423bf215546Sopenharmony_ci# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
424bf215546Sopenharmony_ci# bits of the second source.  These replacements must correctly handle the
425bf215546Sopenharmony_ci# case where (b % bitsize) + (c % bitsize) >= bitsize.
426bf215546Sopenharmony_cifor s in [8, 16, 32, 64]:
427bf215546Sopenharmony_ci   mask = s - 1
428bf215546Sopenharmony_ci
429bf215546Sopenharmony_ci   ishl = "ishl@{}".format(s)
430bf215546Sopenharmony_ci   ishr = "ishr@{}".format(s)
431bf215546Sopenharmony_ci   ushr = "ushr@{}".format(s)
432bf215546Sopenharmony_ci
433bf215546Sopenharmony_ci   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
434bf215546Sopenharmony_ci
435bf215546Sopenharmony_ci   optimizations.extend([
436bf215546Sopenharmony_ci       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
437bf215546Sopenharmony_ci       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
438bf215546Sopenharmony_ci
439bf215546Sopenharmony_ci       # To get get -1 for large shifts of negative values, ishr must instead
440bf215546Sopenharmony_ci       # clamp the shift count to the maximum value.
441bf215546Sopenharmony_ci       ((ishr, (ishr, a, '#b'), '#c'),
442bf215546Sopenharmony_ci        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
443bf215546Sopenharmony_ci   ])
444bf215546Sopenharmony_ci
445bf215546Sopenharmony_ci# Optimize a pattern of address calculation created by DXVK where the offset is
446bf215546Sopenharmony_ci# divided by 4 and then multipled by 4. This can be turned into an iand and the
447bf215546Sopenharmony_ci# additions before can be reassociated to CSE the iand instruction.
448bf215546Sopenharmony_ci
449bf215546Sopenharmony_cifor size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):
450bf215546Sopenharmony_ci    a_sz = 'a@{}'.format(size)
451bf215546Sopenharmony_ci
452bf215546Sopenharmony_ci    optimizations.extend([
453bf215546Sopenharmony_ci       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
454bf215546Sopenharmony_ci       (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
455bf215546Sopenharmony_ci       (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
456bf215546Sopenharmony_ci
457bf215546Sopenharmony_ci       # This does not trivially work with ishr.
458bf215546Sopenharmony_ci       (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),
459bf215546Sopenharmony_ci    ])
460bf215546Sopenharmony_ci
461bf215546Sopenharmony_cioptimizations.extend([
462bf215546Sopenharmony_ci    (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)),
463bf215546Sopenharmony_ci    (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)),
464bf215546Sopenharmony_ci])
465bf215546Sopenharmony_ci
466bf215546Sopenharmony_cifor log2 in range(1, 7): # powers of two from 2 to 64
467bf215546Sopenharmony_ci   v = 1 << log2
468bf215546Sopenharmony_ci   mask = 0xffffffff & ~(v - 1)
469bf215546Sopenharmony_ci   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
470bf215546Sopenharmony_ci
471bf215546Sopenharmony_ci   optimizations.extend([
472bf215546Sopenharmony_ci       # Reassociate for improved CSE
473bf215546Sopenharmony_ci       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
474bf215546Sopenharmony_ci   ])
475bf215546Sopenharmony_ci
476bf215546Sopenharmony_ci# To save space in the state tables, reduce to the set that is known to help.
477bf215546Sopenharmony_ci# Previously, this was range(1, 32).  In addition, a couple rules inside the
478bf215546Sopenharmony_ci# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
479bf215546Sopenharmony_ci# resolution.
480bf215546Sopenharmony_cifor i in [1, 2, 16, 24]:
481bf215546Sopenharmony_ci    lo_mask = 0xffffffff >> i
482bf215546Sopenharmony_ci    hi_mask = (0xffffffff << i) & 0xffffffff
483bf215546Sopenharmony_ci
484bf215546Sopenharmony_ci    optimizations.extend([
485bf215546Sopenharmony_ci        # This pattern seems to only help in the soft-fp64 code.
486bf215546Sopenharmony_ci        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
487bf215546Sopenharmony_ci#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
488bf215546Sopenharmony_ci#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
489bf215546Sopenharmony_ci
490bf215546Sopenharmony_ci        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
491bf215546Sopenharmony_ci        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
492bf215546Sopenharmony_ci#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
493bf215546Sopenharmony_ci    ])
494bf215546Sopenharmony_ci
495bf215546Sopenharmony_cioptimizations.extend([
496bf215546Sopenharmony_ci   # This is common for address calculations.  Reassociating may enable the
497bf215546Sopenharmony_ci   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
498bf215546Sopenharmony_ci   # instruction or a constant offset field for in load / store instructions.
499bf215546Sopenharmony_ci   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
500bf215546Sopenharmony_ci
501bf215546Sopenharmony_ci   # (a + #b) * #c => (a * #c) + (#b * #c)
502bf215546Sopenharmony_ci   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
503bf215546Sopenharmony_ci
504bf215546Sopenharmony_ci   # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
505bf215546Sopenharmony_ci   (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
506bf215546Sopenharmony_ci    ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),
507bf215546Sopenharmony_ci   (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
508bf215546Sopenharmony_ci    ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),
509bf215546Sopenharmony_ci
510bf215546Sopenharmony_ci   # Comparison simplifications
511bf215546Sopenharmony_ci   (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),
512bf215546Sopenharmony_ci   (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),
513bf215546Sopenharmony_ci   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
514bf215546Sopenharmony_ci   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
515bf215546Sopenharmony_ci   (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),
516bf215546Sopenharmony_ci   (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),
517bf215546Sopenharmony_ci   (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),
518bf215546Sopenharmony_ci   (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),
519bf215546Sopenharmony_ci   (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),
520bf215546Sopenharmony_ci   (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),
521bf215546Sopenharmony_ci
522bf215546Sopenharmony_ci   (('iand', ('feq', a, b), ('fneu', a, b)), False),
523bf215546Sopenharmony_ci   (('iand', ('flt', a, b), ('flt', b, a)), False),
524bf215546Sopenharmony_ci   (('iand', ('ieq', a, b), ('ine', a, b)), False),
525bf215546Sopenharmony_ci   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
526bf215546Sopenharmony_ci   (('iand', ('ult', a, b), ('ult', b, a)), False),
527bf215546Sopenharmony_ci
528bf215546Sopenharmony_ci   # This helps some shaders because, after some optimizations, they end up
529bf215546Sopenharmony_ci   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
530bf215546Sopenharmony_ci   # matching would be handled by CSE.
531bf215546Sopenharmony_ci   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
532bf215546Sopenharmony_ci   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
533bf215546Sopenharmony_ci   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
534bf215546Sopenharmony_ci   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
535bf215546Sopenharmony_ci   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
536bf215546Sopenharmony_ci   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
537bf215546Sopenharmony_ci   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
538bf215546Sopenharmony_ci   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
539bf215546Sopenharmony_ci   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
540bf215546Sopenharmony_ci   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
541bf215546Sopenharmony_ci
542bf215546Sopenharmony_ci   # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.
543bf215546Sopenharmony_ci   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
544bf215546Sopenharmony_ci
545bf215546Sopenharmony_ci   # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.
546bf215546Sopenharmony_ci   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
547bf215546Sopenharmony_ci
548bf215546Sopenharmony_ci   # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.
549bf215546Sopenharmony_ci   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
550bf215546Sopenharmony_ci
551bf215546Sopenharmony_ci   # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.
552bf215546Sopenharmony_ci   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
553bf215546Sopenharmony_ci
554bf215546Sopenharmony_ci   # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.
555bf215546Sopenharmony_ci   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
556bf215546Sopenharmony_ci
557bf215546Sopenharmony_ci   # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.
558bf215546Sopenharmony_ci   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
559bf215546Sopenharmony_ci
560bf215546Sopenharmony_ci   # 0.0 >= b2f(a)
561bf215546Sopenharmony_ci   # b2f(a) <= 0.0
562bf215546Sopenharmony_ci   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
563bf215546Sopenharmony_ci   # inot(a)
564bf215546Sopenharmony_ci   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
565bf215546Sopenharmony_ci
566bf215546Sopenharmony_ci   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
567bf215546Sopenharmony_ci
568bf215546Sopenharmony_ci   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
569bf215546Sopenharmony_ci   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
570bf215546Sopenharmony_ci   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
571bf215546Sopenharmony_ci   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
572bf215546Sopenharmony_ci   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
573bf215546Sopenharmony_ci   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
574bf215546Sopenharmony_ci   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
575bf215546Sopenharmony_ci   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
576bf215546Sopenharmony_ci   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
577bf215546Sopenharmony_ci   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
578bf215546Sopenharmony_ci   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
579bf215546Sopenharmony_ci   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
580bf215546Sopenharmony_ci   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
581bf215546Sopenharmony_ci   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
582bf215546Sopenharmony_ci   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
583bf215546Sopenharmony_ci   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
584bf215546Sopenharmony_ci
585bf215546Sopenharmony_ci   # -(b2f(a) + b2f(b)) < 0
586bf215546Sopenharmony_ci   # 0 < b2f(a) + b2f(b)
587bf215546Sopenharmony_ci   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
588bf215546Sopenharmony_ci   # a || b
589bf215546Sopenharmony_ci   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
590bf215546Sopenharmony_ci   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
591bf215546Sopenharmony_ci
592bf215546Sopenharmony_ci   # -(b2f(a) + b2f(b)) >= 0
593bf215546Sopenharmony_ci   # 0 >= b2f(a) + b2f(b)
594bf215546Sopenharmony_ci   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
595bf215546Sopenharmony_ci   # !(a || b)
596bf215546Sopenharmony_ci   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
597bf215546Sopenharmony_ci   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
598bf215546Sopenharmony_ci
599bf215546Sopenharmony_ci   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
600bf215546Sopenharmony_ci   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
601bf215546Sopenharmony_ci
602bf215546Sopenharmony_ci   # Some optimizations (below) convert things like (a < b || c < b) into
603bf215546Sopenharmony_ci   # (min(a, c) < b).  However, this interfers with the previous optimizations
604bf215546Sopenharmony_ci   # that try to remove comparisons with negated sums of b2f.  This just
605bf215546Sopenharmony_ci   # breaks that apart.
606bf215546Sopenharmony_ci   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
607bf215546Sopenharmony_ci    ('ior', ('flt', c, 0.0), ('ior', a, b))),
608bf215546Sopenharmony_ci
609bf215546Sopenharmony_ci   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
610bf215546Sopenharmony_ci   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
611bf215546Sopenharmony_ci   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
612bf215546Sopenharmony_ci   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
613bf215546Sopenharmony_ci   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
614bf215546Sopenharmony_ci   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
615bf215546Sopenharmony_ci   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
616bf215546Sopenharmony_ci   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
617bf215546Sopenharmony_ci   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
618bf215546Sopenharmony_ci   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
619bf215546Sopenharmony_ci   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
620bf215546Sopenharmony_ci   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
621bf215546Sopenharmony_ci
622bf215546Sopenharmony_ci   # Cannot remove the addition from ilt or ige due to overflow.
623bf215546Sopenharmony_ci   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
624bf215546Sopenharmony_ci   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
625bf215546Sopenharmony_ci
626bf215546Sopenharmony_ci   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
627bf215546Sopenharmony_ci   (('fneu', ('b2f', 'a@1'), 0.0), a),
628bf215546Sopenharmony_ci   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
629bf215546Sopenharmony_ci   (('ine', ('b2i', 'a@1'), 0),   a),
630bf215546Sopenharmony_ci
631bf215546Sopenharmony_ci   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
632bf215546Sopenharmony_ci   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
633bf215546Sopenharmony_ci   (('fge', ('u2f', a), 0.0), True),
634bf215546Sopenharmony_ci   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
635bf215546Sopenharmony_ci   (('flt', ('u2f', a), 0.0), False),
636bf215546Sopenharmony_ci   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
637bf215546Sopenharmony_ci   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
638bf215546Sopenharmony_ci   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
639bf215546Sopenharmony_ci   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
640bf215546Sopenharmony_ci   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
641bf215546Sopenharmony_ci   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
642bf215546Sopenharmony_ci   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
643bf215546Sopenharmony_ci
644bf215546Sopenharmony_ci   # 0.0 < fabs(a)
645bf215546Sopenharmony_ci   # fabs(a) > 0.0
646bf215546Sopenharmony_ci   # fabs(a) != 0.0 because fabs(a) must be >= 0
647bf215546Sopenharmony_ci   # a != 0.0
648bf215546Sopenharmony_ci   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
649bf215546Sopenharmony_ci
650bf215546Sopenharmony_ci   # -fabs(a) < 0.0
651bf215546Sopenharmony_ci   # fabs(a) > 0.0
652bf215546Sopenharmony_ci   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
653bf215546Sopenharmony_ci
654bf215546Sopenharmony_ci   # 0.0 >= fabs(a)
655bf215546Sopenharmony_ci   # 0.0 == fabs(a)   because fabs(a) must be >= 0
656bf215546Sopenharmony_ci   # 0.0 == a
657bf215546Sopenharmony_ci   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
658bf215546Sopenharmony_ci
659bf215546Sopenharmony_ci   # -fabs(a) >= 0.0
660bf215546Sopenharmony_ci   # 0.0 >= fabs(a)
661bf215546Sopenharmony_ci   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
662bf215546Sopenharmony_ci
663bf215546Sopenharmony_ci   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
664bf215546Sopenharmony_ci   #
665bf215546Sopenharmony_ci   # This should be NaN safe.
666bf215546Sopenharmony_ci   #
667bf215546Sopenharmony_ci   # NaN >= 0 && 1 >= NaN -> false && false -> false
668bf215546Sopenharmony_ci   #
669bf215546Sopenharmony_ci   # vs.
670bf215546Sopenharmony_ci   #
671bf215546Sopenharmony_ci   # NaN == fsat(NaN) -> NaN == 0 -> false
672bf215546Sopenharmony_ci   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
673bf215546Sopenharmony_ci
674bf215546Sopenharmony_ci   # Note: fmin(-a, -b) == -fmax(a, b)
675bf215546Sopenharmony_ci   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
676bf215546Sopenharmony_ci   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
677bf215546Sopenharmony_ci   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
678bf215546Sopenharmony_ci   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
679bf215546Sopenharmony_ci
680bf215546Sopenharmony_ci   # fmin(b2f(a), b)
681bf215546Sopenharmony_ci   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
682bf215546Sopenharmony_ci   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
683bf215546Sopenharmony_ci   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
684bf215546Sopenharmony_ci   #
685bf215546Sopenharmony_ci   # Since b is a constant, constant folding will eliminate the fmin and the
686bf215546Sopenharmony_ci   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
687bf215546Sopenharmony_ci   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
688bf215546Sopenharmony_ci
689bf215546Sopenharmony_ci   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
690bf215546Sopenharmony_ci
691bf215546Sopenharmony_ci   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
692bf215546Sopenharmony_ci   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
693bf215546Sopenharmony_ci   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
694bf215546Sopenharmony_ci   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
695bf215546Sopenharmony_ci   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
696bf215546Sopenharmony_ci   (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),
697bf215546Sopenharmony_ci   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
698bf215546Sopenharmony_ci   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
699bf215546Sopenharmony_ci   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
700bf215546Sopenharmony_ci   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
701bf215546Sopenharmony_ci   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
702bf215546Sopenharmony_ci   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
703bf215546Sopenharmony_ci   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
704bf215546Sopenharmony_ci   (('bcsel', a, True, b), ('ior', a, b)),
705bf215546Sopenharmony_ci   (('bcsel', a, a, b), ('ior', a, b)),
706bf215546Sopenharmony_ci   (('bcsel', a, b, False), ('iand', a, b)),
707bf215546Sopenharmony_ci   (('bcsel', a, b, a), ('iand', a, b)),
708bf215546Sopenharmony_ci   (('~fmin', a, a), a),
709bf215546Sopenharmony_ci   (('~fmax', a, a), a),
710bf215546Sopenharmony_ci   (('imin', a, a), a),
711bf215546Sopenharmony_ci   (('imax', a, a), a),
712bf215546Sopenharmony_ci   (('umin', a, a), a),
713bf215546Sopenharmony_ci   (('umin', a, 0), 0),
714bf215546Sopenharmony_ci   (('umin', a, -1), a),
715bf215546Sopenharmony_ci   (('umax', a, a), a),
716bf215546Sopenharmony_ci   (('umax', a, 0), a),
717bf215546Sopenharmony_ci   (('umax', a, -1), -1),
718bf215546Sopenharmony_ci   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
719bf215546Sopenharmony_ci   (('umax', ('umax', a, b), b), ('umax', a, b)),
720bf215546Sopenharmony_ci   (('imax', ('imax', a, b), b), ('imax', a, b)),
721bf215546Sopenharmony_ci   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
722bf215546Sopenharmony_ci   (('umin', ('umin', a, b), b), ('umin', a, b)),
723bf215546Sopenharmony_ci   (('imin', ('imin', a, b), b), ('imin', a, b)),
724bf215546Sopenharmony_ci   (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),
725bf215546Sopenharmony_ci   (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),
726bf215546Sopenharmony_ci   (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),
727bf215546Sopenharmony_ci   (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),
728bf215546Sopenharmony_ci   (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),
729bf215546Sopenharmony_ci   (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
730bf215546Sopenharmony_ci])
731bf215546Sopenharmony_ci
732bf215546Sopenharmony_cifor N in [8, 16, 32, 64]:
733bf215546Sopenharmony_ci    b2iN = 'b2i{0}'.format(N)
734bf215546Sopenharmony_ci    optimizations.extend([
735bf215546Sopenharmony_ci        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
736bf215546Sopenharmony_ci        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
737bf215546Sopenharmony_ci    ])
738bf215546Sopenharmony_ci
739bf215546Sopenharmony_cifor N in [16, 32, 64]:
740bf215546Sopenharmony_ci    b2fN = 'b2f{0}'.format(N)
741bf215546Sopenharmony_ci    optimizations.extend([
742bf215546Sopenharmony_ci        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
743bf215546Sopenharmony_ci        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
744bf215546Sopenharmony_ci    ])
745bf215546Sopenharmony_ci
746bf215546Sopenharmony_ci# Integer sizes
747bf215546Sopenharmony_cifor s in [8, 16, 32, 64]:
748bf215546Sopenharmony_ci    optimizations.extend([
749bf215546Sopenharmony_ci       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
750bf215546Sopenharmony_ci
751bf215546Sopenharmony_ci       # Simplify logic to detect sign of an integer.
752bf215546Sopenharmony_ci       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
753bf215546Sopenharmony_ci       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
754bf215546Sopenharmony_ci       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
755bf215546Sopenharmony_ci       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
756bf215546Sopenharmony_ci       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
757bf215546Sopenharmony_ci       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
758bf215546Sopenharmony_ci       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
759bf215546Sopenharmony_ci       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
760bf215546Sopenharmony_ci       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
761bf215546Sopenharmony_ci       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
762bf215546Sopenharmony_ci       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
763bf215546Sopenharmony_ci       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
764bf215546Sopenharmony_ci    ])
765bf215546Sopenharmony_ci
766bf215546Sopenharmony_cioptimizations.extend([
767bf215546Sopenharmony_ci   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
768bf215546Sopenharmony_ci   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
769bf215546Sopenharmony_ci   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
770bf215546Sopenharmony_ci   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
771bf215546Sopenharmony_ci   (('~fmin', a, ('fabs', a)), a),
772bf215546Sopenharmony_ci   (('imin', a, ('iabs', a)), a),
773bf215546Sopenharmony_ci   (('~fmax', a, ('fneg', ('fabs', a))), a),
774bf215546Sopenharmony_ci   (('imax', a, ('ineg', ('iabs', a))), a),
775bf215546Sopenharmony_ci   (('fmax', a, ('fabs', a)), ('fabs', a)),
776bf215546Sopenharmony_ci   (('imax', a, ('iabs', a)), ('iabs', a)),
777bf215546Sopenharmony_ci   (('fmax', a, ('fneg', a)), ('fabs', a)),
778bf215546Sopenharmony_ci   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
779bf215546Sopenharmony_ci   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
780bf215546Sopenharmony_ci   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
781bf215546Sopenharmony_ci   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
782bf215546Sopenharmony_ci   # fsat(a) returns 0.0.
783bf215546Sopenharmony_ci   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
784bf215546Sopenharmony_ci   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
785bf215546Sopenharmony_ci   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
786bf215546Sopenharmony_ci   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
787bf215546Sopenharmony_ci   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
788bf215546Sopenharmony_ci   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
789bf215546Sopenharmony_ci   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
790bf215546Sopenharmony_ci   # representing this in the optimizations other than the usual ~.
791bf215546Sopenharmony_ci   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
792bf215546Sopenharmony_ci   # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark
793bf215546Sopenharmony_ci   # the new comparison precise to prevent it being changed to 'a != 0'.
794bf215546Sopenharmony_ci   (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),
795bf215546Sopenharmony_ci   (('fsat', ('b2f', a)), ('b2f', a)),
796bf215546Sopenharmony_ci   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
797bf215546Sopenharmony_ci   (('fsat', ('fsat', a)), ('fsat', a)),
798bf215546Sopenharmony_ci   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
799bf215546Sopenharmony_ci   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
800bf215546Sopenharmony_ci   (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32),
801bf215546Sopenharmony_ci   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
802bf215546Sopenharmony_ci   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
803bf215546Sopenharmony_ci   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
804bf215546Sopenharmony_ci   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
805bf215546Sopenharmony_ci   # Both the left and right patterns are "b" when isnan(a), so this is exact.
806bf215546Sopenharmony_ci   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
807bf215546Sopenharmony_ci   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
808bf215546Sopenharmony_ci   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
809bf215546Sopenharmony_ci   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
810bf215546Sopenharmony_ci
811bf215546Sopenharmony_ci   # max(-min(b, a), b) -> max(abs(b), -a)
812bf215546Sopenharmony_ci   # min(-max(b, a), b) -> min(-abs(b), -a)
813bf215546Sopenharmony_ci   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
814bf215546Sopenharmony_ci   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
815bf215546Sopenharmony_ci
816bf215546Sopenharmony_ci   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
817bf215546Sopenharmony_ci   # fsat(b-a).
818bf215546Sopenharmony_ci   #
819bf215546Sopenharmony_ci   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
820bf215546Sopenharmony_ci   #
821bf215546Sopenharmony_ci   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
822bf215546Sopenharmony_ci   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
823bf215546Sopenharmony_ci    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
824bf215546Sopenharmony_ci
825bf215546Sopenharmony_ci   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
826bf215546Sopenharmony_ci
827bf215546Sopenharmony_ci   # The ior versions are exact because fmin and fmax will always pick a
828bf215546Sopenharmony_ci   # non-NaN value, if one exists.  Therefore (a < NaN) || (a < c) == a <
829bf215546Sopenharmony_ci   # fmax(NaN, c) == a < c.  Mark the fmin or fmax in the replacement as exact
830bf215546Sopenharmony_ci   # to prevent other optimizations from ruining the "NaN clensing" property
831bf215546Sopenharmony_ci   # of the fmin or fmax.
832bf215546Sopenharmony_ci   (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),
833bf215546Sopenharmony_ci   (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),
834bf215546Sopenharmony_ci   (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),
835bf215546Sopenharmony_ci   (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),
836bf215546Sopenharmony_ci   (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),
837bf215546Sopenharmony_ci   (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),
838bf215546Sopenharmony_ci   (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),
839bf215546Sopenharmony_ci   (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),
840bf215546Sopenharmony_ci   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
841bf215546Sopenharmony_ci   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
842bf215546Sopenharmony_ci   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
843bf215546Sopenharmony_ci   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
844bf215546Sopenharmony_ci   (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),
845bf215546Sopenharmony_ci   (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),
846bf215546Sopenharmony_ci   (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),
847bf215546Sopenharmony_ci   (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),
848bf215546Sopenharmony_ci
849bf215546Sopenharmony_ci   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
850bf215546Sopenharmony_ci   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
851bf215546Sopenharmony_ci   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
852bf215546Sopenharmony_ci   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
853bf215546Sopenharmony_ci   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
854bf215546Sopenharmony_ci   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
855bf215546Sopenharmony_ci   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
856bf215546Sopenharmony_ci   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
857bf215546Sopenharmony_ci   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
858bf215546Sopenharmony_ci   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
859bf215546Sopenharmony_ci   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
860bf215546Sopenharmony_ci   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
861bf215546Sopenharmony_ci   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
862bf215546Sopenharmony_ci   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
863bf215546Sopenharmony_ci   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
864bf215546Sopenharmony_ci   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
865bf215546Sopenharmony_ci
866bf215546Sopenharmony_ci   # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y
867bf215546Sopenharmony_ci   # < 0.0, || a.y > 1.0 || ...  These patterns rearrange and replace in a
868bf215546Sopenharmony_ci   # single step.  Doing just the replacement can lead to an infinite loop as
869bf215546Sopenharmony_ci   # the pattern is repeatedly applied to the result of the previous
870bf215546Sopenharmony_ci   # application of the pattern.
871bf215546Sopenharmony_ci   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
872bf215546Sopenharmony_ci   (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
873bf215546Sopenharmony_ci   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
874bf215546Sopenharmony_ci   (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
875bf215546Sopenharmony_ci
876bf215546Sopenharmony_ci   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
877bf215546Sopenharmony_ci   # numbers, then it can be replaced with fneu.
878bf215546Sopenharmony_ci   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
879bf215546Sopenharmony_ci])
880bf215546Sopenharmony_ci
881bf215546Sopenharmony_ci# Float sizes
882bf215546Sopenharmony_cifor s in [16, 32, 64]:
883bf215546Sopenharmony_ci    optimizations.extend([
884bf215546Sopenharmony_ci       # These derive from the previous patterns with the application of b < 0 <=>
885bf215546Sopenharmony_ci       # 0 < -b.  The transformation should be applied if either comparison is
886bf215546Sopenharmony_ci       # used once as this ensures that the number of comparisons will not
887bf215546Sopenharmony_ci       # increase.  The sources to the ior and iand are not symmetric, so the
888bf215546Sopenharmony_ci       # rules have to be duplicated to get this behavior.
889bf215546Sopenharmony_ci       (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
890bf215546Sopenharmony_ci       (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
891bf215546Sopenharmony_ci       (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
892bf215546Sopenharmony_ci       (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
893bf215546Sopenharmony_ci       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
894bf215546Sopenharmony_ci       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
895bf215546Sopenharmony_ci       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
896bf215546Sopenharmony_ci       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
897bf215546Sopenharmony_ci
898bf215546Sopenharmony_ci       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
899bf215546Sopenharmony_ci       # with the bcsel, it's basically copysign(1.0, a).  There are some
900bf215546Sopenharmony_ci       # behavior differences between this pattern and copysign w.r.t. ±0 and
901bf215546Sopenharmony_ci       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
902bf215546Sopenharmony_ci       # to x, regardless of whether either or both values are NaN.
903bf215546Sopenharmony_ci       #
904bf215546Sopenharmony_ci       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
905bf215546Sopenharmony_ci       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
906bf215546Sopenharmony_ci       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
907bf215546Sopenharmony_ci       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
908bf215546Sopenharmony_ci       #
909bf215546Sopenharmony_ci       # For all other values of 'a', the original and replacement behave as
910bf215546Sopenharmony_ci       # copysign.
911bf215546Sopenharmony_ci       #
912bf215546Sopenharmony_ci       # Marking the replacement comparisons as precise prevents any future
913bf215546Sopenharmony_ci       # optimizations from replacing either of the comparisons with the
914bf215546Sopenharmony_ci       # logical-not of the other.
915bf215546Sopenharmony_ci       #
916bf215546Sopenharmony_ci       # Note: Use b2i32 in the replacement because some platforms that
917bf215546Sopenharmony_ci       # support fp16 don't support int16.
918bf215546Sopenharmony_ci       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
919bf215546Sopenharmony_ci        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
920bf215546Sopenharmony_ci
921bf215546Sopenharmony_ci       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
922bf215546Sopenharmony_ci
923bf215546Sopenharmony_ci       # The C spec says, "If the value of the integral part cannot be represented
924bf215546Sopenharmony_ci       # by the integer type, the behavior is undefined."  "Undefined" can mean
925bf215546Sopenharmony_ci       # "the conversion doesn't happen at all."
926bf215546Sopenharmony_ci       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
927bf215546Sopenharmony_ci
928bf215546Sopenharmony_ci       # Ironically, mark these as imprecise because removing the conversions may
929bf215546Sopenharmony_ci       # preserve more precision than doing the conversions (e.g.,
930bf215546Sopenharmony_ci       # uint(float(0x81818181u)) == 0x81818200).
931bf215546Sopenharmony_ci       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
932bf215546Sopenharmony_ci       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
933bf215546Sopenharmony_ci       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
934bf215546Sopenharmony_ci       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
935bf215546Sopenharmony_ci
936bf215546Sopenharmony_ci       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'),
937bf215546Sopenharmony_ci       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'),
938bf215546Sopenharmony_ci    ])
939bf215546Sopenharmony_ci
940bf215546Sopenharmony_ci    # float? -> float? -> floatS ==> float? -> floatS
941bf215546Sopenharmony_ci    (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
942bf215546Sopenharmony_ci
943bf215546Sopenharmony_ci    # int? -> float? -> floatS ==> int? -> floatS
944bf215546Sopenharmony_ci    (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
945bf215546Sopenharmony_ci    (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
946bf215546Sopenharmony_ci
947bf215546Sopenharmony_ci    # float? -> float? -> intS ==> float? -> intS
948bf215546Sopenharmony_ci    (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
949bf215546Sopenharmony_ci    (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
950bf215546Sopenharmony_ci
951bf215546Sopenharmony_ci    for B in [32, 64]:
952bf215546Sopenharmony_ci        if s < B:
953bf215546Sopenharmony_ci            optimizations.extend([
954bf215546Sopenharmony_ci               # S = smaller, B = bigger
955bf215546Sopenharmony_ci               # typeS -> typeB -> typeS ==> identity
956bf215546Sopenharmony_ci               (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
957bf215546Sopenharmony_ci               (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a),
958bf215546Sopenharmony_ci               (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a),
959bf215546Sopenharmony_ci
960bf215546Sopenharmony_ci               # bool1 -> typeB -> typeS ==> bool1 -> typeS
961bf215546Sopenharmony_ci               (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)),
962bf215546Sopenharmony_ci               (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
963bf215546Sopenharmony_ci               (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
964bf215546Sopenharmony_ci
965bf215546Sopenharmony_ci               # floatS -> floatB -> intB ==> floatS -> intB
966bf215546Sopenharmony_ci               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
967bf215546Sopenharmony_ci               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
968bf215546Sopenharmony_ci
969bf215546Sopenharmony_ci               # int? -> floatB -> floatS ==> int? -> floatS
970bf215546Sopenharmony_ci               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
971bf215546Sopenharmony_ci               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
972bf215546Sopenharmony_ci
973bf215546Sopenharmony_ci               # intS -> intB -> floatB ==> intS -> floatB
974bf215546Sopenharmony_ci               (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)),
975bf215546Sopenharmony_ci               (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)),
976bf215546Sopenharmony_ci            ])
977bf215546Sopenharmony_ci
978bf215546Sopenharmony_ci# mediump variants of the above
979bf215546Sopenharmony_cioptimizations.extend([
980bf215546Sopenharmony_ci    # int32 -> float32 -> float16 ==> int32 -> float16
981bf215546Sopenharmony_ci    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
982bf215546Sopenharmony_ci    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
983bf215546Sopenharmony_ci
984bf215546Sopenharmony_ci    # float32 -> float16 -> int16 ==> float32 -> int16
985bf215546Sopenharmony_ci    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
986bf215546Sopenharmony_ci    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
987bf215546Sopenharmony_ci
988bf215546Sopenharmony_ci    # float32 -> int32 -> int16 ==> float32 -> int16
989bf215546Sopenharmony_ci    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
990bf215546Sopenharmony_ci    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
991bf215546Sopenharmony_ci
992bf215546Sopenharmony_ci    # int32 -> int16 -> float16 ==> int32 -> float16
993bf215546Sopenharmony_ci    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
994bf215546Sopenharmony_ci    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
995bf215546Sopenharmony_ci])
996bf215546Sopenharmony_ci
997bf215546Sopenharmony_ci# Clean up junk left from 8-bit integer to 16-bit integer lowering.
998bf215546Sopenharmony_cioptimizations.extend([
999bf215546Sopenharmony_ci    # The u2u16(u2u8(X)) just masks off the upper 8-bits of X.  This can be
1000bf215546Sopenharmony_ci    # accomplished by mask the upper 8-bit of the immediate operand to the
1001bf215546Sopenharmony_ci    # iand instruction.  Often times, both patterns will end up being applied
1002bf215546Sopenharmony_ci    # to the same original expression tree.
1003bf215546Sopenharmony_ci    (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'),               ('iand', a, ('iand', b, 0xff))),
1004bf215546Sopenharmony_ci    (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),
1005bf215546Sopenharmony_ci])
1006bf215546Sopenharmony_ci
1007bf215546Sopenharmony_cifor op in ['iand', 'ior', 'ixor']:
1008bf215546Sopenharmony_ci    optimizations.extend([
1009bf215546Sopenharmony_ci        (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),
1010bf215546Sopenharmony_ci        (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),
1011bf215546Sopenharmony_ci
1012bf215546Sopenharmony_ci        # Undistribute extract from a logic op
1013bf215546Sopenharmony_ci        ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),
1014bf215546Sopenharmony_ci        ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),
1015bf215546Sopenharmony_ci        ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),
1016bf215546Sopenharmony_ci        ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),
1017bf215546Sopenharmony_ci
1018bf215546Sopenharmony_ci        # Undistribute shifts from a logic op
1019bf215546Sopenharmony_ci        ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),
1020bf215546Sopenharmony_ci        ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),
1021bf215546Sopenharmony_ci        ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),
1022bf215546Sopenharmony_ci    ])
1023bf215546Sopenharmony_ci
1024bf215546Sopenharmony_ci# Integer sizes
1025bf215546Sopenharmony_cifor s in [8, 16, 32, 64]:
1026bf215546Sopenharmony_ci    last_shift_bit = int(math.log2(s)) - 1
1027bf215546Sopenharmony_ci
1028bf215546Sopenharmony_ci    optimizations.extend([
1029bf215546Sopenharmony_ci       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'),
1030bf215546Sopenharmony_ci       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'),
1031bf215546Sopenharmony_ci       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'),
1032bf215546Sopenharmony_ci       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'),
1033bf215546Sopenharmony_ci       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'),
1034bf215546Sopenharmony_ci       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'),
1035bf215546Sopenharmony_ci
1036bf215546Sopenharmony_ci       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
1037bf215546Sopenharmony_ci       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
1038bf215546Sopenharmony_ci
1039bf215546Sopenharmony_ci       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
1040bf215546Sopenharmony_ci       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
1041bf215546Sopenharmony_ci       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
1042bf215546Sopenharmony_ci       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
1043bf215546Sopenharmony_ci       (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))),
1044bf215546Sopenharmony_ci    ])
1045bf215546Sopenharmony_ci
1046bf215546Sopenharmony_cioptimizations.extend([
1047bf215546Sopenharmony_ci   # Common pattern like 'if (i == 0 || i == 1 || ...)'
1048bf215546Sopenharmony_ci   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
1049bf215546Sopenharmony_ci   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
1050bf215546Sopenharmony_ci   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
1051bf215546Sopenharmony_ci
1052bf215546Sopenharmony_ci   (('ior', a, ('ieq', a, False)), True),
1053bf215546Sopenharmony_ci   (('ior', a, ('inot', a)), -1),
1054bf215546Sopenharmony_ci
1055bf215546Sopenharmony_ci   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
1056bf215546Sopenharmony_ci   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
1057bf215546Sopenharmony_ci
1058bf215546Sopenharmony_ci   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
1059bf215546Sopenharmony_ci   # The first part of the iand comes from the !__feq64_nonnan.
1060bf215546Sopenharmony_ci   #
1061bf215546Sopenharmony_ci   # The second pattern is a reformulation of the first based on the relation
1062bf215546Sopenharmony_ci   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
1063bf215546Sopenharmony_ci   # happens to be y == 0.
1064bf215546Sopenharmony_ci   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
1065bf215546Sopenharmony_ci    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
1066bf215546Sopenharmony_ci   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
1067bf215546Sopenharmony_ci    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
1068bf215546Sopenharmony_ci
1069bf215546Sopenharmony_ci   # These patterns can result when (a < b || a < c) => (a < min(b, c))
1070bf215546Sopenharmony_ci   # transformations occur before constant propagation and loop-unrolling.
1071bf215546Sopenharmony_ci   #
1072bf215546Sopenharmony_ci   # The flt versions are exact.  If isnan(a), the original pattern is
1073bf215546Sopenharmony_ci   # trivially false, and the replacements are false too.  If isnan(b):
1074bf215546Sopenharmony_ci   #
1075bf215546Sopenharmony_ci   #    a < fmax(NaN, a) => a < a => false vs a < NaN => false
1076bf215546Sopenharmony_ci   (('flt', a, ('fmax', b, a)), ('flt', a, b)),
1077bf215546Sopenharmony_ci   (('flt', ('fmin', a, b), a), ('flt', b, a)),
1078bf215546Sopenharmony_ci   (('~fge', a, ('fmin', b, a)), True),
1079bf215546Sopenharmony_ci   (('~fge', ('fmax', a, b), a), True),
1080bf215546Sopenharmony_ci   (('flt', a, ('fmin', b, a)), False),
1081bf215546Sopenharmony_ci   (('flt', ('fmax', a, b), a), False),
1082bf215546Sopenharmony_ci   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
1083bf215546Sopenharmony_ci   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
1084bf215546Sopenharmony_ci
1085bf215546Sopenharmony_ci   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
1086bf215546Sopenharmony_ci   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
1087bf215546Sopenharmony_ci   (('ige', a, ('imin', b, a)), True),
1088bf215546Sopenharmony_ci   (('ige', ('imax', a, b), a), True),
1089bf215546Sopenharmony_ci   (('ult', a, ('umax', b, a)), ('ult', a, b)),
1090bf215546Sopenharmony_ci   (('ult', ('umin', a, b), a), ('ult', b, a)),
1091bf215546Sopenharmony_ci   (('uge', a, ('umin', b, a)), True),
1092bf215546Sopenharmony_ci   (('uge', ('umax', a, b), a), True),
1093bf215546Sopenharmony_ci   (('ilt', a, ('imin', b, a)), False),
1094bf215546Sopenharmony_ci   (('ilt', ('imax', a, b), a), False),
1095bf215546Sopenharmony_ci   (('ige', a, ('imax', b, a)), ('ige', a, b)),
1096bf215546Sopenharmony_ci   (('ige', ('imin', a, b), a), ('ige', b, a)),
1097bf215546Sopenharmony_ci   (('ult', a, ('umin', b, a)), False),
1098bf215546Sopenharmony_ci   (('ult', ('umax', a, b), a), False),
1099bf215546Sopenharmony_ci   (('uge', a, ('umax', b, a)), ('uge', a, b)),
1100bf215546Sopenharmony_ci   (('uge', ('umin', a, b), a), ('uge', b, a)),
1101bf215546Sopenharmony_ci   (('ult', a, ('iand', b, a)), False),
1102bf215546Sopenharmony_ci   (('ult', ('ior', a, b), a), False),
1103bf215546Sopenharmony_ci   (('uge', a, ('iand', b, a)), True),
1104bf215546Sopenharmony_ci   (('uge', ('ior', a, b), a), True),
1105bf215546Sopenharmony_ci
1106bf215546Sopenharmony_ci   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
1107bf215546Sopenharmony_ci   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
1108bf215546Sopenharmony_ci   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
1109bf215546Sopenharmony_ci   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
1110bf215546Sopenharmony_ci   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
1111bf215546Sopenharmony_ci   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
1112bf215546Sopenharmony_ci   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
1113bf215546Sopenharmony_ci   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
1114bf215546Sopenharmony_ci   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
1115bf215546Sopenharmony_ci   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
1116bf215546Sopenharmony_ci   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
1117bf215546Sopenharmony_ci   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
1118bf215546Sopenharmony_ci   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
1119bf215546Sopenharmony_ci   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
1120bf215546Sopenharmony_ci   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
1121bf215546Sopenharmony_ci   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
1122bf215546Sopenharmony_ci
1123bf215546Sopenharmony_ci   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
1124bf215546Sopenharmony_ci   # negative.
1125bf215546Sopenharmony_ci   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
1126bf215546Sopenharmony_ci    ('iabs', ('ishr', a, b))),
1127bf215546Sopenharmony_ci   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
1128bf215546Sopenharmony_ci
1129bf215546Sopenharmony_ci   (('fabs', ('slt', a, b)), ('slt', a, b)),
1130bf215546Sopenharmony_ci   (('fabs', ('sge', a, b)), ('sge', a, b)),
1131bf215546Sopenharmony_ci   (('fabs', ('seq', a, b)), ('seq', a, b)),
1132bf215546Sopenharmony_ci   (('fabs', ('sne', a, b)), ('sne', a, b)),
1133bf215546Sopenharmony_ci   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
1134bf215546Sopenharmony_ci   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
1135bf215546Sopenharmony_ci   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
1136bf215546Sopenharmony_ci   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
1137bf215546Sopenharmony_ci   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
1138bf215546Sopenharmony_ci   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
1139bf215546Sopenharmony_ci   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
1140bf215546Sopenharmony_ci   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
1141bf215546Sopenharmony_ci   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
1142bf215546Sopenharmony_ci   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
1143bf215546Sopenharmony_ci   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
1144bf215546Sopenharmony_ci   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
1145bf215546Sopenharmony_ci   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
1146bf215546Sopenharmony_ci   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
1147bf215546Sopenharmony_ci   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
1148bf215546Sopenharmony_ci   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
1149bf215546Sopenharmony_ci   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
1150bf215546Sopenharmony_ci   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
1151bf215546Sopenharmony_ci   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
1152bf215546Sopenharmony_ci   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
1153bf215546Sopenharmony_ci   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1154bf215546Sopenharmony_ci   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
1155bf215546Sopenharmony_ci   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
1156bf215546Sopenharmony_ci   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1157bf215546Sopenharmony_ci   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1158bf215546Sopenharmony_ci   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1159bf215546Sopenharmony_ci
1160bf215546Sopenharmony_ci   (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1161bf215546Sopenharmony_ci   (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1162bf215546Sopenharmony_ci   (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1163bf215546Sopenharmony_ci
1164bf215546Sopenharmony_ci   (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1165bf215546Sopenharmony_ci   (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1166bf215546Sopenharmony_ci   (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1167bf215546Sopenharmony_ci
1168bf215546Sopenharmony_ci   (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1169bf215546Sopenharmony_ci   (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1170bf215546Sopenharmony_ci   (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1171bf215546Sopenharmony_ci
1172bf215546Sopenharmony_ci   (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1173bf215546Sopenharmony_ci   (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1174bf215546Sopenharmony_ci   (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1175bf215546Sopenharmony_ci
1176bf215546Sopenharmony_ci   (('feq', ('seq', a, b), 1.0), ('feq', a, b)),
1177bf215546Sopenharmony_ci   (('feq', ('sne', a, b), 1.0), ('fneu', a, b)),
1178bf215546Sopenharmony_ci   (('feq', ('slt', a, b), 1.0), ('flt', a, b)),
1179bf215546Sopenharmony_ci   (('feq', ('sge', a, b), 1.0), ('fge', a, b)),
1180bf215546Sopenharmony_ci   (('fneu', ('seq', a, b), 0.0), ('feq', a, b)),
1181bf215546Sopenharmony_ci   (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)),
1182bf215546Sopenharmony_ci   (('fneu', ('slt', a, b), 0.0), ('flt', a, b)),
1183bf215546Sopenharmony_ci   (('fneu', ('sge', a, b), 0.0), ('fge', a, b)),
1184bf215546Sopenharmony_ci   (('feq', ('seq', a, b), 0.0), ('fneu', a, b)),
1185bf215546Sopenharmony_ci   (('feq', ('sne', a, b), 0.0), ('feq', a, b)),
1186bf215546Sopenharmony_ci   (('feq', ('slt', a, b), 0.0), ('fge', a, b)),
1187bf215546Sopenharmony_ci   (('feq', ('sge', a, b), 0.0), ('flt', a, b)),
1188bf215546Sopenharmony_ci   (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)),
1189bf215546Sopenharmony_ci   (('fneu', ('sne', a, b), 1.0), ('feq', a, b)),
1190bf215546Sopenharmony_ci   (('fneu', ('slt', a, b), 1.0), ('fge', a, b)),
1191bf215546Sopenharmony_ci   (('fneu', ('sge', a, b), 1.0), ('flt', a, b)),
1192bf215546Sopenharmony_ci
1193bf215546Sopenharmony_ci   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
1194bf215546Sopenharmony_ci   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
1195bf215546Sopenharmony_ci   # Emulating booleans
1196bf215546Sopenharmony_ci   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1197bf215546Sopenharmony_ci   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1198bf215546Sopenharmony_ci   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
1199bf215546Sopenharmony_ci   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
1200bf215546Sopenharmony_ci   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
1201bf215546Sopenharmony_ci   (('iand', 'a@bool16', 1.0), ('b2f', a)),
1202bf215546Sopenharmony_ci   (('iand', 'a@bool32', 1.0), ('b2f', a)),
1203bf215546Sopenharmony_ci   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
1204bf215546Sopenharmony_ci   # Comparison with the same args.  Note that these are only done for the
1205bf215546Sopenharmony_ci   # float versions when the source must be a number.  Generally, NaN cmp NaN
1206bf215546Sopenharmony_ci   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN
1207bf215546Sopenharmony_ci   # is false, and, for any number X, X < X is also false.
1208bf215546Sopenharmony_ci   (('ilt', a, a), False),
1209bf215546Sopenharmony_ci   (('ige', a, a), True),
1210bf215546Sopenharmony_ci   (('ieq', a, a), True),
1211bf215546Sopenharmony_ci   (('ine', a, a), False),
1212bf215546Sopenharmony_ci   (('ult', a, a), False),
1213bf215546Sopenharmony_ci   (('uge', a, a), True),
1214bf215546Sopenharmony_ci   (('flt', a, a), False),
1215bf215546Sopenharmony_ci   (('fge', 'a(is_a_number)', a), True),
1216bf215546Sopenharmony_ci   (('feq', 'a(is_a_number)', a), True),
1217bf215546Sopenharmony_ci   (('fneu', 'a(is_a_number)', a), False),
1218bf215546Sopenharmony_ci   # Logical and bit operations
1219bf215546Sopenharmony_ci   (('iand', a, a), a),
1220bf215546Sopenharmony_ci   (('iand', a, ~0), a),
1221bf215546Sopenharmony_ci   (('iand', a, 0), 0),
1222bf215546Sopenharmony_ci   (('ior', a, a), a),
1223bf215546Sopenharmony_ci   (('ior', a, 0), a),
1224bf215546Sopenharmony_ci   (('ior', a, True), True),
1225bf215546Sopenharmony_ci   (('ixor', a, a), 0),
1226bf215546Sopenharmony_ci   (('ixor', a, 0), a),
1227bf215546Sopenharmony_ci   (('ixor', a, ('ixor', a, b)), b),
1228bf215546Sopenharmony_ci   (('ixor', a, -1), ('inot', a)),
1229bf215546Sopenharmony_ci   (('inot', ('inot', a)), a),
1230bf215546Sopenharmony_ci   (('ior', ('iand', a, b), b), b),
1231bf215546Sopenharmony_ci   (('ior', ('ior', a, b), b), ('ior', a, b)),
1232bf215546Sopenharmony_ci   (('iand', ('ior', a, b), b), b),
1233bf215546Sopenharmony_ci   (('iand', ('iand', a, b), b), ('iand', a, b)),
1234bf215546Sopenharmony_ci   # DeMorgan's Laws
1235bf215546Sopenharmony_ci   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
1236bf215546Sopenharmony_ci   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
1237bf215546Sopenharmony_ci   # Shift optimizations
1238bf215546Sopenharmony_ci   (('ishl', 0, a), 0),
1239bf215546Sopenharmony_ci   (('ishl', a, 0), a),
1240bf215546Sopenharmony_ci   (('ishr', 0, a), 0),
1241bf215546Sopenharmony_ci   (('ishr', -1, a), -1),
1242bf215546Sopenharmony_ci   (('ishr', a, 0), a),
1243bf215546Sopenharmony_ci   (('ushr', 0, a), 0),
1244bf215546Sopenharmony_ci   (('ushr', a, 0), a),
1245bf215546Sopenharmony_ci   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
1246bf215546Sopenharmony_ci   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
1247bf215546Sopenharmony_ci   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
1248bf215546Sopenharmony_ci   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
1249bf215546Sopenharmony_ci   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
1250bf215546Sopenharmony_ci   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
1251bf215546Sopenharmony_ci   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
1252bf215546Sopenharmony_ci   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
1253bf215546Sopenharmony_ci   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
1254bf215546Sopenharmony_ci   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
1255bf215546Sopenharmony_ci   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
1256bf215546Sopenharmony_ci   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
1257bf215546Sopenharmony_ci   # Exponential/logarithmic identities
1258bf215546Sopenharmony_ci   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
1259bf215546Sopenharmony_ci   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
1260bf215546Sopenharmony_ci   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
1261bf215546Sopenharmony_ci   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
1262bf215546Sopenharmony_ci   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
1263bf215546Sopenharmony_ci    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
1264bf215546Sopenharmony_ci   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
1265bf215546Sopenharmony_ci   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
1266bf215546Sopenharmony_ci   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1267bf215546Sopenharmony_ci   (('~fpow', a, 1.0), a),
1268bf215546Sopenharmony_ci   (('~fpow', a, 2.0), ('fmul', a, a)),
1269bf215546Sopenharmony_ci   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1270bf215546Sopenharmony_ci   (('~fpow', 2.0, a), ('fexp2', a)),
1271bf215546Sopenharmony_ci   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
1272bf215546Sopenharmony_ci   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
1273bf215546Sopenharmony_ci   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
1274bf215546Sopenharmony_ci   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
1275bf215546Sopenharmony_ci   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
1276bf215546Sopenharmony_ci   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
1277bf215546Sopenharmony_ci   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
1278bf215546Sopenharmony_ci   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
1279bf215546Sopenharmony_ci   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
1280bf215546Sopenharmony_ci   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
1281bf215546Sopenharmony_ci   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
1282bf215546Sopenharmony_ci   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
1283bf215546Sopenharmony_ci   (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),
1284bf215546Sopenharmony_ci   # Division and reciprocal
1285bf215546Sopenharmony_ci   (('~fdiv', 1.0, a), ('frcp', a)),
1286bf215546Sopenharmony_ci   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
1287bf215546Sopenharmony_ci   (('~frcp', ('frcp', a)), a),
1288bf215546Sopenharmony_ci   (('~frcp', ('fsqrt', a)), ('frsq', a)),
1289bf215546Sopenharmony_ci   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
1290bf215546Sopenharmony_ci   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
1291bf215546Sopenharmony_ci   # Trig
1292bf215546Sopenharmony_ci   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
1293bf215546Sopenharmony_ci   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
1294bf215546Sopenharmony_ci   # Boolean simplifications
1295bf215546Sopenharmony_ci   (('i2b16(is_used_by_if)', a), ('ine16', a, 0)),
1296bf215546Sopenharmony_ci   (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
1297bf215546Sopenharmony_ci   (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
1298bf215546Sopenharmony_ci   (('ieq', a, True), a),
1299bf215546Sopenharmony_ci   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
1300bf215546Sopenharmony_ci   (('ine', a, False), a),
1301bf215546Sopenharmony_ci   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
1302bf215546Sopenharmony_ci   (('bcsel', a, True, False), a),
1303bf215546Sopenharmony_ci   (('bcsel', a, False, True), ('inot', a)),
1304bf215546Sopenharmony_ci   (('bcsel', True, b, c), b),
1305bf215546Sopenharmony_ci   (('bcsel', False, b, c), c),
1306bf215546Sopenharmony_ci
1307bf215546Sopenharmony_ci   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
1308bf215546Sopenharmony_ci   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
1309bf215546Sopenharmony_ci   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1310bf215546Sopenharmony_ci   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1311bf215546Sopenharmony_ci   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
1312bf215546Sopenharmony_ci   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
1313bf215546Sopenharmony_ci   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1314bf215546Sopenharmony_ci   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1315bf215546Sopenharmony_ci   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1316bf215546Sopenharmony_ci   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1317bf215546Sopenharmony_ci   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1318bf215546Sopenharmony_ci   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1319bf215546Sopenharmony_ci
1320bf215546Sopenharmony_ci   (('bcsel', a, b, b), b),
1321bf215546Sopenharmony_ci   (('~fcsel', a, b, b), b),
1322bf215546Sopenharmony_ci
1323bf215546Sopenharmony_ci   # D3D Boolean emulation
1324bf215546Sopenharmony_ci   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
1325bf215546Sopenharmony_ci   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
1326bf215546Sopenharmony_ci   (('bcsel', a, 1, 0), ('b2i', 'a@1')),
1327bf215546Sopenharmony_ci   (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
1328bf215546Sopenharmony_ci   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1329bf215546Sopenharmony_ci    ('ineg', ('b2i', ('iand', a, b)))),
1330bf215546Sopenharmony_ci   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1331bf215546Sopenharmony_ci    ('ineg', ('b2i', ('ior', a, b)))),
1332bf215546Sopenharmony_ci   (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1333bf215546Sopenharmony_ci   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1334bf215546Sopenharmony_ci   (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
1335bf215546Sopenharmony_ci   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1336bf215546Sopenharmony_ci   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1337bf215546Sopenharmony_ci   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
1338bf215546Sopenharmony_ci   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
1339bf215546Sopenharmony_ci   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1340bf215546Sopenharmony_ci   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1341bf215546Sopenharmony_ci
1342bf215546Sopenharmony_ci   # With D3D booleans, imax is AND and umax is OR
1343bf215546Sopenharmony_ci   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1344bf215546Sopenharmony_ci    ('ineg', ('b2i', ('iand', a, b)))),
1345bf215546Sopenharmony_ci   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1346bf215546Sopenharmony_ci    ('ineg', ('b2i', ('ior', a, b)))),
1347bf215546Sopenharmony_ci   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1348bf215546Sopenharmony_ci    ('ineg', ('b2i', ('ior', a, b)))),
1349bf215546Sopenharmony_ci   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1350bf215546Sopenharmony_ci    ('ineg', ('b2i', ('iand', a, b)))),
1351bf215546Sopenharmony_ci
1352bf215546Sopenharmony_ci   # Conversions
1353bf215546Sopenharmony_ci   (('i2b16', ('b2i', 'a@16')), a),
1354bf215546Sopenharmony_ci   (('i2b32', ('b2i', 'a@32')), a),
1355bf215546Sopenharmony_ci   (('f2i', ('ftrunc', a)), ('f2i', a)),
1356bf215546Sopenharmony_ci   (('f2u', ('ftrunc', a)), ('f2u', a)),
1357bf215546Sopenharmony_ci   (('i2b', ('ineg', a)), ('i2b', a)),
1358bf215546Sopenharmony_ci   (('i2b', ('iabs', a)), ('i2b', a)),
1359bf215546Sopenharmony_ci   (('inot', ('f2b1', a)), ('feq', a, 0.0)),
1360bf215546Sopenharmony_ci
1361bf215546Sopenharmony_ci   # Conversions from 16 bits to 32 bits and back can always be removed
1362bf215546Sopenharmony_ci   (('f2fmp', ('f2f32', 'a@16')), a),
1363bf215546Sopenharmony_ci   (('i2imp', ('i2i32', 'a@16')), a),
1364bf215546Sopenharmony_ci   (('i2imp', ('u2u32', 'a@16')), a),
1365bf215546Sopenharmony_ci
1366bf215546Sopenharmony_ci   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1367bf215546Sopenharmony_ci   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1368bf215546Sopenharmony_ci   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1369bf215546Sopenharmony_ci   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1370bf215546Sopenharmony_ci
1371bf215546Sopenharmony_ci   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1372bf215546Sopenharmony_ci   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1373bf215546Sopenharmony_ci   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1374bf215546Sopenharmony_ci
1375bf215546Sopenharmony_ci   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1376bf215546Sopenharmony_ci   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1377bf215546Sopenharmony_ci   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1378bf215546Sopenharmony_ci   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1379bf215546Sopenharmony_ci
1380bf215546Sopenharmony_ci   # Conversions to 16 bits would be lossy so they should only be removed if
1381bf215546Sopenharmony_ci   # the instruction was generated by the precision lowering pass.
1382bf215546Sopenharmony_ci   (('f2f32', ('f2fmp', 'a@32')), a),
1383bf215546Sopenharmony_ci   (('i2i32', ('i2imp', 'a@32')), a),
1384bf215546Sopenharmony_ci   (('u2u32', ('i2imp', 'a@32')), a),
1385bf215546Sopenharmony_ci
1386bf215546Sopenharmony_ci   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1387bf215546Sopenharmony_ci   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1388bf215546Sopenharmony_ci   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1389bf215546Sopenharmony_ci   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1390bf215546Sopenharmony_ci
1391bf215546Sopenharmony_ci   # Conversions from float32 to float64 and back can be removed as long as
1392bf215546Sopenharmony_ci   # it doesn't need to be precise, since the conversion may e.g. flush denorms
1393bf215546Sopenharmony_ci   (('~f2f32', ('f2f64', 'a@32')), a),
1394bf215546Sopenharmony_ci
1395bf215546Sopenharmony_ci   (('ffloor', 'a(is_integral)'), a),
1396bf215546Sopenharmony_ci   (('fceil', 'a(is_integral)'), a),
1397bf215546Sopenharmony_ci   (('ftrunc', 'a(is_integral)'), a),
1398bf215546Sopenharmony_ci   (('fround_even', 'a(is_integral)'), a),
1399bf215546Sopenharmony_ci
1400bf215546Sopenharmony_ci   # fract(x) = x - floor(x), so fract(NaN) = NaN
1401bf215546Sopenharmony_ci   (('~ffract', 'a(is_integral)'), 0.0),
1402bf215546Sopenharmony_ci   (('fabs', 'a(is_not_negative)'), a),
1403bf215546Sopenharmony_ci   (('iabs', 'a(is_not_negative)'), a),
1404bf215546Sopenharmony_ci   (('fsat', 'a(is_not_positive)'), 0.0),
1405bf215546Sopenharmony_ci
1406bf215546Sopenharmony_ci   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1407bf215546Sopenharmony_ci
1408bf215546Sopenharmony_ci   # The result of the multiply must be in [-1, 0], so the result of the ffma
1409bf215546Sopenharmony_ci   # must be in [0, 1].
1410bf215546Sopenharmony_ci   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1411bf215546Sopenharmony_ci   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1412bf215546Sopenharmony_ci   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1413bf215546Sopenharmony_ci   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1414bf215546Sopenharmony_ci
1415bf215546Sopenharmony_ci   (('fneu', 'a(is_not_zero)', 0.0), True),
1416bf215546Sopenharmony_ci   (('feq', 'a(is_not_zero)', 0.0), False),
1417bf215546Sopenharmony_ci
1418bf215546Sopenharmony_ci   # In this chart, + means value > 0 and - means value < 0.
1419bf215546Sopenharmony_ci   #
1420bf215546Sopenharmony_ci   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1421bf215546Sopenharmony_ci   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1422bf215546Sopenharmony_ci   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1423bf215546Sopenharmony_ci   #
1424bf215546Sopenharmony_ci   # Using grouping conceptually similar to a Karnaugh map...
1425bf215546Sopenharmony_ci   #
1426bf215546Sopenharmony_ci   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1427bf215546Sopenharmony_ci   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1428bf215546Sopenharmony_ci   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1429bf215546Sopenharmony_ci   #
1430bf215546Sopenharmony_ci   # The flt / ilt cases just invert the expected result.
1431bf215546Sopenharmony_ci   #
1432bf215546Sopenharmony_ci   # The results expecting true, must be marked imprecise.  The results
1433bf215546Sopenharmony_ci   # expecting false are fine because NaN compared >= or < anything is false.
1434bf215546Sopenharmony_ci
1435bf215546Sopenharmony_ci   (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),
1436bf215546Sopenharmony_ci   (('fge', 'a(is_not_positive)',          'b(is_gt_zero)'),               False),
1437bf215546Sopenharmony_ci   (('fge', 'a(is_lt_zero)',               'b(is_not_negative)'),          False),
1438bf215546Sopenharmony_ci
1439bf215546Sopenharmony_ci   (('flt', 'a(is_not_negative)',          'b(is_not_positive)'),          False),
1440bf215546Sopenharmony_ci   (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'),      True),
1441bf215546Sopenharmony_ci   (('flt', 'a(is_a_number_lt_zero)',      'b(is_a_number_not_negative)'), True),
1442bf215546Sopenharmony_ci
1443bf215546Sopenharmony_ci   (('ine', 'a(is_not_zero)', 0), True),
1444bf215546Sopenharmony_ci   (('ieq', 'a(is_not_zero)', 0), False),
1445bf215546Sopenharmony_ci
1446bf215546Sopenharmony_ci   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1447bf215546Sopenharmony_ci   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1448bf215546Sopenharmony_ci   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1449bf215546Sopenharmony_ci
1450bf215546Sopenharmony_ci   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1451bf215546Sopenharmony_ci   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1452bf215546Sopenharmony_ci   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1453bf215546Sopenharmony_ci
1454bf215546Sopenharmony_ci   (('ult', 0, 'a(is_gt_zero)'), True),
1455bf215546Sopenharmony_ci   (('ult', a, 0), False),
1456bf215546Sopenharmony_ci
1457bf215546Sopenharmony_ci   # Packing and then unpacking does nothing
1458bf215546Sopenharmony_ci   (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
1459bf215546Sopenharmony_ci   (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
1460bf215546Sopenharmony_ci   (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'),
1461bf215546Sopenharmony_ci   (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'),
1462bf215546Sopenharmony_ci   (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
1463bf215546Sopenharmony_ci   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1464bf215546Sopenharmony_ci   (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),
1465bf215546Sopenharmony_ci   (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1466bf215546Sopenharmony_ci                           ('unpack_64_2x32_split_y', a)), a),
1467bf215546Sopenharmony_ci   (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
1468bf215546Sopenharmony_ci                              ('unpack_64_2x32_split_y', a))), a),
1469bf215546Sopenharmony_ci   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1470bf215546Sopenharmony_ci   (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),
1471bf215546Sopenharmony_ci
1472bf215546Sopenharmony_ci   # Comparing two halves of an unpack separately.  While this optimization
1473bf215546Sopenharmony_ci   # should be correct for non-constant values, it's less obvious that it's
1474bf215546Sopenharmony_ci   # useful in that case.  For constant values, the pack will fold and we're
1475bf215546Sopenharmony_ci   # guaranteed to reduce the whole tree to one instruction.
1476bf215546Sopenharmony_ci   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1477bf215546Sopenharmony_ci             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1478bf215546Sopenharmony_ci    ('ieq', a, ('pack_32_2x16_split', b, c))),
1479bf215546Sopenharmony_ci
1480bf215546Sopenharmony_ci   # Byte extraction
1481bf215546Sopenharmony_ci   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1482bf215546Sopenharmony_ci   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1483bf215546Sopenharmony_ci   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1484bf215546Sopenharmony_ci   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1485bf215546Sopenharmony_ci   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1486bf215546Sopenharmony_ci   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1487bf215546Sopenharmony_ci   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1488bf215546Sopenharmony_ci
1489bf215546Sopenharmony_ci   # Common pattern in many Vulkan CTS tests that read 8-bit integers from a
1490bf215546Sopenharmony_ci   # storage buffer.
1491bf215546Sopenharmony_ci   (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),
1492bf215546Sopenharmony_ci   (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),
1493bf215546Sopenharmony_ci
1494bf215546Sopenharmony_ci   # Common pattern after lowering 8-bit integers to 16-bit.
1495bf215546Sopenharmony_ci   (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),
1496bf215546Sopenharmony_ci   (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),
1497bf215546Sopenharmony_ci
1498bf215546Sopenharmony_ci   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1499bf215546Sopenharmony_ci   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1500bf215546Sopenharmony_ci   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1501bf215546Sopenharmony_ci   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1502bf215546Sopenharmony_ci   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1503bf215546Sopenharmony_ci   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1504bf215546Sopenharmony_ci   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1505bf215546Sopenharmony_ci   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1506bf215546Sopenharmony_ci
1507bf215546Sopenharmony_ci   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
1508bf215546Sopenharmony_ci   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
1509bf215546Sopenharmony_ci
1510bf215546Sopenharmony_ci    # Word extraction
1511bf215546Sopenharmony_ci   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1512bf215546Sopenharmony_ci   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1513bf215546Sopenharmony_ci   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1514bf215546Sopenharmony_ci   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1515bf215546Sopenharmony_ci   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1516bf215546Sopenharmony_ci
1517bf215546Sopenharmony_ci   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1518bf215546Sopenharmony_ci   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1519bf215546Sopenharmony_ci   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1520bf215546Sopenharmony_ci   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1521bf215546Sopenharmony_ci
1522bf215546Sopenharmony_ci   # Packing a u8vec4 to write to an SSBO.
1523bf215546Sopenharmony_ci   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
1524bf215546Sopenharmony_ci    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
1525bf215546Sopenharmony_ci
1526bf215546Sopenharmony_ci   (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
1527bf215546Sopenharmony_ci   (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
1528bf215546Sopenharmony_ci
1529bf215546Sopenharmony_ci   # Lower pack/unpack
1530bf215546Sopenharmony_ci   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1531bf215546Sopenharmony_ci   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'),
1532bf215546Sopenharmony_ci   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1533bf215546Sopenharmony_ci   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1534bf215546Sopenharmony_ci   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'),
1535bf215546Sopenharmony_ci   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'),
1536bf215546Sopenharmony_ci
1537bf215546Sopenharmony_ci   # Useless masking before unpacking
1538bf215546Sopenharmony_ci   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1539bf215546Sopenharmony_ci   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1540bf215546Sopenharmony_ci   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1541bf215546Sopenharmony_ci   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1542bf215546Sopenharmony_ci   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1543bf215546Sopenharmony_ci   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1544bf215546Sopenharmony_ci
1545bf215546Sopenharmony_ci   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1546bf215546Sopenharmony_ci   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1547bf215546Sopenharmony_ci   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1548bf215546Sopenharmony_ci   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1549bf215546Sopenharmony_ci   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1550bf215546Sopenharmony_ci
1551bf215546Sopenharmony_ci   # Optimize half packing
1552bf215546Sopenharmony_ci   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1553bf215546Sopenharmony_ci   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1554bf215546Sopenharmony_ci
1555bf215546Sopenharmony_ci   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1556bf215546Sopenharmony_ci    ('pack_half_2x16', ('vec2', a, b))),
1557bf215546Sopenharmony_ci   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1558bf215546Sopenharmony_ci    ('pack_half_2x16', ('vec2', a, b))),
1559bf215546Sopenharmony_ci
1560bf215546Sopenharmony_ci   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
1561bf215546Sopenharmony_ci   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
1562bf215546Sopenharmony_ci   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
1563bf215546Sopenharmony_ci
1564bf215546Sopenharmony_ci   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1565bf215546Sopenharmony_ci   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1566bf215546Sopenharmony_ci
1567bf215546Sopenharmony_ci   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
1568bf215546Sopenharmony_ci   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
1569bf215546Sopenharmony_ci   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
1570bf215546Sopenharmony_ci   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
1571bf215546Sopenharmony_ci   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
1572bf215546Sopenharmony_ci   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
1573bf215546Sopenharmony_ci   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
1574bf215546Sopenharmony_ci   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
1575bf215546Sopenharmony_ci])
1576bf215546Sopenharmony_ci
1577bf215546Sopenharmony_ci# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
1578bf215546Sopenharmony_ci# patterns like those below.
1579bf215546Sopenharmony_cifor op in ('ushr', 'ishr'):
1580bf215546Sopenharmony_ci   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
1581bf215546Sopenharmony_ci   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
1582bf215546Sopenharmony_ci   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
1583bf215546Sopenharmony_ci
1584bf215546Sopenharmony_cioptimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
1585bf215546Sopenharmony_ci
1586bf215546Sopenharmony_ci# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
1587bf215546Sopenharmony_ci# patterns like those below.
1588bf215546Sopenharmony_cifor op in ('extract_u8', 'extract_i8'):
1589bf215546Sopenharmony_ci   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
1590bf215546Sopenharmony_ci   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
1591bf215546Sopenharmony_ci   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
1592bf215546Sopenharmony_ci
1593bf215546Sopenharmony_cioptimizations.extend([
1594bf215546Sopenharmony_ci   # Subtracts
1595bf215546Sopenharmony_ci   (('ussub_4x8_vc4', a, 0), a),
1596bf215546Sopenharmony_ci   (('ussub_4x8_vc4', a, ~0), 0),
1597bf215546Sopenharmony_ci   # Lower all Subtractions first - they can get recombined later
1598bf215546Sopenharmony_ci   (('fsub', a, b), ('fadd', a, ('fneg', b))),
1599bf215546Sopenharmony_ci   (('isub', a, b), ('iadd', a, ('ineg', b))),
1600bf215546Sopenharmony_ci   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1601bf215546Sopenharmony_ci   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
1602bf215546Sopenharmony_ci   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1603bf215546Sopenharmony_ci
1604bf215546Sopenharmony_ci   # Propagate negation up multiplication chains
1605bf215546Sopenharmony_ci   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
1606bf215546Sopenharmony_ci   (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32),
1607bf215546Sopenharmony_ci   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
1608bf215546Sopenharmony_ci   (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),
1609bf215546Sopenharmony_ci   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
1610bf215546Sopenharmony_ci
1611bf215546Sopenharmony_ci   # Propagate constants up multiplication chains
1612bf215546Sopenharmony_ci   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
1613bf215546Sopenharmony_ci   (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),
1614bf215546Sopenharmony_ci   (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),
1615bf215546Sopenharmony_ci   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
1616bf215546Sopenharmony_ci   (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
1617bf215546Sopenharmony_ci   (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),
1618bf215546Sopenharmony_ci   (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),
1619bf215546Sopenharmony_ci   # Prefer moving out a multiplication for more MAD/FMA-friendly code
1620bf215546Sopenharmony_ci   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
1621bf215546Sopenharmony_ci   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
1622bf215546Sopenharmony_ci   (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
1623bf215546Sopenharmony_ci   (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),
1624bf215546Sopenharmony_ci   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
1625bf215546Sopenharmony_ci
1626bf215546Sopenharmony_ci   # Reassociate constants in add/mul chains so they can be folded together.
1627bf215546Sopenharmony_ci   # For now, we mostly only handle cases where the constants are separated by
1628bf215546Sopenharmony_ci   # a single non-constant.  We could do better eventually.
1629bf215546Sopenharmony_ci   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
1630bf215546Sopenharmony_ci   (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),
1631bf215546Sopenharmony_ci   (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),
1632bf215546Sopenharmony_ci   (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
1633bf215546Sopenharmony_ci   (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),
1634bf215546Sopenharmony_ci   (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),
1635bf215546Sopenharmony_ci   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
1636bf215546Sopenharmony_ci   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
1637bf215546Sopenharmony_ci   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
1638bf215546Sopenharmony_ci   (('~fadd', '#a',          ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffma',          b,  c, ('fadd', a,          d))),
1639bf215546Sopenharmony_ci   (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1640bf215546Sopenharmony_ci   (('~fadd', '#a',          ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffmaz',          b,  c, ('fadd', a,          d))),
1641bf215546Sopenharmony_ci   (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1642bf215546Sopenharmony_ci   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
1643bf215546Sopenharmony_ci   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
1644bf215546Sopenharmony_ci   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
1645bf215546Sopenharmony_ci   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
1646bf215546Sopenharmony_ci
1647bf215546Sopenharmony_ci   # Reassociate add chains for more MAD/FMA-friendly code
1648bf215546Sopenharmony_ci   (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),
1649bf215546Sopenharmony_ci
1650bf215546Sopenharmony_ci   # Drop mul-div by the same value when there's no wrapping.
1651bf215546Sopenharmony_ci   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
1652bf215546Sopenharmony_ci
1653bf215546Sopenharmony_ci   # By definition...
1654bf215546Sopenharmony_ci   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
1655bf215546Sopenharmony_ci   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1656bf215546Sopenharmony_ci   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1657bf215546Sopenharmony_ci
1658bf215546Sopenharmony_ci   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
1659bf215546Sopenharmony_ci   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1660bf215546Sopenharmony_ci   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1661bf215546Sopenharmony_ci
1662bf215546Sopenharmony_ci   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
1663bf215546Sopenharmony_ci
1664bf215546Sopenharmony_ci   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1665bf215546Sopenharmony_ci   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1666bf215546Sopenharmony_ci   (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1667bf215546Sopenharmony_ci   (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1668bf215546Sopenharmony_ci   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
1669bf215546Sopenharmony_ci
1670bf215546Sopenharmony_ci   (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
1671bf215546Sopenharmony_ci   (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
1672bf215546Sopenharmony_ci
1673bf215546Sopenharmony_ci   # Misc. lowering
1674bf215546Sopenharmony_ci   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
1675bf215546Sopenharmony_ci   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
1676bf215546Sopenharmony_ci   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
1677bf215546Sopenharmony_ci   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
1678bf215546Sopenharmony_ci
1679bf215546Sopenharmony_ci   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1680bf215546Sopenharmony_ci    ('bcsel', ('ult', 31, 'bits'), 'insert',
1681bf215546Sopenharmony_ci              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
1682bf215546Sopenharmony_ci    'options->lower_bitfield_insert'),
1683bf215546Sopenharmony_ci   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1684bf215546Sopenharmony_ci   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1685bf215546Sopenharmony_ci   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1686bf215546Sopenharmony_ci   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1687bf215546Sopenharmony_ci   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1688bf215546Sopenharmony_ci   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1689bf215546Sopenharmony_ci   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1690bf215546Sopenharmony_ci   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1691bf215546Sopenharmony_ci
1692bf215546Sopenharmony_ci   (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'),
1693bf215546Sopenharmony_ci   (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'),
1694bf215546Sopenharmony_ci
1695bf215546Sopenharmony_ci   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1696bf215546Sopenharmony_ci   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
1697bf215546Sopenharmony_ci   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'),
1698bf215546Sopenharmony_ci   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'),
1699bf215546Sopenharmony_ci
1700bf215546Sopenharmony_ci   # int64_t sum = a + b;
1701bf215546Sopenharmony_ci   #
1702bf215546Sopenharmony_ci   # if (a < 0 && b < 0 && a < sum)
1703bf215546Sopenharmony_ci   #    sum = INT64_MIN;
1704bf215546Sopenharmony_ci   # } else if (a >= 0 && b >= 0 && sum < a)
1705bf215546Sopenharmony_ci   #    sum = INT64_MAX;
1706bf215546Sopenharmony_ci   # }
1707bf215546Sopenharmony_ci   #
1708bf215546Sopenharmony_ci   # A couple optimizations are applied.
1709bf215546Sopenharmony_ci   #
1710bf215546Sopenharmony_ci   # 1. a < sum => sum >= 0.  This replacement works because it is known that
1711bf215546Sopenharmony_ci   #    a < 0 and b < 0, so sum should also be < 0 unless there was
1712bf215546Sopenharmony_ci   #    underflow.
1713bf215546Sopenharmony_ci   #
1714bf215546Sopenharmony_ci   # 2. sum < a => sum < 0.  This replacement works because it is known that
1715bf215546Sopenharmony_ci   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
1716bf215546Sopenharmony_ci   #    overflow.
1717bf215546Sopenharmony_ci   #
1718bf215546Sopenharmony_ci   # 3. Invert the second if-condition and swap the order of parameters for
1719bf215546Sopenharmony_ci   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
1720bf215546Sopenharmony_ci   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
1721bf215546Sopenharmony_ci   #
1722bf215546Sopenharmony_ci   # On Intel Gen11, this saves ~11 instructions.
1723bf215546Sopenharmony_ci   (('iadd_sat@64', a, b), ('bcsel',
1724bf215546Sopenharmony_ci                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1725bf215546Sopenharmony_ci                            0x8000000000000000,
1726bf215546Sopenharmony_ci                            ('bcsel',
1727bf215546Sopenharmony_ci                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1728bf215546Sopenharmony_ci                             ('iadd', a, b),
1729bf215546Sopenharmony_ci                             0x7fffffffffffffff)),
1730bf215546Sopenharmony_ci    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
1731bf215546Sopenharmony_ci
1732bf215546Sopenharmony_ci   # int64_t sum = a - b;
1733bf215546Sopenharmony_ci   #
1734bf215546Sopenharmony_ci   # if (a < 0 && b >= 0 && a < sum)
1735bf215546Sopenharmony_ci   #    sum = INT64_MIN;
1736bf215546Sopenharmony_ci   # } else if (a >= 0 && b < 0 && a >= sum)
1737bf215546Sopenharmony_ci   #    sum = INT64_MAX;
1738bf215546Sopenharmony_ci   # }
1739bf215546Sopenharmony_ci   #
1740bf215546Sopenharmony_ci   # Optimizations similar to the iadd_sat case are applied here.
1741bf215546Sopenharmony_ci   (('isub_sat@64', a, b), ('bcsel',
1742bf215546Sopenharmony_ci                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1743bf215546Sopenharmony_ci                            0x8000000000000000,
1744bf215546Sopenharmony_ci                            ('bcsel',
1745bf215546Sopenharmony_ci                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1746bf215546Sopenharmony_ci                             ('isub', a, b),
1747bf215546Sopenharmony_ci                             0x7fffffffffffffff)),
1748bf215546Sopenharmony_ci    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
1749bf215546Sopenharmony_ci
1750bf215546Sopenharmony_ci   # These are done here instead of in the backend because the int64 lowering
1751bf215546Sopenharmony_ci   # pass will make a mess of the patterns.  The first patterns are
1752bf215546Sopenharmony_ci   # conditioned on nir_lower_minmax64 because it was not clear that it was
1753bf215546Sopenharmony_ci   # always an improvement on platforms that have real int64 support.  No
1754bf215546Sopenharmony_ci   # shaders in shader-db hit this, so it was hard to say one way or the
1755bf215546Sopenharmony_ci   # other.
1756bf215546Sopenharmony_ci   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1757bf215546Sopenharmony_ci   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1758bf215546Sopenharmony_ci   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1759bf215546Sopenharmony_ci   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1760bf215546Sopenharmony_ci   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1761bf215546Sopenharmony_ci   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1762bf215546Sopenharmony_ci
1763bf215546Sopenharmony_ci   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1764bf215546Sopenharmony_ci   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1765bf215546Sopenharmony_ci   # 0u < uint(a) <=> uint(a) != 0u
1766bf215546Sopenharmony_ci   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1767bf215546Sopenharmony_ci
1768bf215546Sopenharmony_ci   # Alternative lowering that doesn't rely on bfi.
1769bf215546Sopenharmony_ci   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1770bf215546Sopenharmony_ci    ('bcsel', ('ult', 31, 'bits'),
1771bf215546Sopenharmony_ci     'insert',
1772bf215546Sopenharmony_ci    (('ior',
1773bf215546Sopenharmony_ci     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
1774bf215546Sopenharmony_ci     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
1775bf215546Sopenharmony_ci    'options->lower_bitfield_insert_to_shifts'),
1776bf215546Sopenharmony_ci
1777bf215546Sopenharmony_ci   # Alternative lowering that uses bitfield_select.
1778bf215546Sopenharmony_ci   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1779bf215546Sopenharmony_ci    ('bcsel', ('ult', 31, 'bits'), 'insert',
1780bf215546Sopenharmony_ci              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
1781bf215546Sopenharmony_ci    'options->lower_bitfield_insert_to_bitfield_select'),
1782bf215546Sopenharmony_ci
1783bf215546Sopenharmony_ci   (('ibitfield_extract', 'value', 'offset', 'bits'),
1784bf215546Sopenharmony_ci    ('bcsel', ('ult', 31, 'bits'), 'value',
1785bf215546Sopenharmony_ci              ('ibfe', 'value', 'offset', 'bits')),
1786bf215546Sopenharmony_ci    'options->lower_bitfield_extract'),
1787bf215546Sopenharmony_ci
1788bf215546Sopenharmony_ci   (('ubitfield_extract', 'value', 'offset', 'bits'),
1789bf215546Sopenharmony_ci    ('bcsel', ('ult', 31, 'bits'), 'value',
1790bf215546Sopenharmony_ci              ('ubfe', 'value', 'offset', 'bits')),
1791bf215546Sopenharmony_ci    'options->lower_bitfield_extract'),
1792bf215546Sopenharmony_ci
1793bf215546Sopenharmony_ci   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
1794bf215546Sopenharmony_ci   (('bitfield_select', a, b, 0), ('iand', a, b)),
1795bf215546Sopenharmony_ci   (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),
1796bf215546Sopenharmony_ci
1797bf215546Sopenharmony_ci   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
1798bf215546Sopenharmony_ci   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
1799bf215546Sopenharmony_ci   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
1800bf215546Sopenharmony_ci   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
1801bf215546Sopenharmony_ci   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
1802bf215546Sopenharmony_ci   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
1803bf215546Sopenharmony_ci   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
1804bf215546Sopenharmony_ci
1805bf215546Sopenharmony_ci   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
1806bf215546Sopenharmony_ci   #
1807bf215546Sopenharmony_ci   #    If bits is zero, the result will be zero.
1808bf215546Sopenharmony_ci   #
1809bf215546Sopenharmony_ci   # These patterns prevent other patterns from generating invalid results
1810bf215546Sopenharmony_ci   # when count is zero.
1811bf215546Sopenharmony_ci   (('ubfe', a, b, 0), 0),
1812bf215546Sopenharmony_ci   (('ibfe', a, b, 0), 0),
1813bf215546Sopenharmony_ci
1814bf215546Sopenharmony_ci   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
1815bf215546Sopenharmony_ci
1816bf215546Sopenharmony_ci   (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)),
1817bf215546Sopenharmony_ci   (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
1818bf215546Sopenharmony_ci   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1819bf215546Sopenharmony_ci   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1820bf215546Sopenharmony_ci   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1821bf215546Sopenharmony_ci   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1822bf215546Sopenharmony_ci
1823bf215546Sopenharmony_ci   (('ibitfield_extract', 'value', 'offset', 'bits'),
1824bf215546Sopenharmony_ci    ('bcsel', ('ieq', 0, 'bits'),
1825bf215546Sopenharmony_ci     0,
1826bf215546Sopenharmony_ci     ('ishr',
1827bf215546Sopenharmony_ci       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
1828bf215546Sopenharmony_ci       ('isub', 32, 'bits'))),
1829bf215546Sopenharmony_ci    'options->lower_bitfield_extract_to_shifts'),
1830bf215546Sopenharmony_ci
1831bf215546Sopenharmony_ci   (('ubitfield_extract', 'value', 'offset', 'bits'),
1832bf215546Sopenharmony_ci    ('iand',
1833bf215546Sopenharmony_ci     ('ushr', 'value', 'offset'),
1834bf215546Sopenharmony_ci     ('bcsel', ('ieq', 'bits', 32),
1835bf215546Sopenharmony_ci      0xffffffff,
1836bf215546Sopenharmony_ci      ('isub', ('ishl', 1, 'bits'), 1))),
1837bf215546Sopenharmony_ci    'options->lower_bitfield_extract_to_shifts'),
1838bf215546Sopenharmony_ci
1839bf215546Sopenharmony_ci   (('ifind_msb', 'value'),
1840bf215546Sopenharmony_ci    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
1841bf215546Sopenharmony_ci    'options->lower_ifind_msb'),
1842bf215546Sopenharmony_ci
1843bf215546Sopenharmony_ci   (('ifind_msb', 'value'),
1844bf215546Sopenharmony_ci    ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),
1845bf215546Sopenharmony_ci     ('isub', 31, ('ifind_msb_rev', 'value')),
1846bf215546Sopenharmony_ci     ('ifind_msb_rev', 'value')),
1847bf215546Sopenharmony_ci    'options->lower_find_msb_to_reverse'),
1848bf215546Sopenharmony_ci
1849bf215546Sopenharmony_ci    (('ufind_msb', 'value'),
1850bf215546Sopenharmony_ci     ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),
1851bf215546Sopenharmony_ci      ('isub', 31, ('ufind_msb_rev', 'value')),
1852bf215546Sopenharmony_ci      ('ufind_msb_rev', 'value')),
1853bf215546Sopenharmony_ci     'options->lower_find_msb_to_reverse'),
1854bf215546Sopenharmony_ci
1855bf215546Sopenharmony_ci   (('find_lsb', 'value'),
1856bf215546Sopenharmony_ci    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
1857bf215546Sopenharmony_ci    'options->lower_find_lsb'),
1858bf215546Sopenharmony_ci
1859bf215546Sopenharmony_ci   (('extract_i8', a, 'b@32'),
1860bf215546Sopenharmony_ci    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
1861bf215546Sopenharmony_ci    'options->lower_extract_byte'),
1862bf215546Sopenharmony_ci
1863bf215546Sopenharmony_ci   (('extract_u8', a, 'b@32'),
1864bf215546Sopenharmony_ci    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
1865bf215546Sopenharmony_ci    'options->lower_extract_byte'),
1866bf215546Sopenharmony_ci
1867bf215546Sopenharmony_ci   (('extract_i16', a, 'b@32'),
1868bf215546Sopenharmony_ci    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
1869bf215546Sopenharmony_ci    'options->lower_extract_word'),
1870bf215546Sopenharmony_ci
1871bf215546Sopenharmony_ci   (('extract_u16', a, 'b@32'),
1872bf215546Sopenharmony_ci    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
1873bf215546Sopenharmony_ci    'options->lower_extract_word'),
1874bf215546Sopenharmony_ci
1875bf215546Sopenharmony_ci    (('pack_unorm_2x16', 'v'),
1876bf215546Sopenharmony_ci     ('pack_uvec2_to_uint',
1877bf215546Sopenharmony_ci        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
1878bf215546Sopenharmony_ci     'options->lower_pack_unorm_2x16'),
1879bf215546Sopenharmony_ci
1880bf215546Sopenharmony_ci    (('pack_unorm_4x8', 'v'),
1881bf215546Sopenharmony_ci     ('pack_uvec4_to_uint',
1882bf215546Sopenharmony_ci        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
1883bf215546Sopenharmony_ci     'options->lower_pack_unorm_4x8'),
1884bf215546Sopenharmony_ci
1885bf215546Sopenharmony_ci    (('pack_snorm_2x16', 'v'),
1886bf215546Sopenharmony_ci     ('pack_uvec2_to_uint',
1887bf215546Sopenharmony_ci        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
1888bf215546Sopenharmony_ci     'options->lower_pack_snorm_2x16'),
1889bf215546Sopenharmony_ci
1890bf215546Sopenharmony_ci    (('pack_snorm_4x8', 'v'),
1891bf215546Sopenharmony_ci     ('pack_uvec4_to_uint',
1892bf215546Sopenharmony_ci        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
1893bf215546Sopenharmony_ci     'options->lower_pack_snorm_4x8'),
1894bf215546Sopenharmony_ci
1895bf215546Sopenharmony_ci    (('unpack_unorm_2x16', 'v'),
1896bf215546Sopenharmony_ci     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
1897bf215546Sopenharmony_ci                                  ('extract_u16', 'v', 1))),
1898bf215546Sopenharmony_ci              65535.0),
1899bf215546Sopenharmony_ci     'options->lower_unpack_unorm_2x16'),
1900bf215546Sopenharmony_ci
1901bf215546Sopenharmony_ci    (('unpack_unorm_4x8', 'v'),
1902bf215546Sopenharmony_ci     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
1903bf215546Sopenharmony_ci                                  ('extract_u8', 'v', 1),
1904bf215546Sopenharmony_ci                                  ('extract_u8', 'v', 2),
1905bf215546Sopenharmony_ci                                  ('extract_u8', 'v', 3))),
1906bf215546Sopenharmony_ci              255.0),
1907bf215546Sopenharmony_ci     'options->lower_unpack_unorm_4x8'),
1908bf215546Sopenharmony_ci
1909bf215546Sopenharmony_ci    (('unpack_snorm_2x16', 'v'),
1910bf215546Sopenharmony_ci     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
1911bf215546Sopenharmony_ci                                                            ('extract_i16', 'v', 1))),
1912bf215546Sopenharmony_ci                                           32767.0))),
1913bf215546Sopenharmony_ci     'options->lower_unpack_snorm_2x16'),
1914bf215546Sopenharmony_ci
1915bf215546Sopenharmony_ci    (('unpack_snorm_4x8', 'v'),
1916bf215546Sopenharmony_ci     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
1917bf215546Sopenharmony_ci                                                            ('extract_i8', 'v', 1),
1918bf215546Sopenharmony_ci                                                            ('extract_i8', 'v', 2),
1919bf215546Sopenharmony_ci                                                            ('extract_i8', 'v', 3))),
1920bf215546Sopenharmony_ci                                           127.0))),
1921bf215546Sopenharmony_ci     'options->lower_unpack_snorm_4x8'),
1922bf215546Sopenharmony_ci
1923bf215546Sopenharmony_ci   (('pack_half_2x16_split', 'a@32', 'b@32'),
1924bf215546Sopenharmony_ci    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
1925bf215546Sopenharmony_ci    'options->lower_pack_split'),
1926bf215546Sopenharmony_ci
1927bf215546Sopenharmony_ci   (('unpack_half_2x16_split_x', 'a@32'),
1928bf215546Sopenharmony_ci    ('f2f32', ('u2u16', a)),
1929bf215546Sopenharmony_ci    'options->lower_pack_split'),
1930bf215546Sopenharmony_ci
1931bf215546Sopenharmony_ci   (('unpack_half_2x16_split_y', 'a@32'),
1932bf215546Sopenharmony_ci    ('f2f32', ('u2u16', ('ushr', a, 16))),
1933bf215546Sopenharmony_ci    'options->lower_pack_split'),
1934bf215546Sopenharmony_ci
1935bf215546Sopenharmony_ci   (('pack_32_2x16_split', 'a@16', 'b@16'),
1936bf215546Sopenharmony_ci    ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
1937bf215546Sopenharmony_ci    'options->lower_pack_split'),
1938bf215546Sopenharmony_ci
1939bf215546Sopenharmony_ci   (('unpack_32_2x16_split_x', 'a@32'),
1940bf215546Sopenharmony_ci    ('u2u16', a),
1941bf215546Sopenharmony_ci    'options->lower_pack_split'),
1942bf215546Sopenharmony_ci
1943bf215546Sopenharmony_ci   (('unpack_32_2x16_split_y', 'a@32'),
1944bf215546Sopenharmony_ci    ('u2u16', ('ushr', 'a', 16)),
1945bf215546Sopenharmony_ci    'options->lower_pack_split'),
1946bf215546Sopenharmony_ci
1947bf215546Sopenharmony_ci   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
1948bf215546Sopenharmony_ci   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
1949bf215546Sopenharmony_ci   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
1950bf215546Sopenharmony_ci   # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0
1951bf215546Sopenharmony_ci   # Mark the new comparisons precise to prevent them being changed to 'a !=
1952bf215546Sopenharmony_ci   # 0' or 'a == 0'.
1953bf215546Sopenharmony_ci   (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),
1954bf215546Sopenharmony_ci
1955bf215546Sopenharmony_ci   # Address/offset calculations:
1956bf215546Sopenharmony_ci   # Drivers supporting imul24 should use the nir_lower_amul() pass, this
1957bf215546Sopenharmony_ci   # rule converts everyone else to imul:
1958bf215546Sopenharmony_ci   (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
1959bf215546Sopenharmony_ci
1960bf215546Sopenharmony_ci   (('umul24', a, b),
1961bf215546Sopenharmony_ci    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
1962bf215546Sopenharmony_ci    '!options->has_umul24'),
1963bf215546Sopenharmony_ci   (('umad24', a, b, c),
1964bf215546Sopenharmony_ci    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
1965bf215546Sopenharmony_ci    '!options->has_umad24'),
1966bf215546Sopenharmony_ci
1967bf215546Sopenharmony_ci   # Relaxed 24bit ops
1968bf215546Sopenharmony_ci   (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),
1969bf215546Sopenharmony_ci   (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),
1970bf215546Sopenharmony_ci   (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),
1971bf215546Sopenharmony_ci   (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),
1972bf215546Sopenharmony_ci   (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),
1973bf215546Sopenharmony_ci   (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),
1974bf215546Sopenharmony_ci
1975bf215546Sopenharmony_ci   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
1976bf215546Sopenharmony_ci   (('imad24_ir3', a, 0, c), (c)),
1977bf215546Sopenharmony_ci   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
1978bf215546Sopenharmony_ci
1979bf215546Sopenharmony_ci   # if first two srcs are const, crack apart the imad so constant folding
1980bf215546Sopenharmony_ci   # can clean up the imul:
1981bf215546Sopenharmony_ci   # TODO ffma should probably get a similar rule:
1982bf215546Sopenharmony_ci   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
1983bf215546Sopenharmony_ci
1984bf215546Sopenharmony_ci   # These will turn 24b address/offset calc back into 32b shifts, but
1985bf215546Sopenharmony_ci   # it should be safe to get back some of the bits of precision that we
1986bf215546Sopenharmony_ci   # already decided were no necessary:
1987bf215546Sopenharmony_ci   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
1988bf215546Sopenharmony_ci   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
1989bf215546Sopenharmony_ci   (('imul24', a, 0), (0)),
1990bf215546Sopenharmony_ci
1991bf215546Sopenharmony_ci   (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1992bf215546Sopenharmony_ci   (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
1993bf215546Sopenharmony_ci   (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1994bf215546Sopenharmony_ci   (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
1995bf215546Sopenharmony_ci
1996bf215546Sopenharmony_ci   (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1997bf215546Sopenharmony_ci   (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"),
1998bf215546Sopenharmony_ci   (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1999bf215546Sopenharmony_ci   (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"),
2000bf215546Sopenharmony_ci
2001bf215546Sopenharmony_ci   (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
2002bf215546Sopenharmony_ci   (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
2003bf215546Sopenharmony_ci   (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
2004bf215546Sopenharmony_ci   (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
2005bf215546Sopenharmony_ci
2006bf215546Sopenharmony_ci])
2007bf215546Sopenharmony_ci
2008bf215546Sopenharmony_ci# bit_size dependent lowerings
2009bf215546Sopenharmony_cifor bit_size in [8, 16, 32, 64]:
2010bf215546Sopenharmony_ci   # convenience constants
2011bf215546Sopenharmony_ci   intmax = (1 << (bit_size - 1)) - 1
2012bf215546Sopenharmony_ci   intmin = 1 << (bit_size - 1)
2013bf215546Sopenharmony_ci
2014bf215546Sopenharmony_ci   optimizations += [
2015bf215546Sopenharmony_ci      (('iadd_sat@' + str(bit_size), a, b),
2016bf215546Sopenharmony_ci       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
2017bf215546Sopenharmony_ci                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
2018bf215546Sopenharmony_ci      (('isub_sat@' + str(bit_size), a, b),
2019bf215546Sopenharmony_ci       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
2020bf215546Sopenharmony_ci                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
2021bf215546Sopenharmony_ci   ]
2022bf215546Sopenharmony_ci
2023bf215546Sopenharmony_ciinvert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
2024bf215546Sopenharmony_ci
2025bf215546Sopenharmony_cifor left, right in itertools.combinations_with_replacement(invert.keys(), 2):
2026bf215546Sopenharmony_ci   optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
2027bf215546Sopenharmony_ci                         ('iand', (invert[left], a, b), (invert[right], c, d))))
2028bf215546Sopenharmony_ci   optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
2029bf215546Sopenharmony_ci                         ('ior', (invert[left], a, b), (invert[right], c, d))))
2030bf215546Sopenharmony_ci
2031bf215546Sopenharmony_ci# Optimize x2bN(b2x(x)) -> x
2032bf215546Sopenharmony_cifor size in type_sizes('bool'):
2033bf215546Sopenharmony_ci    aN = 'a@' + str(size)
2034bf215546Sopenharmony_ci    f2bN = 'f2b' + str(size)
2035bf215546Sopenharmony_ci    i2bN = 'i2b' + str(size)
2036bf215546Sopenharmony_ci    optimizations.append(((f2bN, ('b2f', aN)), a))
2037bf215546Sopenharmony_ci    optimizations.append(((i2bN, ('b2i', aN)), a))
2038bf215546Sopenharmony_ci
2039bf215546Sopenharmony_ci# Optimize x2yN(b2x(x)) -> b2y
2040bf215546Sopenharmony_cifor x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
2041bf215546Sopenharmony_ci   if x != 'f' and y != 'f' and x != y:
2042bf215546Sopenharmony_ci      continue
2043bf215546Sopenharmony_ci
2044bf215546Sopenharmony_ci   b2x = 'b2f' if x == 'f' else 'b2i'
2045bf215546Sopenharmony_ci   b2y = 'b2f' if y == 'f' else 'b2i'
2046bf215546Sopenharmony_ci   x2yN = '{}2{}'.format(x, y)
2047bf215546Sopenharmony_ci   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
2048bf215546Sopenharmony_ci
2049bf215546Sopenharmony_ci# Optimize away x2xN(a@N)
2050bf215546Sopenharmony_cifor t in ['int', 'uint', 'float', 'bool']:
2051bf215546Sopenharmony_ci   for N in type_sizes(t):
2052bf215546Sopenharmony_ci      x2xN = '{0}2{0}{1}'.format(t[0], N)
2053bf215546Sopenharmony_ci      aN = 'a@{0}'.format(N)
2054bf215546Sopenharmony_ci      optimizations.append(((x2xN, aN), a))
2055bf215546Sopenharmony_ci
2056bf215546Sopenharmony_ci# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
2057bf215546Sopenharmony_ci# In particular, we can optimize away everything except upcast of downcast and
2058bf215546Sopenharmony_ci# upcasts where the type differs from the other cast
2059bf215546Sopenharmony_cifor N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
2060bf215546Sopenharmony_ci   if N < M:
2061bf215546Sopenharmony_ci      # The outer cast is a down-cast.  It doesn't matter what the size of the
2062bf215546Sopenharmony_ci      # argument of the inner cast is because we'll never been in the upcast
2063bf215546Sopenharmony_ci      # of downcast case.  Regardless of types, we'll always end up with y2yN
2064bf215546Sopenharmony_ci      # in the end.
2065bf215546Sopenharmony_ci      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
2066bf215546Sopenharmony_ci         x2xN = '{0}2{0}{1}'.format(x, N)
2067bf215546Sopenharmony_ci         y2yM = '{0}2{0}{1}'.format(y, M)
2068bf215546Sopenharmony_ci         y2yN = '{0}2{0}{1}'.format(y, N)
2069bf215546Sopenharmony_ci         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
2070bf215546Sopenharmony_ci   elif N > M:
2071bf215546Sopenharmony_ci      # If the outer cast is an up-cast, we have to be more careful about the
2072bf215546Sopenharmony_ci      # size of the argument of the inner cast and with types.  In this case,
2073bf215546Sopenharmony_ci      # the type is always the type of type up-cast which is given by the
2074bf215546Sopenharmony_ci      # outer cast.
2075bf215546Sopenharmony_ci      for P in type_sizes('uint'):
2076bf215546Sopenharmony_ci         # We can't optimize away up-cast of down-cast.
2077bf215546Sopenharmony_ci         if M < P:
2078bf215546Sopenharmony_ci            continue
2079bf215546Sopenharmony_ci
2080bf215546Sopenharmony_ci         # Because we're doing down-cast of down-cast, the types always have
2081bf215546Sopenharmony_ci         # to match between the two casts
2082bf215546Sopenharmony_ci         for x in ['i', 'u']:
2083bf215546Sopenharmony_ci            x2xN = '{0}2{0}{1}'.format(x, N)
2084bf215546Sopenharmony_ci            x2xM = '{0}2{0}{1}'.format(x, M)
2085bf215546Sopenharmony_ci            aP = 'a@{0}'.format(P)
2086bf215546Sopenharmony_ci            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
2087bf215546Sopenharmony_ci   else:
2088bf215546Sopenharmony_ci      # The N == M case is handled by other optimizations
2089bf215546Sopenharmony_ci      pass
2090bf215546Sopenharmony_ci
2091bf215546Sopenharmony_ci# Downcast operations should be able to see through pack
2092bf215546Sopenharmony_cifor t in ['i', 'u']:
2093bf215546Sopenharmony_ci    for N in [8, 16, 32]:
2094bf215546Sopenharmony_ci        x2xN = '{0}2{0}{1}'.format(t, N)
2095bf215546Sopenharmony_ci        optimizations += [
2096bf215546Sopenharmony_ci            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2097bf215546Sopenharmony_ci            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2098bf215546Sopenharmony_ci        ]
2099bf215546Sopenharmony_ci
2100bf215546Sopenharmony_ci# Optimize comparisons with up-casts
2101bf215546Sopenharmony_cifor t in ['int', 'uint', 'float']:
2102bf215546Sopenharmony_ci    for N, M in itertools.product(type_sizes(t), repeat=2):
2103bf215546Sopenharmony_ci        if N == 1 or N >= M:
2104bf215546Sopenharmony_ci            continue
2105bf215546Sopenharmony_ci
2106bf215546Sopenharmony_ci        cond = 'true'
2107bf215546Sopenharmony_ci        if N == 8:
2108bf215546Sopenharmony_ci            cond = 'options->support_8bit_alu'
2109bf215546Sopenharmony_ci        elif N == 16:
2110bf215546Sopenharmony_ci            cond = 'options->support_16bit_alu'
2111bf215546Sopenharmony_ci        x2xM = '{0}2{0}{1}'.format(t[0], M)
2112bf215546Sopenharmony_ci        x2xN = '{0}2{0}{1}'.format(t[0], N)
2113bf215546Sopenharmony_ci        aN = 'a@' + str(N)
2114bf215546Sopenharmony_ci        bN = 'b@' + str(N)
2115bf215546Sopenharmony_ci        xeq = 'feq' if t == 'float' else 'ieq'
2116bf215546Sopenharmony_ci        xne = 'fneu' if t == 'float' else 'ine'
2117bf215546Sopenharmony_ci        xge = '{0}ge'.format(t[0])
2118bf215546Sopenharmony_ci        xlt = '{0}lt'.format(t[0])
2119bf215546Sopenharmony_ci
2120bf215546Sopenharmony_ci        # Up-casts are lossless so for correctly signed comparisons of
2121bf215546Sopenharmony_ci        # up-casted values we can do the comparison at the largest of the two
2122bf215546Sopenharmony_ci        # original sizes and drop one or both of the casts.  (We have
2123bf215546Sopenharmony_ci        # optimizations to drop the no-op casts which this may generate.)
2124bf215546Sopenharmony_ci        for P in type_sizes(t):
2125bf215546Sopenharmony_ci            if P == 1 or P > N:
2126bf215546Sopenharmony_ci                continue
2127bf215546Sopenharmony_ci
2128bf215546Sopenharmony_ci            bP = 'b@' + str(P)
2129bf215546Sopenharmony_ci            optimizations += [
2130bf215546Sopenharmony_ci                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
2131bf215546Sopenharmony_ci                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
2132bf215546Sopenharmony_ci                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
2133bf215546Sopenharmony_ci                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
2134bf215546Sopenharmony_ci                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
2135bf215546Sopenharmony_ci                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
2136bf215546Sopenharmony_ci            ]
2137bf215546Sopenharmony_ci
2138bf215546Sopenharmony_ci        # The next bit doesn't work on floats because the range checks would
2139bf215546Sopenharmony_ci        # get way too complicated.
2140bf215546Sopenharmony_ci        if t in ['int', 'uint']:
2141bf215546Sopenharmony_ci            if t == 'int':
2142bf215546Sopenharmony_ci                xN_min = -(1 << (N - 1))
2143bf215546Sopenharmony_ci                xN_max = (1 << (N - 1)) - 1
2144bf215546Sopenharmony_ci            elif t == 'uint':
2145bf215546Sopenharmony_ci                xN_min = 0
2146bf215546Sopenharmony_ci                xN_max = (1 << N) - 1
2147bf215546Sopenharmony_ci            else:
2148bf215546Sopenharmony_ci                assert False
2149bf215546Sopenharmony_ci
2150bf215546Sopenharmony_ci            # If we're up-casting and comparing to a constant, we can unfold
2151bf215546Sopenharmony_ci            # the comparison into a comparison with the shrunk down constant
2152bf215546Sopenharmony_ci            # and a check that the constant fits in the smaller bit size.
2153bf215546Sopenharmony_ci            optimizations += [
2154bf215546Sopenharmony_ci                ((xeq, (x2xM, aN), '#b'),
2155bf215546Sopenharmony_ci                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
2156bf215546Sopenharmony_ci                ((xne, (x2xM, aN), '#b'),
2157bf215546Sopenharmony_ci                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
2158bf215546Sopenharmony_ci                ((xlt, (x2xM, aN), '#b'),
2159bf215546Sopenharmony_ci                 ('iand', (xlt, xN_min, b),
2160bf215546Sopenharmony_ci                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
2161bf215546Sopenharmony_ci                ((xlt, '#a', (x2xM, bN)),
2162bf215546Sopenharmony_ci                 ('iand', (xlt, a, xN_max),
2163bf215546Sopenharmony_ci                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
2164bf215546Sopenharmony_ci                ((xge, (x2xM, aN), '#b'),
2165bf215546Sopenharmony_ci                 ('iand', (xge, xN_max, b),
2166bf215546Sopenharmony_ci                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
2167bf215546Sopenharmony_ci                ((xge, '#a', (x2xM, bN)),
2168bf215546Sopenharmony_ci                 ('iand', (xge, a, xN_min),
2169bf215546Sopenharmony_ci                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
2170bf215546Sopenharmony_ci            ]
2171bf215546Sopenharmony_ci
2172bf215546Sopenharmony_ci# Convert masking followed by signed downcast to just unsigned downcast
2173bf215546Sopenharmony_cioptimizations += [
2174bf215546Sopenharmony_ci    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
2175bf215546Sopenharmony_ci    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
2176bf215546Sopenharmony_ci    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
2177bf215546Sopenharmony_ci    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
2178bf215546Sopenharmony_ci    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
2179bf215546Sopenharmony_ci    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
2180bf215546Sopenharmony_ci]
2181bf215546Sopenharmony_ci
2182bf215546Sopenharmony_ci# Some operations such as iadd have the property that the bottom N bits of the
2183bf215546Sopenharmony_ci# output only depends on the bottom N bits of each of the inputs so we can
2184bf215546Sopenharmony_ci# remove casts
2185bf215546Sopenharmony_cifor N in [16, 32]:
2186bf215546Sopenharmony_ci    for M in [8, 16]:
2187bf215546Sopenharmony_ci        if M >= N:
2188bf215546Sopenharmony_ci            continue
2189bf215546Sopenharmony_ci
2190bf215546Sopenharmony_ci        aN = 'a@' + str(N)
2191bf215546Sopenharmony_ci        u2uM = 'u2u{0}'.format(M)
2192bf215546Sopenharmony_ci        i2iM = 'i2i{0}'.format(M)
2193bf215546Sopenharmony_ci
2194bf215546Sopenharmony_ci        for x in ['u', 'i']:
2195bf215546Sopenharmony_ci            x2xN = '{0}2{0}{1}'.format(x, N)
2196bf215546Sopenharmony_ci            extract_xM = 'extract_{0}{1}'.format(x, M)
2197bf215546Sopenharmony_ci
2198bf215546Sopenharmony_ci            x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)
2199bf215546Sopenharmony_ci            extract_xM_M_bits = \
2200bf215546Sopenharmony_ci                '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)
2201bf215546Sopenharmony_ci            optimizations += [
2202bf215546Sopenharmony_ci                ((x2xN_M_bits, (u2uM, aN)), a),
2203bf215546Sopenharmony_ci                ((extract_xM_M_bits, aN, 0), a),
2204bf215546Sopenharmony_ci            ]
2205bf215546Sopenharmony_ci
2206bf215546Sopenharmony_ci            bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)
2207bf215546Sopenharmony_ci            optimizations += [
2208bf215546Sopenharmony_ci                ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),
2209bf215546Sopenharmony_ci                ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),
2210bf215546Sopenharmony_ci                ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),
2211bf215546Sopenharmony_ci            ]
2212bf215546Sopenharmony_ci
2213bf215546Sopenharmony_ci            for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:
2214bf215546Sopenharmony_ci                op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)
2215bf215546Sopenharmony_ci                optimizations += [
2216bf215546Sopenharmony_ci                    ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),
2217bf215546Sopenharmony_ci                    ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),
2218bf215546Sopenharmony_ci                    ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),
2219bf215546Sopenharmony_ci                ]
2220bf215546Sopenharmony_ci
2221bf215546Sopenharmony_cidef fexp2i(exp, bits):
2222bf215546Sopenharmony_ci   # Generate an expression which constructs value 2.0^exp or 0.0.
2223bf215546Sopenharmony_ci   #
2224bf215546Sopenharmony_ci   # We assume that exp is already in a valid range:
2225bf215546Sopenharmony_ci   #
2226bf215546Sopenharmony_ci   #   * [-15, 15] for 16-bit float
2227bf215546Sopenharmony_ci   #   * [-127, 127] for 32-bit float
2228bf215546Sopenharmony_ci   #   * [-1023, 1023] for 16-bit float
2229bf215546Sopenharmony_ci   #
2230bf215546Sopenharmony_ci   # If exp is the lowest value in the valid range, a value of 0.0 is
2231bf215546Sopenharmony_ci   # constructed.  Otherwise, the value 2.0^exp is constructed.
2232bf215546Sopenharmony_ci   if bits == 16:
2233bf215546Sopenharmony_ci      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
2234bf215546Sopenharmony_ci   elif bits == 32:
2235bf215546Sopenharmony_ci      return ('ishl', ('iadd', exp, 127), 23)
2236bf215546Sopenharmony_ci   elif bits == 64:
2237bf215546Sopenharmony_ci      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
2238bf215546Sopenharmony_ci   else:
2239bf215546Sopenharmony_ci      assert False
2240bf215546Sopenharmony_ci
2241bf215546Sopenharmony_cidef ldexp(f, exp, bits):
2242bf215546Sopenharmony_ci   # The maximum possible range for a normal exponent is [-126, 127] and,
2243bf215546Sopenharmony_ci   # throwing in denormals, you get a maximum range of [-149, 127].  This
2244bf215546Sopenharmony_ci   # means that we can potentially have a swing of +-276.  If you start with
2245bf215546Sopenharmony_ci   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
2246bf215546Sopenharmony_ci   # all the way to zero.  The GLSL spec only requires that we handle a subset
2247bf215546Sopenharmony_ci   # of this range.  From version 4.60 of the spec:
2248bf215546Sopenharmony_ci   #
2249bf215546Sopenharmony_ci   #    "If exp is greater than +128 (single-precision) or +1024
2250bf215546Sopenharmony_ci   #    (double-precision), the value returned is undefined. If exp is less
2251bf215546Sopenharmony_ci   #    than -126 (single-precision) or -1022 (double-precision), the value
2252bf215546Sopenharmony_ci   #    returned may be flushed to zero. Additionally, splitting the value
2253bf215546Sopenharmony_ci   #    into a significand and exponent using frexp() and then reconstructing
2254bf215546Sopenharmony_ci   #    a floating-point value using ldexp() should yield the original input
2255bf215546Sopenharmony_ci   #    for zero and all finite non-denormalized values."
2256bf215546Sopenharmony_ci   #
2257bf215546Sopenharmony_ci   # The SPIR-V spec has similar language.
2258bf215546Sopenharmony_ci   #
2259bf215546Sopenharmony_ci   # In order to handle the maximum value +128 using the fexp2i() helper
2260bf215546Sopenharmony_ci   # above, we have to split the exponent in half and do two multiply
2261bf215546Sopenharmony_ci   # operations.
2262bf215546Sopenharmony_ci   #
2263bf215546Sopenharmony_ci   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
2264bf215546Sopenharmony_ci   # twice the full range that is valid for the fexp2i() function above.  If
2265bf215546Sopenharmony_ci   # exp/2 is the bottom value of that range, the fexp2i() expression will
2266bf215546Sopenharmony_ci   # yield 0.0f which, when multiplied by f, will flush it to zero which is
2267bf215546Sopenharmony_ci   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
2268bf215546Sopenharmony_ci   # value is clamped from above, then it must have been above the supported
2269bf215546Sopenharmony_ci   # range of the GLSL built-in and therefore any return value is acceptable.
2270bf215546Sopenharmony_ci   if bits == 16:
2271bf215546Sopenharmony_ci      exp = ('imin', ('imax', exp, -30), 30)
2272bf215546Sopenharmony_ci   elif bits == 32:
2273bf215546Sopenharmony_ci      exp = ('imin', ('imax', exp, -254), 254)
2274bf215546Sopenharmony_ci   elif bits == 64:
2275bf215546Sopenharmony_ci      exp = ('imin', ('imax', exp, -2046), 2046)
2276bf215546Sopenharmony_ci   else:
2277bf215546Sopenharmony_ci      assert False
2278bf215546Sopenharmony_ci
2279bf215546Sopenharmony_ci   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
2280bf215546Sopenharmony_ci   # (We use ishr which isn't the same for -1, but the -1 case still works
2281bf215546Sopenharmony_ci   # since we use exp-exp/2 as the second exponent.)  While the spec
2282bf215546Sopenharmony_ci   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
2283bf215546Sopenharmony_ci   # work with denormals and doesn't allow for the full swing in exponents
2284bf215546Sopenharmony_ci   # that you can get with normalized values.  Instead, we create two powers
2285bf215546Sopenharmony_ci   # of two and multiply by them each in turn.  That way the effective range
2286bf215546Sopenharmony_ci   # of our exponent is doubled.
2287bf215546Sopenharmony_ci   pow2_1 = fexp2i(('ishr', exp, 1), bits)
2288bf215546Sopenharmony_ci   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
2289bf215546Sopenharmony_ci   return ('fmul', ('fmul', f, pow2_1), pow2_2)
2290bf215546Sopenharmony_ci
2291bf215546Sopenharmony_cioptimizations += [
2292bf215546Sopenharmony_ci   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
2293bf215546Sopenharmony_ci   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
2294bf215546Sopenharmony_ci   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
2295bf215546Sopenharmony_ci]
2296bf215546Sopenharmony_ci
2297bf215546Sopenharmony_ci# Unreal Engine 4 demo applications open-codes bitfieldReverse()
2298bf215546Sopenharmony_cidef bitfield_reverse_ue4(u):
2299bf215546Sopenharmony_ci    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2300bf215546Sopenharmony_ci    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
2301bf215546Sopenharmony_ci    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
2302bf215546Sopenharmony_ci    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
2303bf215546Sopenharmony_ci    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
2304bf215546Sopenharmony_ci
2305bf215546Sopenharmony_ci    return step5
2306bf215546Sopenharmony_ci
2307bf215546Sopenharmony_ci# Cyberpunk 2077 open-codes bitfieldReverse()
2308bf215546Sopenharmony_cidef bitfield_reverse_cp2077(u):
2309bf215546Sopenharmony_ci    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2310bf215546Sopenharmony_ci    step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2311bf215546Sopenharmony_ci    step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2312bf215546Sopenharmony_ci    step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2313bf215546Sopenharmony_ci    step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2314bf215546Sopenharmony_ci
2315bf215546Sopenharmony_ci    return step5
2316bf215546Sopenharmony_ci
2317bf215546Sopenharmony_cioptimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2318bf215546Sopenharmony_cioptimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2319bf215546Sopenharmony_ci
2320bf215546Sopenharmony_ci# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
2321bf215546Sopenharmony_ci# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
2322bf215546Sopenharmony_cifor ncomp in [2, 3, 4, 8, 16]:
2323bf215546Sopenharmony_ci   optimizations += [
2324bf215546Sopenharmony_ci      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
2325bf215546Sopenharmony_ci      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
2326bf215546Sopenharmony_ci      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
2327bf215546Sopenharmony_ci      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
2328bf215546Sopenharmony_ci   ]
2329bf215546Sopenharmony_ci
2330bf215546Sopenharmony_ci# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
2331bf215546Sopenharmony_ci# then the "a == a" is redundant because it's equivalent to "a is not NaN"
2332bf215546Sopenharmony_ci# and, if a is a NaN then the second comparison will fail anyway.
2333bf215546Sopenharmony_cifor op in ['flt', 'fge', 'feq']:
2334bf215546Sopenharmony_ci   optimizations += [
2335bf215546Sopenharmony_ci      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
2336bf215546Sopenharmony_ci      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
2337bf215546Sopenharmony_ci   ]
2338bf215546Sopenharmony_ci
2339bf215546Sopenharmony_ci# Add optimizations to handle the case where the result of a ternary is
2340bf215546Sopenharmony_ci# compared to a constant.  This way we can take things like
2341bf215546Sopenharmony_ci#
2342bf215546Sopenharmony_ci# (a ? 0 : 1) > 0
2343bf215546Sopenharmony_ci#
2344bf215546Sopenharmony_ci# and turn it into
2345bf215546Sopenharmony_ci#
2346bf215546Sopenharmony_ci# a ? (0 > 0) : (1 > 0)
2347bf215546Sopenharmony_ci#
2348bf215546Sopenharmony_ci# which constant folding will eat for lunch.  The resulting ternary will
2349bf215546Sopenharmony_ci# further get cleaned up by the boolean reductions above and we will be
2350bf215546Sopenharmony_ci# left with just the original variable "a".
2351bf215546Sopenharmony_cifor op in ['feq', 'fneu', 'ieq', 'ine']:
2352bf215546Sopenharmony_ci   optimizations += [
2353bf215546Sopenharmony_ci      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2354bf215546Sopenharmony_ci       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2355bf215546Sopenharmony_ci   ]
2356bf215546Sopenharmony_ci
2357bf215546Sopenharmony_cifor op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
2358bf215546Sopenharmony_ci   optimizations += [
2359bf215546Sopenharmony_ci      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2360bf215546Sopenharmony_ci       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2361bf215546Sopenharmony_ci      ((op, '#d', ('bcsel', a, '#b', '#c')),
2362bf215546Sopenharmony_ci       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
2363bf215546Sopenharmony_ci   ]
2364bf215546Sopenharmony_ci
2365bf215546Sopenharmony_ci
2366bf215546Sopenharmony_ci# For example, this converts things like
2367bf215546Sopenharmony_ci#
2368bf215546Sopenharmony_ci#    1 + mix(0, a - 1, condition)
2369bf215546Sopenharmony_ci#
2370bf215546Sopenharmony_ci# into
2371bf215546Sopenharmony_ci#
2372bf215546Sopenharmony_ci#    mix(1, (a-1)+1, condition)
2373bf215546Sopenharmony_ci#
2374bf215546Sopenharmony_ci# Other optimizations will rearrange the constants.
2375bf215546Sopenharmony_cifor op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:
2376bf215546Sopenharmony_ci   optimizations += [
2377bf215546Sopenharmony_ci      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
2378bf215546Sopenharmony_ci   ]
2379bf215546Sopenharmony_ci
2380bf215546Sopenharmony_ci# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
2381bf215546Sopenharmony_ci# states:
2382bf215546Sopenharmony_ci#
2383bf215546Sopenharmony_ci#     If neither layout qualifier is specified, derivatives in compute shaders
2384bf215546Sopenharmony_ci#     return zero, which is consistent with the handling of built-in texture
2385bf215546Sopenharmony_ci#     functions like texture() in GLSL 4.50 compute shaders.
2386bf215546Sopenharmony_cifor op in ['fddx', 'fddx_fine', 'fddx_coarse',
2387bf215546Sopenharmony_ci           'fddy', 'fddy_fine', 'fddy_coarse']:
2388bf215546Sopenharmony_ci   optimizations += [
2389bf215546Sopenharmony_ci      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
2390bf215546Sopenharmony_ci]
2391bf215546Sopenharmony_ci
2392bf215546Sopenharmony_ci# Some optimizations for ir3-specific instructions.
2393bf215546Sopenharmony_cioptimizations += [
2394bf215546Sopenharmony_ci   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
2395bf215546Sopenharmony_ci   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
2396bf215546Sopenharmony_ci   # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
2397bf215546Sopenharmony_ci   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
2398bf215546Sopenharmony_ci   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
2399bf215546Sopenharmony_ci]
2400bf215546Sopenharmony_ci
2401bf215546Sopenharmony_ci# These kinds of sequences can occur after nir_opt_peephole_select.
2402bf215546Sopenharmony_ci#
2403bf215546Sopenharmony_ci# NOTE: fadd is not handled here because that gets in the way of ffma
2404bf215546Sopenharmony_ci# generation in the i965 driver.  Instead, fadd and ffma are handled in
2405bf215546Sopenharmony_ci# late_optimizations.
2406bf215546Sopenharmony_ci
2407bf215546Sopenharmony_cifor op in ['flrp']:
2408bf215546Sopenharmony_ci    optimizations += [
2409bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2410bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2411bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2412bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2413bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2414bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2415bf215546Sopenharmony_ci    ]
2416bf215546Sopenharmony_ci
2417bf215546Sopenharmony_cifor op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
2418bf215546Sopenharmony_ci    optimizations += [
2419bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2420bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2421bf215546Sopenharmony_ci        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2422bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2423bf215546Sopenharmony_ci    ]
2424bf215546Sopenharmony_ci
2425bf215546Sopenharmony_cifor op in ['fpow']:
2426bf215546Sopenharmony_ci    optimizations += [
2427bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2428bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2429bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
2430bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
2431bf215546Sopenharmony_ci    ]
2432bf215546Sopenharmony_ci
2433bf215546Sopenharmony_cifor op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fneg', 'fabs', 'fsign']:
2434bf215546Sopenharmony_ci    optimizations += [
2435bf215546Sopenharmony_ci        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
2436bf215546Sopenharmony_ci    ]
2437bf215546Sopenharmony_ci
2438bf215546Sopenharmony_cifor op in ['ineg', 'iabs', 'inot', 'isign']:
2439bf215546Sopenharmony_ci    optimizations += [
2440bf215546Sopenharmony_ci        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
2441bf215546Sopenharmony_ci    ]
2442bf215546Sopenharmony_ci
2443bf215546Sopenharmony_cioptimizations.extend([
2444bf215546Sopenharmony_ci    (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'),
2445bf215546Sopenharmony_ci    (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'),
2446bf215546Sopenharmony_ci    (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal')
2447bf215546Sopenharmony_ci    ])
2448bf215546Sopenharmony_ci
2449bf215546Sopenharmony_ci# This section contains optimizations to propagate downsizing conversions of
2450bf215546Sopenharmony_ci# constructed vectors into vectors of downsized components. Whether this is
2451bf215546Sopenharmony_ci# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
2452bf215546Sopenharmony_ci# this reduces the register pressure of the vector itself and often enables the
2453bf215546Sopenharmony_ci# conversions to be eliminated via other algebraic rules or constant folding.
2454bf215546Sopenharmony_ci# In the worst case on a SIMD architecture, the propagated conversions may be
2455bf215546Sopenharmony_ci# revectorized via nir_opt_vectorize so instruction count is minimally
2456bf215546Sopenharmony_ci# impacted.
2457bf215546Sopenharmony_ci#
2458bf215546Sopenharmony_ci# On a machine with SIMD-within-a-register only, this actually
2459bf215546Sopenharmony_ci# counterintuitively hurts instruction count. These machines are the same that
2460bf215546Sopenharmony_ci# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
2461bf215546Sopenharmony_ci# not being set.
2462bf215546Sopenharmony_ci#
2463bf215546Sopenharmony_ci# Finally for scalar architectures, there should be no difference in generated
2464bf215546Sopenharmony_ci# code since it all ends up scalarized at the end, but it might minimally help
2465bf215546Sopenharmony_ci# compile-times.
2466bf215546Sopenharmony_ci
2467bf215546Sopenharmony_cifor i in range(2, 4 + 1):
2468bf215546Sopenharmony_ci   for T in ('f', 'u', 'i'):
2469bf215546Sopenharmony_ci      vec_inst = ('vec' + str(i),)
2470bf215546Sopenharmony_ci
2471bf215546Sopenharmony_ci      indices = ['a', 'b', 'c', 'd']
2472bf215546Sopenharmony_ci      suffix_in = tuple((indices[j] + '@32') for j in range(i))
2473bf215546Sopenharmony_ci
2474bf215546Sopenharmony_ci      to_16 = '{}2{}16'.format(T, T)
2475bf215546Sopenharmony_ci      to_mp = '{}2{}mp'.format(T, T)
2476bf215546Sopenharmony_ci
2477bf215546Sopenharmony_ci      out_16 = tuple((to_16, indices[j]) for j in range(i))
2478bf215546Sopenharmony_ci      out_mp = tuple((to_mp, indices[j]) for j in range(i))
2479bf215546Sopenharmony_ci
2480bf215546Sopenharmony_ci      optimizations  += [
2481bf215546Sopenharmony_ci         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
2482bf215546Sopenharmony_ci      ]
2483bf215546Sopenharmony_ci      # u2ump doesn't exist, because it's equal to i2imp
2484bf215546Sopenharmony_ci      if T in ['f', 'i']:
2485bf215546Sopenharmony_ci          optimizations  += [
2486bf215546Sopenharmony_ci             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
2487bf215546Sopenharmony_ci          ]
2488bf215546Sopenharmony_ci
2489bf215546Sopenharmony_ci# This section contains "late" optimizations that should be run before
2490bf215546Sopenharmony_ci# creating ffmas and calling regular optimizations for the final time.
2491bf215546Sopenharmony_ci# Optimizations should go here if they help code generation and conflict
2492bf215546Sopenharmony_ci# with the regular optimizations.
2493bf215546Sopenharmony_cibefore_ffma_optimizations = [
2494bf215546Sopenharmony_ci   # Propagate constants down multiplication chains
2495bf215546Sopenharmony_ci   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
2496bf215546Sopenharmony_ci   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
2497bf215546Sopenharmony_ci   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
2498bf215546Sopenharmony_ci   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
2499bf215546Sopenharmony_ci
2500bf215546Sopenharmony_ci   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
2501bf215546Sopenharmony_ci   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
2502bf215546Sopenharmony_ci   (('~fadd', ('fneg', a), a), 0.0),
2503bf215546Sopenharmony_ci   (('iadd', ('ineg', a), a), 0),
2504bf215546Sopenharmony_ci   (('iadd', ('ineg', a), ('iadd', a, b)), b),
2505bf215546Sopenharmony_ci   (('iadd', a, ('iadd', ('ineg', a), b)), b),
2506bf215546Sopenharmony_ci   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
2507bf215546Sopenharmony_ci   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
2508bf215546Sopenharmony_ci
2509bf215546Sopenharmony_ci   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
2510bf215546Sopenharmony_ci   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
2511bf215546Sopenharmony_ci   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
2512bf215546Sopenharmony_ci]
2513bf215546Sopenharmony_ci
2514bf215546Sopenharmony_ci# This section contains "late" optimizations that should be run after the
2515bf215546Sopenharmony_ci# regular optimizations have finished.  Optimizations should go here if
2516bf215546Sopenharmony_ci# they help code generation but do not necessarily produce code that is
2517bf215546Sopenharmony_ci# more easily optimizable.
2518bf215546Sopenharmony_cilate_optimizations = [
2519bf215546Sopenharmony_ci   # The rearrangements are fine w.r.t. NaN.  However, they produce incorrect
2520bf215546Sopenharmony_ci   # results if one operand is +Inf and the other is -Inf.
2521bf215546Sopenharmony_ci   #
2522bf215546Sopenharmony_ci   # 1. Inf + -Inf = NaN
2523bf215546Sopenharmony_ci   # 2. ∀x: x + NaN = NaN and x - NaN = NaN
2524bf215546Sopenharmony_ci   # 3. ∀x: x != NaN = true
2525bf215546Sopenharmony_ci   # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false
2526bf215546Sopenharmony_ci   #
2527bf215546Sopenharmony_ci   #               a=Inf, b=-Inf   a=-Inf, b=Inf    a=NaN    b=NaN
2528bf215546Sopenharmony_ci   #  (a+b) < 0        false            false       false    false
2529bf215546Sopenharmony_ci   #      a < -b       false            false       false    false
2530bf215546Sopenharmony_ci   # -(a+b) < 0        false            false       false    false
2531bf215546Sopenharmony_ci   #     -a < b        false            false       false    false
2532bf215546Sopenharmony_ci   #  (a+b) >= 0       false            false       false    false
2533bf215546Sopenharmony_ci   #      a >= -b      true             true        false    false
2534bf215546Sopenharmony_ci   # -(a+b) >= 0       false            false       false    false
2535bf215546Sopenharmony_ci   #     -a >= b       true             true        false    false
2536bf215546Sopenharmony_ci   #  (a+b) == 0       false            false       false    false
2537bf215546Sopenharmony_ci   #      a == -b      true             true        false    false
2538bf215546Sopenharmony_ci   #  (a+b) != 0       true             true        true     true
2539bf215546Sopenharmony_ci   #      a != -b      false            false       true     true
2540bf215546Sopenharmony_ci   (('flt',                        ('fadd(is_used_once)', a, b),  0.0), ('flt',          a, ('fneg', b))),
2541bf215546Sopenharmony_ci   (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a),         b)),
2542bf215546Sopenharmony_ci   (('flt', 0.0,                        ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a),         b)),
2543bf215546Sopenharmony_ci   (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt',          a, ('fneg', b))),
2544bf215546Sopenharmony_ci   (('~fge',                        ('fadd(is_used_once)', a, b),  0.0), ('fge',          a, ('fneg', b))),
2545bf215546Sopenharmony_ci   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a),         b)),
2546bf215546Sopenharmony_ci   (('~fge', 0.0,                        ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a),         b)),
2547bf215546Sopenharmony_ci   (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge',          a, ('fneg', b))),
2548bf215546Sopenharmony_ci   (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),
2549bf215546Sopenharmony_ci   (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),
2550bf215546Sopenharmony_ci
2551bf215546Sopenharmony_ci   # If either source must be finite, then the original (a+b) cannot produce
2552bf215546Sopenharmony_ci   # NaN due to Inf-Inf.  The patterns and the replacements produce the same
2553bf215546Sopenharmony_ci   # result if b is NaN. Therefore, the replacements are exact.
2554bf215546Sopenharmony_ci   (('fge',                        ('fadd(is_used_once)', 'a(is_finite)', b),  0.0), ('fge',          a, ('fneg', b))),
2555bf215546Sopenharmony_ci   (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a),         b)),
2556bf215546Sopenharmony_ci   (('fge', 0.0,                        ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a),         b)),
2557bf215546Sopenharmony_ci   (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge',          a, ('fneg', b))),
2558bf215546Sopenharmony_ci   (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
2559bf215546Sopenharmony_ci   (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
2560bf215546Sopenharmony_ci
2561bf215546Sopenharmony_ci   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
2562bf215546Sopenharmony_ci   # SpvOpLessOrGreater.
2563bf215546Sopenharmony_ci   (('iand', ('fneu', a, b),   ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))),
2564bf215546Sopenharmony_ci   (('iand', ('fneu', a, 0.0),          ('feq', a, a)                ), ('!flt', 0.0, ('fabs', a))),
2565bf215546Sopenharmony_ci
2566bf215546Sopenharmony_ci   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
2567bf215546Sopenharmony_ci   # !SpvOpLessOrGreater.
2568bf215546Sopenharmony_ci   (('ior', ('feq', a, b),   ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))),
2569bf215546Sopenharmony_ci   (('ior', ('feq', a, 0.0),         ('fneu', a, a),                ), ('inot', ('!flt', 0.0, ('fabs', a)))),
2570bf215546Sopenharmony_ci
2571bf215546Sopenharmony_ci   # nir_lower_to_source_mods will collapse this, but its existence during the
2572bf215546Sopenharmony_ci   # optimization loop can prevent other optimizations.
2573bf215546Sopenharmony_ci   (('fneg', ('fneg', a)), a),
2574bf215546Sopenharmony_ci
2575bf215546Sopenharmony_ci   # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
2576bf215546Sopenharmony_ci   # gets combined to fma(a, b, -c).
2577bf215546Sopenharmony_ci   (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
2578bf215546Sopenharmony_ci   (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
2579bf215546Sopenharmony_ci   (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
2580bf215546Sopenharmony_ci   (('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'),
2581bf215546Sopenharmony_ci
2582bf215546Sopenharmony_ci   # Subtractions get lowered during optimization, so we need to recombine them
2583bf215546Sopenharmony_ci   (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2584bf215546Sopenharmony_ci   (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2585bf215546Sopenharmony_ci   (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2586bf215546Sopenharmony_ci   (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'),
2587bf215546Sopenharmony_ci
2588bf215546Sopenharmony_ci   (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
2589bf215546Sopenharmony_ci   (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
2590bf215546Sopenharmony_ci   (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
2591bf215546Sopenharmony_ci   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
2592bf215546Sopenharmony_ci
2593bf215546Sopenharmony_ci   (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),
2594bf215546Sopenharmony_ci   (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'),
2595bf215546Sopenharmony_ci   (('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'),
2596bf215546Sopenharmony_ci
2597bf215546Sopenharmony_ci    # fneg_lo / fneg_hi
2598bf215546Sopenharmony_ci   (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'),
2599bf215546Sopenharmony_ci   (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'),
2600bf215546Sopenharmony_ci
2601bf215546Sopenharmony_ci   # These are duplicated from the main optimizations table.  The late
2602bf215546Sopenharmony_ci   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
2603bf215546Sopenharmony_ci   # new patterns like these.  The patterns that compare with zero are removed
2604bf215546Sopenharmony_ci   # because they are unlikely to be created in by anything in
2605bf215546Sopenharmony_ci   # late_optimizations.
2606bf215546Sopenharmony_ci   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
2607bf215546Sopenharmony_ci   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
2608bf215546Sopenharmony_ci   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
2609bf215546Sopenharmony_ci   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
2610bf215546Sopenharmony_ci
2611bf215546Sopenharmony_ci   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
2612bf215546Sopenharmony_ci
2613bf215546Sopenharmony_ci   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
2614bf215546Sopenharmony_ci
2615bf215546Sopenharmony_ci   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
2616bf215546Sopenharmony_ci   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
2617bf215546Sopenharmony_ci   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
2618bf215546Sopenharmony_ci   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
2619bf215546Sopenharmony_ci   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
2620bf215546Sopenharmony_ci   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
2621bf215546Sopenharmony_ci   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
2622bf215546Sopenharmony_ci   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
2623bf215546Sopenharmony_ci   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
2624bf215546Sopenharmony_ci   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
2625bf215546Sopenharmony_ci
2626bf215546Sopenharmony_ci   (('ior', a, a), a),
2627bf215546Sopenharmony_ci   (('iand', a, a), a),
2628bf215546Sopenharmony_ci
2629bf215546Sopenharmony_ci   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
2630bf215546Sopenharmony_ci
2631bf215546Sopenharmony_ci   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
2632bf215546Sopenharmony_ci   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
2633bf215546Sopenharmony_ci   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
2634bf215546Sopenharmony_ci   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
2635bf215546Sopenharmony_ci
2636bf215546Sopenharmony_ci   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
2637bf215546Sopenharmony_ci
2638bf215546Sopenharmony_ci   # Approximate handling of fround_even for DX9 addressing from gallium nine on
2639bf215546Sopenharmony_ci   # DX9-class hardware with no proper fround support.  This is in
2640bf215546Sopenharmony_ci   # late_optimizations so that the is_integral() opts in the main pass get a
2641bf215546Sopenharmony_ci   # chance to eliminate the fround_even first.
2642bf215546Sopenharmony_ci   (('fround_even', a), ('bcsel',
2643bf215546Sopenharmony_ci                         ('feq', ('ffract', a), 0.5),
2644bf215546Sopenharmony_ci                         ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0),
2645bf215546Sopenharmony_ci                         ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'),
2646bf215546Sopenharmony_ci
2647bf215546Sopenharmony_ci   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
2648bf215546Sopenharmony_ci   # particular operation is common for expanding values stored in a texture
2649bf215546Sopenharmony_ci   # from [0,1] to [-1,1].
2650bf215546Sopenharmony_ci   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2651bf215546Sopenharmony_ci   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2652bf215546Sopenharmony_ci   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2653bf215546Sopenharmony_ci   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2654bf215546Sopenharmony_ci   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2655bf215546Sopenharmony_ci   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2656bf215546Sopenharmony_ci   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2657bf215546Sopenharmony_ci   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2658bf215546Sopenharmony_ci
2659bf215546Sopenharmony_ci    # flrp(a, b, a)
2660bf215546Sopenharmony_ci    # a*(1-a) + b*a
2661bf215546Sopenharmony_ci    # a + -a*a + a*b    (1)
2662bf215546Sopenharmony_ci    # a + a*(b - a)
2663bf215546Sopenharmony_ci    # Option 1: ffma(a, (b-a), a)
2664bf215546Sopenharmony_ci    #
2665bf215546Sopenharmony_ci    # Alternately, after (1):
2666bf215546Sopenharmony_ci    # a*(1+b) + -a*a
2667bf215546Sopenharmony_ci    # a*((1+b) + -a)
2668bf215546Sopenharmony_ci    #
2669bf215546Sopenharmony_ci    # Let b=1
2670bf215546Sopenharmony_ci    #
2671bf215546Sopenharmony_ci    # Option 2: ffma(a, 2, -(a*a))
2672bf215546Sopenharmony_ci    # Option 3: ffma(a, 2, (-a)*a)
2673bf215546Sopenharmony_ci    # Option 4: ffma(a, -a, (2*a)
2674bf215546Sopenharmony_ci    # Option 5: a * (2 - a)
2675bf215546Sopenharmony_ci    #
2676bf215546Sopenharmony_ci    # There are a lot of other possible combinations.
2677bf215546Sopenharmony_ci   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
2678bf215546Sopenharmony_ci   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2679bf215546Sopenharmony_ci   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2680bf215546Sopenharmony_ci   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2681bf215546Sopenharmony_ci   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2682bf215546Sopenharmony_ci
2683bf215546Sopenharmony_ci   # we do these late so that we don't get in the way of creating ffmas
2684bf215546Sopenharmony_ci   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
2685bf215546Sopenharmony_ci   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
2686bf215546Sopenharmony_ci
2687bf215546Sopenharmony_ci   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
2688bf215546Sopenharmony_ci   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
2689bf215546Sopenharmony_ci   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
2690bf215546Sopenharmony_ci    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
2691bf215546Sopenharmony_ci
2692bf215546Sopenharmony_ci   # Things that look like DPH in the source shader may get expanded to
2693bf215546Sopenharmony_ci   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
2694bf215546Sopenharmony_ci   # to NIR.  After FFMA is generated, this can look like:
2695bf215546Sopenharmony_ci   #
2696bf215546Sopenharmony_ci   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
2697bf215546Sopenharmony_ci   #
2698bf215546Sopenharmony_ci   # Reassociate the last addition into the first multiplication.
2699bf215546Sopenharmony_ci   #
2700bf215546Sopenharmony_ci   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
2701bf215546Sopenharmony_ci   # shader stages on some outputs that are intended to be invariant.  For
2702bf215546Sopenharmony_ci   # various reasons, this optimization may not be fully applied in all
2703bf215546Sopenharmony_ci   # shaders used for different rendering passes of the same geometry.  This
2704bf215546Sopenharmony_ci   # can result in Z-fighting artifacts (at best).  For now, disable this
2705bf215546Sopenharmony_ci   # optimization in these stages.  See bugzilla #111490.  In tessellation
2706bf215546Sopenharmony_ci   # stages applications seem to use 'precise' when necessary, so allow the
2707bf215546Sopenharmony_ci   # optimization in those stages.
2708bf215546Sopenharmony_ci   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
2709bf215546Sopenharmony_ci    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2710bf215546Sopenharmony_ci   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
2711bf215546Sopenharmony_ci    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2712bf215546Sopenharmony_ci   (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
2713bf215546Sopenharmony_ci    ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2714bf215546Sopenharmony_ci
2715bf215546Sopenharmony_ci   (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
2716bf215546Sopenharmony_ci    ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2717bf215546Sopenharmony_ci   (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
2718bf215546Sopenharmony_ci    ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2719bf215546Sopenharmony_ci   (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
2720bf215546Sopenharmony_ci    ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2721bf215546Sopenharmony_ci
2722bf215546Sopenharmony_ci   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2723bf215546Sopenharmony_ci   #
2724bf215546Sopenharmony_ci   #    If bits is zero, the result will be zero.
2725bf215546Sopenharmony_ci   #
2726bf215546Sopenharmony_ci   # These prevent the next two lowerings generating incorrect results when
2727bf215546Sopenharmony_ci   # count is zero.
2728bf215546Sopenharmony_ci   (('ubfe', a, b, 0), 0),
2729bf215546Sopenharmony_ci   (('ibfe', a, b, 0), 0),
2730bf215546Sopenharmony_ci
2731bf215546Sopenharmony_ci   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
2732bf215546Sopenharmony_ci   # instructions on Intel GPUs, it cannot have an immediate values as
2733bf215546Sopenharmony_ci   # sources.  There are also limitations on source register strides.  As a
2734bf215546Sopenharmony_ci   # result, it is very easy for 3-source instruction combined with either
2735bf215546Sopenharmony_ci   # loads of immediate values or copies from weird register strides to be
2736bf215546Sopenharmony_ci   # more expensive than the primitive instructions it represents.
2737bf215546Sopenharmony_ci   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),
2738bf215546Sopenharmony_ci
2739bf215546Sopenharmony_ci   # b is the lowest order bit to be extracted and c is the number of bits to
2740bf215546Sopenharmony_ci   # extract.  The inner shift removes the bits above b + c by shifting left
2741bf215546Sopenharmony_ci   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
2742bf215546Sopenharmony_ci   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
2743bf215546Sopenharmony_ci   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
2744bf215546Sopenharmony_ci   # This means that it must be shifted right by 32 - c or -c bits.
2745bf215546Sopenharmony_ci   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),
2746bf215546Sopenharmony_ci
2747bf215546Sopenharmony_ci   # Clean up no-op shifts that may result from the bfe lowerings.
2748bf215546Sopenharmony_ci   (('ishl', a, 0), a),
2749bf215546Sopenharmony_ci   (('ishl', a, -32), a),
2750bf215546Sopenharmony_ci   (('ishr', a, 0), a),
2751bf215546Sopenharmony_ci   (('ishr', a, -32), a),
2752bf215546Sopenharmony_ci   (('ushr', a, 0), a),
2753bf215546Sopenharmony_ci
2754bf215546Sopenharmony_ci   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
2755bf215546Sopenharmony_ci   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
2756bf215546Sopenharmony_ci   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
2757bf215546Sopenharmony_ci   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
2758bf215546Sopenharmony_ci]
2759bf215546Sopenharmony_ci
2760bf215546Sopenharmony_ci# A few more extract cases we'd rather leave late
2761bf215546Sopenharmony_cifor N in [16, 32]:
2762bf215546Sopenharmony_ci    aN = 'a@{0}'.format(N)
2763bf215546Sopenharmony_ci    u2uM = 'u2u{0}'.format(M)
2764bf215546Sopenharmony_ci    i2iM = 'i2i{0}'.format(M)
2765bf215546Sopenharmony_ci
2766bf215546Sopenharmony_ci    for x in ['u', 'i']:
2767bf215546Sopenharmony_ci        x2xN = '{0}2{0}{1}'.format(x, N)
2768bf215546Sopenharmony_ci        extract_x8 = 'extract_{0}8'.format(x)
2769bf215546Sopenharmony_ci        extract_x16 = 'extract_{0}16'.format(x)
2770bf215546Sopenharmony_ci
2771bf215546Sopenharmony_ci        late_optimizations.extend([
2772bf215546Sopenharmony_ci            ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
2773bf215546Sopenharmony_ci            ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
2774bf215546Sopenharmony_ci        ])
2775bf215546Sopenharmony_ci
2776bf215546Sopenharmony_ci        if N > 16:
2777bf215546Sopenharmony_ci            late_optimizations.extend([
2778bf215546Sopenharmony_ci                ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
2779bf215546Sopenharmony_ci                ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
2780bf215546Sopenharmony_ci            ])
2781bf215546Sopenharmony_ci
2782bf215546Sopenharmony_ci# Byte insertion
2783bf215546Sopenharmony_cilate_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
2784bf215546Sopenharmony_cilate_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
2785bf215546Sopenharmony_cilate_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))
2786bf215546Sopenharmony_ci
2787bf215546Sopenharmony_cilate_optimizations += [
2788bf215546Sopenharmony_ci   # Word insertion
2789bf215546Sopenharmony_ci   (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),
2790bf215546Sopenharmony_ci
2791bf215546Sopenharmony_ci   # Extract and then insert
2792bf215546Sopenharmony_ci   (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),
2793bf215546Sopenharmony_ci   (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),
2794bf215546Sopenharmony_ci]
2795bf215546Sopenharmony_ci
2796bf215546Sopenharmony_ci# Integer sizes
2797bf215546Sopenharmony_cifor s in [8, 16, 32, 64]:
2798bf215546Sopenharmony_ci    late_optimizations.extend([
2799bf215546Sopenharmony_ci        (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)),
2800bf215546Sopenharmony_ci        (('ior',  ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)),
2801bf215546Sopenharmony_ci    ])
2802bf215546Sopenharmony_ci
2803bf215546Sopenharmony_ci# Float sizes
2804bf215546Sopenharmony_cifor s in [16, 32, 64]:
2805bf215546Sopenharmony_ci    late_optimizations.extend([
2806bf215546Sopenharmony_ci       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
2807bf215546Sopenharmony_ci       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
2808bf215546Sopenharmony_ci    ])
2809bf215546Sopenharmony_ci
2810bf215546Sopenharmony_cifor op in ['fadd']:
2811bf215546Sopenharmony_ci    late_optimizations += [
2812bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2813bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2814bf215546Sopenharmony_ci    ]
2815bf215546Sopenharmony_ci
2816bf215546Sopenharmony_cifor op in ['ffma', 'ffmaz']:
2817bf215546Sopenharmony_ci    late_optimizations += [
2818bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2819bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2820bf215546Sopenharmony_ci
2821bf215546Sopenharmony_ci        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2822bf215546Sopenharmony_ci        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2823bf215546Sopenharmony_ci    ]
2824bf215546Sopenharmony_ci
2825bf215546Sopenharmony_ci# mediump: If an opcode is surrounded by conversions, remove the conversions.
2826bf215546Sopenharmony_ci# The rationale is that type conversions + the low precision opcode are more
2827bf215546Sopenharmony_ci# expensive that the same arithmetic opcode at higher precision.
2828bf215546Sopenharmony_ci#
2829bf215546Sopenharmony_ci# This must be done in late optimizations, because we need normal optimizations to
2830bf215546Sopenharmony_ci# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
2831bf215546Sopenharmony_ci#
2832bf215546Sopenharmony_ci# Unary opcodes
2833bf215546Sopenharmony_cifor op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy',
2834bf215546Sopenharmony_ci           'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
2835bf215546Sopenharmony_ci           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
2836bf215546Sopenharmony_ci    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
2837bf215546Sopenharmony_ci
2838bf215546Sopenharmony_ci# Binary opcodes
2839bf215546Sopenharmony_cifor op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
2840bf215546Sopenharmony_ci    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
2841bf215546Sopenharmony_ci
2842bf215546Sopenharmony_ci# Ternary opcodes
2843bf215546Sopenharmony_cifor op in ['ffma', 'flrp']:
2844bf215546Sopenharmony_ci    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
2845bf215546Sopenharmony_ci
2846bf215546Sopenharmony_ci# Comparison opcodes
2847bf215546Sopenharmony_cifor op in ['feq', 'fge', 'flt', 'fneu']:
2848bf215546Sopenharmony_ci    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
2849bf215546Sopenharmony_ci
2850bf215546Sopenharmony_ci# Do this last, so that the f2fmp patterns above have effect.
2851bf215546Sopenharmony_cilate_optimizations += [
2852bf215546Sopenharmony_ci  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
2853bf215546Sopenharmony_ci  # any conversions that could have been removed will have been removed in
2854bf215546Sopenharmony_ci  # nir_opt_algebraic so any remaining ones are required.
2855bf215546Sopenharmony_ci  (('f2fmp', a), ('f2f16', a)),
2856bf215546Sopenharmony_ci  (('f2imp', a), ('f2i16', a)),
2857bf215546Sopenharmony_ci  (('f2ump', a), ('f2u16', a)),
2858bf215546Sopenharmony_ci  (('i2imp', a), ('i2i16', a)),
2859bf215546Sopenharmony_ci  (('i2fmp', a), ('i2f16', a)),
2860bf215546Sopenharmony_ci  (('i2imp', a), ('u2u16', a)),
2861bf215546Sopenharmony_ci  (('u2fmp', a), ('u2f16', a)),
2862bf215546Sopenharmony_ci  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
2863bf215546Sopenharmony_ci]
2864bf215546Sopenharmony_ci
2865bf215546Sopenharmony_cidistribute_src_mods = [
2866bf215546Sopenharmony_ci   # Try to remove some spurious negations rather than pushing them down.
2867bf215546Sopenharmony_ci   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
2868bf215546Sopenharmony_ci   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
2869bf215546Sopenharmony_ci   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
2870bf215546Sopenharmony_ci   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
2871bf215546Sopenharmony_ci   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
2872bf215546Sopenharmony_ci   (('fneg', ('fneg', a)), a),
2873bf215546Sopenharmony_ci
2874bf215546Sopenharmony_ci   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
2875bf215546Sopenharmony_ci   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
2876bf215546Sopenharmony_ci
2877bf215546Sopenharmony_ci   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
2878bf215546Sopenharmony_ci   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
2879bf215546Sopenharmony_ci   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
2880bf215546Sopenharmony_ci
2881bf215546Sopenharmony_ci   # Note that fmin <-> fmax.  I don't think there is a way to distribute
2882bf215546Sopenharmony_ci   # fabs() into fmin or fmax.
2883bf215546Sopenharmony_ci   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
2884bf215546Sopenharmony_ci   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
2885bf215546Sopenharmony_ci
2886bf215546Sopenharmony_ci   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
2887bf215546Sopenharmony_ci   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
2888bf215546Sopenharmony_ci   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
2889bf215546Sopenharmony_ci
2890bf215546Sopenharmony_ci   # fdph works mostly like fdot, but to get the correct result, the negation
2891bf215546Sopenharmony_ci   # must be applied to the second source.
2892bf215546Sopenharmony_ci   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
2893bf215546Sopenharmony_ci
2894bf215546Sopenharmony_ci   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
2895bf215546Sopenharmony_ci   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
2896bf215546Sopenharmony_ci]
2897bf215546Sopenharmony_ci
2898bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
2899bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
2900bf215546Sopenharmony_ci                                  before_ffma_optimizations).render())
2901bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
2902bf215546Sopenharmony_ci                                  late_optimizations).render())
2903bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
2904bf215546Sopenharmony_ci                                  distribute_src_mods).render())
2905