1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2014 Intel Corporation
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23#
24# Authors:
25#    Jason Ekstrand (jason@jlekstrand.net)
26
27from collections import OrderedDict
28import nir_algebraic
29from nir_opcodes import type_sizes
30import itertools
31import struct
32from math import pi
33import math
34
35# Convenience variables
36a = 'a'
37b = 'b'
38c = 'c'
39d = 'd'
40e = 'e'
41
42signed_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)'
43signed_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)'
44
45ignore_exact = nir_algebraic.ignore_exact
46
47# Written in the form (<search>, <replace>) where <search> is an expression
48# and <replace> is either an expression or a value.  An expression is
49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
50# where each source is either an expression or a value.  A value can be
51# either a numeric constant or a string representing a variable name.
52#
53# If the opcode in a search expression is prefixed by a '~' character, this
54# indicates that the operation is inexact.  Such operations will only get
55# applied to SSA values that do not have the exact bit set.  This should be
56# used by by any optimizations that are not bit-for-bit exact.  It should not,
57# however, be used for backend-requested lowering operations as those need to
58# happen regardless of precision.
59#
60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where:
61# "#" indicates that the given variable will only match constants,
62# type indicates that the given variable will only match values from ALU
63#    instructions with the given output type,
64# (cond) specifies an additional condition function (see nir_search_helpers.h),
65# swiz is a swizzle applied to the variable (only in the <replace> expression)
66#
67# For constants, you have to be careful to make sure that it is the right
68# type because python is unaware of the source and destination types of the
69# opcodes.
70#
71# All expression types can have a bit-size specified.  For opcodes, this
72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a
73# type and size.  In the search half of the expression this indicates that it
74# should only match that particular bit-size.  In the replace half of the
75# expression this indicates that the constructed value should have that
76# bit-size.
77#
78# If the opcode in a replacement expression is prefixed by a '!' character,
79# this indicated that the new expression will be marked exact.
80#
81# A special condition "many-comm-expr" can be used with expressions to note
82# that the expression and its subexpressions have more commutative expressions
83# than nir_replace_instr can handle.  If this special condition is needed with
84# another condition, the two can be separated by a comma (e.g.,
85# "(many-comm-expr,is_used_once)").
86
87# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
88def lowered_sincos(c):
89    x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0)
90    x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0)
91    return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x)
92
93def intBitsToFloat(i):
94    return struct.unpack('!f', struct.pack('!I', i))[0]
95
96optimizations = [
97
98   (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
99   (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'),
100   (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'),
101   (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'),
102   (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'),
103   (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
104   (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'),
105
106   (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))),
107    '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'),
108
109   (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
110   (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)),
111   (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'),
112   (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'),
113   (('udiv', a, 1), a),
114   (('idiv', a, 1), a),
115   (('umod', a, 1), 0),
116   (('imod', a, 1), 0),
117   (('imod', a, -1), 0),
118   (('irem', a, 1), 0),
119   (('irem', a, -1), 0),
120   (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'),
121   (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'),
122   (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'),
123   (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
124   (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'),
125   (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'),
126   # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)'
127   (('irem', a, '#b(is_pos_power_of_two)'),
128    ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))),
129    '!options->lower_bitops'),
130   (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'),
131
132   (('~fneg', ('fneg', a)), a),
133   (('ineg', ('ineg', a)), a),
134   (('fabs', ('fneg', a)), ('fabs', a)),
135   (('fabs', ('u2f', a)), ('u2f', a)),
136   (('iabs', ('iabs', a)), ('iabs', a)),
137   (('iabs', ('ineg', a)), ('iabs', a)),
138   (('f2b', ('fneg', a)), ('f2b', a)),
139   (('i2b', ('ineg', a)), ('i2b', a)),
140   (('~fadd', a, 0.0), a),
141   # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a
142   # floating point instruction, they should flush any input denormals and we
143   # can replace -0.0 with 0.0 if the float execution mode allows it.
144   (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16),
145   (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32),
146   (('iadd', a, 0), a),
147   (('iadd_sat', a, 0), a),
148   (('isub_sat', a, 0), a),
149   (('uadd_sat', a, 0), a),
150   (('usub_sat', a, 0), a),
151   (('usadd_4x8_vc4', a, 0), a),
152   (('usadd_4x8_vc4', a, ~0), ~0),
153   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
154   (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))),
155   (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
156   (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))),
157   (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)),
158   (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))),
159   (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
160   (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))),
161   (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)),
162   (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))),
163   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
164   (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))),
165   (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))),
166   (('~fadd', ('fneg', a), a), 0.0),
167   (('iadd', ('ineg', a), a), 0),
168   (('iadd', ('ineg', a), ('iadd', a, b)), b),
169   (('iadd', a, ('iadd', ('ineg', a), b)), b),
170   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
171   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
172   (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))),
173   (('~fmul', a, 0.0), 0.0),
174   # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN
175   (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16),
176   (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32),
177   (('fmulz', a, 0.0), 0.0),
178   (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
179   (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)),
180   (('fmulz', a, a), ('fmul', a, a)),
181   (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32),
182   (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)),
183   (('ffmaz', a, a, b), ('ffma', a, a, b)),
184   (('imul', a, 0), 0),
185   (('umul_unorm_4x8_vc4', a, 0), 0),
186   (('umul_unorm_4x8_vc4', a, ~0), a),
187   (('~fmul', a, 1.0), a),
188   (('~fmulz', a, 1.0), a),
189   # The only effect a*1.0 can have is flushing denormals. If it's only used by
190   # a floating point instruction, they should flush any input denormals and
191   # this multiplication isn't needed.
192   (('fmul(is_only_used_as_float)', a, 1.0), a),
193   (('imul', a, 1), a),
194   (('fmul', a, -1.0), ('fneg', a)),
195   (('imul', a, -1), ('ineg', a)),
196   # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a
197   # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a
198   # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0
199   # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN
200   (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)),
201   (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)),
202   (('~ffma', 0.0, a, b), b),
203   (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16),
204   (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32),
205   (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)),
206   (('~ffma', a, b, 0.0), ('fmul', a, b)),
207   (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16),
208   (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32),
209   (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32),
210   (('ffma', 1.0, a, b), ('fadd', a, b)),
211   (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32),
212   (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)),
213   (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32),
214   (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)),
215   (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)),
216   (('~flrp', a, b, 0.0), a),
217   (('~flrp', a, b, 1.0), b),
218   (('~flrp', a, a, b), a),
219   (('~flrp', 0.0, a, b), ('fmul', a, b)),
220
221   # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c)
222   (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)),
223
224   (('sdot_4x8_iadd', a, 0, b), b),
225   (('udot_4x8_uadd', a, 0, b), b),
226   (('sdot_4x8_iadd_sat', a, 0, b), b),
227   (('udot_4x8_uadd_sat', a, 0, b), b),
228   (('sdot_2x16_iadd', a, 0, b), b),
229   (('udot_2x16_uadd', a, 0, b), b),
230   (('sdot_2x16_iadd_sat', a, 0, b), b),
231   (('udot_2x16_uadd_sat', a, 0, b), b),
232
233   # sudot_4x8_iadd is not commutative at all, so the patterns must be
234   # duplicated with zeros on each of the first positions.
235   (('sudot_4x8_iadd', a, 0, b), b),
236   (('sudot_4x8_iadd', 0, a, b), b),
237   (('sudot_4x8_iadd_sat', a, 0, b), b),
238   (('sudot_4x8_iadd_sat', 0, a, b), b),
239
240   (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))),
241   (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))),
242   (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))),
243   (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))),
244   (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))),
245
246   # Try to let constant folding eliminate the dot-product part.  These are
247   # safe because the dot product cannot overflow 32 bits.
248   (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)),
249   (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)),
250   (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)),
251   (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)),
252   (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)),
253   (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)),
254   (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)),
255   (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)),
256   (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)),
257   (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)),
258   (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)),
259   (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
260   (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
261   (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
262   (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'),
263   (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'),
264
265   # Optimize open-coded fmulz.
266   # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b)
267   (('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)),
268    ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
269   (('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')),
270    ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
271
272   # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c)
273   (('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c),
274    ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
275   (('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c),
276    ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32),
277]
278
279# Shorthand for the expansion of just the dot product part of the [iu]dp4a
280# instructions.
281sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)),
282                                 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))),
283                        ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)),
284                                 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3))))
285udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)),
286                                 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))),
287                        ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)),
288                                 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3))))
289sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)),
290                                  ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))),
291                         ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)),
292                                  ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3))))
293sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)),
294                         ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1)))
295udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)),
296                         ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1)))
297
298optimizations.extend([
299   (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
300   (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'),
301   (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
302   (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
303   (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'),
304
305   # For the unsigned dot-product, the largest possible value 4*(255*255) =
306   # 0x3f804, so we don't have to worry about that intermediate result
307   # overflowing.  0x100000000 - 0x3f804 = 0xfffc07fc.  If c is a constant
308   # that is less than 0xfffc07fc, then the result cannot overflow ever.
309   (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)),
310   (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'),
311
312   # For the signed dot-product, the largest positive value is 4*(-128*-128) =
313   # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00.  We
314   # don't have to worry about that intermediate result overflowing or
315   # underflowing.
316   (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'),
317
318   (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'),
319
320   (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'),
321   (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'),
322])
323
324# Float sizes
325for s in [16, 32, 64]:
326    optimizations.extend([
327       (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
328
329       (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)),
330       (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)),
331       (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)),
332
333       (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
334       # These are the same as the previous three rules, but it depends on
335       # 1-fsat(x) <=> fsat(1-x).  See below.
336       (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)),
337       (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)),
338
339       (('~fadd@{}'.format(s),    ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f',  c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
340       (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)),
341
342       (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)),
343       (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)),
344
345       # These two aren't flrp lowerings, but do appear in some shaders.
346       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)),
347       (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))),
348
349       # 1 - ((1 - a) * (1 - b))
350       # 1 - (1 - a - b + a*b)
351       # 1 - 1 + a + b - a*b
352       # a + b - a*b
353       # a + b*(1 - a)
354       # b*(1 - a) + 1*a
355       # flrp(b, 1, a)
356       (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)),
357    ])
358
359optimizations.extend([
360   (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)),
361
362   (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)),
363   (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'),
364
365   (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
366   (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'),
367   (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'),
368   (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
369   (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'),
370   (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'),
371   (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
372   (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
373   (('ffract@64', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)'),
374   (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'),
375   (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'),
376   (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'),
377   (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'),
378   (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'),
379   # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late).
380   (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'),
381   (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'),
382   (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'),
383   (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'),
384
385   (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'),
386    ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))),
387
388   (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'),
389
390   (('fdot4', ('vec4', a, b,   c,   1.0), d), ('fdph',  ('vec3', a, b, c), d), '!options->lower_fdph'),
391   (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)),
392   (('fdot4', ('vec4', a, b,   0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)),
393   (('fdot4', ('vec4', a, b,   c,   0.0), d), ('fdot3', ('vec3', a, b, c), d)),
394
395   (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)),
396   (('fdot3', ('vec3', a, b,   0.0), c), ('fdot2', ('vec2', a, b), c)),
397
398   (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)),
399   (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')),
400
401   # Lower fdot to fsum when it is available
402   (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'),
403   (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'),
404   (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'),
405   (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'),
406
407   # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially
408   # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1
409   # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0
410   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
411
412   # (a * #b + #c) << #d
413   # ((a * #b) << #d) + (#c << #d)
414   # (a * (#b << #d)) + (#c << #d)
415   (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'),
416    ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))),
417
418   # (a * #b) << #c
419   # a * (#b << #c)
420   (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))),
421])
422
423# Care must be taken here.  Shifts in NIR uses only the lower log2(bitsize)
424# bits of the second source.  These replacements must correctly handle the
425# case where (b % bitsize) + (c % bitsize) >= bitsize.
426for s in [8, 16, 32, 64]:
427   mask = s - 1
428
429   ishl = "ishl@{}".format(s)
430   ishr = "ishr@{}".format(s)
431   ushr = "ushr@{}".format(s)
432
433   in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s)
434
435   optimizations.extend([
436       ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)),
437       ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)),
438
439       # To get get -1 for large shifts of negative values, ishr must instead
440       # clamp the shift count to the maximum value.
441       ((ishr, (ishr, a, '#b'), '#c'),
442        (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))),
443   ])
444
445# Optimize a pattern of address calculation created by DXVK where the offset is
446# divided by 4 and then multipled by 4. This can be turned into an iand and the
447# additions before can be reassociated to CSE the iand instruction.
448
449for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)):
450    a_sz = 'a@{}'.format(size)
451
452    optimizations.extend([
453       # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)'
454       (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
455       (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))),
456
457       # This does not trivially work with ishr.
458       (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))),
459    ])
460
461optimizations.extend([
462    (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)),
463    (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)),
464])
465
466for log2 in range(1, 7): # powers of two from 2 to 64
467   v = 1 << log2
468   mask = 0xffffffff & ~(v - 1)
469   b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v)
470
471   optimizations.extend([
472       # Reassociate for improved CSE
473       (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)),
474   ])
475
476# To save space in the state tables, reduce to the set that is known to help.
477# Previously, this was range(1, 32).  In addition, a couple rules inside the
478# loop are commented out.  Revisit someday, probably after mesa/#2635 has some
479# resolution.
480for i in [1, 2, 16, 24]:
481    lo_mask = 0xffffffff >> i
482    hi_mask = (0xffffffff << i) & 0xffffffff
483
484    optimizations.extend([
485        # This pattern seems to only help in the soft-fp64 code.
486        (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)),
487#        (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)),
488#        (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)),
489
490        (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)),
491        (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)),
492#        (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct
493    ])
494
495optimizations.extend([
496   # This is common for address calculations.  Reassociating may enable the
497   # 'a<<c' to be CSE'd.  It also helps architectures that have an ISHLADD
498   # instruction or a constant offset field for in load / store instructions.
499   (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))),
500
501   # (a + #b) * #c => (a * #c) + (#b * #c)
502   (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))),
503
504   # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d)
505   (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
506    ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))),
507   (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'),
508    ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))),
509
510   # Comparison simplifications
511   (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)),
512   (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)),
513   (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)),
514   (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)),
515   (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)),
516   (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)),
517   (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)),
518   (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)),
519   (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)),
520   (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)),
521
522   (('iand', ('feq', a, b), ('fneu', a, b)), False),
523   (('iand', ('flt', a, b), ('flt', b, a)), False),
524   (('iand', ('ieq', a, b), ('ine', a, b)), False),
525   (('iand', ('ilt', a, b), ('ilt', b, a)), False),
526   (('iand', ('ult', a, b), ('ult', b, a)), False),
527
528   # This helps some shaders because, after some optimizations, they end up
529   # with patterns like (-a < -b) || (b < a).  In an ideal world, this sort of
530   # matching would be handled by CSE.
531   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
532   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
533   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
534   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
535   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
536   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
537   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
538   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
539   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
540   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
541
542   # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false.
543   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
544
545   # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false.
546   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
547
548   # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false.
549   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
550
551   # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true.
552   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
553
554   # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false.
555   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
556
557   # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false.
558   (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)),
559
560   # 0.0 >= b2f(a)
561   # b2f(a) <= 0.0
562   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
563   # inot(a)
564   (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)),
565
566   (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)),
567
568   (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)),
569   (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('ior', a, b)),
570   (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('ior', a, b)),
571   (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)),
572   (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('iand', a, b)),
573   (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)),
574   (('fneu',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ixor', a, b)),
575   (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ixor', a, b)),
576   (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))),
577   (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1'))   , 0.0), ('inot', ('ior', a, b))),
578   (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))),      ('inot', ('ior', a, b))),
579   (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))),
580   (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0)   , 0.0), ('inot', ('iand', a, b))),
581   (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)),
582   (('feq',          ('b2f', 'a@1') ,          ('b2f', 'b@1') ),      ('ieq', a, b)),
583   (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))),      ('ieq', a, b)),
584
585   # -(b2f(a) + b2f(b)) < 0
586   # 0 < b2f(a) + b2f(b)
587   # 0 != b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
588   # a || b
589   (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)),
590   (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)),
591
592   # -(b2f(a) + b2f(b)) >= 0
593   # 0 >= b2f(a) + b2f(b)
594   # 0 == b2f(a) + b2f(b)       b2f must be 0 or 1, so the sum is non-negative
595   # !(a || b)
596   (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))),
597   (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))),
598
599   (('flt', a, ('fneg', a)), ('flt', a, 0.0)),
600   (('fge', a, ('fneg', a)), ('fge', a, 0.0)),
601
602   # Some optimizations (below) convert things like (a < b || c < b) into
603   # (min(a, c) < b).  However, this interfers with the previous optimizations
604   # that try to remove comparisons with negated sums of b2f.  This just
605   # breaks that apart.
606   (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0),
607    ('ior', ('flt', c, 0.0), ('ior', a, b))),
608
609   (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)),
610   (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)),
611   (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)),
612   (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)),
613   (('~flt',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('flt', a, ('fadd', c, ('fneg', b)))),
614   (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)),
615   (('~fge',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fge', a, ('fadd', c, ('fneg', b)))),
616   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)),
617   (('~feq',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('feq', a, ('fadd', c, ('fneg', b)))),
618   (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)),
619   (('~fneu',                        ('fadd(is_used_once)', a, '#b'),  '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))),
620   (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)),
621
622   # Cannot remove the addition from ilt or ige due to overflow.
623   (('ieq', ('iadd', a, b), a), ('ieq', b, 0)),
624   (('ine', ('iadd', a, b), a), ('ine', b, 0)),
625
626   (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)),
627   (('fneu', ('b2f', 'a@1'), 0.0), a),
628   (('ieq', ('b2i', 'a@1'), 0),   ('inot', a)),
629   (('ine', ('b2i', 'a@1'), 0),   a),
630
631   (('fneu', ('u2f', a), 0.0), ('ine', a, 0)),
632   (('feq', ('u2f', a), 0.0), ('ieq', a, 0)),
633   (('fge', ('u2f', a), 0.0), True),
634   (('fge', 0.0, ('u2f', a)), ('uge', 0, a)),    # ieq instead?
635   (('flt', ('u2f', a), 0.0), False),
636   (('flt', 0.0, ('u2f', a)), ('ult', 0, a)),    # ine instead?
637   (('fneu', ('i2f', a), 0.0), ('ine', a, 0)),
638   (('feq', ('i2f', a), 0.0), ('ieq', a, 0)),
639   (('fge', ('i2f', a), 0.0), ('ige', a, 0)),
640   (('fge', 0.0, ('i2f', a)), ('ige', 0, a)),
641   (('flt', ('i2f', a), 0.0), ('ilt', a, 0)),
642   (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)),
643
644   # 0.0 < fabs(a)
645   # fabs(a) > 0.0
646   # fabs(a) != 0.0 because fabs(a) must be >= 0
647   # a != 0.0
648   (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)),
649
650   # -fabs(a) < 0.0
651   # fabs(a) > 0.0
652   (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)),
653
654   # 0.0 >= fabs(a)
655   # 0.0 == fabs(a)   because fabs(a) must be >= 0
656   # 0.0 == a
657   (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)),
658
659   # -fabs(a) >= 0.0
660   # 0.0 >= fabs(a)
661   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
662
663   # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a
664   #
665   # This should be NaN safe.
666   #
667   # NaN >= 0 && 1 >= NaN -> false && false -> false
668   #
669   # vs.
670   #
671   # NaN == fsat(NaN) -> NaN == 0 -> false
672   (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'),
673
674   # Note: fmin(-a, -b) == -fmax(a, b)
675   (('fmax',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('ior', a, b))),
676   (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))),
677   (('fmin',                        ('b2f(is_used_once)', 'a@1'),           ('b2f', 'b@1')),           ('b2f', ('iand', a, b))),
678   (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))),
679
680   # fmin(b2f(a), b)
681   # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b))
682   # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b))
683   # bcsel(a, fmin(1.0, b), fmin(0.0, b))
684   #
685   # Since b is a constant, constant folding will eliminate the fmin and the
686   # fmax.  If b is > 1.0, the bcsel will be replaced with a b2f.
687   (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))),
688
689   (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)),
690
691   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
692   (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
693   (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
694   (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)),
695   (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)),
696   (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)),
697   (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)),
698   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
699   (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)),
700   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
701   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))),
702   (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
703   (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)),
704   (('bcsel', a, True, b), ('ior', a, b)),
705   (('bcsel', a, a, b), ('ior', a, b)),
706   (('bcsel', a, b, False), ('iand', a, b)),
707   (('bcsel', a, b, a), ('iand', a, b)),
708   (('~fmin', a, a), a),
709   (('~fmax', a, a), a),
710   (('imin', a, a), a),
711   (('imax', a, a), a),
712   (('umin', a, a), a),
713   (('umin', a, 0), 0),
714   (('umin', a, -1), a),
715   (('umax', a, a), a),
716   (('umax', a, 0), a),
717   (('umax', a, -1), -1),
718   (('fmax', ('fmax', a, b), b), ('fmax', a, b)),
719   (('umax', ('umax', a, b), b), ('umax', a, b)),
720   (('imax', ('imax', a, b), b), ('imax', a, b)),
721   (('fmin', ('fmin', a, b), b), ('fmin', a, b)),
722   (('umin', ('umin', a, b), b), ('umin', a, b)),
723   (('imin', ('imin', a, b), b), ('imin', a, b)),
724   (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)),
725   (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)),
726   (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)),
727   (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)),
728   (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)),
729   (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)),
730])
731
732for N in [8, 16, 32, 64]:
733    b2iN = 'b2i{0}'.format(N)
734    optimizations.extend([
735        (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)),
736        (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)),
737    ])
738
739for N in [16, 32, 64]:
740    b2fN = 'b2f{0}'.format(N)
741    optimizations.extend([
742        (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)),
743        (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)),
744    ])
745
746# Integer sizes
747for s in [8, 16, 32, 64]:
748    optimizations.extend([
749       (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)),
750
751       # Simplify logic to detect sign of an integer.
752       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ige', a, 0)),
753       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)),
754       (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0),            ('ilt', a, 0)),
755       (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)),
756       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
757       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
758       (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)),
759       (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)),
760       (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)),
761       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)),
762       (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)),
763       (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)),
764    ])
765
766optimizations.extend([
767   (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
768   (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
769   (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
770   (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))),
771   (('~fmin', a, ('fabs', a)), a),
772   (('imin', a, ('iabs', a)), a),
773   (('~fmax', a, ('fneg', ('fabs', a))), a),
774   (('imax', a, ('ineg', ('iabs', a))), a),
775   (('fmax', a, ('fabs', a)), ('fabs', a)),
776   (('imax', a, ('iabs', a)), ('iabs', a)),
777   (('fmax', a, ('fneg', a)), ('fabs', a)),
778   (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'),
779   (('~fmax', ('fabs', a), 0.0), ('fabs', a)),
780   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
781   # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while
782   # fsat(a) returns 0.0.
783   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
784   # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while
785   # fneg(fsat(fneg(a))) returns -0.0 on NaN.
786   (('~fmin', ('fmax', a, -1.0),  0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
787   # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while
788   # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if
789   # SignedZeroInfNanPreserve is set, but we don't currently have any way of
790   # representing this in the optimizations other than the usual ~.
791   (('~fmax', ('fmin', a,  0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'),
792   # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark
793   # the new comparison precise to prevent it being changed to 'a != 0'.
794   (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))),
795   (('fsat', ('b2f', a)), ('b2f', a)),
796   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
797   (('fsat', ('fsat', a)), ('fsat', a)),
798   (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'),
799   (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'),
800   (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32),
801   (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'),
802   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
803   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
804   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
805   # Both the left and right patterns are "b" when isnan(a), so this is exact.
806   (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))),
807   # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) ->
808   # fmin(0.0, b)) while the right one is "b", so this optimization is inexact.
809   (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))),
810
811   # max(-min(b, a), b) -> max(abs(b), -a)
812   # min(-max(b, a), b) -> min(-abs(b), -a)
813   (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))),
814   (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))),
815
816   # If a in [0,b] then b-a is also in [0,b].  Since b in [0,1], max(b-a, 0) =
817   # fsat(b-a).
818   #
819   # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0
820   #
821   # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0.
822   (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0),
823    ('fsat', ('fadd', ('fneg',  a), b)), '!options->lower_fsat'),
824
825   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
826
827   # The ior versions are exact because fmin and fmax will always pick a
828   # non-NaN value, if one exists.  Therefore (a < NaN) || (a < c) == a <
829   # fmax(NaN, c) == a < c.  Mark the fmin or fmax in the replacement as exact
830   # to prevent other optimizations from ruining the "NaN clensing" property
831   # of the fmin or fmax.
832   (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))),
833   (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)),
834   (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))),
835   (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)),
836   (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))),
837   (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)),
838   (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))),
839   (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)),
840   (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))),
841   (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)),
842   (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))),
843   (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)),
844   (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))),
845   (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)),
846   (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))),
847   (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)),
848
849   (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))),
850   (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)),
851   (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))),
852   (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)),
853   (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))),
854   (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)),
855   (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))),
856   (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)),
857   (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))),
858   (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)),
859   (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))),
860   (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)),
861   (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))),
862   (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)),
863   (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
864   (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
865
866   # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y
867   # < 0.0, || a.y > 1.0 || ...  These patterns rearrange and replace in a
868   # single step.  Doing just the replacement can lead to an infinite loop as
869   # the pattern is repeatedly applied to the result of the previous
870   # application of the pattern.
871   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
872   (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)),
873   (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
874   (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)),
875
876   # This is how SpvOpFOrdNotEqual might be implemented.  If both values are
877   # numbers, then it can be replaced with fneu.
878   (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)),
879])
880
881# Float sizes
882for s in [16, 32, 64]:
883    optimizations.extend([
884       # These derive from the previous patterns with the application of b < 0 <=>
885       # 0 < -b.  The transformation should be applied if either comparison is
886       # used once as this ensures that the number of comparisons will not
887       # increase.  The sources to the ior and iand are not symmetric, so the
888       # rules have to be duplicated to get this behavior.
889       (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
890       (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))),
891       (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
892       (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))),
893       (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
894       (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))),
895       (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
896       (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))),
897
898       # The (i2f32, ...) part is an open-coded fsign.  When that is combined
899       # with the bcsel, it's basically copysign(1.0, a).  There are some
900       # behavior differences between this pattern and copysign w.r.t. ±0 and
901       # NaN.  copysign(x, y) blindly takes the sign bit from y and applies it
902       # to x, regardless of whether either or both values are NaN.
903       #
904       # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0,
905       #            int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0
906       # If a == ±0: bcsel(True, 1.0, ...) = 1.0,
907       #            int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1
908       #
909       # For all other values of 'a', the original and replacement behave as
910       # copysign.
911       #
912       # Marking the replacement comparisons as precise prevents any future
913       # optimizations from replacing either of the comparisons with the
914       # logical-not of the other.
915       #
916       # Note: Use b2i32 in the replacement because some platforms that
917       # support fp16 don't support int16.
918       (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))),
919        ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))),
920
921       (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))),
922
923       # The C spec says, "If the value of the integral part cannot be represented
924       # by the integer type, the behavior is undefined."  "Undefined" can mean
925       # "the conversion doesn't happen at all."
926       (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)),
927
928       # Ironically, mark these as imprecise because removing the conversions may
929       # preserve more precision than doing the conversions (e.g.,
930       # uint(float(0x81818181u)) == 0x81818200).
931       (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
932       (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
933       (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a),
934       (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a),
935
936       (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'),
937       (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'),
938    ])
939
940    # float? -> float? -> floatS ==> float? -> floatS
941    (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)),
942
943    # int? -> float? -> floatS ==> int? -> floatS
944    (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)),
945    (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)),
946
947    # float? -> float? -> intS ==> float? -> intS
948    (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)),
949    (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)),
950
951    for B in [32, 64]:
952        if s < B:
953            optimizations.extend([
954               # S = smaller, B = bigger
955               # typeS -> typeB -> typeS ==> identity
956               (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a),
957               (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a),
958               (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a),
959
960               # bool1 -> typeB -> typeS ==> bool1 -> typeS
961               (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)),
962               (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
963               (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)),
964
965               # floatS -> floatB -> intB ==> floatS -> intB
966               (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)),
967               (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)),
968
969               # int? -> floatB -> floatS ==> int? -> floatS
970               (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)),
971               (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)),
972
973               # intS -> intB -> floatB ==> intS -> floatB
974               (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)),
975               (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)),
976            ])
977
978# mediump variants of the above
979optimizations.extend([
980    # int32 -> float32 -> float16 ==> int32 -> float16
981    (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)),
982    (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)),
983
984    # float32 -> float16 -> int16 ==> float32 -> int16
985    (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)),
986    (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)),
987
988    # float32 -> int32 -> int16 ==> float32 -> int16
989    (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)),
990    (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)),
991
992    # int32 -> int16 -> float16 ==> int32 -> float16
993    (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)),
994    (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)),
995])
996
997# Clean up junk left from 8-bit integer to 16-bit integer lowering.
998optimizations.extend([
999    # The u2u16(u2u8(X)) just masks off the upper 8-bits of X.  This can be
1000    # accomplished by mask the upper 8-bit of the immediate operand to the
1001    # iand instruction.  Often times, both patterns will end up being applied
1002    # to the same original expression tree.
1003    (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'),               ('iand', a, ('iand', b, 0xff))),
1004    (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))),
1005])
1006
1007for op in ['iand', 'ior', 'ixor']:
1008    optimizations.extend([
1009        (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))),
1010        (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))),
1011
1012        # Undistribute extract from a logic op
1013        ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)),
1014        ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)),
1015        ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)),
1016        ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)),
1017
1018        # Undistribute shifts from a logic op
1019        ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)),
1020        ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)),
1021        ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)),
1022    ])
1023
1024# Integer sizes
1025for s in [8, 16, 32, 64]:
1026    last_shift_bit = int(math.log2(s)) - 1
1027
1028    optimizations.extend([
1029       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'),
1030       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'),
1031       (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'),
1032       (('ior',  ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'),
1033       (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'),
1034       (('ior',  ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'),
1035
1036       # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
1037       (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a),
1038
1039       # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits)
1040       (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)),
1041       (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)),
1042       (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)),
1043       (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))),
1044    ])
1045
1046optimizations.extend([
1047   # Common pattern like 'if (i == 0 || i == 1 || ...)'
1048   (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)),
1049   (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)),
1050   (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)),
1051
1052   (('ior', a, ('ieq', a, False)), True),
1053   (('ior', a, ('inot', a)), -1),
1054
1055   (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)),
1056   (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))),
1057
1058   # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code.
1059   # The first part of the iand comes from the !__feq64_nonnan.
1060   #
1061   # The second pattern is a reformulation of the first based on the relation
1062   # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation
1063   # happens to be y == 0.
1064   (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0),  b), c)), ('ilt', a, 0)),
1065    ('iand', ('inot', ('iand',                         b , c)), ('ilt', a, 0))),
1066   (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)),
1067    ('iand', ('inot', ('iand', ('ieq',             b , 0), c)), ('ilt', a, 0))),
1068
1069   # These patterns can result when (a < b || a < c) => (a < min(b, c))
1070   # transformations occur before constant propagation and loop-unrolling.
1071   #
1072   # The flt versions are exact.  If isnan(a), the original pattern is
1073   # trivially false, and the replacements are false too.  If isnan(b):
1074   #
1075   #    a < fmax(NaN, a) => a < a => false vs a < NaN => false
1076   (('flt', a, ('fmax', b, a)), ('flt', a, b)),
1077   (('flt', ('fmin', a, b), a), ('flt', b, a)),
1078   (('~fge', a, ('fmin', b, a)), True),
1079   (('~fge', ('fmax', a, b), a), True),
1080   (('flt', a, ('fmin', b, a)), False),
1081   (('flt', ('fmax', a, b), a), False),
1082   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
1083   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
1084
1085   (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
1086   (('ilt', ('imin', a, b), a), ('ilt', b, a)),
1087   (('ige', a, ('imin', b, a)), True),
1088   (('ige', ('imax', a, b), a), True),
1089   (('ult', a, ('umax', b, a)), ('ult', a, b)),
1090   (('ult', ('umin', a, b), a), ('ult', b, a)),
1091   (('uge', a, ('umin', b, a)), True),
1092   (('uge', ('umax', a, b), a), True),
1093   (('ilt', a, ('imin', b, a)), False),
1094   (('ilt', ('imax', a, b), a), False),
1095   (('ige', a, ('imax', b, a)), ('ige', a, b)),
1096   (('ige', ('imin', a, b), a), ('ige', b, a)),
1097   (('ult', a, ('umin', b, a)), False),
1098   (('ult', ('umax', a, b), a), False),
1099   (('uge', a, ('umax', b, a)), ('uge', a, b)),
1100   (('uge', ('umin', a, b), a), ('uge', b, a)),
1101   (('ult', a, ('iand', b, a)), False),
1102   (('ult', ('ior', a, b), a), False),
1103   (('uge', a, ('iand', b, a)), True),
1104   (('uge', ('ior', a, b), a), True),
1105
1106   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
1107   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
1108   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
1109   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
1110   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
1111   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
1112   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
1113   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
1114   (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
1115   (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
1116   (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
1117   (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))),
1118   (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))),
1119   (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))),
1120   (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))),
1121   (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))),
1122
1123   # Thanks to sign extension, the ishr(a, b) is negative if and only if a is
1124   # negative.
1125   (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)),
1126    ('iabs', ('ishr', a, b))),
1127   (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)),
1128
1129   (('fabs', ('slt', a, b)), ('slt', a, b)),
1130   (('fabs', ('sge', a, b)), ('sge', a, b)),
1131   (('fabs', ('seq', a, b)), ('seq', a, b)),
1132   (('fabs', ('sne', a, b)), ('sne', a, b)),
1133   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
1134   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
1135   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
1136   (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'),
1137   (('seq', ('seq', a, b), 1.0), ('seq', a, b)),
1138   (('seq', ('sne', a, b), 1.0), ('sne', a, b)),
1139   (('seq', ('slt', a, b), 1.0), ('slt', a, b)),
1140   (('seq', ('sge', a, b), 1.0), ('sge', a, b)),
1141   (('sne', ('seq', a, b), 0.0), ('seq', a, b)),
1142   (('sne', ('sne', a, b), 0.0), ('sne', a, b)),
1143   (('sne', ('slt', a, b), 0.0), ('slt', a, b)),
1144   (('sne', ('sge', a, b), 0.0), ('sge', a, b)),
1145   (('seq', ('seq', a, b), 0.0), ('sne', a, b)),
1146   (('seq', ('sne', a, b), 0.0), ('seq', a, b)),
1147   (('seq', ('slt', a, b), 0.0), ('sge', a, b)),
1148   (('seq', ('sge', a, b), 0.0), ('slt', a, b)),
1149   (('sne', ('seq', a, b), 1.0), ('sne', a, b)),
1150   (('sne', ('sne', a, b), 1.0), ('seq', a, b)),
1151   (('sne', ('slt', a, b), 1.0), ('sge', a, b)),
1152   (('sne', ('sge', a, b), 1.0), ('slt', a, b)),
1153   (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1154   (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'),
1155   (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'),
1156   (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1157   (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1158   (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'),
1159
1160   (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1161   (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1162   (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1163
1164   (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1165   (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1166   (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1167
1168   (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1169   (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1170   (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1171
1172   (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'),
1173   (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'),
1174   (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'),
1175
1176   (('feq', ('seq', a, b), 1.0), ('feq', a, b)),
1177   (('feq', ('sne', a, b), 1.0), ('fneu', a, b)),
1178   (('feq', ('slt', a, b), 1.0), ('flt', a, b)),
1179   (('feq', ('sge', a, b), 1.0), ('fge', a, b)),
1180   (('fneu', ('seq', a, b), 0.0), ('feq', a, b)),
1181   (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)),
1182   (('fneu', ('slt', a, b), 0.0), ('flt', a, b)),
1183   (('fneu', ('sge', a, b), 0.0), ('fge', a, b)),
1184   (('feq', ('seq', a, b), 0.0), ('fneu', a, b)),
1185   (('feq', ('sne', a, b), 0.0), ('feq', a, b)),
1186   (('feq', ('slt', a, b), 0.0), ('fge', a, b)),
1187   (('feq', ('sge', a, b), 0.0), ('flt', a, b)),
1188   (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)),
1189   (('fneu', ('sne', a, b), 1.0), ('feq', a, b)),
1190   (('fneu', ('slt', a, b), 1.0), ('fge', a, b)),
1191   (('fneu', ('sge', a, b), 1.0), ('flt', a, b)),
1192
1193   (('fneu', ('fneg', a), a), ('fneu', a, 0.0)),
1194   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
1195   # Emulating booleans
1196   (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1197   (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))),
1198   (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))),
1199   (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))),
1200   (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))),
1201   (('iand', 'a@bool16', 1.0), ('b2f', a)),
1202   (('iand', 'a@bool32', 1.0), ('b2f', a)),
1203   (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF.
1204   # Comparison with the same args.  Note that these are only done for the
1205   # float versions when the source must be a number.  Generally, NaN cmp NaN
1206   # produces the opposite result of X cmp X.  flt is the outlier.  NaN < NaN
1207   # is false, and, for any number X, X < X is also false.
1208   (('ilt', a, a), False),
1209   (('ige', a, a), True),
1210   (('ieq', a, a), True),
1211   (('ine', a, a), False),
1212   (('ult', a, a), False),
1213   (('uge', a, a), True),
1214   (('flt', a, a), False),
1215   (('fge', 'a(is_a_number)', a), True),
1216   (('feq', 'a(is_a_number)', a), True),
1217   (('fneu', 'a(is_a_number)', a), False),
1218   # Logical and bit operations
1219   (('iand', a, a), a),
1220   (('iand', a, ~0), a),
1221   (('iand', a, 0), 0),
1222   (('ior', a, a), a),
1223   (('ior', a, 0), a),
1224   (('ior', a, True), True),
1225   (('ixor', a, a), 0),
1226   (('ixor', a, 0), a),
1227   (('ixor', a, ('ixor', a, b)), b),
1228   (('ixor', a, -1), ('inot', a)),
1229   (('inot', ('inot', a)), a),
1230   (('ior', ('iand', a, b), b), b),
1231   (('ior', ('ior', a, b), b), ('ior', a, b)),
1232   (('iand', ('ior', a, b), b), b),
1233   (('iand', ('iand', a, b), b), ('iand', a, b)),
1234   # DeMorgan's Laws
1235   (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
1236   (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
1237   # Shift optimizations
1238   (('ishl', 0, a), 0),
1239   (('ishl', a, 0), a),
1240   (('ishr', 0, a), 0),
1241   (('ishr', -1, a), -1),
1242   (('ishr', a, 0), a),
1243   (('ushr', 0, a), 0),
1244   (('ushr', a, 0), a),
1245   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
1246   (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'),
1247   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'),
1248   (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'),
1249   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
1250   (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'),
1251   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'),
1252   (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'),
1253   (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'),
1254   (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'),
1255   (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'),
1256   (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'),
1257   # Exponential/logarithmic identities
1258   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
1259   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
1260   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
1261   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
1262   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
1263    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
1264   (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)),
1265   (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)),
1266   (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1267   (('~fpow', a, 1.0), a),
1268   (('~fpow', a, 2.0), ('fmul', a, a)),
1269   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
1270   (('~fpow', 2.0, a), ('fexp2', a)),
1271   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
1272   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
1273   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
1274   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
1275   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
1276   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
1277   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
1278   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
1279   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
1280   (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))),
1281   (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))),
1282   (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)),
1283   (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)),
1284   # Division and reciprocal
1285   (('~fdiv', 1.0, a), ('frcp', a)),
1286   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
1287   (('~frcp', ('frcp', a)), a),
1288   (('~frcp', ('fsqrt', a)), ('frsq', a)),
1289   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
1290   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
1291   # Trig
1292   (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'),
1293   (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'),
1294   # Boolean simplifications
1295   (('i2b16(is_used_by_if)', a), ('ine16', a, 0)),
1296   (('i2b32(is_used_by_if)', a), ('ine32', a, 0)),
1297   (('i2b1(is_used_by_if)', a), ('ine', a, 0)),
1298   (('ieq', a, True), a),
1299   (('ine(is_not_used_by_if)', a, True), ('inot', a)),
1300   (('ine', a, False), a),
1301   (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')),
1302   (('bcsel', a, True, False), a),
1303   (('bcsel', a, False, True), ('inot', a)),
1304   (('bcsel', True, b, c), b),
1305   (('bcsel', False, b, c), c),
1306
1307   (('bcsel@16', a, 1.0, 0.0), ('b2f', a)),
1308   (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))),
1309   (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1310   (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1311   (('bcsel@32', a, 1.0, 0.0), ('b2f', a)),
1312   (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))),
1313   (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))),
1314   (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))),
1315   (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1316   (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1317   (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1318   (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'),
1319
1320   (('bcsel', a, b, b), b),
1321   (('~fcsel', a, b, b), b),
1322
1323   # D3D Boolean emulation
1324   (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))),
1325   (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))),
1326   (('bcsel', a, 1, 0), ('b2i', 'a@1')),
1327   (('bcsel', a, 0, 1), ('b2i', ('inot', a))),
1328   (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1329    ('ineg', ('b2i', ('iand', a, b)))),
1330   (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))),
1331    ('ineg', ('b2i', ('ior', a, b)))),
1332   (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1333   (('ieq', ('ineg', ('b2i', 'a@1')), -1), a),
1334   (('ine', ('ineg', ('b2i', 'a@1')), 0), a),
1335   (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)),
1336   (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)),
1337   (('ilt', ('ineg', ('b2i', 'a@1')), 0), a),
1338   (('ult', 0, ('ineg', ('b2i', 'a@1'))), a),
1339   (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)),
1340   (('iand', ('ineg', ('b2i', a)), 1),   ('b2i', a)),
1341
1342   # With D3D booleans, imax is AND and umax is OR
1343   (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1344    ('ineg', ('b2i', ('iand', a, b)))),
1345   (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1346    ('ineg', ('b2i', ('ior', a, b)))),
1347   (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1348    ('ineg', ('b2i', ('ior', a, b)))),
1349   (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))),
1350    ('ineg', ('b2i', ('iand', a, b)))),
1351
1352   # Conversions
1353   (('i2b16', ('b2i', 'a@16')), a),
1354   (('i2b32', ('b2i', 'a@32')), a),
1355   (('f2i', ('ftrunc', a)), ('f2i', a)),
1356   (('f2u', ('ftrunc', a)), ('f2u', a)),
1357   (('i2b', ('ineg', a)), ('i2b', a)),
1358   (('i2b', ('iabs', a)), ('i2b', a)),
1359   (('inot', ('f2b1', a)), ('feq', a, 0.0)),
1360
1361   # Conversions from 16 bits to 32 bits and back can always be removed
1362   (('f2fmp', ('f2f32', 'a@16')), a),
1363   (('i2imp', ('i2i32', 'a@16')), a),
1364   (('i2imp', ('u2u32', 'a@16')), a),
1365
1366   (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)),
1367   (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)),
1368   (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)),
1369   (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)),
1370
1371   (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)),
1372   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1373   (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)),
1374
1375   (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)),
1376   (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)),
1377   (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1378   (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)),
1379
1380   # Conversions to 16 bits would be lossy so they should only be removed if
1381   # the instruction was generated by the precision lowering pass.
1382   (('f2f32', ('f2fmp', 'a@32')), a),
1383   (('i2i32', ('i2imp', 'a@32')), a),
1384   (('u2u32', ('i2imp', 'a@32')), a),
1385
1386   (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)),
1387   (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)),
1388   (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)),
1389   (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)),
1390
1391   # Conversions from float32 to float64 and back can be removed as long as
1392   # it doesn't need to be precise, since the conversion may e.g. flush denorms
1393   (('~f2f32', ('f2f64', 'a@32')), a),
1394
1395   (('ffloor', 'a(is_integral)'), a),
1396   (('fceil', 'a(is_integral)'), a),
1397   (('ftrunc', 'a(is_integral)'), a),
1398   (('fround_even', 'a(is_integral)'), a),
1399
1400   # fract(x) = x - floor(x), so fract(NaN) = NaN
1401   (('~ffract', 'a(is_integral)'), 0.0),
1402   (('fabs', 'a(is_not_negative)'), a),
1403   (('iabs', 'a(is_not_negative)'), a),
1404   (('fsat', 'a(is_not_positive)'), 0.0),
1405
1406   (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'),
1407
1408   # The result of the multiply must be in [-1, 0], so the result of the ffma
1409   # must be in [0, 1].
1410   (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False),
1411   (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False),
1412   (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)),
1413   (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)),
1414
1415   (('fneu', 'a(is_not_zero)', 0.0), True),
1416   (('feq', 'a(is_not_zero)', 0.0), False),
1417
1418   # In this chart, + means value > 0 and - means value < 0.
1419   #
1420   # + >= + -> unknown  0 >= + -> false    - >= + -> false
1421   # + >= 0 -> true     0 >= 0 -> true     - >= 0 -> false
1422   # + >= - -> true     0 >= - -> true     - >= - -> unknown
1423   #
1424   # Using grouping conceptually similar to a Karnaugh map...
1425   #
1426   # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true
1427   # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false
1428   # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false
1429   #
1430   # The flt / ilt cases just invert the expected result.
1431   #
1432   # The results expecting true, must be marked imprecise.  The results
1433   # expecting false are fine because NaN compared >= or < anything is false.
1434
1435   (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True),
1436   (('fge', 'a(is_not_positive)',          'b(is_gt_zero)'),               False),
1437   (('fge', 'a(is_lt_zero)',               'b(is_not_negative)'),          False),
1438
1439   (('flt', 'a(is_not_negative)',          'b(is_not_positive)'),          False),
1440   (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'),      True),
1441   (('flt', 'a(is_a_number_lt_zero)',      'b(is_a_number_not_negative)'), True),
1442
1443   (('ine', 'a(is_not_zero)', 0), True),
1444   (('ieq', 'a(is_not_zero)', 0), False),
1445
1446   (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True),
1447   (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'),      False),
1448   (('ige', 'a(is_lt_zero)',      'b(is_not_negative)'), False),
1449
1450   (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False),
1451   (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'),      True),
1452   (('ilt', 'a(is_lt_zero)',      'b(is_not_negative)'), True),
1453
1454   (('ult', 0, 'a(is_gt_zero)'), True),
1455   (('ult', a, 0), False),
1456
1457   # Packing and then unpacking does nothing
1458   (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
1459   (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b),
1460   (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'),
1461   (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'),
1462   (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)),
1463   (('unpack_64_2x32', ('pack_64_2x32', a)), a),
1464   (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a),
1465   (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
1466                           ('unpack_64_2x32_split_y', a)), a),
1467   (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a),
1468                              ('unpack_64_2x32_split_y', a))), a),
1469   (('pack_64_2x32', ('unpack_64_2x32', a)), a),
1470   (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a),
1471
1472   # Comparing two halves of an unpack separately.  While this optimization
1473   # should be correct for non-constant values, it's less obvious that it's
1474   # useful in that case.  For constant values, the pack will fold and we're
1475   # guaranteed to reduce the whole tree to one instruction.
1476   (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'),
1477             ('ieq', ('unpack_32_2x16_split_y', a), '#c')),
1478    ('ieq', a, ('pack_32_2x16_split', b, c))),
1479
1480   # Byte extraction
1481   (('ushr', 'a@16',  8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1482   (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1483   (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'),
1484   (('ishr', 'a@16',  8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1485   (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1486   (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'),
1487   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1488
1489   # Common pattern in many Vulkan CTS tests that read 8-bit integers from a
1490   # storage buffer.
1491   (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'),
1492   (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'),
1493
1494   # Common pattern after lowering 8-bit integers to 16-bit.
1495   (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))),
1496   (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))),
1497
1498   (('ubfe', a,  0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'),
1499   (('ubfe', a,  8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'),
1500   (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'),
1501   (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'),
1502   (('ibfe', a,  0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'),
1503   (('ibfe', a,  8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'),
1504   (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'),
1505   (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'),
1506
1507   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
1508   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
1509
1510    # Word extraction
1511   (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1512   (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1513   (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1514   (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1515   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
1516
1517   (('ubfe', a,  0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'),
1518   (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
1519   (('ibfe', a,  0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'),
1520   (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'),
1521
1522   # Packing a u8vec4 to write to an SSBO.
1523   (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))),
1524    ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'),
1525
1526   (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)),
1527   (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)),
1528
1529   # Lower pack/unpack
1530   (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'),
1531   (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'),
1532   (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'),
1533   (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'),
1534   (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'),
1535   (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'),
1536
1537   # Useless masking before unpacking
1538   (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)),
1539   (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)),
1540   (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)),
1541   (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)),
1542   (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)),
1543   (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)),
1544
1545   (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)),
1546   (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)),
1547   (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)),
1548   (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)),
1549   (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)),
1550
1551   # Optimize half packing
1552   (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))),
1553   (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))),
1554
1555   (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1556    ('pack_half_2x16', ('vec2', a, b))),
1557   (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))),
1558    ('pack_half_2x16', ('vec2', a, b))),
1559
1560   (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)),
1561   (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)),
1562   (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)),
1563
1564   (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1565   (('ior',  ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)),
1566
1567   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)),
1568   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)),
1569   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)),
1570   (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)),
1571   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)),
1572   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)),
1573   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)),
1574   (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)),
1575])
1576
1577# After the ('extract_u8', a, 0) pattern, above, triggers, there will be
1578# patterns like those below.
1579for op in ('ushr', 'ishr'):
1580   optimizations.extend([(('extract_u8', (op, 'a@16',  8),     0), ('extract_u8', a, 1))])
1581   optimizations.extend([(('extract_u8', (op, 'a@32',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)])
1582   optimizations.extend([(('extract_u8', (op, 'a@64',  8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)])
1583
1584optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))])
1585
1586# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be
1587# patterns like those below.
1588for op in ('extract_u8', 'extract_i8'):
1589   optimizations.extend([((op, ('ishl', 'a@16',      8),     1), (op, a, 0))])
1590   optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)])
1591   optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)])
1592
1593optimizations.extend([
1594   # Subtracts
1595   (('ussub_4x8_vc4', a, 0), a),
1596   (('ussub_4x8_vc4', a, ~0), 0),
1597   # Lower all Subtractions first - they can get recombined later
1598   (('fsub', a, b), ('fadd', a, ('fneg', b))),
1599   (('isub', a, b), ('iadd', a, ('ineg', b))),
1600   (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1601   # This is correct.  We don't need isub_sat because the result type is unsigned, so it cannot overflow.
1602   (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))),
1603
1604   # Propagate negation up multiplication chains
1605   (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))),
1606   (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32),
1607   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
1608   (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)),
1609   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
1610
1611   # Propagate constants up multiplication chains
1612   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)),
1613   (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)),
1614   (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)),
1615   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)),
1616   (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)),
1617   (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)),
1618   (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)),
1619   # Prefer moving out a multiplication for more MAD/FMA-friendly code
1620   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)),
1621   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)),
1622   (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)),
1623   (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)),
1624   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)),
1625
1626   # Reassociate constants in add/mul chains so they can be folded together.
1627   # For now, we mostly only handle cases where the constants are separated by
1628   # a single non-constant.  We could do better eventually.
1629   (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)),
1630   (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)),
1631   (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)),
1632   (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)),
1633   (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)),
1634   (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)),
1635   (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)),
1636   (('~fadd', '#a',          ('fadd', 'b(is_not_const)', '#c')),  ('fadd', ('fadd', a,          c),           b)),
1637   (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))),
1638   (('~fadd', '#a',          ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffma',          b,  c, ('fadd', a,          d))),
1639   (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1640   (('~fadd', '#a',          ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')),  ('ffmaz',          b,  c, ('fadd', a,          d))),
1641   (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))),
1642   (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)),
1643   (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)),
1644   (('ior',  '#a', ('ior',  'b(is_not_const)', '#c')), ('ior',  ('ior',  a, c), b)),
1645   (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)),
1646
1647   # Reassociate add chains for more MAD/FMA-friendly code
1648   (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)),
1649
1650   # Drop mul-div by the same value when there's no wrapping.
1651   (('idiv', ('imul(no_signed_wrap)', a, b), b), a),
1652
1653   # By definition...
1654   (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)),
1655   (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1656   (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1657
1658   (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)),
1659   (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)),
1660   (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)),
1661
1662   (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)),
1663
1664   (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1665   (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1666   (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)),
1667   (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))),
1668   (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)),
1669
1670   (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)),
1671   (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)),
1672
1673   # Misc. lowering
1674   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
1675   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
1676   (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
1677   (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
1678
1679   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1680    ('bcsel', ('ult', 31, 'bits'), 'insert',
1681              ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
1682    'options->lower_bitfield_insert'),
1683   (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1684   (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1685   (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1686   (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'),
1687   (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1688   (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1689   (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1690   (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1691
1692   (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'),
1693   (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'),
1694
1695   (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'),
1696   (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'),
1697   (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'),
1698   (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'),
1699
1700   # int64_t sum = a + b;
1701   #
1702   # if (a < 0 && b < 0 && a < sum)
1703   #    sum = INT64_MIN;
1704   # } else if (a >= 0 && b >= 0 && sum < a)
1705   #    sum = INT64_MAX;
1706   # }
1707   #
1708   # A couple optimizations are applied.
1709   #
1710   # 1. a < sum => sum >= 0.  This replacement works because it is known that
1711   #    a < 0 and b < 0, so sum should also be < 0 unless there was
1712   #    underflow.
1713   #
1714   # 2. sum < a => sum < 0.  This replacement works because it is known that
1715   #    a >= 0 and b >= 0, so sum should also be >= 0 unless there was
1716   #    overflow.
1717   #
1718   # 3. Invert the second if-condition and swap the order of parameters for
1719   #    the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >=
1720   #    0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0)
1721   #
1722   # On Intel Gen11, this saves ~11 instructions.
1723   (('iadd_sat@64', a, b), ('bcsel',
1724                            ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1725                            0x8000000000000000,
1726                            ('bcsel',
1727                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
1728                             ('iadd', a, b),
1729                             0x7fffffffffffffff)),
1730    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
1731
1732   # int64_t sum = a - b;
1733   #
1734   # if (a < 0 && b >= 0 && a < sum)
1735   #    sum = INT64_MIN;
1736   # } else if (a >= 0 && b < 0 && a >= sum)
1737   #    sum = INT64_MAX;
1738   # }
1739   #
1740   # Optimizations similar to the iadd_sat case are applied here.
1741   (('isub_sat@64', a, b), ('bcsel',
1742                            ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1743                            0x8000000000000000,
1744                            ('bcsel',
1745                             ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)),
1746                             ('isub', a, b),
1747                             0x7fffffffffffffff)),
1748    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),
1749
1750   # These are done here instead of in the backend because the int64 lowering
1751   # pass will make a mess of the patterns.  The first patterns are
1752   # conditioned on nir_lower_minmax64 because it was not clear that it was
1753   # always an improvement on platforms that have real int64 support.  No
1754   # shaders in shader-db hit this, so it was hard to say one way or the
1755   # other.
1756   (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1757   (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1758   (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1759   (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'),
1760   (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1761   (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1762
1763   (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1764   (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1765   # 0u < uint(a) <=> uint(a) != 0u
1766   (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'),
1767
1768   # Alternative lowering that doesn't rely on bfi.
1769   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1770    ('bcsel', ('ult', 31, 'bits'),
1771     'insert',
1772    (('ior',
1773     ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
1774     ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))),
1775    'options->lower_bitfield_insert_to_shifts'),
1776
1777   # Alternative lowering that uses bitfield_select.
1778   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
1779    ('bcsel', ('ult', 31, 'bits'), 'insert',
1780              ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')),
1781    'options->lower_bitfield_insert_to_bitfield_select'),
1782
1783   (('ibitfield_extract', 'value', 'offset', 'bits'),
1784    ('bcsel', ('ult', 31, 'bits'), 'value',
1785              ('ibfe', 'value', 'offset', 'bits')),
1786    'options->lower_bitfield_extract'),
1787
1788   (('ubitfield_extract', 'value', 'offset', 'bits'),
1789    ('bcsel', ('ult', 31, 'bits'), 'value',
1790              ('ubfe', 'value', 'offset', 'bits')),
1791    'options->lower_bitfield_extract'),
1792
1793   # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0.
1794   (('bitfield_select', a, b, 0), ('iand', a, b)),
1795   (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)),
1796
1797   # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits'
1798   (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')),
1799   (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')),
1800   (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')),
1801   (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')),
1802   (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')),
1803   (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')),
1804
1805   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
1806   #
1807   #    If bits is zero, the result will be zero.
1808   #
1809   # These patterns prevent other patterns from generating invalid results
1810   # when count is zero.
1811   (('ubfe', a, b, 0), 0),
1812   (('ibfe', a, b, 0), 0),
1813
1814   (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))),
1815
1816   (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)),
1817   (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct
1818   (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1819   (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1820   (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1821   (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)),
1822
1823   (('ibitfield_extract', 'value', 'offset', 'bits'),
1824    ('bcsel', ('ieq', 0, 'bits'),
1825     0,
1826     ('ishr',
1827       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
1828       ('isub', 32, 'bits'))),
1829    'options->lower_bitfield_extract_to_shifts'),
1830
1831   (('ubitfield_extract', 'value', 'offset', 'bits'),
1832    ('iand',
1833     ('ushr', 'value', 'offset'),
1834     ('bcsel', ('ieq', 'bits', 32),
1835      0xffffffff,
1836      ('isub', ('ishl', 1, 'bits'), 1))),
1837    'options->lower_bitfield_extract_to_shifts'),
1838
1839   (('ifind_msb', 'value'),
1840    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
1841    'options->lower_ifind_msb'),
1842
1843   (('ifind_msb', 'value'),
1844    ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0),
1845     ('isub', 31, ('ifind_msb_rev', 'value')),
1846     ('ifind_msb_rev', 'value')),
1847    'options->lower_find_msb_to_reverse'),
1848
1849    (('ufind_msb', 'value'),
1850     ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0),
1851      ('isub', 31, ('ufind_msb_rev', 'value')),
1852      ('ufind_msb_rev', 'value')),
1853     'options->lower_find_msb_to_reverse'),
1854
1855   (('find_lsb', 'value'),
1856    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
1857    'options->lower_find_lsb'),
1858
1859   (('extract_i8', a, 'b@32'),
1860    ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
1861    'options->lower_extract_byte'),
1862
1863   (('extract_u8', a, 'b@32'),
1864    ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
1865    'options->lower_extract_byte'),
1866
1867   (('extract_i16', a, 'b@32'),
1868    ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
1869    'options->lower_extract_word'),
1870
1871   (('extract_u16', a, 'b@32'),
1872    ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
1873    'options->lower_extract_word'),
1874
1875    (('pack_unorm_2x16', 'v'),
1876     ('pack_uvec2_to_uint',
1877        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
1878     'options->lower_pack_unorm_2x16'),
1879
1880    (('pack_unorm_4x8', 'v'),
1881     ('pack_uvec4_to_uint',
1882        ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
1883     'options->lower_pack_unorm_4x8'),
1884
1885    (('pack_snorm_2x16', 'v'),
1886     ('pack_uvec2_to_uint',
1887        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
1888     'options->lower_pack_snorm_2x16'),
1889
1890    (('pack_snorm_4x8', 'v'),
1891     ('pack_uvec4_to_uint',
1892        ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
1893     'options->lower_pack_snorm_4x8'),
1894
1895    (('unpack_unorm_2x16', 'v'),
1896     ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0),
1897                                  ('extract_u16', 'v', 1))),
1898              65535.0),
1899     'options->lower_unpack_unorm_2x16'),
1900
1901    (('unpack_unorm_4x8', 'v'),
1902     ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0),
1903                                  ('extract_u8', 'v', 1),
1904                                  ('extract_u8', 'v', 2),
1905                                  ('extract_u8', 'v', 3))),
1906              255.0),
1907     'options->lower_unpack_unorm_4x8'),
1908
1909    (('unpack_snorm_2x16', 'v'),
1910     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
1911                                                            ('extract_i16', 'v', 1))),
1912                                           32767.0))),
1913     'options->lower_unpack_snorm_2x16'),
1914
1915    (('unpack_snorm_4x8', 'v'),
1916     ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
1917                                                            ('extract_i8', 'v', 1),
1918                                                            ('extract_i8', 'v', 2),
1919                                                            ('extract_i8', 'v', 3))),
1920                                           127.0))),
1921     'options->lower_unpack_snorm_4x8'),
1922
1923   (('pack_half_2x16_split', 'a@32', 'b@32'),
1924    ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))),
1925    'options->lower_pack_split'),
1926
1927   (('unpack_half_2x16_split_x', 'a@32'),
1928    ('f2f32', ('u2u16', a)),
1929    'options->lower_pack_split'),
1930
1931   (('unpack_half_2x16_split_y', 'a@32'),
1932    ('f2f32', ('u2u16', ('ushr', a, 16))),
1933    'options->lower_pack_split'),
1934
1935   (('pack_32_2x16_split', 'a@16', 'b@16'),
1936    ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)),
1937    'options->lower_pack_split'),
1938
1939   (('unpack_32_2x16_split_x', 'a@32'),
1940    ('u2u16', a),
1941    'options->lower_pack_split'),
1942
1943   (('unpack_32_2x16_split_y', 'a@32'),
1944    ('u2u16', ('ushr', 'a', 16)),
1945    'options->lower_pack_split'),
1946
1947   (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'),
1948   (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'),
1949   (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'),
1950   # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0
1951   # Mark the new comparisons precise to prevent them being changed to 'a !=
1952   # 0' or 'a == 0'.
1953   (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'),
1954
1955   # Address/offset calculations:
1956   # Drivers supporting imul24 should use the nir_lower_amul() pass, this
1957   # rule converts everyone else to imul:
1958   (('amul', a, b), ('imul', a, b), '!options->has_imul24'),
1959
1960   (('umul24', a, b),
1961    ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)),
1962    '!options->has_umul24'),
1963   (('umad24', a, b, c),
1964    ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c),
1965    '!options->has_umad24'),
1966
1967   # Relaxed 24bit ops
1968   (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'),
1969   (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'),
1970   (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'),
1971   (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'),
1972   (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'),
1973   (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'),
1974
1975   (('imad24_ir3', a, b, 0), ('imul24', a, b)),
1976   (('imad24_ir3', a, 0, c), (c)),
1977   (('imad24_ir3', a, 1, c), ('iadd', a, c)),
1978
1979   # if first two srcs are const, crack apart the imad so constant folding
1980   # can clean up the imul:
1981   # TODO ffma should probably get a similar rule:
1982   (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)),
1983
1984   # These will turn 24b address/offset calc back into 32b shifts, but
1985   # it should be safe to get back some of the bits of precision that we
1986   # already decided were no necessary:
1987   (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'),
1988   (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'),
1989   (('imul24', a, 0), (0)),
1990
1991   (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1992   (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
1993   (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1994   (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
1995
1996   (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"),
1997   (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"),
1998   (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"),
1999   (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"),
2000
2001   (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"),
2002   (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
2003   (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"),
2004   (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"),
2005
2006])
2007
2008# bit_size dependent lowerings
2009for bit_size in [8, 16, 32, 64]:
2010   # convenience constants
2011   intmax = (1 << (bit_size - 1)) - 1
2012   intmin = 1 << (bit_size - 1)
2013
2014   optimizations += [
2015      (('iadd_sat@' + str(bit_size), a, b),
2016       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
2017                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
2018      (('isub_sat@' + str(bit_size), a, b),
2019       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
2020                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
2021   ]
2022
2023invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')])
2024
2025for left, right in itertools.combinations_with_replacement(invert.keys(), 2):
2026   optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
2027                         ('iand', (invert[left], a, b), (invert[right], c, d))))
2028   optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))),
2029                         ('ior', (invert[left], a, b), (invert[right], c, d))))
2030
2031# Optimize x2bN(b2x(x)) -> x
2032for size in type_sizes('bool'):
2033    aN = 'a@' + str(size)
2034    f2bN = 'f2b' + str(size)
2035    i2bN = 'i2b' + str(size)
2036    optimizations.append(((f2bN, ('b2f', aN)), a))
2037    optimizations.append(((i2bN, ('b2i', aN)), a))
2038
2039# Optimize x2yN(b2x(x)) -> b2y
2040for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']):
2041   if x != 'f' and y != 'f' and x != y:
2042      continue
2043
2044   b2x = 'b2f' if x == 'f' else 'b2i'
2045   b2y = 'b2f' if y == 'f' else 'b2i'
2046   x2yN = '{}2{}'.format(x, y)
2047   optimizations.append(((x2yN, (b2x, a)), (b2y, a)))
2048
2049# Optimize away x2xN(a@N)
2050for t in ['int', 'uint', 'float', 'bool']:
2051   for N in type_sizes(t):
2052      x2xN = '{0}2{0}{1}'.format(t[0], N)
2053      aN = 'a@{0}'.format(N)
2054      optimizations.append(((x2xN, aN), a))
2055
2056# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers
2057# In particular, we can optimize away everything except upcast of downcast and
2058# upcasts where the type differs from the other cast
2059for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')):
2060   if N < M:
2061      # The outer cast is a down-cast.  It doesn't matter what the size of the
2062      # argument of the inner cast is because we'll never been in the upcast
2063      # of downcast case.  Regardless of types, we'll always end up with y2yN
2064      # in the end.
2065      for x, y in itertools.product(['i', 'u'], ['i', 'u']):
2066         x2xN = '{0}2{0}{1}'.format(x, N)
2067         y2yM = '{0}2{0}{1}'.format(y, M)
2068         y2yN = '{0}2{0}{1}'.format(y, N)
2069         optimizations.append(((x2xN, (y2yM, a)), (y2yN, a)))
2070   elif N > M:
2071      # If the outer cast is an up-cast, we have to be more careful about the
2072      # size of the argument of the inner cast and with types.  In this case,
2073      # the type is always the type of type up-cast which is given by the
2074      # outer cast.
2075      for P in type_sizes('uint'):
2076         # We can't optimize away up-cast of down-cast.
2077         if M < P:
2078            continue
2079
2080         # Because we're doing down-cast of down-cast, the types always have
2081         # to match between the two casts
2082         for x in ['i', 'u']:
2083            x2xN = '{0}2{0}{1}'.format(x, N)
2084            x2xM = '{0}2{0}{1}'.format(x, M)
2085            aP = 'a@{0}'.format(P)
2086            optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a)))
2087   else:
2088      # The N == M case is handled by other optimizations
2089      pass
2090
2091# Downcast operations should be able to see through pack
2092for t in ['i', 'u']:
2093    for N in [8, 16, 32]:
2094        x2xN = '{0}2{0}{1}'.format(t, N)
2095        optimizations += [
2096            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2097            ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)),
2098        ]
2099
2100# Optimize comparisons with up-casts
2101for t in ['int', 'uint', 'float']:
2102    for N, M in itertools.product(type_sizes(t), repeat=2):
2103        if N == 1 or N >= M:
2104            continue
2105
2106        cond = 'true'
2107        if N == 8:
2108            cond = 'options->support_8bit_alu'
2109        elif N == 16:
2110            cond = 'options->support_16bit_alu'
2111        x2xM = '{0}2{0}{1}'.format(t[0], M)
2112        x2xN = '{0}2{0}{1}'.format(t[0], N)
2113        aN = 'a@' + str(N)
2114        bN = 'b@' + str(N)
2115        xeq = 'feq' if t == 'float' else 'ieq'
2116        xne = 'fneu' if t == 'float' else 'ine'
2117        xge = '{0}ge'.format(t[0])
2118        xlt = '{0}lt'.format(t[0])
2119
2120        # Up-casts are lossless so for correctly signed comparisons of
2121        # up-casted values we can do the comparison at the largest of the two
2122        # original sizes and drop one or both of the casts.  (We have
2123        # optimizations to drop the no-op casts which this may generate.)
2124        for P in type_sizes(t):
2125            if P == 1 or P > N:
2126                continue
2127
2128            bP = 'b@' + str(P)
2129            optimizations += [
2130                ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond),
2131                ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond),
2132                ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond),
2133                ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond),
2134                ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond),
2135                ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond),
2136            ]
2137
2138        # The next bit doesn't work on floats because the range checks would
2139        # get way too complicated.
2140        if t in ['int', 'uint']:
2141            if t == 'int':
2142                xN_min = -(1 << (N - 1))
2143                xN_max = (1 << (N - 1)) - 1
2144            elif t == 'uint':
2145                xN_min = 0
2146                xN_max = (1 << N) - 1
2147            else:
2148                assert False
2149
2150            # If we're up-casting and comparing to a constant, we can unfold
2151            # the comparison into a comparison with the shrunk down constant
2152            # and a check that the constant fits in the smaller bit size.
2153            optimizations += [
2154                ((xeq, (x2xM, aN), '#b'),
2155                 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond),
2156                ((xne, (x2xM, aN), '#b'),
2157                 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond),
2158                ((xlt, (x2xM, aN), '#b'),
2159                 ('iand', (xlt, xN_min, b),
2160                          ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond),
2161                ((xlt, '#a', (x2xM, bN)),
2162                 ('iand', (xlt, a, xN_max),
2163                          ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond),
2164                ((xge, (x2xM, aN), '#b'),
2165                 ('iand', (xge, xN_max, b),
2166                          ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond),
2167                ((xge, '#a', (x2xM, bN)),
2168                 ('iand', (xge, a, xN_min),
2169                          ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond),
2170            ]
2171
2172# Convert masking followed by signed downcast to just unsigned downcast
2173optimizations += [
2174    (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)),
2175    (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)),
2176    (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)),
2177    (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)),
2178    (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)),
2179    (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)),
2180]
2181
2182# Some operations such as iadd have the property that the bottom N bits of the
2183# output only depends on the bottom N bits of each of the inputs so we can
2184# remove casts
2185for N in [16, 32]:
2186    for M in [8, 16]:
2187        if M >= N:
2188            continue
2189
2190        aN = 'a@' + str(N)
2191        u2uM = 'u2u{0}'.format(M)
2192        i2iM = 'i2i{0}'.format(M)
2193
2194        for x in ['u', 'i']:
2195            x2xN = '{0}2{0}{1}'.format(x, N)
2196            extract_xM = 'extract_{0}{1}'.format(x, M)
2197
2198            x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M)
2199            extract_xM_M_bits = \
2200                '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M)
2201            optimizations += [
2202                ((x2xN_M_bits, (u2uM, aN)), a),
2203                ((extract_xM_M_bits, aN, 0), a),
2204            ]
2205
2206            bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M)
2207            optimizations += [
2208                ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)),
2209                ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)),
2210                ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)),
2211            ]
2212
2213            for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']:
2214                op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M)
2215                optimizations += [
2216                    ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)),
2217                    ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)),
2218                    ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)),
2219                ]
2220
2221def fexp2i(exp, bits):
2222   # Generate an expression which constructs value 2.0^exp or 0.0.
2223   #
2224   # We assume that exp is already in a valid range:
2225   #
2226   #   * [-15, 15] for 16-bit float
2227   #   * [-127, 127] for 32-bit float
2228   #   * [-1023, 1023] for 16-bit float
2229   #
2230   # If exp is the lowest value in the valid range, a value of 0.0 is
2231   # constructed.  Otherwise, the value 2.0^exp is constructed.
2232   if bits == 16:
2233      return ('i2i16', ('ishl', ('iadd', exp, 15), 10))
2234   elif bits == 32:
2235      return ('ishl', ('iadd', exp, 127), 23)
2236   elif bits == 64:
2237      return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20))
2238   else:
2239      assert False
2240
2241def ldexp(f, exp, bits):
2242   # The maximum possible range for a normal exponent is [-126, 127] and,
2243   # throwing in denormals, you get a maximum range of [-149, 127].  This
2244   # means that we can potentially have a swing of +-276.  If you start with
2245   # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush
2246   # all the way to zero.  The GLSL spec only requires that we handle a subset
2247   # of this range.  From version 4.60 of the spec:
2248   #
2249   #    "If exp is greater than +128 (single-precision) or +1024
2250   #    (double-precision), the value returned is undefined. If exp is less
2251   #    than -126 (single-precision) or -1022 (double-precision), the value
2252   #    returned may be flushed to zero. Additionally, splitting the value
2253   #    into a significand and exponent using frexp() and then reconstructing
2254   #    a floating-point value using ldexp() should yield the original input
2255   #    for zero and all finite non-denormalized values."
2256   #
2257   # The SPIR-V spec has similar language.
2258   #
2259   # In order to handle the maximum value +128 using the fexp2i() helper
2260   # above, we have to split the exponent in half and do two multiply
2261   # operations.
2262   #
2263   # First, we clamp exp to a reasonable range.  Specifically, we clamp to
2264   # twice the full range that is valid for the fexp2i() function above.  If
2265   # exp/2 is the bottom value of that range, the fexp2i() expression will
2266   # yield 0.0f which, when multiplied by f, will flush it to zero which is
2267   # allowed by the GLSL and SPIR-V specs for low exponent values.  If the
2268   # value is clamped from above, then it must have been above the supported
2269   # range of the GLSL built-in and therefore any return value is acceptable.
2270   if bits == 16:
2271      exp = ('imin', ('imax', exp, -30), 30)
2272   elif bits == 32:
2273      exp = ('imin', ('imax', exp, -254), 254)
2274   elif bits == 64:
2275      exp = ('imin', ('imax', exp, -2046), 2046)
2276   else:
2277      assert False
2278
2279   # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
2280   # (We use ishr which isn't the same for -1, but the -1 case still works
2281   # since we use exp-exp/2 as the second exponent.)  While the spec
2282   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
2283   # work with denormals and doesn't allow for the full swing in exponents
2284   # that you can get with normalized values.  Instead, we create two powers
2285   # of two and multiply by them each in turn.  That way the effective range
2286   # of our exponent is doubled.
2287   pow2_1 = fexp2i(('ishr', exp, 1), bits)
2288   pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits)
2289   return ('fmul', ('fmul', f, pow2_1), pow2_2)
2290
2291optimizations += [
2292   (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'),
2293   (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'),
2294   (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'),
2295]
2296
2297# Unreal Engine 4 demo applications open-codes bitfieldReverse()
2298def bitfield_reverse_ue4(u):
2299    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2300    step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
2301    step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
2302    step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
2303    step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
2304
2305    return step5
2306
2307# Cyberpunk 2077 open-codes bitfieldReverse()
2308def bitfield_reverse_cp2077(u):
2309    step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
2310    step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555))
2311    step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333))
2312    step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f))
2313    step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff))
2314
2315    return step5
2316
2317optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2318optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')]
2319
2320# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)"
2321# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)"
2322for ncomp in [2, 3, 4, 8, 16]:
2323   optimizations += [
2324      (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)),
2325      (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)),
2326      (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)),
2327      (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)),
2328   ]
2329
2330# For any float comparison operation, "cmp", if you have "a == a && a cmp b"
2331# then the "a == a" is redundant because it's equivalent to "a is not NaN"
2332# and, if a is a NaN then the second comparison will fail anyway.
2333for op in ['flt', 'fge', 'feq']:
2334   optimizations += [
2335      (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)),
2336      (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)),
2337   ]
2338
2339# Add optimizations to handle the case where the result of a ternary is
2340# compared to a constant.  This way we can take things like
2341#
2342# (a ? 0 : 1) > 0
2343#
2344# and turn it into
2345#
2346# a ? (0 > 0) : (1 > 0)
2347#
2348# which constant folding will eat for lunch.  The resulting ternary will
2349# further get cleaned up by the boolean reductions above and we will be
2350# left with just the original variable "a".
2351for op in ['feq', 'fneu', 'ieq', 'ine']:
2352   optimizations += [
2353      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2354       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2355   ]
2356
2357for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']:
2358   optimizations += [
2359      ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
2360       ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
2361      ((op, '#d', ('bcsel', a, '#b', '#c')),
2362       ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
2363   ]
2364
2365
2366# For example, this converts things like
2367#
2368#    1 + mix(0, a - 1, condition)
2369#
2370# into
2371#
2372#    mix(1, (a-1)+1, condition)
2373#
2374# Other optimizations will rearrange the constants.
2375for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']:
2376   optimizations += [
2377      ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d)))
2378   ]
2379
2380# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives
2381# states:
2382#
2383#     If neither layout qualifier is specified, derivatives in compute shaders
2384#     return zero, which is consistent with the handling of built-in texture
2385#     functions like texture() in GLSL 4.50 compute shaders.
2386for op in ['fddx', 'fddx_fine', 'fddx_coarse',
2387           'fddy', 'fddy_fine', 'fddy_coarse']:
2388   optimizations += [
2389      ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE')
2390]
2391
2392# Some optimizations for ir3-specific instructions.
2393optimizations += [
2394   # 'al * bl': If either 'al' or 'bl' is zero, return zero.
2395   (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)),
2396   # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'.
2397   (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')),
2398   (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')),
2399]
2400
2401# These kinds of sequences can occur after nir_opt_peephole_select.
2402#
2403# NOTE: fadd is not handled here because that gets in the way of ffma
2404# generation in the i965 driver.  Instead, fadd and ffma are handled in
2405# late_optimizations.
2406
2407for op in ['flrp']:
2408    optimizations += [
2409        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2410        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2411        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2412        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2413        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2414        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)),
2415    ]
2416
2417for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']:
2418    optimizations += [
2419        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2420        (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2421        (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2422        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))),
2423    ]
2424
2425for op in ['fpow']:
2426    optimizations += [
2427        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2428        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2429        (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)),
2430        (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)),
2431    ]
2432
2433for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fneg', 'fabs', 'fsign']:
2434    optimizations += [
2435        (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))),
2436    ]
2437
2438for op in ['ineg', 'iabs', 'inot', 'isign']:
2439    optimizations += [
2440        ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))),
2441    ]
2442
2443optimizations.extend([
2444    (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'),
2445    (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'),
2446    (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal')
2447    ])
2448
2449# This section contains optimizations to propagate downsizing conversions of
2450# constructed vectors into vectors of downsized components. Whether this is
2451# useful depends on the SIMD semantics of the backend. On a true SIMD machine,
2452# this reduces the register pressure of the vector itself and often enables the
2453# conversions to be eliminated via other algebraic rules or constant folding.
2454# In the worst case on a SIMD architecture, the propagated conversions may be
2455# revectorized via nir_opt_vectorize so instruction count is minimally
2456# impacted.
2457#
2458# On a machine with SIMD-within-a-register only, this actually
2459# counterintuitively hurts instruction count. These machines are the same that
2460# require vectorize_vec2_16bit, so we predicate the optimizations on that flag
2461# not being set.
2462#
2463# Finally for scalar architectures, there should be no difference in generated
2464# code since it all ends up scalarized at the end, but it might minimally help
2465# compile-times.
2466
2467for i in range(2, 4 + 1):
2468   for T in ('f', 'u', 'i'):
2469      vec_inst = ('vec' + str(i),)
2470
2471      indices = ['a', 'b', 'c', 'd']
2472      suffix_in = tuple((indices[j] + '@32') for j in range(i))
2473
2474      to_16 = '{}2{}16'.format(T, T)
2475      to_mp = '{}2{}mp'.format(T, T)
2476
2477      out_16 = tuple((to_16, indices[j]) for j in range(i))
2478      out_mp = tuple((to_mp, indices[j]) for j in range(i))
2479
2480      optimizations  += [
2481         ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'),
2482      ]
2483      # u2ump doesn't exist, because it's equal to i2imp
2484      if T in ['f', 'i']:
2485          optimizations  += [
2486             ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit')
2487          ]
2488
2489# This section contains "late" optimizations that should be run before
2490# creating ffmas and calling regular optimizations for the final time.
2491# Optimizations should go here if they help code generation and conflict
2492# with the regular optimizations.
2493before_ffma_optimizations = [
2494   # Propagate constants down multiplication chains
2495   (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)),
2496   (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)),
2497   (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)),
2498   (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)),
2499
2500   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
2501   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
2502   (('~fadd', ('fneg', a), a), 0.0),
2503   (('iadd', ('ineg', a), a), 0),
2504   (('iadd', ('ineg', a), ('iadd', a, b)), b),
2505   (('iadd', a, ('iadd', ('ineg', a), b)), b),
2506   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
2507   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
2508
2509   (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a,  1.0), d), ('fadd', ('flrp', -1.0,  1.0, d), a)),
2510   (('~flrp', ('fadd(is_used_once)', a,  1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp',  1.0, -1.0, d), a)),
2511   (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))),
2512]
2513
2514# This section contains "late" optimizations that should be run after the
2515# regular optimizations have finished.  Optimizations should go here if
2516# they help code generation but do not necessarily produce code that is
2517# more easily optimizable.
2518late_optimizations = [
2519   # The rearrangements are fine w.r.t. NaN.  However, they produce incorrect
2520   # results if one operand is +Inf and the other is -Inf.
2521   #
2522   # 1. Inf + -Inf = NaN
2523   # 2. ∀x: x + NaN = NaN and x - NaN = NaN
2524   # 3. ∀x: x != NaN = true
2525   # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false
2526   #
2527   #               a=Inf, b=-Inf   a=-Inf, b=Inf    a=NaN    b=NaN
2528   #  (a+b) < 0        false            false       false    false
2529   #      a < -b       false            false       false    false
2530   # -(a+b) < 0        false            false       false    false
2531   #     -a < b        false            false       false    false
2532   #  (a+b) >= 0       false            false       false    false
2533   #      a >= -b      true             true        false    false
2534   # -(a+b) >= 0       false            false       false    false
2535   #     -a >= b       true             true        false    false
2536   #  (a+b) == 0       false            false       false    false
2537   #      a == -b      true             true        false    false
2538   #  (a+b) != 0       true             true        true     true
2539   #      a != -b      false            false       true     true
2540   (('flt',                        ('fadd(is_used_once)', a, b),  0.0), ('flt',          a, ('fneg', b))),
2541   (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a),         b)),
2542   (('flt', 0.0,                        ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a),         b)),
2543   (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt',          a, ('fneg', b))),
2544   (('~fge',                        ('fadd(is_used_once)', a, b),  0.0), ('fge',          a, ('fneg', b))),
2545   (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a),         b)),
2546   (('~fge', 0.0,                        ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a),         b)),
2547   (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge',          a, ('fneg', b))),
2548   (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))),
2549   (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))),
2550
2551   # If either source must be finite, then the original (a+b) cannot produce
2552   # NaN due to Inf-Inf.  The patterns and the replacements produce the same
2553   # result if b is NaN. Therefore, the replacements are exact.
2554   (('fge',                        ('fadd(is_used_once)', 'a(is_finite)', b),  0.0), ('fge',          a, ('fneg', b))),
2555   (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a),         b)),
2556   (('fge', 0.0,                        ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a),         b)),
2557   (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge',          a, ('fneg', b))),
2558   (('feq',  ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq',  a, ('fneg', b))),
2559   (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))),
2560
2561   # This is how SpvOpFOrdNotEqual might be implemented.  Replace it with
2562   # SpvOpLessOrGreater.
2563   (('iand', ('fneu', a, b),   ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))),
2564   (('iand', ('fneu', a, 0.0),          ('feq', a, a)                ), ('!flt', 0.0, ('fabs', a))),
2565
2566   # This is how SpvOpFUnordEqual might be implemented.  Replace it with
2567   # !SpvOpLessOrGreater.
2568   (('ior', ('feq', a, b),   ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))),
2569   (('ior', ('feq', a, 0.0),         ('fneu', a, a),                ), ('inot', ('!flt', 0.0, ('fabs', a)))),
2570
2571   # nir_lower_to_source_mods will collapse this, but its existence during the
2572   # optimization loop can prevent other optimizations.
2573   (('fneg', ('fneg', a)), a),
2574
2575   # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c
2576   # gets combined to fma(a, b, -c).
2577   (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'),
2578   (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'),
2579   (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'),
2580   (('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'),
2581
2582   # Subtractions get lowered during optimization, so we need to recombine them
2583   (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2584   (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2585   (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'),
2586   (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'),
2587
2588   (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'),
2589   (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'),
2590   (('ineg', a), ('isub', 0, a), 'options->lower_ineg'),
2591   (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'),
2592
2593   (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'),
2594   (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'),
2595   (('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'),
2596
2597    # fneg_lo / fneg_hi
2598   (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'),
2599   (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'),
2600
2601   # These are duplicated from the main optimizations table.  The late
2602   # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create
2603   # new patterns like these.  The patterns that compare with zero are removed
2604   # because they are unlikely to be created in by anything in
2605   # late_optimizations.
2606   (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)),
2607   (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)),
2608   (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)),
2609   (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)),
2610
2611   (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)),
2612
2613   (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))),
2614
2615   (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)),
2616   (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)),
2617   (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)),
2618   (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)),
2619   (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)),
2620   (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)),
2621   (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)),
2622   (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)),
2623   (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)),
2624   (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)),
2625
2626   (('ior', a, a), a),
2627   (('iand', a, a), a),
2628
2629   (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))),
2630
2631   (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'),
2632   (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'),
2633   (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'),
2634   (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
2635
2636   (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)),
2637
2638   # Approximate handling of fround_even for DX9 addressing from gallium nine on
2639   # DX9-class hardware with no proper fround support.  This is in
2640   # late_optimizations so that the is_integral() opts in the main pass get a
2641   # chance to eliminate the fround_even first.
2642   (('fround_even', a), ('bcsel',
2643                         ('feq', ('ffract', a), 0.5),
2644                         ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0),
2645                         ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'),
2646
2647   # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this
2648   # particular operation is common for expanding values stored in a texture
2649   # from [0,1] to [-1,1].
2650   (('~ffma@32', a,  2.0, -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2651   (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2652   (('~ffma@32', a, -2.0,  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2653   (('~ffma@32', a,  2.0,  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2654   (('~fadd@32', ('fmul(is_used_once)',  2.0, a), -1.0), ('flrp', -1.0,  1.0,          a ), '!options->lower_flrp32'),
2655   (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0,  1.0, ('fneg', a)), '!options->lower_flrp32'),
2656   (('~fadd@32', ('fmul(is_used_once)', -2.0, a),  1.0), ('flrp',  1.0, -1.0,          a ), '!options->lower_flrp32'),
2657   (('~fadd@32', ('fmul(is_used_once)',  2.0, a),  1.0), ('flrp',  1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'),
2658
2659    # flrp(a, b, a)
2660    # a*(1-a) + b*a
2661    # a + -a*a + a*b    (1)
2662    # a + a*(b - a)
2663    # Option 1: ffma(a, (b-a), a)
2664    #
2665    # Alternately, after (1):
2666    # a*(1+b) + -a*a
2667    # a*((1+b) + -a)
2668    #
2669    # Let b=1
2670    #
2671    # Option 2: ffma(a, 2, -(a*a))
2672    # Option 3: ffma(a, 2, (-a)*a)
2673    # Option 4: ffma(a, -a, (2*a)
2674    # Option 5: a * (2 - a)
2675    #
2676    # There are a lot of other possible combinations.
2677   (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'),
2678   (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2679   (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2680   (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2681   (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))),    ('flrp', a, 1.0, a), '!options->lower_flrp32'),
2682
2683   # we do these late so that we don't get in the way of creating ffmas
2684   (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
2685   (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
2686
2687   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
2688   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
2689   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
2690    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),
2691
2692   # Things that look like DPH in the source shader may get expanded to
2693   # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets
2694   # to NIR.  After FFMA is generated, this can look like:
2695   #
2696   #    fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w)
2697   #
2698   # Reassociate the last addition into the first multiplication.
2699   #
2700   # Some shaders do not use 'invariant' in vertex and (possibly) geometry
2701   # shader stages on some outputs that are intended to be invariant.  For
2702   # various reasons, this optimization may not be fully applied in all
2703   # shaders used for different rendering passes of the same geometry.  This
2704   # can result in Z-fighting artifacts (at best).  For now, disable this
2705   # optimization in these stages.  See bugzilla #111490.  In tessellation
2706   # stages applications seem to use 'precise' when necessary, so allow the
2707   # optimization in those stages.
2708   (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
2709    ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2710   (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
2711    ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2712   (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
2713    ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2714
2715   (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'),
2716    ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2717   (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'),
2718    ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2719   (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'),
2720    ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'),
2721
2722   # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says:
2723   #
2724   #    If bits is zero, the result will be zero.
2725   #
2726   # These prevent the next two lowerings generating incorrect results when
2727   # count is zero.
2728   (('ubfe', a, b, 0), 0),
2729   (('ibfe', a, b, 0), 0),
2730
2731   # On Intel GPUs, BFE is a 3-source instruction.  Like all 3-source
2732   # instructions on Intel GPUs, it cannot have an immediate values as
2733   # sources.  There are also limitations on source register strides.  As a
2734   # result, it is very easy for 3-source instruction combined with either
2735   # loads of immediate values or copies from weird register strides to be
2736   # more expensive than the primitive instructions it represents.
2737   (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'),
2738
2739   # b is the lowest order bit to be extracted and c is the number of bits to
2740   # extract.  The inner shift removes the bits above b + c by shifting left
2741   # 32 - (b + c).  ishl only sees the low 5 bits of the shift count, which is
2742   # -(b + c).  The outer shift moves the bit that was at b to bit zero.
2743   # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c.
2744   # This means that it must be shifted right by 32 - c or -c bits.
2745   (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'),
2746
2747   # Clean up no-op shifts that may result from the bfe lowerings.
2748   (('ishl', a, 0), a),
2749   (('ishl', a, -32), a),
2750   (('ishr', a, 0), a),
2751   (('ishr', a, -32), a),
2752   (('ushr', a, 0), a),
2753
2754   (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)),
2755   (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)),
2756   (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)),
2757   (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)),
2758]
2759
2760# A few more extract cases we'd rather leave late
2761for N in [16, 32]:
2762    aN = 'a@{0}'.format(N)
2763    u2uM = 'u2u{0}'.format(M)
2764    i2iM = 'i2i{0}'.format(M)
2765
2766    for x in ['u', 'i']:
2767        x2xN = '{0}2{0}{1}'.format(x, N)
2768        extract_x8 = 'extract_{0}8'.format(x)
2769        extract_x16 = 'extract_{0}16'.format(x)
2770
2771        late_optimizations.extend([
2772            ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
2773            ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'),
2774        ])
2775
2776        if N > 16:
2777            late_optimizations.extend([
2778                ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
2779                ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'),
2780            ])
2781
2782# Byte insertion
2783late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
2784late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)])
2785late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte'))
2786
2787late_optimizations += [
2788   # Word insertion
2789   (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'),
2790
2791   # Extract and then insert
2792   (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)),
2793   (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)),
2794]
2795
2796# Integer sizes
2797for s in [8, 16, 32, 64]:
2798    late_optimizations.extend([
2799        (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)),
2800        (('ior',  ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)),
2801    ])
2802
2803# Float sizes
2804for s in [16, 32, 64]:
2805    late_optimizations.extend([
2806       (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)),
2807       (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))),
2808    ])
2809
2810for op in ['fadd']:
2811    late_optimizations += [
2812        (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))),
2813        (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))),
2814    ]
2815
2816for op in ['ffma', 'ffmaz']:
2817    late_optimizations += [
2818        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2819        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))),
2820
2821        (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2822        (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)),
2823    ]
2824
2825# mediump: If an opcode is surrounded by conversions, remove the conversions.
2826# The rationale is that type conversions + the low precision opcode are more
2827# expensive that the same arithmetic opcode at higher precision.
2828#
2829# This must be done in late optimizations, because we need normal optimizations to
2830# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))).
2831#
2832# Unary opcodes
2833for op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy',
2834           'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg',
2835           'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']:
2836    late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))]
2837
2838# Binary opcodes
2839for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']:
2840    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))]
2841
2842# Ternary opcodes
2843for op in ['ffma', 'flrp']:
2844    late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))]
2845
2846# Comparison opcodes
2847for op in ['feq', 'fge', 'flt', 'fneu']:
2848    late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))]
2849
2850# Do this last, so that the f2fmp patterns above have effect.
2851late_optimizations += [
2852  # Convert *2*mp instructions to concrete *2*16 instructions. At this point
2853  # any conversions that could have been removed will have been removed in
2854  # nir_opt_algebraic so any remaining ones are required.
2855  (('f2fmp', a), ('f2f16', a)),
2856  (('f2imp', a), ('f2i16', a)),
2857  (('f2ump', a), ('f2u16', a)),
2858  (('i2imp', a), ('i2i16', a)),
2859  (('i2fmp', a), ('i2f16', a)),
2860  (('i2imp', a), ('u2u16', a)),
2861  (('u2fmp', a), ('u2f16', a)),
2862  (('fisfinite', a), ('flt', ('fabs', a), float("inf"))),
2863]
2864
2865distribute_src_mods = [
2866   # Try to remove some spurious negations rather than pushing them down.
2867   (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)),
2868   (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)),
2869   (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)),
2870   (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)),
2871   (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)),
2872   (('fneg', ('fneg', a)), a),
2873
2874   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
2875   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),
2876
2877   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
2878   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
2879   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
2880
2881   # Note that fmin <-> fmax.  I don't think there is a way to distribute
2882   # fabs() into fmin or fmax.
2883   (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))),
2884   (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))),
2885
2886   (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)),
2887   (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)),
2888   (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)),
2889
2890   # fdph works mostly like fdot, but to get the correct result, the negation
2891   # must be applied to the second source.
2892   (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))),
2893
2894   (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))),
2895   (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))),
2896]
2897
2898print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
2899print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
2900                                  before_ffma_optimizations).render())
2901print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
2902                                  late_optimizations).render())
2903print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods",
2904                                  distribute_src_mods).render())
2905