1bf215546Sopenharmony_ci# -*- coding: utf-8 -*- 2bf215546Sopenharmony_ci# 3bf215546Sopenharmony_ci# Copyright (C) 2014 Intel Corporation 4bf215546Sopenharmony_ci# 5bf215546Sopenharmony_ci# Permission is hereby granted, free of charge, to any person obtaining a 6bf215546Sopenharmony_ci# copy of this software and associated documentation files (the "Software"), 7bf215546Sopenharmony_ci# to deal in the Software without restriction, including without limitation 8bf215546Sopenharmony_ci# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9bf215546Sopenharmony_ci# and/or sell copies of the Software, and to permit persons to whom the 10bf215546Sopenharmony_ci# Software is furnished to do so, subject to the following conditions: 11bf215546Sopenharmony_ci# 12bf215546Sopenharmony_ci# The above copyright notice and this permission notice (including the next 13bf215546Sopenharmony_ci# paragraph) shall be included in all copies or substantial portions of the 14bf215546Sopenharmony_ci# Software. 15bf215546Sopenharmony_ci# 16bf215546Sopenharmony_ci# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17bf215546Sopenharmony_ci# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18bf215546Sopenharmony_ci# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19bf215546Sopenharmony_ci# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20bf215546Sopenharmony_ci# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21bf215546Sopenharmony_ci# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22bf215546Sopenharmony_ci# IN THE SOFTWARE. 23bf215546Sopenharmony_ci# 24bf215546Sopenharmony_ci# Authors: 25bf215546Sopenharmony_ci# Jason Ekstrand (jason@jlekstrand.net) 26bf215546Sopenharmony_ci 27bf215546Sopenharmony_cifrom collections import OrderedDict 28bf215546Sopenharmony_ciimport nir_algebraic 29bf215546Sopenharmony_cifrom nir_opcodes import type_sizes 30bf215546Sopenharmony_ciimport itertools 31bf215546Sopenharmony_ciimport struct 32bf215546Sopenharmony_cifrom math import pi 33bf215546Sopenharmony_ciimport math 34bf215546Sopenharmony_ci 35bf215546Sopenharmony_ci# Convenience variables 36bf215546Sopenharmony_cia = 'a' 37bf215546Sopenharmony_cib = 'b' 38bf215546Sopenharmony_cic = 'c' 39bf215546Sopenharmony_cid = 'd' 40bf215546Sopenharmony_cie = 'e' 41bf215546Sopenharmony_ci 42bf215546Sopenharmony_cisigned_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)' 43bf215546Sopenharmony_cisigned_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)' 44bf215546Sopenharmony_ci 45bf215546Sopenharmony_ciignore_exact = nir_algebraic.ignore_exact 46bf215546Sopenharmony_ci 47bf215546Sopenharmony_ci# Written in the form (<search>, <replace>) where <search> is an expression 48bf215546Sopenharmony_ci# and <replace> is either an expression or a value. An expression is 49bf215546Sopenharmony_ci# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 50bf215546Sopenharmony_ci# where each source is either an expression or a value. A value can be 51bf215546Sopenharmony_ci# either a numeric constant or a string representing a variable name. 52bf215546Sopenharmony_ci# 53bf215546Sopenharmony_ci# If the opcode in a search expression is prefixed by a '~' character, this 54bf215546Sopenharmony_ci# indicates that the operation is inexact. Such operations will only get 55bf215546Sopenharmony_ci# applied to SSA values that do not have the exact bit set. This should be 56bf215546Sopenharmony_ci# used by by any optimizations that are not bit-for-bit exact. It should not, 57bf215546Sopenharmony_ci# however, be used for backend-requested lowering operations as those need to 58bf215546Sopenharmony_ci# happen regardless of precision. 59bf215546Sopenharmony_ci# 60bf215546Sopenharmony_ci# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 61bf215546Sopenharmony_ci# "#" indicates that the given variable will only match constants, 62bf215546Sopenharmony_ci# type indicates that the given variable will only match values from ALU 63bf215546Sopenharmony_ci# instructions with the given output type, 64bf215546Sopenharmony_ci# (cond) specifies an additional condition function (see nir_search_helpers.h), 65bf215546Sopenharmony_ci# swiz is a swizzle applied to the variable (only in the <replace> expression) 66bf215546Sopenharmony_ci# 67bf215546Sopenharmony_ci# For constants, you have to be careful to make sure that it is the right 68bf215546Sopenharmony_ci# type because python is unaware of the source and destination types of the 69bf215546Sopenharmony_ci# opcodes. 70bf215546Sopenharmony_ci# 71bf215546Sopenharmony_ci# All expression types can have a bit-size specified. For opcodes, this 72bf215546Sopenharmony_ci# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 73bf215546Sopenharmony_ci# type and size. In the search half of the expression this indicates that it 74bf215546Sopenharmony_ci# should only match that particular bit-size. In the replace half of the 75bf215546Sopenharmony_ci# expression this indicates that the constructed value should have that 76bf215546Sopenharmony_ci# bit-size. 77bf215546Sopenharmony_ci# 78bf215546Sopenharmony_ci# If the opcode in a replacement expression is prefixed by a '!' character, 79bf215546Sopenharmony_ci# this indicated that the new expression will be marked exact. 80bf215546Sopenharmony_ci# 81bf215546Sopenharmony_ci# A special condition "many-comm-expr" can be used with expressions to note 82bf215546Sopenharmony_ci# that the expression and its subexpressions have more commutative expressions 83bf215546Sopenharmony_ci# than nir_replace_instr can handle. If this special condition is needed with 84bf215546Sopenharmony_ci# another condition, the two can be separated by a comma (e.g., 85bf215546Sopenharmony_ci# "(many-comm-expr,is_used_once)"). 86bf215546Sopenharmony_ci 87bf215546Sopenharmony_ci# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 88bf215546Sopenharmony_cidef lowered_sincos(c): 89bf215546Sopenharmony_ci x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 90bf215546Sopenharmony_ci x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 91bf215546Sopenharmony_ci return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 92bf215546Sopenharmony_ci 93bf215546Sopenharmony_cidef intBitsToFloat(i): 94bf215546Sopenharmony_ci return struct.unpack('!f', struct.pack('!I', i))[0] 95bf215546Sopenharmony_ci 96bf215546Sopenharmony_cioptimizations = [ 97bf215546Sopenharmony_ci 98bf215546Sopenharmony_ci (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 99bf215546Sopenharmony_ci (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 100bf215546Sopenharmony_ci (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 101bf215546Sopenharmony_ci (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 102bf215546Sopenharmony_ci (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 103bf215546Sopenharmony_ci (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 104bf215546Sopenharmony_ci (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 105bf215546Sopenharmony_ci 106bf215546Sopenharmony_ci (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))), 107bf215546Sopenharmony_ci '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'), 108bf215546Sopenharmony_ci 109bf215546Sopenharmony_ci (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 110bf215546Sopenharmony_ci (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 111bf215546Sopenharmony_ci (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 112bf215546Sopenharmony_ci (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 113bf215546Sopenharmony_ci (('udiv', a, 1), a), 114bf215546Sopenharmony_ci (('idiv', a, 1), a), 115bf215546Sopenharmony_ci (('umod', a, 1), 0), 116bf215546Sopenharmony_ci (('imod', a, 1), 0), 117bf215546Sopenharmony_ci (('imod', a, -1), 0), 118bf215546Sopenharmony_ci (('irem', a, 1), 0), 119bf215546Sopenharmony_ci (('irem', a, -1), 0), 120bf215546Sopenharmony_ci (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 121bf215546Sopenharmony_ci (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 122bf215546Sopenharmony_ci (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 123bf215546Sopenharmony_ci (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 124bf215546Sopenharmony_ci (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 125bf215546Sopenharmony_ci (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 126bf215546Sopenharmony_ci # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 127bf215546Sopenharmony_ci (('irem', a, '#b(is_pos_power_of_two)'), 128bf215546Sopenharmony_ci ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 129bf215546Sopenharmony_ci '!options->lower_bitops'), 130bf215546Sopenharmony_ci (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 131bf215546Sopenharmony_ci 132bf215546Sopenharmony_ci (('~fneg', ('fneg', a)), a), 133bf215546Sopenharmony_ci (('ineg', ('ineg', a)), a), 134bf215546Sopenharmony_ci (('fabs', ('fneg', a)), ('fabs', a)), 135bf215546Sopenharmony_ci (('fabs', ('u2f', a)), ('u2f', a)), 136bf215546Sopenharmony_ci (('iabs', ('iabs', a)), ('iabs', a)), 137bf215546Sopenharmony_ci (('iabs', ('ineg', a)), ('iabs', a)), 138bf215546Sopenharmony_ci (('f2b', ('fneg', a)), ('f2b', a)), 139bf215546Sopenharmony_ci (('i2b', ('ineg', a)), ('i2b', a)), 140bf215546Sopenharmony_ci (('~fadd', a, 0.0), a), 141bf215546Sopenharmony_ci # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 142bf215546Sopenharmony_ci # floating point instruction, they should flush any input denormals and we 143bf215546Sopenharmony_ci # can replace -0.0 with 0.0 if the float execution mode allows it. 144bf215546Sopenharmony_ci (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16), 145bf215546Sopenharmony_ci (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32), 146bf215546Sopenharmony_ci (('iadd', a, 0), a), 147bf215546Sopenharmony_ci (('iadd_sat', a, 0), a), 148bf215546Sopenharmony_ci (('isub_sat', a, 0), a), 149bf215546Sopenharmony_ci (('uadd_sat', a, 0), a), 150bf215546Sopenharmony_ci (('usub_sat', a, 0), a), 151bf215546Sopenharmony_ci (('usadd_4x8_vc4', a, 0), a), 152bf215546Sopenharmony_ci (('usadd_4x8_vc4', a, ~0), ~0), 153bf215546Sopenharmony_ci (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 154bf215546Sopenharmony_ci (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), 155bf215546Sopenharmony_ci (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 156bf215546Sopenharmony_ci (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 157bf215546Sopenharmony_ci (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 158bf215546Sopenharmony_ci (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 159bf215546Sopenharmony_ci (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 160bf215546Sopenharmony_ci (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), 161bf215546Sopenharmony_ci (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 162bf215546Sopenharmony_ci (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), 163bf215546Sopenharmony_ci (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 164bf215546Sopenharmony_ci (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), 165bf215546Sopenharmony_ci (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))), 166bf215546Sopenharmony_ci (('~fadd', ('fneg', a), a), 0.0), 167bf215546Sopenharmony_ci (('iadd', ('ineg', a), a), 0), 168bf215546Sopenharmony_ci (('iadd', ('ineg', a), ('iadd', a, b)), b), 169bf215546Sopenharmony_ci (('iadd', a, ('iadd', ('ineg', a), b)), b), 170bf215546Sopenharmony_ci (('~fadd', ('fneg', a), ('fadd', a, b)), b), 171bf215546Sopenharmony_ci (('~fadd', a, ('fadd', ('fneg', a), b)), b), 172bf215546Sopenharmony_ci (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 173bf215546Sopenharmony_ci (('~fmul', a, 0.0), 0.0), 174bf215546Sopenharmony_ci # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 175bf215546Sopenharmony_ci (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16), 176bf215546Sopenharmony_ci (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32), 177bf215546Sopenharmony_ci (('fmulz', a, 0.0), 0.0), 178bf215546Sopenharmony_ci (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), 179bf215546Sopenharmony_ci (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), 180bf215546Sopenharmony_ci (('fmulz', a, a), ('fmul', a, a)), 181bf215546Sopenharmony_ci (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32), 182bf215546Sopenharmony_ci (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), 183bf215546Sopenharmony_ci (('ffmaz', a, a, b), ('ffma', a, a, b)), 184bf215546Sopenharmony_ci (('imul', a, 0), 0), 185bf215546Sopenharmony_ci (('umul_unorm_4x8_vc4', a, 0), 0), 186bf215546Sopenharmony_ci (('umul_unorm_4x8_vc4', a, ~0), a), 187bf215546Sopenharmony_ci (('~fmul', a, 1.0), a), 188bf215546Sopenharmony_ci (('~fmulz', a, 1.0), a), 189bf215546Sopenharmony_ci # The only effect a*1.0 can have is flushing denormals. If it's only used by 190bf215546Sopenharmony_ci # a floating point instruction, they should flush any input denormals and 191bf215546Sopenharmony_ci # this multiplication isn't needed. 192bf215546Sopenharmony_ci (('fmul(is_only_used_as_float)', a, 1.0), a), 193bf215546Sopenharmony_ci (('imul', a, 1), a), 194bf215546Sopenharmony_ci (('fmul', a, -1.0), ('fneg', a)), 195bf215546Sopenharmony_ci (('imul', a, -1), ('ineg', a)), 196bf215546Sopenharmony_ci # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 197bf215546Sopenharmony_ci # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 198bf215546Sopenharmony_ci # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 199bf215546Sopenharmony_ci # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 200bf215546Sopenharmony_ci (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 201bf215546Sopenharmony_ci (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 202bf215546Sopenharmony_ci (('~ffma', 0.0, a, b), b), 203bf215546Sopenharmony_ci (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16), 204bf215546Sopenharmony_ci (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32), 205bf215546Sopenharmony_ci (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), 206bf215546Sopenharmony_ci (('~ffma', a, b, 0.0), ('fmul', a, b)), 207bf215546Sopenharmony_ci (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16), 208bf215546Sopenharmony_ci (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), 209bf215546Sopenharmony_ci (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32), 210bf215546Sopenharmony_ci (('ffma', 1.0, a, b), ('fadd', a, b)), 211bf215546Sopenharmony_ci (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32), 212bf215546Sopenharmony_ci (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 213bf215546Sopenharmony_ci (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32), 214bf215546Sopenharmony_ci (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 215bf215546Sopenharmony_ci (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), 216bf215546Sopenharmony_ci (('~flrp', a, b, 0.0), a), 217bf215546Sopenharmony_ci (('~flrp', a, b, 1.0), b), 218bf215546Sopenharmony_ci (('~flrp', a, a, b), a), 219bf215546Sopenharmony_ci (('~flrp', 0.0, a, b), ('fmul', a, b)), 220bf215546Sopenharmony_ci 221bf215546Sopenharmony_ci # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 222bf215546Sopenharmony_ci (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 223bf215546Sopenharmony_ci 224bf215546Sopenharmony_ci (('sdot_4x8_iadd', a, 0, b), b), 225bf215546Sopenharmony_ci (('udot_4x8_uadd', a, 0, b), b), 226bf215546Sopenharmony_ci (('sdot_4x8_iadd_sat', a, 0, b), b), 227bf215546Sopenharmony_ci (('udot_4x8_uadd_sat', a, 0, b), b), 228bf215546Sopenharmony_ci (('sdot_2x16_iadd', a, 0, b), b), 229bf215546Sopenharmony_ci (('udot_2x16_uadd', a, 0, b), b), 230bf215546Sopenharmony_ci (('sdot_2x16_iadd_sat', a, 0, b), b), 231bf215546Sopenharmony_ci (('udot_2x16_uadd_sat', a, 0, b), b), 232bf215546Sopenharmony_ci 233bf215546Sopenharmony_ci # sudot_4x8_iadd is not commutative at all, so the patterns must be 234bf215546Sopenharmony_ci # duplicated with zeros on each of the first positions. 235bf215546Sopenharmony_ci (('sudot_4x8_iadd', a, 0, b), b), 236bf215546Sopenharmony_ci (('sudot_4x8_iadd', 0, a, b), b), 237bf215546Sopenharmony_ci (('sudot_4x8_iadd_sat', a, 0, b), b), 238bf215546Sopenharmony_ci (('sudot_4x8_iadd_sat', 0, a, b), b), 239bf215546Sopenharmony_ci 240bf215546Sopenharmony_ci (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 241bf215546Sopenharmony_ci (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 242bf215546Sopenharmony_ci (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 243bf215546Sopenharmony_ci (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 244bf215546Sopenharmony_ci (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 245bf215546Sopenharmony_ci 246bf215546Sopenharmony_ci # Try to let constant folding eliminate the dot-product part. These are 247bf215546Sopenharmony_ci # safe because the dot product cannot overflow 32 bits. 248bf215546Sopenharmony_ci (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 249bf215546Sopenharmony_ci (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 250bf215546Sopenharmony_ci (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 251bf215546Sopenharmony_ci (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 252bf215546Sopenharmony_ci (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 253bf215546Sopenharmony_ci (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 254bf215546Sopenharmony_ci (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 255bf215546Sopenharmony_ci (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 256bf215546Sopenharmony_ci (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 257bf215546Sopenharmony_ci (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 258bf215546Sopenharmony_ci (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 259bf215546Sopenharmony_ci (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 260bf215546Sopenharmony_ci (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 261bf215546Sopenharmony_ci (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 262bf215546Sopenharmony_ci (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 263bf215546Sopenharmony_ci (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 264bf215546Sopenharmony_ci 265bf215546Sopenharmony_ci # Optimize open-coded fmulz. 266bf215546Sopenharmony_ci # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b) 267bf215546Sopenharmony_ci (('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)), 268bf215546Sopenharmony_ci ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 269bf215546Sopenharmony_ci (('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')), 270bf215546Sopenharmony_ci ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 271bf215546Sopenharmony_ci 272bf215546Sopenharmony_ci # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c) 273bf215546Sopenharmony_ci (('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c), 274bf215546Sopenharmony_ci ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 275bf215546Sopenharmony_ci (('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c), 276bf215546Sopenharmony_ci ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 277bf215546Sopenharmony_ci] 278bf215546Sopenharmony_ci 279bf215546Sopenharmony_ci# Shorthand for the expansion of just the dot product part of the [iu]dp4a 280bf215546Sopenharmony_ci# instructions. 281bf215546Sopenharmony_cisdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 282bf215546Sopenharmony_ci ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 283bf215546Sopenharmony_ci ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 284bf215546Sopenharmony_ci ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 285bf215546Sopenharmony_ciudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 286bf215546Sopenharmony_ci ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 287bf215546Sopenharmony_ci ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 288bf215546Sopenharmony_ci ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 289bf215546Sopenharmony_cisudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 290bf215546Sopenharmony_ci ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 291bf215546Sopenharmony_ci ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 292bf215546Sopenharmony_ci ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 293bf215546Sopenharmony_cisdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 294bf215546Sopenharmony_ci ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 295bf215546Sopenharmony_ciudot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 296bf215546Sopenharmony_ci ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 297bf215546Sopenharmony_ci 298bf215546Sopenharmony_cioptimizations.extend([ 299bf215546Sopenharmony_ci (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 300bf215546Sopenharmony_ci (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'), 301bf215546Sopenharmony_ci (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 302bf215546Sopenharmony_ci (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 303bf215546Sopenharmony_ci (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 304bf215546Sopenharmony_ci 305bf215546Sopenharmony_ci # For the unsigned dot-product, the largest possible value 4*(255*255) = 306bf215546Sopenharmony_ci # 0x3f804, so we don't have to worry about that intermediate result 307bf215546Sopenharmony_ci # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 308bf215546Sopenharmony_ci # that is less than 0xfffc07fc, then the result cannot overflow ever. 309bf215546Sopenharmony_ci (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 310bf215546Sopenharmony_ci (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'), 311bf215546Sopenharmony_ci 312bf215546Sopenharmony_ci # For the signed dot-product, the largest positive value is 4*(-128*-128) = 313bf215546Sopenharmony_ci # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 314bf215546Sopenharmony_ci # don't have to worry about that intermediate result overflowing or 315bf215546Sopenharmony_ci # underflowing. 316bf215546Sopenharmony_ci (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 317bf215546Sopenharmony_ci 318bf215546Sopenharmony_ci (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 319bf215546Sopenharmony_ci 320bf215546Sopenharmony_ci (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 321bf215546Sopenharmony_ci (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 322bf215546Sopenharmony_ci]) 323bf215546Sopenharmony_ci 324bf215546Sopenharmony_ci# Float sizes 325bf215546Sopenharmony_cifor s in [16, 32, 64]: 326bf215546Sopenharmony_ci optimizations.extend([ 327bf215546Sopenharmony_ci (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 328bf215546Sopenharmony_ci 329bf215546Sopenharmony_ci (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 330bf215546Sopenharmony_ci (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 331bf215546Sopenharmony_ci (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 332bf215546Sopenharmony_ci 333bf215546Sopenharmony_ci (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 334bf215546Sopenharmony_ci # These are the same as the previous three rules, but it depends on 335bf215546Sopenharmony_ci # 1-fsat(x) <=> fsat(1-x). See below. 336bf215546Sopenharmony_ci (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 337bf215546Sopenharmony_ci (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 338bf215546Sopenharmony_ci 339bf215546Sopenharmony_ci (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 340bf215546Sopenharmony_ci (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 341bf215546Sopenharmony_ci 342bf215546Sopenharmony_ci (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 343bf215546Sopenharmony_ci (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 344bf215546Sopenharmony_ci 345bf215546Sopenharmony_ci # These two aren't flrp lowerings, but do appear in some shaders. 346bf215546Sopenharmony_ci (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 347bf215546Sopenharmony_ci (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 348bf215546Sopenharmony_ci 349bf215546Sopenharmony_ci # 1 - ((1 - a) * (1 - b)) 350bf215546Sopenharmony_ci # 1 - (1 - a - b + a*b) 351bf215546Sopenharmony_ci # 1 - 1 + a + b - a*b 352bf215546Sopenharmony_ci # a + b - a*b 353bf215546Sopenharmony_ci # a + b*(1 - a) 354bf215546Sopenharmony_ci # b*(1 - a) + 1*a 355bf215546Sopenharmony_ci # flrp(b, 1, a) 356bf215546Sopenharmony_ci (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 357bf215546Sopenharmony_ci ]) 358bf215546Sopenharmony_ci 359bf215546Sopenharmony_cioptimizations.extend([ 360bf215546Sopenharmony_ci (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 361bf215546Sopenharmony_ci 362bf215546Sopenharmony_ci (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 363bf215546Sopenharmony_ci (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 364bf215546Sopenharmony_ci 365bf215546Sopenharmony_ci (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 366bf215546Sopenharmony_ci (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 367bf215546Sopenharmony_ci (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'), 368bf215546Sopenharmony_ci (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 369bf215546Sopenharmony_ci (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 370bf215546Sopenharmony_ci (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 371bf215546Sopenharmony_ci (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 372bf215546Sopenharmony_ci (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 373bf215546Sopenharmony_ci (('ffract@64', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)'), 374bf215546Sopenharmony_ci (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 375bf215546Sopenharmony_ci (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 376bf215546Sopenharmony_ci (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 377bf215546Sopenharmony_ci (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 378bf215546Sopenharmony_ci (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), 379bf215546Sopenharmony_ci # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 380bf215546Sopenharmony_ci (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 381bf215546Sopenharmony_ci (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 382bf215546Sopenharmony_ci (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 383bf215546Sopenharmony_ci (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), 384bf215546Sopenharmony_ci 385bf215546Sopenharmony_ci (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 386bf215546Sopenharmony_ci ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 387bf215546Sopenharmony_ci 388bf215546Sopenharmony_ci (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 389bf215546Sopenharmony_ci 390bf215546Sopenharmony_ci (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 391bf215546Sopenharmony_ci (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 392bf215546Sopenharmony_ci (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 393bf215546Sopenharmony_ci (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 394bf215546Sopenharmony_ci 395bf215546Sopenharmony_ci (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 396bf215546Sopenharmony_ci (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 397bf215546Sopenharmony_ci 398bf215546Sopenharmony_ci (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 399bf215546Sopenharmony_ci (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 400bf215546Sopenharmony_ci 401bf215546Sopenharmony_ci # Lower fdot to fsum when it is available 402bf215546Sopenharmony_ci (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 403bf215546Sopenharmony_ci (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 404bf215546Sopenharmony_ci (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 405bf215546Sopenharmony_ci (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 406bf215546Sopenharmony_ci 407bf215546Sopenharmony_ci # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 408bf215546Sopenharmony_ci # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 409bf215546Sopenharmony_ci # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 410bf215546Sopenharmony_ci (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 411bf215546Sopenharmony_ci 412bf215546Sopenharmony_ci # (a * #b + #c) << #d 413bf215546Sopenharmony_ci # ((a * #b) << #d) + (#c << #d) 414bf215546Sopenharmony_ci # (a * (#b << #d)) + (#c << #d) 415bf215546Sopenharmony_ci (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 416bf215546Sopenharmony_ci ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 417bf215546Sopenharmony_ci 418bf215546Sopenharmony_ci # (a * #b) << #c 419bf215546Sopenharmony_ci # a * (#b << #c) 420bf215546Sopenharmony_ci (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 421bf215546Sopenharmony_ci]) 422bf215546Sopenharmony_ci 423bf215546Sopenharmony_ci# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 424bf215546Sopenharmony_ci# bits of the second source. These replacements must correctly handle the 425bf215546Sopenharmony_ci# case where (b % bitsize) + (c % bitsize) >= bitsize. 426bf215546Sopenharmony_cifor s in [8, 16, 32, 64]: 427bf215546Sopenharmony_ci mask = s - 1 428bf215546Sopenharmony_ci 429bf215546Sopenharmony_ci ishl = "ishl@{}".format(s) 430bf215546Sopenharmony_ci ishr = "ishr@{}".format(s) 431bf215546Sopenharmony_ci ushr = "ushr@{}".format(s) 432bf215546Sopenharmony_ci 433bf215546Sopenharmony_ci in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 434bf215546Sopenharmony_ci 435bf215546Sopenharmony_ci optimizations.extend([ 436bf215546Sopenharmony_ci ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 437bf215546Sopenharmony_ci ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 438bf215546Sopenharmony_ci 439bf215546Sopenharmony_ci # To get get -1 for large shifts of negative values, ishr must instead 440bf215546Sopenharmony_ci # clamp the shift count to the maximum value. 441bf215546Sopenharmony_ci ((ishr, (ishr, a, '#b'), '#c'), 442bf215546Sopenharmony_ci (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 443bf215546Sopenharmony_ci ]) 444bf215546Sopenharmony_ci 445bf215546Sopenharmony_ci# Optimize a pattern of address calculation created by DXVK where the offset is 446bf215546Sopenharmony_ci# divided by 4 and then multipled by 4. This can be turned into an iand and the 447bf215546Sopenharmony_ci# additions before can be reassociated to CSE the iand instruction. 448bf215546Sopenharmony_ci 449bf215546Sopenharmony_cifor size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 450bf215546Sopenharmony_ci a_sz = 'a@{}'.format(size) 451bf215546Sopenharmony_ci 452bf215546Sopenharmony_ci optimizations.extend([ 453bf215546Sopenharmony_ci # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 454bf215546Sopenharmony_ci (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 455bf215546Sopenharmony_ci (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 456bf215546Sopenharmony_ci 457bf215546Sopenharmony_ci # This does not trivially work with ishr. 458bf215546Sopenharmony_ci (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 459bf215546Sopenharmony_ci ]) 460bf215546Sopenharmony_ci 461bf215546Sopenharmony_cioptimizations.extend([ 462bf215546Sopenharmony_ci (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)), 463bf215546Sopenharmony_ci (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)), 464bf215546Sopenharmony_ci]) 465bf215546Sopenharmony_ci 466bf215546Sopenharmony_cifor log2 in range(1, 7): # powers of two from 2 to 64 467bf215546Sopenharmony_ci v = 1 << log2 468bf215546Sopenharmony_ci mask = 0xffffffff & ~(v - 1) 469bf215546Sopenharmony_ci b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 470bf215546Sopenharmony_ci 471bf215546Sopenharmony_ci optimizations.extend([ 472bf215546Sopenharmony_ci # Reassociate for improved CSE 473bf215546Sopenharmony_ci (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 474bf215546Sopenharmony_ci ]) 475bf215546Sopenharmony_ci 476bf215546Sopenharmony_ci# To save space in the state tables, reduce to the set that is known to help. 477bf215546Sopenharmony_ci# Previously, this was range(1, 32). In addition, a couple rules inside the 478bf215546Sopenharmony_ci# loop are commented out. Revisit someday, probably after mesa/#2635 has some 479bf215546Sopenharmony_ci# resolution. 480bf215546Sopenharmony_cifor i in [1, 2, 16, 24]: 481bf215546Sopenharmony_ci lo_mask = 0xffffffff >> i 482bf215546Sopenharmony_ci hi_mask = (0xffffffff << i) & 0xffffffff 483bf215546Sopenharmony_ci 484bf215546Sopenharmony_ci optimizations.extend([ 485bf215546Sopenharmony_ci # This pattern seems to only help in the soft-fp64 code. 486bf215546Sopenharmony_ci (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 487bf215546Sopenharmony_ci# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 488bf215546Sopenharmony_ci# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 489bf215546Sopenharmony_ci 490bf215546Sopenharmony_ci (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 491bf215546Sopenharmony_ci (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 492bf215546Sopenharmony_ci# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 493bf215546Sopenharmony_ci ]) 494bf215546Sopenharmony_ci 495bf215546Sopenharmony_cioptimizations.extend([ 496bf215546Sopenharmony_ci # This is common for address calculations. Reassociating may enable the 497bf215546Sopenharmony_ci # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 498bf215546Sopenharmony_ci # instruction or a constant offset field for in load / store instructions. 499bf215546Sopenharmony_ci (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 500bf215546Sopenharmony_ci 501bf215546Sopenharmony_ci # (a + #b) * #c => (a * #c) + (#b * #c) 502bf215546Sopenharmony_ci (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 503bf215546Sopenharmony_ci 504bf215546Sopenharmony_ci # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 505bf215546Sopenharmony_ci (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 506bf215546Sopenharmony_ci ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 507bf215546Sopenharmony_ci (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 508bf215546Sopenharmony_ci ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 509bf215546Sopenharmony_ci 510bf215546Sopenharmony_ci # Comparison simplifications 511bf215546Sopenharmony_ci (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 512bf215546Sopenharmony_ci (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 513bf215546Sopenharmony_ci (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 514bf215546Sopenharmony_ci (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 515bf215546Sopenharmony_ci (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 516bf215546Sopenharmony_ci (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 517bf215546Sopenharmony_ci (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 518bf215546Sopenharmony_ci (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 519bf215546Sopenharmony_ci (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 520bf215546Sopenharmony_ci (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 521bf215546Sopenharmony_ci 522bf215546Sopenharmony_ci (('iand', ('feq', a, b), ('fneu', a, b)), False), 523bf215546Sopenharmony_ci (('iand', ('flt', a, b), ('flt', b, a)), False), 524bf215546Sopenharmony_ci (('iand', ('ieq', a, b), ('ine', a, b)), False), 525bf215546Sopenharmony_ci (('iand', ('ilt', a, b), ('ilt', b, a)), False), 526bf215546Sopenharmony_ci (('iand', ('ult', a, b), ('ult', b, a)), False), 527bf215546Sopenharmony_ci 528bf215546Sopenharmony_ci # This helps some shaders because, after some optimizations, they end up 529bf215546Sopenharmony_ci # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 530bf215546Sopenharmony_ci # matching would be handled by CSE. 531bf215546Sopenharmony_ci (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 532bf215546Sopenharmony_ci (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 533bf215546Sopenharmony_ci (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 534bf215546Sopenharmony_ci (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 535bf215546Sopenharmony_ci (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 536bf215546Sopenharmony_ci (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 537bf215546Sopenharmony_ci (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 538bf215546Sopenharmony_ci (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 539bf215546Sopenharmony_ci (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 540bf215546Sopenharmony_ci (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 541bf215546Sopenharmony_ci 542bf215546Sopenharmony_ci # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 543bf215546Sopenharmony_ci (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 544bf215546Sopenharmony_ci 545bf215546Sopenharmony_ci # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 546bf215546Sopenharmony_ci (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 547bf215546Sopenharmony_ci 548bf215546Sopenharmony_ci # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 549bf215546Sopenharmony_ci (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 550bf215546Sopenharmony_ci 551bf215546Sopenharmony_ci # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 552bf215546Sopenharmony_ci (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 553bf215546Sopenharmony_ci 554bf215546Sopenharmony_ci # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 555bf215546Sopenharmony_ci (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 556bf215546Sopenharmony_ci 557bf215546Sopenharmony_ci # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 558bf215546Sopenharmony_ci (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 559bf215546Sopenharmony_ci 560bf215546Sopenharmony_ci # 0.0 >= b2f(a) 561bf215546Sopenharmony_ci # b2f(a) <= 0.0 562bf215546Sopenharmony_ci # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 563bf215546Sopenharmony_ci # inot(a) 564bf215546Sopenharmony_ci (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 565bf215546Sopenharmony_ci 566bf215546Sopenharmony_ci (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 567bf215546Sopenharmony_ci 568bf215546Sopenharmony_ci (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 569bf215546Sopenharmony_ci (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 570bf215546Sopenharmony_ci (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 571bf215546Sopenharmony_ci (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 572bf215546Sopenharmony_ci (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 573bf215546Sopenharmony_ci (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 574bf215546Sopenharmony_ci (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 575bf215546Sopenharmony_ci (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 576bf215546Sopenharmony_ci (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 577bf215546Sopenharmony_ci (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 578bf215546Sopenharmony_ci (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 579bf215546Sopenharmony_ci (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 580bf215546Sopenharmony_ci (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 581bf215546Sopenharmony_ci (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 582bf215546Sopenharmony_ci (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 583bf215546Sopenharmony_ci (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 584bf215546Sopenharmony_ci 585bf215546Sopenharmony_ci # -(b2f(a) + b2f(b)) < 0 586bf215546Sopenharmony_ci # 0 < b2f(a) + b2f(b) 587bf215546Sopenharmony_ci # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 588bf215546Sopenharmony_ci # a || b 589bf215546Sopenharmony_ci (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 590bf215546Sopenharmony_ci (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 591bf215546Sopenharmony_ci 592bf215546Sopenharmony_ci # -(b2f(a) + b2f(b)) >= 0 593bf215546Sopenharmony_ci # 0 >= b2f(a) + b2f(b) 594bf215546Sopenharmony_ci # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 595bf215546Sopenharmony_ci # !(a || b) 596bf215546Sopenharmony_ci (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 597bf215546Sopenharmony_ci (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 598bf215546Sopenharmony_ci 599bf215546Sopenharmony_ci (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 600bf215546Sopenharmony_ci (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 601bf215546Sopenharmony_ci 602bf215546Sopenharmony_ci # Some optimizations (below) convert things like (a < b || c < b) into 603bf215546Sopenharmony_ci # (min(a, c) < b). However, this interfers with the previous optimizations 604bf215546Sopenharmony_ci # that try to remove comparisons with negated sums of b2f. This just 605bf215546Sopenharmony_ci # breaks that apart. 606bf215546Sopenharmony_ci (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 607bf215546Sopenharmony_ci ('ior', ('flt', c, 0.0), ('ior', a, b))), 608bf215546Sopenharmony_ci 609bf215546Sopenharmony_ci (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 610bf215546Sopenharmony_ci (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 611bf215546Sopenharmony_ci (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 612bf215546Sopenharmony_ci (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 613bf215546Sopenharmony_ci (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 614bf215546Sopenharmony_ci (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 615bf215546Sopenharmony_ci (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 616bf215546Sopenharmony_ci (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 617bf215546Sopenharmony_ci (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 618bf215546Sopenharmony_ci (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 619bf215546Sopenharmony_ci (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 620bf215546Sopenharmony_ci (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 621bf215546Sopenharmony_ci 622bf215546Sopenharmony_ci # Cannot remove the addition from ilt or ige due to overflow. 623bf215546Sopenharmony_ci (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 624bf215546Sopenharmony_ci (('ine', ('iadd', a, b), a), ('ine', b, 0)), 625bf215546Sopenharmony_ci 626bf215546Sopenharmony_ci (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 627bf215546Sopenharmony_ci (('fneu', ('b2f', 'a@1'), 0.0), a), 628bf215546Sopenharmony_ci (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 629bf215546Sopenharmony_ci (('ine', ('b2i', 'a@1'), 0), a), 630bf215546Sopenharmony_ci 631bf215546Sopenharmony_ci (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 632bf215546Sopenharmony_ci (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 633bf215546Sopenharmony_ci (('fge', ('u2f', a), 0.0), True), 634bf215546Sopenharmony_ci (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 635bf215546Sopenharmony_ci (('flt', ('u2f', a), 0.0), False), 636bf215546Sopenharmony_ci (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 637bf215546Sopenharmony_ci (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 638bf215546Sopenharmony_ci (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 639bf215546Sopenharmony_ci (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 640bf215546Sopenharmony_ci (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 641bf215546Sopenharmony_ci (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 642bf215546Sopenharmony_ci (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 643bf215546Sopenharmony_ci 644bf215546Sopenharmony_ci # 0.0 < fabs(a) 645bf215546Sopenharmony_ci # fabs(a) > 0.0 646bf215546Sopenharmony_ci # fabs(a) != 0.0 because fabs(a) must be >= 0 647bf215546Sopenharmony_ci # a != 0.0 648bf215546Sopenharmony_ci (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 649bf215546Sopenharmony_ci 650bf215546Sopenharmony_ci # -fabs(a) < 0.0 651bf215546Sopenharmony_ci # fabs(a) > 0.0 652bf215546Sopenharmony_ci (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 653bf215546Sopenharmony_ci 654bf215546Sopenharmony_ci # 0.0 >= fabs(a) 655bf215546Sopenharmony_ci # 0.0 == fabs(a) because fabs(a) must be >= 0 656bf215546Sopenharmony_ci # 0.0 == a 657bf215546Sopenharmony_ci (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 658bf215546Sopenharmony_ci 659bf215546Sopenharmony_ci # -fabs(a) >= 0.0 660bf215546Sopenharmony_ci # 0.0 >= fabs(a) 661bf215546Sopenharmony_ci (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 662bf215546Sopenharmony_ci 663bf215546Sopenharmony_ci # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 664bf215546Sopenharmony_ci # 665bf215546Sopenharmony_ci # This should be NaN safe. 666bf215546Sopenharmony_ci # 667bf215546Sopenharmony_ci # NaN >= 0 && 1 >= NaN -> false && false -> false 668bf215546Sopenharmony_ci # 669bf215546Sopenharmony_ci # vs. 670bf215546Sopenharmony_ci # 671bf215546Sopenharmony_ci # NaN == fsat(NaN) -> NaN == 0 -> false 672bf215546Sopenharmony_ci (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 673bf215546Sopenharmony_ci 674bf215546Sopenharmony_ci # Note: fmin(-a, -b) == -fmax(a, b) 675bf215546Sopenharmony_ci (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 676bf215546Sopenharmony_ci (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 677bf215546Sopenharmony_ci (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 678bf215546Sopenharmony_ci (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 679bf215546Sopenharmony_ci 680bf215546Sopenharmony_ci # fmin(b2f(a), b) 681bf215546Sopenharmony_ci # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 682bf215546Sopenharmony_ci # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 683bf215546Sopenharmony_ci # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 684bf215546Sopenharmony_ci # 685bf215546Sopenharmony_ci # Since b is a constant, constant folding will eliminate the fmin and the 686bf215546Sopenharmony_ci # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 687bf215546Sopenharmony_ci (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 688bf215546Sopenharmony_ci 689bf215546Sopenharmony_ci (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 690bf215546Sopenharmony_ci 691bf215546Sopenharmony_ci (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 692bf215546Sopenharmony_ci (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 693bf215546Sopenharmony_ci (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 694bf215546Sopenharmony_ci (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 695bf215546Sopenharmony_ci (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 696bf215546Sopenharmony_ci (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)), 697bf215546Sopenharmony_ci (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 698bf215546Sopenharmony_ci (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 699bf215546Sopenharmony_ci (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 700bf215546Sopenharmony_ci (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 701bf215546Sopenharmony_ci (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 702bf215546Sopenharmony_ci (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 703bf215546Sopenharmony_ci (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 704bf215546Sopenharmony_ci (('bcsel', a, True, b), ('ior', a, b)), 705bf215546Sopenharmony_ci (('bcsel', a, a, b), ('ior', a, b)), 706bf215546Sopenharmony_ci (('bcsel', a, b, False), ('iand', a, b)), 707bf215546Sopenharmony_ci (('bcsel', a, b, a), ('iand', a, b)), 708bf215546Sopenharmony_ci (('~fmin', a, a), a), 709bf215546Sopenharmony_ci (('~fmax', a, a), a), 710bf215546Sopenharmony_ci (('imin', a, a), a), 711bf215546Sopenharmony_ci (('imax', a, a), a), 712bf215546Sopenharmony_ci (('umin', a, a), a), 713bf215546Sopenharmony_ci (('umin', a, 0), 0), 714bf215546Sopenharmony_ci (('umin', a, -1), a), 715bf215546Sopenharmony_ci (('umax', a, a), a), 716bf215546Sopenharmony_ci (('umax', a, 0), a), 717bf215546Sopenharmony_ci (('umax', a, -1), -1), 718bf215546Sopenharmony_ci (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 719bf215546Sopenharmony_ci (('umax', ('umax', a, b), b), ('umax', a, b)), 720bf215546Sopenharmony_ci (('imax', ('imax', a, b), b), ('imax', a, b)), 721bf215546Sopenharmony_ci (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 722bf215546Sopenharmony_ci (('umin', ('umin', a, b), b), ('umin', a, b)), 723bf215546Sopenharmony_ci (('imin', ('imin', a, b), b), ('imin', a, b)), 724bf215546Sopenharmony_ci (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 725bf215546Sopenharmony_ci (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 726bf215546Sopenharmony_ci (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 727bf215546Sopenharmony_ci (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 728bf215546Sopenharmony_ci (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 729bf215546Sopenharmony_ci (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 730bf215546Sopenharmony_ci]) 731bf215546Sopenharmony_ci 732bf215546Sopenharmony_cifor N in [8, 16, 32, 64]: 733bf215546Sopenharmony_ci b2iN = 'b2i{0}'.format(N) 734bf215546Sopenharmony_ci optimizations.extend([ 735bf215546Sopenharmony_ci (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 736bf215546Sopenharmony_ci (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 737bf215546Sopenharmony_ci ]) 738bf215546Sopenharmony_ci 739bf215546Sopenharmony_cifor N in [16, 32, 64]: 740bf215546Sopenharmony_ci b2fN = 'b2f{0}'.format(N) 741bf215546Sopenharmony_ci optimizations.extend([ 742bf215546Sopenharmony_ci (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 743bf215546Sopenharmony_ci (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 744bf215546Sopenharmony_ci ]) 745bf215546Sopenharmony_ci 746bf215546Sopenharmony_ci# Integer sizes 747bf215546Sopenharmony_cifor s in [8, 16, 32, 64]: 748bf215546Sopenharmony_ci optimizations.extend([ 749bf215546Sopenharmony_ci (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 750bf215546Sopenharmony_ci 751bf215546Sopenharmony_ci # Simplify logic to detect sign of an integer. 752bf215546Sopenharmony_ci (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 753bf215546Sopenharmony_ci (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 754bf215546Sopenharmony_ci (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 755bf215546Sopenharmony_ci (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 756bf215546Sopenharmony_ci (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 757bf215546Sopenharmony_ci (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 758bf215546Sopenharmony_ci (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 759bf215546Sopenharmony_ci (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 760bf215546Sopenharmony_ci (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 761bf215546Sopenharmony_ci (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 762bf215546Sopenharmony_ci (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 763bf215546Sopenharmony_ci (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 764bf215546Sopenharmony_ci ]) 765bf215546Sopenharmony_ci 766bf215546Sopenharmony_cioptimizations.extend([ 767bf215546Sopenharmony_ci (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 768bf215546Sopenharmony_ci (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 769bf215546Sopenharmony_ci (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 770bf215546Sopenharmony_ci (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 771bf215546Sopenharmony_ci (('~fmin', a, ('fabs', a)), a), 772bf215546Sopenharmony_ci (('imin', a, ('iabs', a)), a), 773bf215546Sopenharmony_ci (('~fmax', a, ('fneg', ('fabs', a))), a), 774bf215546Sopenharmony_ci (('imax', a, ('ineg', ('iabs', a))), a), 775bf215546Sopenharmony_ci (('fmax', a, ('fabs', a)), ('fabs', a)), 776bf215546Sopenharmony_ci (('imax', a, ('iabs', a)), ('iabs', a)), 777bf215546Sopenharmony_ci (('fmax', a, ('fneg', a)), ('fabs', a)), 778bf215546Sopenharmony_ci (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 779bf215546Sopenharmony_ci (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 780bf215546Sopenharmony_ci (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 781bf215546Sopenharmony_ci # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 782bf215546Sopenharmony_ci # fsat(a) returns 0.0. 783bf215546Sopenharmony_ci (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 784bf215546Sopenharmony_ci # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 785bf215546Sopenharmony_ci # fneg(fsat(fneg(a))) returns -0.0 on NaN. 786bf215546Sopenharmony_ci (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 787bf215546Sopenharmony_ci # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 788bf215546Sopenharmony_ci # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 789bf215546Sopenharmony_ci # SignedZeroInfNanPreserve is set, but we don't currently have any way of 790bf215546Sopenharmony_ci # representing this in the optimizations other than the usual ~. 791bf215546Sopenharmony_ci (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 792bf215546Sopenharmony_ci # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 793bf215546Sopenharmony_ci # the new comparison precise to prevent it being changed to 'a != 0'. 794bf215546Sopenharmony_ci (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 795bf215546Sopenharmony_ci (('fsat', ('b2f', a)), ('b2f', a)), 796bf215546Sopenharmony_ci (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 797bf215546Sopenharmony_ci (('fsat', ('fsat', a)), ('fsat', a)), 798bf215546Sopenharmony_ci (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 799bf215546Sopenharmony_ci (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 800bf215546Sopenharmony_ci (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32), 801bf215546Sopenharmony_ci (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 802bf215546Sopenharmony_ci (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 803bf215546Sopenharmony_ci (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 804bf215546Sopenharmony_ci (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 805bf215546Sopenharmony_ci # Both the left and right patterns are "b" when isnan(a), so this is exact. 806bf215546Sopenharmony_ci (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 807bf215546Sopenharmony_ci # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 808bf215546Sopenharmony_ci # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 809bf215546Sopenharmony_ci (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 810bf215546Sopenharmony_ci 811bf215546Sopenharmony_ci # max(-min(b, a), b) -> max(abs(b), -a) 812bf215546Sopenharmony_ci # min(-max(b, a), b) -> min(-abs(b), -a) 813bf215546Sopenharmony_ci (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 814bf215546Sopenharmony_ci (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 815bf215546Sopenharmony_ci 816bf215546Sopenharmony_ci # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 817bf215546Sopenharmony_ci # fsat(b-a). 818bf215546Sopenharmony_ci # 819bf215546Sopenharmony_ci # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 820bf215546Sopenharmony_ci # 821bf215546Sopenharmony_ci # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 822bf215546Sopenharmony_ci (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 823bf215546Sopenharmony_ci ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 824bf215546Sopenharmony_ci 825bf215546Sopenharmony_ci (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 826bf215546Sopenharmony_ci 827bf215546Sopenharmony_ci # The ior versions are exact because fmin and fmax will always pick a 828bf215546Sopenharmony_ci # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 829bf215546Sopenharmony_ci # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 830bf215546Sopenharmony_ci # to prevent other optimizations from ruining the "NaN clensing" property 831bf215546Sopenharmony_ci # of the fmin or fmax. 832bf215546Sopenharmony_ci (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 833bf215546Sopenharmony_ci (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 834bf215546Sopenharmony_ci (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 835bf215546Sopenharmony_ci (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 836bf215546Sopenharmony_ci (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 837bf215546Sopenharmony_ci (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 838bf215546Sopenharmony_ci (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 839bf215546Sopenharmony_ci (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 840bf215546Sopenharmony_ci (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 841bf215546Sopenharmony_ci (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 842bf215546Sopenharmony_ci (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 843bf215546Sopenharmony_ci (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 844bf215546Sopenharmony_ci (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 845bf215546Sopenharmony_ci (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 846bf215546Sopenharmony_ci (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 847bf215546Sopenharmony_ci (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 848bf215546Sopenharmony_ci 849bf215546Sopenharmony_ci (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 850bf215546Sopenharmony_ci (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 851bf215546Sopenharmony_ci (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 852bf215546Sopenharmony_ci (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 853bf215546Sopenharmony_ci (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 854bf215546Sopenharmony_ci (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 855bf215546Sopenharmony_ci (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 856bf215546Sopenharmony_ci (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 857bf215546Sopenharmony_ci (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 858bf215546Sopenharmony_ci (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 859bf215546Sopenharmony_ci (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 860bf215546Sopenharmony_ci (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 861bf215546Sopenharmony_ci (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 862bf215546Sopenharmony_ci (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 863bf215546Sopenharmony_ci (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 864bf215546Sopenharmony_ci (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 865bf215546Sopenharmony_ci 866bf215546Sopenharmony_ci # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 867bf215546Sopenharmony_ci # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 868bf215546Sopenharmony_ci # single step. Doing just the replacement can lead to an infinite loop as 869bf215546Sopenharmony_ci # the pattern is repeatedly applied to the result of the previous 870bf215546Sopenharmony_ci # application of the pattern. 871bf215546Sopenharmony_ci (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 872bf215546Sopenharmony_ci (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 873bf215546Sopenharmony_ci (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 874bf215546Sopenharmony_ci (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 875bf215546Sopenharmony_ci 876bf215546Sopenharmony_ci # This is how SpvOpFOrdNotEqual might be implemented. If both values are 877bf215546Sopenharmony_ci # numbers, then it can be replaced with fneu. 878bf215546Sopenharmony_ci (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 879bf215546Sopenharmony_ci]) 880bf215546Sopenharmony_ci 881bf215546Sopenharmony_ci# Float sizes 882bf215546Sopenharmony_cifor s in [16, 32, 64]: 883bf215546Sopenharmony_ci optimizations.extend([ 884bf215546Sopenharmony_ci # These derive from the previous patterns with the application of b < 0 <=> 885bf215546Sopenharmony_ci # 0 < -b. The transformation should be applied if either comparison is 886bf215546Sopenharmony_ci # used once as this ensures that the number of comparisons will not 887bf215546Sopenharmony_ci # increase. The sources to the ior and iand are not symmetric, so the 888bf215546Sopenharmony_ci # rules have to be duplicated to get this behavior. 889bf215546Sopenharmony_ci (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 890bf215546Sopenharmony_ci (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 891bf215546Sopenharmony_ci (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 892bf215546Sopenharmony_ci (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 893bf215546Sopenharmony_ci (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 894bf215546Sopenharmony_ci (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 895bf215546Sopenharmony_ci (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 896bf215546Sopenharmony_ci (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 897bf215546Sopenharmony_ci 898bf215546Sopenharmony_ci # The (i2f32, ...) part is an open-coded fsign. When that is combined 899bf215546Sopenharmony_ci # with the bcsel, it's basically copysign(1.0, a). There are some 900bf215546Sopenharmony_ci # behavior differences between this pattern and copysign w.r.t. ±0 and 901bf215546Sopenharmony_ci # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 902bf215546Sopenharmony_ci # to x, regardless of whether either or both values are NaN. 903bf215546Sopenharmony_ci # 904bf215546Sopenharmony_ci # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 905bf215546Sopenharmony_ci # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 906bf215546Sopenharmony_ci # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 907bf215546Sopenharmony_ci # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 908bf215546Sopenharmony_ci # 909bf215546Sopenharmony_ci # For all other values of 'a', the original and replacement behave as 910bf215546Sopenharmony_ci # copysign. 911bf215546Sopenharmony_ci # 912bf215546Sopenharmony_ci # Marking the replacement comparisons as precise prevents any future 913bf215546Sopenharmony_ci # optimizations from replacing either of the comparisons with the 914bf215546Sopenharmony_ci # logical-not of the other. 915bf215546Sopenharmony_ci # 916bf215546Sopenharmony_ci # Note: Use b2i32 in the replacement because some platforms that 917bf215546Sopenharmony_ci # support fp16 don't support int16. 918bf215546Sopenharmony_ci (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 919bf215546Sopenharmony_ci ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 920bf215546Sopenharmony_ci 921bf215546Sopenharmony_ci (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 922bf215546Sopenharmony_ci 923bf215546Sopenharmony_ci # The C spec says, "If the value of the integral part cannot be represented 924bf215546Sopenharmony_ci # by the integer type, the behavior is undefined." "Undefined" can mean 925bf215546Sopenharmony_ci # "the conversion doesn't happen at all." 926bf215546Sopenharmony_ci (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 927bf215546Sopenharmony_ci 928bf215546Sopenharmony_ci # Ironically, mark these as imprecise because removing the conversions may 929bf215546Sopenharmony_ci # preserve more precision than doing the conversions (e.g., 930bf215546Sopenharmony_ci # uint(float(0x81818181u)) == 0x81818200). 931bf215546Sopenharmony_ci (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 932bf215546Sopenharmony_ci (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 933bf215546Sopenharmony_ci (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 934bf215546Sopenharmony_ci (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 935bf215546Sopenharmony_ci 936bf215546Sopenharmony_ci (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'), 937bf215546Sopenharmony_ci (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'), 938bf215546Sopenharmony_ci ]) 939bf215546Sopenharmony_ci 940bf215546Sopenharmony_ci # float? -> float? -> floatS ==> float? -> floatS 941bf215546Sopenharmony_ci (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 942bf215546Sopenharmony_ci 943bf215546Sopenharmony_ci # int? -> float? -> floatS ==> int? -> floatS 944bf215546Sopenharmony_ci (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 945bf215546Sopenharmony_ci (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 946bf215546Sopenharmony_ci 947bf215546Sopenharmony_ci # float? -> float? -> intS ==> float? -> intS 948bf215546Sopenharmony_ci (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 949bf215546Sopenharmony_ci (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 950bf215546Sopenharmony_ci 951bf215546Sopenharmony_ci for B in [32, 64]: 952bf215546Sopenharmony_ci if s < B: 953bf215546Sopenharmony_ci optimizations.extend([ 954bf215546Sopenharmony_ci # S = smaller, B = bigger 955bf215546Sopenharmony_ci # typeS -> typeB -> typeS ==> identity 956bf215546Sopenharmony_ci (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 957bf215546Sopenharmony_ci (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a), 958bf215546Sopenharmony_ci (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a), 959bf215546Sopenharmony_ci 960bf215546Sopenharmony_ci # bool1 -> typeB -> typeS ==> bool1 -> typeS 961bf215546Sopenharmony_ci (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)), 962bf215546Sopenharmony_ci (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)), 963bf215546Sopenharmony_ci (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)), 964bf215546Sopenharmony_ci 965bf215546Sopenharmony_ci # floatS -> floatB -> intB ==> floatS -> intB 966bf215546Sopenharmony_ci (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 967bf215546Sopenharmony_ci (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 968bf215546Sopenharmony_ci 969bf215546Sopenharmony_ci # int? -> floatB -> floatS ==> int? -> floatS 970bf215546Sopenharmony_ci (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 971bf215546Sopenharmony_ci (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 972bf215546Sopenharmony_ci 973bf215546Sopenharmony_ci # intS -> intB -> floatB ==> intS -> floatB 974bf215546Sopenharmony_ci (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)), 975bf215546Sopenharmony_ci (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)), 976bf215546Sopenharmony_ci ]) 977bf215546Sopenharmony_ci 978bf215546Sopenharmony_ci# mediump variants of the above 979bf215546Sopenharmony_cioptimizations.extend([ 980bf215546Sopenharmony_ci # int32 -> float32 -> float16 ==> int32 -> float16 981bf215546Sopenharmony_ci (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 982bf215546Sopenharmony_ci (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 983bf215546Sopenharmony_ci 984bf215546Sopenharmony_ci # float32 -> float16 -> int16 ==> float32 -> int16 985bf215546Sopenharmony_ci (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 986bf215546Sopenharmony_ci (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 987bf215546Sopenharmony_ci 988bf215546Sopenharmony_ci # float32 -> int32 -> int16 ==> float32 -> int16 989bf215546Sopenharmony_ci (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 990bf215546Sopenharmony_ci (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 991bf215546Sopenharmony_ci 992bf215546Sopenharmony_ci # int32 -> int16 -> float16 ==> int32 -> float16 993bf215546Sopenharmony_ci (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 994bf215546Sopenharmony_ci (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 995bf215546Sopenharmony_ci]) 996bf215546Sopenharmony_ci 997bf215546Sopenharmony_ci# Clean up junk left from 8-bit integer to 16-bit integer lowering. 998bf215546Sopenharmony_cioptimizations.extend([ 999bf215546Sopenharmony_ci # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 1000bf215546Sopenharmony_ci # accomplished by mask the upper 8-bit of the immediate operand to the 1001bf215546Sopenharmony_ci # iand instruction. Often times, both patterns will end up being applied 1002bf215546Sopenharmony_ci # to the same original expression tree. 1003bf215546Sopenharmony_ci (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 1004bf215546Sopenharmony_ci (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 1005bf215546Sopenharmony_ci]) 1006bf215546Sopenharmony_ci 1007bf215546Sopenharmony_cifor op in ['iand', 'ior', 'ixor']: 1008bf215546Sopenharmony_ci optimizations.extend([ 1009bf215546Sopenharmony_ci (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 1010bf215546Sopenharmony_ci (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 1011bf215546Sopenharmony_ci 1012bf215546Sopenharmony_ci # Undistribute extract from a logic op 1013bf215546Sopenharmony_ci ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 1014bf215546Sopenharmony_ci ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 1015bf215546Sopenharmony_ci ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 1016bf215546Sopenharmony_ci ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 1017bf215546Sopenharmony_ci 1018bf215546Sopenharmony_ci # Undistribute shifts from a logic op 1019bf215546Sopenharmony_ci ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 1020bf215546Sopenharmony_ci ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 1021bf215546Sopenharmony_ci ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 1022bf215546Sopenharmony_ci ]) 1023bf215546Sopenharmony_ci 1024bf215546Sopenharmony_ci# Integer sizes 1025bf215546Sopenharmony_cifor s in [8, 16, 32, 64]: 1026bf215546Sopenharmony_ci last_shift_bit = int(math.log2(s)) - 1 1027bf215546Sopenharmony_ci 1028bf215546Sopenharmony_ci optimizations.extend([ 1029bf215546Sopenharmony_ci (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'), 1030bf215546Sopenharmony_ci (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'), 1031bf215546Sopenharmony_ci (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'), 1032bf215546Sopenharmony_ci (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'), 1033bf215546Sopenharmony_ci (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'), 1034bf215546Sopenharmony_ci (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'), 1035bf215546Sopenharmony_ci 1036bf215546Sopenharmony_ci # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 1037bf215546Sopenharmony_ci (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 1038bf215546Sopenharmony_ci 1039bf215546Sopenharmony_ci # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 1040bf215546Sopenharmony_ci (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 1041bf215546Sopenharmony_ci (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 1042bf215546Sopenharmony_ci (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 1043bf215546Sopenharmony_ci (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))), 1044bf215546Sopenharmony_ci ]) 1045bf215546Sopenharmony_ci 1046bf215546Sopenharmony_cioptimizations.extend([ 1047bf215546Sopenharmony_ci # Common pattern like 'if (i == 0 || i == 1 || ...)' 1048bf215546Sopenharmony_ci (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 1049bf215546Sopenharmony_ci (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 1050bf215546Sopenharmony_ci (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 1051bf215546Sopenharmony_ci 1052bf215546Sopenharmony_ci (('ior', a, ('ieq', a, False)), True), 1053bf215546Sopenharmony_ci (('ior', a, ('inot', a)), -1), 1054bf215546Sopenharmony_ci 1055bf215546Sopenharmony_ci (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 1056bf215546Sopenharmony_ci (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 1057bf215546Sopenharmony_ci 1058bf215546Sopenharmony_ci # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1059bf215546Sopenharmony_ci # The first part of the iand comes from the !__feq64_nonnan. 1060bf215546Sopenharmony_ci # 1061bf215546Sopenharmony_ci # The second pattern is a reformulation of the first based on the relation 1062bf215546Sopenharmony_ci # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1063bf215546Sopenharmony_ci # happens to be y == 0. 1064bf215546Sopenharmony_ci (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1065bf215546Sopenharmony_ci ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1066bf215546Sopenharmony_ci (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1067bf215546Sopenharmony_ci ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1068bf215546Sopenharmony_ci 1069bf215546Sopenharmony_ci # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1070bf215546Sopenharmony_ci # transformations occur before constant propagation and loop-unrolling. 1071bf215546Sopenharmony_ci # 1072bf215546Sopenharmony_ci # The flt versions are exact. If isnan(a), the original pattern is 1073bf215546Sopenharmony_ci # trivially false, and the replacements are false too. If isnan(b): 1074bf215546Sopenharmony_ci # 1075bf215546Sopenharmony_ci # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1076bf215546Sopenharmony_ci (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1077bf215546Sopenharmony_ci (('flt', ('fmin', a, b), a), ('flt', b, a)), 1078bf215546Sopenharmony_ci (('~fge', a, ('fmin', b, a)), True), 1079bf215546Sopenharmony_ci (('~fge', ('fmax', a, b), a), True), 1080bf215546Sopenharmony_ci (('flt', a, ('fmin', b, a)), False), 1081bf215546Sopenharmony_ci (('flt', ('fmax', a, b), a), False), 1082bf215546Sopenharmony_ci (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1083bf215546Sopenharmony_ci (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1084bf215546Sopenharmony_ci 1085bf215546Sopenharmony_ci (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1086bf215546Sopenharmony_ci (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1087bf215546Sopenharmony_ci (('ige', a, ('imin', b, a)), True), 1088bf215546Sopenharmony_ci (('ige', ('imax', a, b), a), True), 1089bf215546Sopenharmony_ci (('ult', a, ('umax', b, a)), ('ult', a, b)), 1090bf215546Sopenharmony_ci (('ult', ('umin', a, b), a), ('ult', b, a)), 1091bf215546Sopenharmony_ci (('uge', a, ('umin', b, a)), True), 1092bf215546Sopenharmony_ci (('uge', ('umax', a, b), a), True), 1093bf215546Sopenharmony_ci (('ilt', a, ('imin', b, a)), False), 1094bf215546Sopenharmony_ci (('ilt', ('imax', a, b), a), False), 1095bf215546Sopenharmony_ci (('ige', a, ('imax', b, a)), ('ige', a, b)), 1096bf215546Sopenharmony_ci (('ige', ('imin', a, b), a), ('ige', b, a)), 1097bf215546Sopenharmony_ci (('ult', a, ('umin', b, a)), False), 1098bf215546Sopenharmony_ci (('ult', ('umax', a, b), a), False), 1099bf215546Sopenharmony_ci (('uge', a, ('umax', b, a)), ('uge', a, b)), 1100bf215546Sopenharmony_ci (('uge', ('umin', a, b), a), ('uge', b, a)), 1101bf215546Sopenharmony_ci (('ult', a, ('iand', b, a)), False), 1102bf215546Sopenharmony_ci (('ult', ('ior', a, b), a), False), 1103bf215546Sopenharmony_ci (('uge', a, ('iand', b, a)), True), 1104bf215546Sopenharmony_ci (('uge', ('ior', a, b), a), True), 1105bf215546Sopenharmony_ci 1106bf215546Sopenharmony_ci (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1107bf215546Sopenharmony_ci (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1108bf215546Sopenharmony_ci (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1109bf215546Sopenharmony_ci (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1110bf215546Sopenharmony_ci (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1111bf215546Sopenharmony_ci (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1112bf215546Sopenharmony_ci (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1113bf215546Sopenharmony_ci (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1114bf215546Sopenharmony_ci (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1115bf215546Sopenharmony_ci (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1116bf215546Sopenharmony_ci (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1117bf215546Sopenharmony_ci (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1118bf215546Sopenharmony_ci (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1119bf215546Sopenharmony_ci (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1120bf215546Sopenharmony_ci (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1121bf215546Sopenharmony_ci (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1122bf215546Sopenharmony_ci 1123bf215546Sopenharmony_ci # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1124bf215546Sopenharmony_ci # negative. 1125bf215546Sopenharmony_ci (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1126bf215546Sopenharmony_ci ('iabs', ('ishr', a, b))), 1127bf215546Sopenharmony_ci (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 1128bf215546Sopenharmony_ci 1129bf215546Sopenharmony_ci (('fabs', ('slt', a, b)), ('slt', a, b)), 1130bf215546Sopenharmony_ci (('fabs', ('sge', a, b)), ('sge', a, b)), 1131bf215546Sopenharmony_ci (('fabs', ('seq', a, b)), ('seq', a, b)), 1132bf215546Sopenharmony_ci (('fabs', ('sne', a, b)), ('sne', a, b)), 1133bf215546Sopenharmony_ci (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1134bf215546Sopenharmony_ci (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1135bf215546Sopenharmony_ci (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1136bf215546Sopenharmony_ci (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1137bf215546Sopenharmony_ci (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1138bf215546Sopenharmony_ci (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1139bf215546Sopenharmony_ci (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1140bf215546Sopenharmony_ci (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1141bf215546Sopenharmony_ci (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1142bf215546Sopenharmony_ci (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1143bf215546Sopenharmony_ci (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1144bf215546Sopenharmony_ci (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1145bf215546Sopenharmony_ci (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1146bf215546Sopenharmony_ci (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1147bf215546Sopenharmony_ci (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1148bf215546Sopenharmony_ci (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1149bf215546Sopenharmony_ci (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1150bf215546Sopenharmony_ci (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1151bf215546Sopenharmony_ci (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1152bf215546Sopenharmony_ci (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1153bf215546Sopenharmony_ci (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1154bf215546Sopenharmony_ci (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1155bf215546Sopenharmony_ci (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1156bf215546Sopenharmony_ci (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1157bf215546Sopenharmony_ci (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1158bf215546Sopenharmony_ci (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1159bf215546Sopenharmony_ci 1160bf215546Sopenharmony_ci (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1161bf215546Sopenharmony_ci (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1162bf215546Sopenharmony_ci (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1163bf215546Sopenharmony_ci 1164bf215546Sopenharmony_ci (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1165bf215546Sopenharmony_ci (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1166bf215546Sopenharmony_ci (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1167bf215546Sopenharmony_ci 1168bf215546Sopenharmony_ci (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1169bf215546Sopenharmony_ci (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1170bf215546Sopenharmony_ci (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1171bf215546Sopenharmony_ci 1172bf215546Sopenharmony_ci (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1173bf215546Sopenharmony_ci (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1174bf215546Sopenharmony_ci (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1175bf215546Sopenharmony_ci 1176bf215546Sopenharmony_ci (('feq', ('seq', a, b), 1.0), ('feq', a, b)), 1177bf215546Sopenharmony_ci (('feq', ('sne', a, b), 1.0), ('fneu', a, b)), 1178bf215546Sopenharmony_ci (('feq', ('slt', a, b), 1.0), ('flt', a, b)), 1179bf215546Sopenharmony_ci (('feq', ('sge', a, b), 1.0), ('fge', a, b)), 1180bf215546Sopenharmony_ci (('fneu', ('seq', a, b), 0.0), ('feq', a, b)), 1181bf215546Sopenharmony_ci (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)), 1182bf215546Sopenharmony_ci (('fneu', ('slt', a, b), 0.0), ('flt', a, b)), 1183bf215546Sopenharmony_ci (('fneu', ('sge', a, b), 0.0), ('fge', a, b)), 1184bf215546Sopenharmony_ci (('feq', ('seq', a, b), 0.0), ('fneu', a, b)), 1185bf215546Sopenharmony_ci (('feq', ('sne', a, b), 0.0), ('feq', a, b)), 1186bf215546Sopenharmony_ci (('feq', ('slt', a, b), 0.0), ('fge', a, b)), 1187bf215546Sopenharmony_ci (('feq', ('sge', a, b), 0.0), ('flt', a, b)), 1188bf215546Sopenharmony_ci (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)), 1189bf215546Sopenharmony_ci (('fneu', ('sne', a, b), 1.0), ('feq', a, b)), 1190bf215546Sopenharmony_ci (('fneu', ('slt', a, b), 1.0), ('fge', a, b)), 1191bf215546Sopenharmony_ci (('fneu', ('sge', a, b), 1.0), ('flt', a, b)), 1192bf215546Sopenharmony_ci 1193bf215546Sopenharmony_ci (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1194bf215546Sopenharmony_ci (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1195bf215546Sopenharmony_ci # Emulating booleans 1196bf215546Sopenharmony_ci (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1197bf215546Sopenharmony_ci (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1198bf215546Sopenharmony_ci (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1199bf215546Sopenharmony_ci (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1200bf215546Sopenharmony_ci (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1201bf215546Sopenharmony_ci (('iand', 'a@bool16', 1.0), ('b2f', a)), 1202bf215546Sopenharmony_ci (('iand', 'a@bool32', 1.0), ('b2f', a)), 1203bf215546Sopenharmony_ci (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1204bf215546Sopenharmony_ci # Comparison with the same args. Note that these are only done for the 1205bf215546Sopenharmony_ci # float versions when the source must be a number. Generally, NaN cmp NaN 1206bf215546Sopenharmony_ci # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1207bf215546Sopenharmony_ci # is false, and, for any number X, X < X is also false. 1208bf215546Sopenharmony_ci (('ilt', a, a), False), 1209bf215546Sopenharmony_ci (('ige', a, a), True), 1210bf215546Sopenharmony_ci (('ieq', a, a), True), 1211bf215546Sopenharmony_ci (('ine', a, a), False), 1212bf215546Sopenharmony_ci (('ult', a, a), False), 1213bf215546Sopenharmony_ci (('uge', a, a), True), 1214bf215546Sopenharmony_ci (('flt', a, a), False), 1215bf215546Sopenharmony_ci (('fge', 'a(is_a_number)', a), True), 1216bf215546Sopenharmony_ci (('feq', 'a(is_a_number)', a), True), 1217bf215546Sopenharmony_ci (('fneu', 'a(is_a_number)', a), False), 1218bf215546Sopenharmony_ci # Logical and bit operations 1219bf215546Sopenharmony_ci (('iand', a, a), a), 1220bf215546Sopenharmony_ci (('iand', a, ~0), a), 1221bf215546Sopenharmony_ci (('iand', a, 0), 0), 1222bf215546Sopenharmony_ci (('ior', a, a), a), 1223bf215546Sopenharmony_ci (('ior', a, 0), a), 1224bf215546Sopenharmony_ci (('ior', a, True), True), 1225bf215546Sopenharmony_ci (('ixor', a, a), 0), 1226bf215546Sopenharmony_ci (('ixor', a, 0), a), 1227bf215546Sopenharmony_ci (('ixor', a, ('ixor', a, b)), b), 1228bf215546Sopenharmony_ci (('ixor', a, -1), ('inot', a)), 1229bf215546Sopenharmony_ci (('inot', ('inot', a)), a), 1230bf215546Sopenharmony_ci (('ior', ('iand', a, b), b), b), 1231bf215546Sopenharmony_ci (('ior', ('ior', a, b), b), ('ior', a, b)), 1232bf215546Sopenharmony_ci (('iand', ('ior', a, b), b), b), 1233bf215546Sopenharmony_ci (('iand', ('iand', a, b), b), ('iand', a, b)), 1234bf215546Sopenharmony_ci # DeMorgan's Laws 1235bf215546Sopenharmony_ci (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1236bf215546Sopenharmony_ci (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1237bf215546Sopenharmony_ci # Shift optimizations 1238bf215546Sopenharmony_ci (('ishl', 0, a), 0), 1239bf215546Sopenharmony_ci (('ishl', a, 0), a), 1240bf215546Sopenharmony_ci (('ishr', 0, a), 0), 1241bf215546Sopenharmony_ci (('ishr', -1, a), -1), 1242bf215546Sopenharmony_ci (('ishr', a, 0), a), 1243bf215546Sopenharmony_ci (('ushr', 0, a), 0), 1244bf215546Sopenharmony_ci (('ushr', a, 0), a), 1245bf215546Sopenharmony_ci (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), 1246bf215546Sopenharmony_ci (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'), 1247bf215546Sopenharmony_ci (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), 1248bf215546Sopenharmony_ci (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'), 1249bf215546Sopenharmony_ci (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), 1250bf215546Sopenharmony_ci (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'), 1251bf215546Sopenharmony_ci (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), 1252bf215546Sopenharmony_ci (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'), 1253bf215546Sopenharmony_ci (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'), 1254bf215546Sopenharmony_ci (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'), 1255bf215546Sopenharmony_ci (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'), 1256bf215546Sopenharmony_ci (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'), 1257bf215546Sopenharmony_ci # Exponential/logarithmic identities 1258bf215546Sopenharmony_ci (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1259bf215546Sopenharmony_ci (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1260bf215546Sopenharmony_ci (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1261bf215546Sopenharmony_ci (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1262bf215546Sopenharmony_ci (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1263bf215546Sopenharmony_ci ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1264bf215546Sopenharmony_ci (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1265bf215546Sopenharmony_ci (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1266bf215546Sopenharmony_ci (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1267bf215546Sopenharmony_ci (('~fpow', a, 1.0), a), 1268bf215546Sopenharmony_ci (('~fpow', a, 2.0), ('fmul', a, a)), 1269bf215546Sopenharmony_ci (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1270bf215546Sopenharmony_ci (('~fpow', 2.0, a), ('fexp2', a)), 1271bf215546Sopenharmony_ci (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1272bf215546Sopenharmony_ci (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1273bf215546Sopenharmony_ci (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1274bf215546Sopenharmony_ci (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1275bf215546Sopenharmony_ci (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1276bf215546Sopenharmony_ci (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1277bf215546Sopenharmony_ci (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1278bf215546Sopenharmony_ci (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1279bf215546Sopenharmony_ci (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1280bf215546Sopenharmony_ci (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1281bf215546Sopenharmony_ci (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1282bf215546Sopenharmony_ci (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1283bf215546Sopenharmony_ci (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), 1284bf215546Sopenharmony_ci # Division and reciprocal 1285bf215546Sopenharmony_ci (('~fdiv', 1.0, a), ('frcp', a)), 1286bf215546Sopenharmony_ci (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1287bf215546Sopenharmony_ci (('~frcp', ('frcp', a)), a), 1288bf215546Sopenharmony_ci (('~frcp', ('fsqrt', a)), ('frsq', a)), 1289bf215546Sopenharmony_ci (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1290bf215546Sopenharmony_ci (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1291bf215546Sopenharmony_ci # Trig 1292bf215546Sopenharmony_ci (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1293bf215546Sopenharmony_ci (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1294bf215546Sopenharmony_ci # Boolean simplifications 1295bf215546Sopenharmony_ci (('i2b16(is_used_by_if)', a), ('ine16', a, 0)), 1296bf215546Sopenharmony_ci (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), 1297bf215546Sopenharmony_ci (('i2b1(is_used_by_if)', a), ('ine', a, 0)), 1298bf215546Sopenharmony_ci (('ieq', a, True), a), 1299bf215546Sopenharmony_ci (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1300bf215546Sopenharmony_ci (('ine', a, False), a), 1301bf215546Sopenharmony_ci (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1302bf215546Sopenharmony_ci (('bcsel', a, True, False), a), 1303bf215546Sopenharmony_ci (('bcsel', a, False, True), ('inot', a)), 1304bf215546Sopenharmony_ci (('bcsel', True, b, c), b), 1305bf215546Sopenharmony_ci (('bcsel', False, b, c), c), 1306bf215546Sopenharmony_ci 1307bf215546Sopenharmony_ci (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1308bf215546Sopenharmony_ci (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1309bf215546Sopenharmony_ci (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1310bf215546Sopenharmony_ci (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1311bf215546Sopenharmony_ci (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1312bf215546Sopenharmony_ci (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1313bf215546Sopenharmony_ci (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1314bf215546Sopenharmony_ci (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1315bf215546Sopenharmony_ci (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1316bf215546Sopenharmony_ci (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1317bf215546Sopenharmony_ci (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1318bf215546Sopenharmony_ci (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1319bf215546Sopenharmony_ci 1320bf215546Sopenharmony_ci (('bcsel', a, b, b), b), 1321bf215546Sopenharmony_ci (('~fcsel', a, b, b), b), 1322bf215546Sopenharmony_ci 1323bf215546Sopenharmony_ci # D3D Boolean emulation 1324bf215546Sopenharmony_ci (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 1325bf215546Sopenharmony_ci (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 1326bf215546Sopenharmony_ci (('bcsel', a, 1, 0), ('b2i', 'a@1')), 1327bf215546Sopenharmony_ci (('bcsel', a, 0, 1), ('b2i', ('inot', a))), 1328bf215546Sopenharmony_ci (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1329bf215546Sopenharmony_ci ('ineg', ('b2i', ('iand', a, b)))), 1330bf215546Sopenharmony_ci (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1331bf215546Sopenharmony_ci ('ineg', ('b2i', ('ior', a, b)))), 1332bf215546Sopenharmony_ci (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1333bf215546Sopenharmony_ci (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1334bf215546Sopenharmony_ci (('ine', ('ineg', ('b2i', 'a@1')), 0), a), 1335bf215546Sopenharmony_ci (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1336bf215546Sopenharmony_ci (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1337bf215546Sopenharmony_ci (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1338bf215546Sopenharmony_ci (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1339bf215546Sopenharmony_ci (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1340bf215546Sopenharmony_ci (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1341bf215546Sopenharmony_ci 1342bf215546Sopenharmony_ci # With D3D booleans, imax is AND and umax is OR 1343bf215546Sopenharmony_ci (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1344bf215546Sopenharmony_ci ('ineg', ('b2i', ('iand', a, b)))), 1345bf215546Sopenharmony_ci (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1346bf215546Sopenharmony_ci ('ineg', ('b2i', ('ior', a, b)))), 1347bf215546Sopenharmony_ci (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1348bf215546Sopenharmony_ci ('ineg', ('b2i', ('ior', a, b)))), 1349bf215546Sopenharmony_ci (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1350bf215546Sopenharmony_ci ('ineg', ('b2i', ('iand', a, b)))), 1351bf215546Sopenharmony_ci 1352bf215546Sopenharmony_ci # Conversions 1353bf215546Sopenharmony_ci (('i2b16', ('b2i', 'a@16')), a), 1354bf215546Sopenharmony_ci (('i2b32', ('b2i', 'a@32')), a), 1355bf215546Sopenharmony_ci (('f2i', ('ftrunc', a)), ('f2i', a)), 1356bf215546Sopenharmony_ci (('f2u', ('ftrunc', a)), ('f2u', a)), 1357bf215546Sopenharmony_ci (('i2b', ('ineg', a)), ('i2b', a)), 1358bf215546Sopenharmony_ci (('i2b', ('iabs', a)), ('i2b', a)), 1359bf215546Sopenharmony_ci (('inot', ('f2b1', a)), ('feq', a, 0.0)), 1360bf215546Sopenharmony_ci 1361bf215546Sopenharmony_ci # Conversions from 16 bits to 32 bits and back can always be removed 1362bf215546Sopenharmony_ci (('f2fmp', ('f2f32', 'a@16')), a), 1363bf215546Sopenharmony_ci (('i2imp', ('i2i32', 'a@16')), a), 1364bf215546Sopenharmony_ci (('i2imp', ('u2u32', 'a@16')), a), 1365bf215546Sopenharmony_ci 1366bf215546Sopenharmony_ci (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1367bf215546Sopenharmony_ci (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1368bf215546Sopenharmony_ci (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1369bf215546Sopenharmony_ci (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1370bf215546Sopenharmony_ci 1371bf215546Sopenharmony_ci (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1372bf215546Sopenharmony_ci (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1373bf215546Sopenharmony_ci (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1374bf215546Sopenharmony_ci 1375bf215546Sopenharmony_ci (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1376bf215546Sopenharmony_ci (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1377bf215546Sopenharmony_ci (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1378bf215546Sopenharmony_ci (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1379bf215546Sopenharmony_ci 1380bf215546Sopenharmony_ci # Conversions to 16 bits would be lossy so they should only be removed if 1381bf215546Sopenharmony_ci # the instruction was generated by the precision lowering pass. 1382bf215546Sopenharmony_ci (('f2f32', ('f2fmp', 'a@32')), a), 1383bf215546Sopenharmony_ci (('i2i32', ('i2imp', 'a@32')), a), 1384bf215546Sopenharmony_ci (('u2u32', ('i2imp', 'a@32')), a), 1385bf215546Sopenharmony_ci 1386bf215546Sopenharmony_ci (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1387bf215546Sopenharmony_ci (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1388bf215546Sopenharmony_ci (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1389bf215546Sopenharmony_ci (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1390bf215546Sopenharmony_ci 1391bf215546Sopenharmony_ci # Conversions from float32 to float64 and back can be removed as long as 1392bf215546Sopenharmony_ci # it doesn't need to be precise, since the conversion may e.g. flush denorms 1393bf215546Sopenharmony_ci (('~f2f32', ('f2f64', 'a@32')), a), 1394bf215546Sopenharmony_ci 1395bf215546Sopenharmony_ci (('ffloor', 'a(is_integral)'), a), 1396bf215546Sopenharmony_ci (('fceil', 'a(is_integral)'), a), 1397bf215546Sopenharmony_ci (('ftrunc', 'a(is_integral)'), a), 1398bf215546Sopenharmony_ci (('fround_even', 'a(is_integral)'), a), 1399bf215546Sopenharmony_ci 1400bf215546Sopenharmony_ci # fract(x) = x - floor(x), so fract(NaN) = NaN 1401bf215546Sopenharmony_ci (('~ffract', 'a(is_integral)'), 0.0), 1402bf215546Sopenharmony_ci (('fabs', 'a(is_not_negative)'), a), 1403bf215546Sopenharmony_ci (('iabs', 'a(is_not_negative)'), a), 1404bf215546Sopenharmony_ci (('fsat', 'a(is_not_positive)'), 0.0), 1405bf215546Sopenharmony_ci 1406bf215546Sopenharmony_ci (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1407bf215546Sopenharmony_ci 1408bf215546Sopenharmony_ci # The result of the multiply must be in [-1, 0], so the result of the ffma 1409bf215546Sopenharmony_ci # must be in [0, 1]. 1410bf215546Sopenharmony_ci (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1411bf215546Sopenharmony_ci (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1412bf215546Sopenharmony_ci (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1413bf215546Sopenharmony_ci (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1414bf215546Sopenharmony_ci 1415bf215546Sopenharmony_ci (('fneu', 'a(is_not_zero)', 0.0), True), 1416bf215546Sopenharmony_ci (('feq', 'a(is_not_zero)', 0.0), False), 1417bf215546Sopenharmony_ci 1418bf215546Sopenharmony_ci # In this chart, + means value > 0 and - means value < 0. 1419bf215546Sopenharmony_ci # 1420bf215546Sopenharmony_ci # + >= + -> unknown 0 >= + -> false - >= + -> false 1421bf215546Sopenharmony_ci # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1422bf215546Sopenharmony_ci # + >= - -> true 0 >= - -> true - >= - -> unknown 1423bf215546Sopenharmony_ci # 1424bf215546Sopenharmony_ci # Using grouping conceptually similar to a Karnaugh map... 1425bf215546Sopenharmony_ci # 1426bf215546Sopenharmony_ci # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1427bf215546Sopenharmony_ci # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1428bf215546Sopenharmony_ci # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1429bf215546Sopenharmony_ci # 1430bf215546Sopenharmony_ci # The flt / ilt cases just invert the expected result. 1431bf215546Sopenharmony_ci # 1432bf215546Sopenharmony_ci # The results expecting true, must be marked imprecise. The results 1433bf215546Sopenharmony_ci # expecting false are fine because NaN compared >= or < anything is false. 1434bf215546Sopenharmony_ci 1435bf215546Sopenharmony_ci (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1436bf215546Sopenharmony_ci (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1437bf215546Sopenharmony_ci (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1438bf215546Sopenharmony_ci 1439bf215546Sopenharmony_ci (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1440bf215546Sopenharmony_ci (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1441bf215546Sopenharmony_ci (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1442bf215546Sopenharmony_ci 1443bf215546Sopenharmony_ci (('ine', 'a(is_not_zero)', 0), True), 1444bf215546Sopenharmony_ci (('ieq', 'a(is_not_zero)', 0), False), 1445bf215546Sopenharmony_ci 1446bf215546Sopenharmony_ci (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1447bf215546Sopenharmony_ci (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1448bf215546Sopenharmony_ci (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1449bf215546Sopenharmony_ci 1450bf215546Sopenharmony_ci (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1451bf215546Sopenharmony_ci (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1452bf215546Sopenharmony_ci (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1453bf215546Sopenharmony_ci 1454bf215546Sopenharmony_ci (('ult', 0, 'a(is_gt_zero)'), True), 1455bf215546Sopenharmony_ci (('ult', a, 0), False), 1456bf215546Sopenharmony_ci 1457bf215546Sopenharmony_ci # Packing and then unpacking does nothing 1458bf215546Sopenharmony_ci (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 1459bf215546Sopenharmony_ci (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 1460bf215546Sopenharmony_ci (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'), 1461bf215546Sopenharmony_ci (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'), 1462bf215546Sopenharmony_ci (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)), 1463bf215546Sopenharmony_ci (('unpack_64_2x32', ('pack_64_2x32', a)), a), 1464bf215546Sopenharmony_ci (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1465bf215546Sopenharmony_ci (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1466bf215546Sopenharmony_ci ('unpack_64_2x32_split_y', a)), a), 1467bf215546Sopenharmony_ci (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a), 1468bf215546Sopenharmony_ci ('unpack_64_2x32_split_y', a))), a), 1469bf215546Sopenharmony_ci (('pack_64_2x32', ('unpack_64_2x32', a)), a), 1470bf215546Sopenharmony_ci (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1471bf215546Sopenharmony_ci 1472bf215546Sopenharmony_ci # Comparing two halves of an unpack separately. While this optimization 1473bf215546Sopenharmony_ci # should be correct for non-constant values, it's less obvious that it's 1474bf215546Sopenharmony_ci # useful in that case. For constant values, the pack will fold and we're 1475bf215546Sopenharmony_ci # guaranteed to reduce the whole tree to one instruction. 1476bf215546Sopenharmony_ci (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1477bf215546Sopenharmony_ci ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1478bf215546Sopenharmony_ci ('ieq', a, ('pack_32_2x16_split', b, c))), 1479bf215546Sopenharmony_ci 1480bf215546Sopenharmony_ci # Byte extraction 1481bf215546Sopenharmony_ci (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1482bf215546Sopenharmony_ci (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1483bf215546Sopenharmony_ci (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1484bf215546Sopenharmony_ci (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1485bf215546Sopenharmony_ci (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1486bf215546Sopenharmony_ci (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1487bf215546Sopenharmony_ci (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1488bf215546Sopenharmony_ci 1489bf215546Sopenharmony_ci # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1490bf215546Sopenharmony_ci # storage buffer. 1491bf215546Sopenharmony_ci (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1492bf215546Sopenharmony_ci (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1493bf215546Sopenharmony_ci 1494bf215546Sopenharmony_ci # Common pattern after lowering 8-bit integers to 16-bit. 1495bf215546Sopenharmony_ci (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1496bf215546Sopenharmony_ci (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1497bf215546Sopenharmony_ci 1498bf215546Sopenharmony_ci (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1499bf215546Sopenharmony_ci (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1500bf215546Sopenharmony_ci (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1501bf215546Sopenharmony_ci (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1502bf215546Sopenharmony_ci (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1503bf215546Sopenharmony_ci (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1504bf215546Sopenharmony_ci (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1505bf215546Sopenharmony_ci (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1506bf215546Sopenharmony_ci 1507bf215546Sopenharmony_ci (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1508bf215546Sopenharmony_ci (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1509bf215546Sopenharmony_ci 1510bf215546Sopenharmony_ci # Word extraction 1511bf215546Sopenharmony_ci (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1512bf215546Sopenharmony_ci (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1513bf215546Sopenharmony_ci (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1514bf215546Sopenharmony_ci (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1515bf215546Sopenharmony_ci (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1516bf215546Sopenharmony_ci 1517bf215546Sopenharmony_ci (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1518bf215546Sopenharmony_ci (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1519bf215546Sopenharmony_ci (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1520bf215546Sopenharmony_ci (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1521bf215546Sopenharmony_ci 1522bf215546Sopenharmony_ci # Packing a u8vec4 to write to an SSBO. 1523bf215546Sopenharmony_ci (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1524bf215546Sopenharmony_ci ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1525bf215546Sopenharmony_ci 1526bf215546Sopenharmony_ci (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1527bf215546Sopenharmony_ci (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1528bf215546Sopenharmony_ci 1529bf215546Sopenharmony_ci # Lower pack/unpack 1530bf215546Sopenharmony_ci (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1531bf215546Sopenharmony_ci (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'), 1532bf215546Sopenharmony_ci (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1533bf215546Sopenharmony_ci (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1534bf215546Sopenharmony_ci (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'), 1535bf215546Sopenharmony_ci (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'), 1536bf215546Sopenharmony_ci 1537bf215546Sopenharmony_ci # Useless masking before unpacking 1538bf215546Sopenharmony_ci (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1539bf215546Sopenharmony_ci (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1540bf215546Sopenharmony_ci (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1541bf215546Sopenharmony_ci (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1542bf215546Sopenharmony_ci (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1543bf215546Sopenharmony_ci (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1544bf215546Sopenharmony_ci 1545bf215546Sopenharmony_ci (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1546bf215546Sopenharmony_ci (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1547bf215546Sopenharmony_ci (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1548bf215546Sopenharmony_ci (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1549bf215546Sopenharmony_ci (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1550bf215546Sopenharmony_ci 1551bf215546Sopenharmony_ci # Optimize half packing 1552bf215546Sopenharmony_ci (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1553bf215546Sopenharmony_ci (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1554bf215546Sopenharmony_ci 1555bf215546Sopenharmony_ci (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1556bf215546Sopenharmony_ci ('pack_half_2x16', ('vec2', a, b))), 1557bf215546Sopenharmony_ci (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1558bf215546Sopenharmony_ci ('pack_half_2x16', ('vec2', a, b))), 1559bf215546Sopenharmony_ci 1560bf215546Sopenharmony_ci (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 1561bf215546Sopenharmony_ci (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 1562bf215546Sopenharmony_ci (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 1563bf215546Sopenharmony_ci 1564bf215546Sopenharmony_ci (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1565bf215546Sopenharmony_ci (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1566bf215546Sopenharmony_ci 1567bf215546Sopenharmony_ci (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 1568bf215546Sopenharmony_ci (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 1569bf215546Sopenharmony_ci (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 1570bf215546Sopenharmony_ci (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 1571bf215546Sopenharmony_ci (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 1572bf215546Sopenharmony_ci (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 1573bf215546Sopenharmony_ci (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 1574bf215546Sopenharmony_ci (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 1575bf215546Sopenharmony_ci]) 1576bf215546Sopenharmony_ci 1577bf215546Sopenharmony_ci# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 1578bf215546Sopenharmony_ci# patterns like those below. 1579bf215546Sopenharmony_cifor op in ('ushr', 'ishr'): 1580bf215546Sopenharmony_ci optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 1581bf215546Sopenharmony_ci optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 1582bf215546Sopenharmony_ci optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 1583bf215546Sopenharmony_ci 1584bf215546Sopenharmony_cioptimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 1585bf215546Sopenharmony_ci 1586bf215546Sopenharmony_ci# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 1587bf215546Sopenharmony_ci# patterns like those below. 1588bf215546Sopenharmony_cifor op in ('extract_u8', 'extract_i8'): 1589bf215546Sopenharmony_ci optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 1590bf215546Sopenharmony_ci optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 1591bf215546Sopenharmony_ci optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 1592bf215546Sopenharmony_ci 1593bf215546Sopenharmony_cioptimizations.extend([ 1594bf215546Sopenharmony_ci # Subtracts 1595bf215546Sopenharmony_ci (('ussub_4x8_vc4', a, 0), a), 1596bf215546Sopenharmony_ci (('ussub_4x8_vc4', a, ~0), 0), 1597bf215546Sopenharmony_ci # Lower all Subtractions first - they can get recombined later 1598bf215546Sopenharmony_ci (('fsub', a, b), ('fadd', a, ('fneg', b))), 1599bf215546Sopenharmony_ci (('isub', a, b), ('iadd', a, ('ineg', b))), 1600bf215546Sopenharmony_ci (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1601bf215546Sopenharmony_ci # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 1602bf215546Sopenharmony_ci (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1603bf215546Sopenharmony_ci 1604bf215546Sopenharmony_ci # Propagate negation up multiplication chains 1605bf215546Sopenharmony_ci (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 1606bf215546Sopenharmony_ci (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32), 1607bf215546Sopenharmony_ci (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 1608bf215546Sopenharmony_ci (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), 1609bf215546Sopenharmony_ci (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 1610bf215546Sopenharmony_ci 1611bf215546Sopenharmony_ci # Propagate constants up multiplication chains 1612bf215546Sopenharmony_ci (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 1613bf215546Sopenharmony_ci (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), 1614bf215546Sopenharmony_ci (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), 1615bf215546Sopenharmony_ci (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 1616bf215546Sopenharmony_ci (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 1617bf215546Sopenharmony_ci (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), 1618bf215546Sopenharmony_ci (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), 1619bf215546Sopenharmony_ci # Prefer moving out a multiplication for more MAD/FMA-friendly code 1620bf215546Sopenharmony_ci (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 1621bf215546Sopenharmony_ci (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 1622bf215546Sopenharmony_ci (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 1623bf215546Sopenharmony_ci (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), 1624bf215546Sopenharmony_ci (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 1625bf215546Sopenharmony_ci 1626bf215546Sopenharmony_ci # Reassociate constants in add/mul chains so they can be folded together. 1627bf215546Sopenharmony_ci # For now, we mostly only handle cases where the constants are separated by 1628bf215546Sopenharmony_ci # a single non-constant. We could do better eventually. 1629bf215546Sopenharmony_ci (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 1630bf215546Sopenharmony_ci (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), 1631bf215546Sopenharmony_ci (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), 1632bf215546Sopenharmony_ci (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 1633bf215546Sopenharmony_ci (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), 1634bf215546Sopenharmony_ci (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), 1635bf215546Sopenharmony_ci (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 1636bf215546Sopenharmony_ci (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 1637bf215546Sopenharmony_ci (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 1638bf215546Sopenharmony_ci (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 1639bf215546Sopenharmony_ci (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1640bf215546Sopenharmony_ci (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), 1641bf215546Sopenharmony_ci (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1642bf215546Sopenharmony_ci (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 1643bf215546Sopenharmony_ci (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 1644bf215546Sopenharmony_ci (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 1645bf215546Sopenharmony_ci (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 1646bf215546Sopenharmony_ci 1647bf215546Sopenharmony_ci # Reassociate add chains for more MAD/FMA-friendly code 1648bf215546Sopenharmony_ci (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 1649bf215546Sopenharmony_ci 1650bf215546Sopenharmony_ci # Drop mul-div by the same value when there's no wrapping. 1651bf215546Sopenharmony_ci (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 1652bf215546Sopenharmony_ci 1653bf215546Sopenharmony_ci # By definition... 1654bf215546Sopenharmony_ci (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 1655bf215546Sopenharmony_ci (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1656bf215546Sopenharmony_ci (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1657bf215546Sopenharmony_ci 1658bf215546Sopenharmony_ci (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 1659bf215546Sopenharmony_ci (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1660bf215546Sopenharmony_ci (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1661bf215546Sopenharmony_ci 1662bf215546Sopenharmony_ci (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 1663bf215546Sopenharmony_ci 1664bf215546Sopenharmony_ci (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1665bf215546Sopenharmony_ci (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1666bf215546Sopenharmony_ci (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1667bf215546Sopenharmony_ci (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1668bf215546Sopenharmony_ci (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 1669bf215546Sopenharmony_ci 1670bf215546Sopenharmony_ci (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 1671bf215546Sopenharmony_ci (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 1672bf215546Sopenharmony_ci 1673bf215546Sopenharmony_ci # Misc. lowering 1674bf215546Sopenharmony_ci (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 1675bf215546Sopenharmony_ci (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 1676bf215546Sopenharmony_ci (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 1677bf215546Sopenharmony_ci (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 1678bf215546Sopenharmony_ci 1679bf215546Sopenharmony_ci (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1680bf215546Sopenharmony_ci ('bcsel', ('ult', 31, 'bits'), 'insert', 1681bf215546Sopenharmony_ci ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 1682bf215546Sopenharmony_ci 'options->lower_bitfield_insert'), 1683bf215546Sopenharmony_ci (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1684bf215546Sopenharmony_ci (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1685bf215546Sopenharmony_ci (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1686bf215546Sopenharmony_ci (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1687bf215546Sopenharmony_ci (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1688bf215546Sopenharmony_ci (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1689bf215546Sopenharmony_ci (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1690bf215546Sopenharmony_ci (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1691bf215546Sopenharmony_ci 1692bf215546Sopenharmony_ci (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'), 1693bf215546Sopenharmony_ci (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'), 1694bf215546Sopenharmony_ci 1695bf215546Sopenharmony_ci (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1696bf215546Sopenharmony_ci (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 1697bf215546Sopenharmony_ci (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'), 1698bf215546Sopenharmony_ci (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'), 1699bf215546Sopenharmony_ci 1700bf215546Sopenharmony_ci # int64_t sum = a + b; 1701bf215546Sopenharmony_ci # 1702bf215546Sopenharmony_ci # if (a < 0 && b < 0 && a < sum) 1703bf215546Sopenharmony_ci # sum = INT64_MIN; 1704bf215546Sopenharmony_ci # } else if (a >= 0 && b >= 0 && sum < a) 1705bf215546Sopenharmony_ci # sum = INT64_MAX; 1706bf215546Sopenharmony_ci # } 1707bf215546Sopenharmony_ci # 1708bf215546Sopenharmony_ci # A couple optimizations are applied. 1709bf215546Sopenharmony_ci # 1710bf215546Sopenharmony_ci # 1. a < sum => sum >= 0. This replacement works because it is known that 1711bf215546Sopenharmony_ci # a < 0 and b < 0, so sum should also be < 0 unless there was 1712bf215546Sopenharmony_ci # underflow. 1713bf215546Sopenharmony_ci # 1714bf215546Sopenharmony_ci # 2. sum < a => sum < 0. This replacement works because it is known that 1715bf215546Sopenharmony_ci # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 1716bf215546Sopenharmony_ci # overflow. 1717bf215546Sopenharmony_ci # 1718bf215546Sopenharmony_ci # 3. Invert the second if-condition and swap the order of parameters for 1719bf215546Sopenharmony_ci # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 1720bf215546Sopenharmony_ci # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 1721bf215546Sopenharmony_ci # 1722bf215546Sopenharmony_ci # On Intel Gen11, this saves ~11 instructions. 1723bf215546Sopenharmony_ci (('iadd_sat@64', a, b), ('bcsel', 1724bf215546Sopenharmony_ci ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1725bf215546Sopenharmony_ci 0x8000000000000000, 1726bf215546Sopenharmony_ci ('bcsel', 1727bf215546Sopenharmony_ci ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1728bf215546Sopenharmony_ci ('iadd', a, b), 1729bf215546Sopenharmony_ci 0x7fffffffffffffff)), 1730bf215546Sopenharmony_ci '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 1731bf215546Sopenharmony_ci 1732bf215546Sopenharmony_ci # int64_t sum = a - b; 1733bf215546Sopenharmony_ci # 1734bf215546Sopenharmony_ci # if (a < 0 && b >= 0 && a < sum) 1735bf215546Sopenharmony_ci # sum = INT64_MIN; 1736bf215546Sopenharmony_ci # } else if (a >= 0 && b < 0 && a >= sum) 1737bf215546Sopenharmony_ci # sum = INT64_MAX; 1738bf215546Sopenharmony_ci # } 1739bf215546Sopenharmony_ci # 1740bf215546Sopenharmony_ci # Optimizations similar to the iadd_sat case are applied here. 1741bf215546Sopenharmony_ci (('isub_sat@64', a, b), ('bcsel', 1742bf215546Sopenharmony_ci ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1743bf215546Sopenharmony_ci 0x8000000000000000, 1744bf215546Sopenharmony_ci ('bcsel', 1745bf215546Sopenharmony_ci ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1746bf215546Sopenharmony_ci ('isub', a, b), 1747bf215546Sopenharmony_ci 0x7fffffffffffffff)), 1748bf215546Sopenharmony_ci '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 1749bf215546Sopenharmony_ci 1750bf215546Sopenharmony_ci # These are done here instead of in the backend because the int64 lowering 1751bf215546Sopenharmony_ci # pass will make a mess of the patterns. The first patterns are 1752bf215546Sopenharmony_ci # conditioned on nir_lower_minmax64 because it was not clear that it was 1753bf215546Sopenharmony_ci # always an improvement on platforms that have real int64 support. No 1754bf215546Sopenharmony_ci # shaders in shader-db hit this, so it was hard to say one way or the 1755bf215546Sopenharmony_ci # other. 1756bf215546Sopenharmony_ci (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1757bf215546Sopenharmony_ci (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1758bf215546Sopenharmony_ci (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1759bf215546Sopenharmony_ci (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1760bf215546Sopenharmony_ci (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1761bf215546Sopenharmony_ci (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1762bf215546Sopenharmony_ci 1763bf215546Sopenharmony_ci (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1764bf215546Sopenharmony_ci (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1765bf215546Sopenharmony_ci # 0u < uint(a) <=> uint(a) != 0u 1766bf215546Sopenharmony_ci (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1767bf215546Sopenharmony_ci 1768bf215546Sopenharmony_ci # Alternative lowering that doesn't rely on bfi. 1769bf215546Sopenharmony_ci (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1770bf215546Sopenharmony_ci ('bcsel', ('ult', 31, 'bits'), 1771bf215546Sopenharmony_ci 'insert', 1772bf215546Sopenharmony_ci (('ior', 1773bf215546Sopenharmony_ci ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 1774bf215546Sopenharmony_ci ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 1775bf215546Sopenharmony_ci 'options->lower_bitfield_insert_to_shifts'), 1776bf215546Sopenharmony_ci 1777bf215546Sopenharmony_ci # Alternative lowering that uses bitfield_select. 1778bf215546Sopenharmony_ci (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1779bf215546Sopenharmony_ci ('bcsel', ('ult', 31, 'bits'), 'insert', 1780bf215546Sopenharmony_ci ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 1781bf215546Sopenharmony_ci 'options->lower_bitfield_insert_to_bitfield_select'), 1782bf215546Sopenharmony_ci 1783bf215546Sopenharmony_ci (('ibitfield_extract', 'value', 'offset', 'bits'), 1784bf215546Sopenharmony_ci ('bcsel', ('ult', 31, 'bits'), 'value', 1785bf215546Sopenharmony_ci ('ibfe', 'value', 'offset', 'bits')), 1786bf215546Sopenharmony_ci 'options->lower_bitfield_extract'), 1787bf215546Sopenharmony_ci 1788bf215546Sopenharmony_ci (('ubitfield_extract', 'value', 'offset', 'bits'), 1789bf215546Sopenharmony_ci ('bcsel', ('ult', 31, 'bits'), 'value', 1790bf215546Sopenharmony_ci ('ubfe', 'value', 'offset', 'bits')), 1791bf215546Sopenharmony_ci 'options->lower_bitfield_extract'), 1792bf215546Sopenharmony_ci 1793bf215546Sopenharmony_ci # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 1794bf215546Sopenharmony_ci (('bitfield_select', a, b, 0), ('iand', a, b)), 1795bf215546Sopenharmony_ci (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 1796bf215546Sopenharmony_ci 1797bf215546Sopenharmony_ci # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 1798bf215546Sopenharmony_ci (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 1799bf215546Sopenharmony_ci (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 1800bf215546Sopenharmony_ci (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 1801bf215546Sopenharmony_ci (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 1802bf215546Sopenharmony_ci (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 1803bf215546Sopenharmony_ci (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 1804bf215546Sopenharmony_ci 1805bf215546Sopenharmony_ci # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 1806bf215546Sopenharmony_ci # 1807bf215546Sopenharmony_ci # If bits is zero, the result will be zero. 1808bf215546Sopenharmony_ci # 1809bf215546Sopenharmony_ci # These patterns prevent other patterns from generating invalid results 1810bf215546Sopenharmony_ci # when count is zero. 1811bf215546Sopenharmony_ci (('ubfe', a, b, 0), 0), 1812bf215546Sopenharmony_ci (('ibfe', a, b, 0), 0), 1813bf215546Sopenharmony_ci 1814bf215546Sopenharmony_ci (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 1815bf215546Sopenharmony_ci 1816bf215546Sopenharmony_ci (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)), 1817bf215546Sopenharmony_ci (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 1818bf215546Sopenharmony_ci (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1819bf215546Sopenharmony_ci (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1820bf215546Sopenharmony_ci (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1821bf215546Sopenharmony_ci (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1822bf215546Sopenharmony_ci 1823bf215546Sopenharmony_ci (('ibitfield_extract', 'value', 'offset', 'bits'), 1824bf215546Sopenharmony_ci ('bcsel', ('ieq', 0, 'bits'), 1825bf215546Sopenharmony_ci 0, 1826bf215546Sopenharmony_ci ('ishr', 1827bf215546Sopenharmony_ci ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 1828bf215546Sopenharmony_ci ('isub', 32, 'bits'))), 1829bf215546Sopenharmony_ci 'options->lower_bitfield_extract_to_shifts'), 1830bf215546Sopenharmony_ci 1831bf215546Sopenharmony_ci (('ubitfield_extract', 'value', 'offset', 'bits'), 1832bf215546Sopenharmony_ci ('iand', 1833bf215546Sopenharmony_ci ('ushr', 'value', 'offset'), 1834bf215546Sopenharmony_ci ('bcsel', ('ieq', 'bits', 32), 1835bf215546Sopenharmony_ci 0xffffffff, 1836bf215546Sopenharmony_ci ('isub', ('ishl', 1, 'bits'), 1))), 1837bf215546Sopenharmony_ci 'options->lower_bitfield_extract_to_shifts'), 1838bf215546Sopenharmony_ci 1839bf215546Sopenharmony_ci (('ifind_msb', 'value'), 1840bf215546Sopenharmony_ci ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 1841bf215546Sopenharmony_ci 'options->lower_ifind_msb'), 1842bf215546Sopenharmony_ci 1843bf215546Sopenharmony_ci (('ifind_msb', 'value'), 1844bf215546Sopenharmony_ci ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 1845bf215546Sopenharmony_ci ('isub', 31, ('ifind_msb_rev', 'value')), 1846bf215546Sopenharmony_ci ('ifind_msb_rev', 'value')), 1847bf215546Sopenharmony_ci 'options->lower_find_msb_to_reverse'), 1848bf215546Sopenharmony_ci 1849bf215546Sopenharmony_ci (('ufind_msb', 'value'), 1850bf215546Sopenharmony_ci ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 1851bf215546Sopenharmony_ci ('isub', 31, ('ufind_msb_rev', 'value')), 1852bf215546Sopenharmony_ci ('ufind_msb_rev', 'value')), 1853bf215546Sopenharmony_ci 'options->lower_find_msb_to_reverse'), 1854bf215546Sopenharmony_ci 1855bf215546Sopenharmony_ci (('find_lsb', 'value'), 1856bf215546Sopenharmony_ci ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 1857bf215546Sopenharmony_ci 'options->lower_find_lsb'), 1858bf215546Sopenharmony_ci 1859bf215546Sopenharmony_ci (('extract_i8', a, 'b@32'), 1860bf215546Sopenharmony_ci ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 1861bf215546Sopenharmony_ci 'options->lower_extract_byte'), 1862bf215546Sopenharmony_ci 1863bf215546Sopenharmony_ci (('extract_u8', a, 'b@32'), 1864bf215546Sopenharmony_ci ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 1865bf215546Sopenharmony_ci 'options->lower_extract_byte'), 1866bf215546Sopenharmony_ci 1867bf215546Sopenharmony_ci (('extract_i16', a, 'b@32'), 1868bf215546Sopenharmony_ci ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 1869bf215546Sopenharmony_ci 'options->lower_extract_word'), 1870bf215546Sopenharmony_ci 1871bf215546Sopenharmony_ci (('extract_u16', a, 'b@32'), 1872bf215546Sopenharmony_ci ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 1873bf215546Sopenharmony_ci 'options->lower_extract_word'), 1874bf215546Sopenharmony_ci 1875bf215546Sopenharmony_ci (('pack_unorm_2x16', 'v'), 1876bf215546Sopenharmony_ci ('pack_uvec2_to_uint', 1877bf215546Sopenharmony_ci ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 1878bf215546Sopenharmony_ci 'options->lower_pack_unorm_2x16'), 1879bf215546Sopenharmony_ci 1880bf215546Sopenharmony_ci (('pack_unorm_4x8', 'v'), 1881bf215546Sopenharmony_ci ('pack_uvec4_to_uint', 1882bf215546Sopenharmony_ci ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 1883bf215546Sopenharmony_ci 'options->lower_pack_unorm_4x8'), 1884bf215546Sopenharmony_ci 1885bf215546Sopenharmony_ci (('pack_snorm_2x16', 'v'), 1886bf215546Sopenharmony_ci ('pack_uvec2_to_uint', 1887bf215546Sopenharmony_ci ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 1888bf215546Sopenharmony_ci 'options->lower_pack_snorm_2x16'), 1889bf215546Sopenharmony_ci 1890bf215546Sopenharmony_ci (('pack_snorm_4x8', 'v'), 1891bf215546Sopenharmony_ci ('pack_uvec4_to_uint', 1892bf215546Sopenharmony_ci ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 1893bf215546Sopenharmony_ci 'options->lower_pack_snorm_4x8'), 1894bf215546Sopenharmony_ci 1895bf215546Sopenharmony_ci (('unpack_unorm_2x16', 'v'), 1896bf215546Sopenharmony_ci ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 1897bf215546Sopenharmony_ci ('extract_u16', 'v', 1))), 1898bf215546Sopenharmony_ci 65535.0), 1899bf215546Sopenharmony_ci 'options->lower_unpack_unorm_2x16'), 1900bf215546Sopenharmony_ci 1901bf215546Sopenharmony_ci (('unpack_unorm_4x8', 'v'), 1902bf215546Sopenharmony_ci ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 1903bf215546Sopenharmony_ci ('extract_u8', 'v', 1), 1904bf215546Sopenharmony_ci ('extract_u8', 'v', 2), 1905bf215546Sopenharmony_ci ('extract_u8', 'v', 3))), 1906bf215546Sopenharmony_ci 255.0), 1907bf215546Sopenharmony_ci 'options->lower_unpack_unorm_4x8'), 1908bf215546Sopenharmony_ci 1909bf215546Sopenharmony_ci (('unpack_snorm_2x16', 'v'), 1910bf215546Sopenharmony_ci ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 1911bf215546Sopenharmony_ci ('extract_i16', 'v', 1))), 1912bf215546Sopenharmony_ci 32767.0))), 1913bf215546Sopenharmony_ci 'options->lower_unpack_snorm_2x16'), 1914bf215546Sopenharmony_ci 1915bf215546Sopenharmony_ci (('unpack_snorm_4x8', 'v'), 1916bf215546Sopenharmony_ci ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 1917bf215546Sopenharmony_ci ('extract_i8', 'v', 1), 1918bf215546Sopenharmony_ci ('extract_i8', 'v', 2), 1919bf215546Sopenharmony_ci ('extract_i8', 'v', 3))), 1920bf215546Sopenharmony_ci 127.0))), 1921bf215546Sopenharmony_ci 'options->lower_unpack_snorm_4x8'), 1922bf215546Sopenharmony_ci 1923bf215546Sopenharmony_ci (('pack_half_2x16_split', 'a@32', 'b@32'), 1924bf215546Sopenharmony_ci ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 1925bf215546Sopenharmony_ci 'options->lower_pack_split'), 1926bf215546Sopenharmony_ci 1927bf215546Sopenharmony_ci (('unpack_half_2x16_split_x', 'a@32'), 1928bf215546Sopenharmony_ci ('f2f32', ('u2u16', a)), 1929bf215546Sopenharmony_ci 'options->lower_pack_split'), 1930bf215546Sopenharmony_ci 1931bf215546Sopenharmony_ci (('unpack_half_2x16_split_y', 'a@32'), 1932bf215546Sopenharmony_ci ('f2f32', ('u2u16', ('ushr', a, 16))), 1933bf215546Sopenharmony_ci 'options->lower_pack_split'), 1934bf215546Sopenharmony_ci 1935bf215546Sopenharmony_ci (('pack_32_2x16_split', 'a@16', 'b@16'), 1936bf215546Sopenharmony_ci ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)), 1937bf215546Sopenharmony_ci 'options->lower_pack_split'), 1938bf215546Sopenharmony_ci 1939bf215546Sopenharmony_ci (('unpack_32_2x16_split_x', 'a@32'), 1940bf215546Sopenharmony_ci ('u2u16', a), 1941bf215546Sopenharmony_ci 'options->lower_pack_split'), 1942bf215546Sopenharmony_ci 1943bf215546Sopenharmony_ci (('unpack_32_2x16_split_y', 'a@32'), 1944bf215546Sopenharmony_ci ('u2u16', ('ushr', 'a', 16)), 1945bf215546Sopenharmony_ci 'options->lower_pack_split'), 1946bf215546Sopenharmony_ci 1947bf215546Sopenharmony_ci (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 1948bf215546Sopenharmony_ci (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 1949bf215546Sopenharmony_ci (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 1950bf215546Sopenharmony_ci # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 1951bf215546Sopenharmony_ci # Mark the new comparisons precise to prevent them being changed to 'a != 1952bf215546Sopenharmony_ci # 0' or 'a == 0'. 1953bf215546Sopenharmony_ci (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 1954bf215546Sopenharmony_ci 1955bf215546Sopenharmony_ci # Address/offset calculations: 1956bf215546Sopenharmony_ci # Drivers supporting imul24 should use the nir_lower_amul() pass, this 1957bf215546Sopenharmony_ci # rule converts everyone else to imul: 1958bf215546Sopenharmony_ci (('amul', a, b), ('imul', a, b), '!options->has_imul24'), 1959bf215546Sopenharmony_ci 1960bf215546Sopenharmony_ci (('umul24', a, b), 1961bf215546Sopenharmony_ci ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 1962bf215546Sopenharmony_ci '!options->has_umul24'), 1963bf215546Sopenharmony_ci (('umad24', a, b, c), 1964bf215546Sopenharmony_ci ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 1965bf215546Sopenharmony_ci '!options->has_umad24'), 1966bf215546Sopenharmony_ci 1967bf215546Sopenharmony_ci # Relaxed 24bit ops 1968bf215546Sopenharmony_ci (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 1969bf215546Sopenharmony_ci (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 1970bf215546Sopenharmony_ci (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 1971bf215546Sopenharmony_ci (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 1972bf215546Sopenharmony_ci (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 1973bf215546Sopenharmony_ci (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 1974bf215546Sopenharmony_ci 1975bf215546Sopenharmony_ci (('imad24_ir3', a, b, 0), ('imul24', a, b)), 1976bf215546Sopenharmony_ci (('imad24_ir3', a, 0, c), (c)), 1977bf215546Sopenharmony_ci (('imad24_ir3', a, 1, c), ('iadd', a, c)), 1978bf215546Sopenharmony_ci 1979bf215546Sopenharmony_ci # if first two srcs are const, crack apart the imad so constant folding 1980bf215546Sopenharmony_ci # can clean up the imul: 1981bf215546Sopenharmony_ci # TODO ffma should probably get a similar rule: 1982bf215546Sopenharmony_ci (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 1983bf215546Sopenharmony_ci 1984bf215546Sopenharmony_ci # These will turn 24b address/offset calc back into 32b shifts, but 1985bf215546Sopenharmony_ci # it should be safe to get back some of the bits of precision that we 1986bf215546Sopenharmony_ci # already decided were no necessary: 1987bf215546Sopenharmony_ci (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 1988bf215546Sopenharmony_ci (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 1989bf215546Sopenharmony_ci (('imul24', a, 0), (0)), 1990bf215546Sopenharmony_ci 1991bf215546Sopenharmony_ci (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1992bf215546Sopenharmony_ci (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 1993bf215546Sopenharmony_ci (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1994bf215546Sopenharmony_ci (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 1995bf215546Sopenharmony_ci 1996bf215546Sopenharmony_ci (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1997bf215546Sopenharmony_ci (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"), 1998bf215546Sopenharmony_ci (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1999bf215546Sopenharmony_ci (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"), 2000bf215546Sopenharmony_ci 2001bf215546Sopenharmony_ci (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 2002bf215546Sopenharmony_ci (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 2003bf215546Sopenharmony_ci (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 2004bf215546Sopenharmony_ci (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 2005bf215546Sopenharmony_ci 2006bf215546Sopenharmony_ci]) 2007bf215546Sopenharmony_ci 2008bf215546Sopenharmony_ci# bit_size dependent lowerings 2009bf215546Sopenharmony_cifor bit_size in [8, 16, 32, 64]: 2010bf215546Sopenharmony_ci # convenience constants 2011bf215546Sopenharmony_ci intmax = (1 << (bit_size - 1)) - 1 2012bf215546Sopenharmony_ci intmin = 1 << (bit_size - 1) 2013bf215546Sopenharmony_ci 2014bf215546Sopenharmony_ci optimizations += [ 2015bf215546Sopenharmony_ci (('iadd_sat@' + str(bit_size), a, b), 2016bf215546Sopenharmony_ci ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 2017bf215546Sopenharmony_ci ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 2018bf215546Sopenharmony_ci (('isub_sat@' + str(bit_size), a, b), 2019bf215546Sopenharmony_ci ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 2020bf215546Sopenharmony_ci ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 2021bf215546Sopenharmony_ci ] 2022bf215546Sopenharmony_ci 2023bf215546Sopenharmony_ciinvert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 2024bf215546Sopenharmony_ci 2025bf215546Sopenharmony_cifor left, right in itertools.combinations_with_replacement(invert.keys(), 2): 2026bf215546Sopenharmony_ci optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 2027bf215546Sopenharmony_ci ('iand', (invert[left], a, b), (invert[right], c, d)))) 2028bf215546Sopenharmony_ci optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 2029bf215546Sopenharmony_ci ('ior', (invert[left], a, b), (invert[right], c, d)))) 2030bf215546Sopenharmony_ci 2031bf215546Sopenharmony_ci# Optimize x2bN(b2x(x)) -> x 2032bf215546Sopenharmony_cifor size in type_sizes('bool'): 2033bf215546Sopenharmony_ci aN = 'a@' + str(size) 2034bf215546Sopenharmony_ci f2bN = 'f2b' + str(size) 2035bf215546Sopenharmony_ci i2bN = 'i2b' + str(size) 2036bf215546Sopenharmony_ci optimizations.append(((f2bN, ('b2f', aN)), a)) 2037bf215546Sopenharmony_ci optimizations.append(((i2bN, ('b2i', aN)), a)) 2038bf215546Sopenharmony_ci 2039bf215546Sopenharmony_ci# Optimize x2yN(b2x(x)) -> b2y 2040bf215546Sopenharmony_cifor x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 2041bf215546Sopenharmony_ci if x != 'f' and y != 'f' and x != y: 2042bf215546Sopenharmony_ci continue 2043bf215546Sopenharmony_ci 2044bf215546Sopenharmony_ci b2x = 'b2f' if x == 'f' else 'b2i' 2045bf215546Sopenharmony_ci b2y = 'b2f' if y == 'f' else 'b2i' 2046bf215546Sopenharmony_ci x2yN = '{}2{}'.format(x, y) 2047bf215546Sopenharmony_ci optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 2048bf215546Sopenharmony_ci 2049bf215546Sopenharmony_ci# Optimize away x2xN(a@N) 2050bf215546Sopenharmony_cifor t in ['int', 'uint', 'float', 'bool']: 2051bf215546Sopenharmony_ci for N in type_sizes(t): 2052bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(t[0], N) 2053bf215546Sopenharmony_ci aN = 'a@{0}'.format(N) 2054bf215546Sopenharmony_ci optimizations.append(((x2xN, aN), a)) 2055bf215546Sopenharmony_ci 2056bf215546Sopenharmony_ci# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 2057bf215546Sopenharmony_ci# In particular, we can optimize away everything except upcast of downcast and 2058bf215546Sopenharmony_ci# upcasts where the type differs from the other cast 2059bf215546Sopenharmony_cifor N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 2060bf215546Sopenharmony_ci if N < M: 2061bf215546Sopenharmony_ci # The outer cast is a down-cast. It doesn't matter what the size of the 2062bf215546Sopenharmony_ci # argument of the inner cast is because we'll never been in the upcast 2063bf215546Sopenharmony_ci # of downcast case. Regardless of types, we'll always end up with y2yN 2064bf215546Sopenharmony_ci # in the end. 2065bf215546Sopenharmony_ci for x, y in itertools.product(['i', 'u'], ['i', 'u']): 2066bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(x, N) 2067bf215546Sopenharmony_ci y2yM = '{0}2{0}{1}'.format(y, M) 2068bf215546Sopenharmony_ci y2yN = '{0}2{0}{1}'.format(y, N) 2069bf215546Sopenharmony_ci optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 2070bf215546Sopenharmony_ci elif N > M: 2071bf215546Sopenharmony_ci # If the outer cast is an up-cast, we have to be more careful about the 2072bf215546Sopenharmony_ci # size of the argument of the inner cast and with types. In this case, 2073bf215546Sopenharmony_ci # the type is always the type of type up-cast which is given by the 2074bf215546Sopenharmony_ci # outer cast. 2075bf215546Sopenharmony_ci for P in type_sizes('uint'): 2076bf215546Sopenharmony_ci # We can't optimize away up-cast of down-cast. 2077bf215546Sopenharmony_ci if M < P: 2078bf215546Sopenharmony_ci continue 2079bf215546Sopenharmony_ci 2080bf215546Sopenharmony_ci # Because we're doing down-cast of down-cast, the types always have 2081bf215546Sopenharmony_ci # to match between the two casts 2082bf215546Sopenharmony_ci for x in ['i', 'u']: 2083bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(x, N) 2084bf215546Sopenharmony_ci x2xM = '{0}2{0}{1}'.format(x, M) 2085bf215546Sopenharmony_ci aP = 'a@{0}'.format(P) 2086bf215546Sopenharmony_ci optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 2087bf215546Sopenharmony_ci else: 2088bf215546Sopenharmony_ci # The N == M case is handled by other optimizations 2089bf215546Sopenharmony_ci pass 2090bf215546Sopenharmony_ci 2091bf215546Sopenharmony_ci# Downcast operations should be able to see through pack 2092bf215546Sopenharmony_cifor t in ['i', 'u']: 2093bf215546Sopenharmony_ci for N in [8, 16, 32]: 2094bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(t, N) 2095bf215546Sopenharmony_ci optimizations += [ 2096bf215546Sopenharmony_ci ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2097bf215546Sopenharmony_ci ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2098bf215546Sopenharmony_ci ] 2099bf215546Sopenharmony_ci 2100bf215546Sopenharmony_ci# Optimize comparisons with up-casts 2101bf215546Sopenharmony_cifor t in ['int', 'uint', 'float']: 2102bf215546Sopenharmony_ci for N, M in itertools.product(type_sizes(t), repeat=2): 2103bf215546Sopenharmony_ci if N == 1 or N >= M: 2104bf215546Sopenharmony_ci continue 2105bf215546Sopenharmony_ci 2106bf215546Sopenharmony_ci cond = 'true' 2107bf215546Sopenharmony_ci if N == 8: 2108bf215546Sopenharmony_ci cond = 'options->support_8bit_alu' 2109bf215546Sopenharmony_ci elif N == 16: 2110bf215546Sopenharmony_ci cond = 'options->support_16bit_alu' 2111bf215546Sopenharmony_ci x2xM = '{0}2{0}{1}'.format(t[0], M) 2112bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(t[0], N) 2113bf215546Sopenharmony_ci aN = 'a@' + str(N) 2114bf215546Sopenharmony_ci bN = 'b@' + str(N) 2115bf215546Sopenharmony_ci xeq = 'feq' if t == 'float' else 'ieq' 2116bf215546Sopenharmony_ci xne = 'fneu' if t == 'float' else 'ine' 2117bf215546Sopenharmony_ci xge = '{0}ge'.format(t[0]) 2118bf215546Sopenharmony_ci xlt = '{0}lt'.format(t[0]) 2119bf215546Sopenharmony_ci 2120bf215546Sopenharmony_ci # Up-casts are lossless so for correctly signed comparisons of 2121bf215546Sopenharmony_ci # up-casted values we can do the comparison at the largest of the two 2122bf215546Sopenharmony_ci # original sizes and drop one or both of the casts. (We have 2123bf215546Sopenharmony_ci # optimizations to drop the no-op casts which this may generate.) 2124bf215546Sopenharmony_ci for P in type_sizes(t): 2125bf215546Sopenharmony_ci if P == 1 or P > N: 2126bf215546Sopenharmony_ci continue 2127bf215546Sopenharmony_ci 2128bf215546Sopenharmony_ci bP = 'b@' + str(P) 2129bf215546Sopenharmony_ci optimizations += [ 2130bf215546Sopenharmony_ci ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2131bf215546Sopenharmony_ci ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2132bf215546Sopenharmony_ci ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2133bf215546Sopenharmony_ci ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2134bf215546Sopenharmony_ci ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2135bf215546Sopenharmony_ci ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2136bf215546Sopenharmony_ci ] 2137bf215546Sopenharmony_ci 2138bf215546Sopenharmony_ci # The next bit doesn't work on floats because the range checks would 2139bf215546Sopenharmony_ci # get way too complicated. 2140bf215546Sopenharmony_ci if t in ['int', 'uint']: 2141bf215546Sopenharmony_ci if t == 'int': 2142bf215546Sopenharmony_ci xN_min = -(1 << (N - 1)) 2143bf215546Sopenharmony_ci xN_max = (1 << (N - 1)) - 1 2144bf215546Sopenharmony_ci elif t == 'uint': 2145bf215546Sopenharmony_ci xN_min = 0 2146bf215546Sopenharmony_ci xN_max = (1 << N) - 1 2147bf215546Sopenharmony_ci else: 2148bf215546Sopenharmony_ci assert False 2149bf215546Sopenharmony_ci 2150bf215546Sopenharmony_ci # If we're up-casting and comparing to a constant, we can unfold 2151bf215546Sopenharmony_ci # the comparison into a comparison with the shrunk down constant 2152bf215546Sopenharmony_ci # and a check that the constant fits in the smaller bit size. 2153bf215546Sopenharmony_ci optimizations += [ 2154bf215546Sopenharmony_ci ((xeq, (x2xM, aN), '#b'), 2155bf215546Sopenharmony_ci ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2156bf215546Sopenharmony_ci ((xne, (x2xM, aN), '#b'), 2157bf215546Sopenharmony_ci ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2158bf215546Sopenharmony_ci ((xlt, (x2xM, aN), '#b'), 2159bf215546Sopenharmony_ci ('iand', (xlt, xN_min, b), 2160bf215546Sopenharmony_ci ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2161bf215546Sopenharmony_ci ((xlt, '#a', (x2xM, bN)), 2162bf215546Sopenharmony_ci ('iand', (xlt, a, xN_max), 2163bf215546Sopenharmony_ci ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2164bf215546Sopenharmony_ci ((xge, (x2xM, aN), '#b'), 2165bf215546Sopenharmony_ci ('iand', (xge, xN_max, b), 2166bf215546Sopenharmony_ci ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2167bf215546Sopenharmony_ci ((xge, '#a', (x2xM, bN)), 2168bf215546Sopenharmony_ci ('iand', (xge, a, xN_min), 2169bf215546Sopenharmony_ci ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2170bf215546Sopenharmony_ci ] 2171bf215546Sopenharmony_ci 2172bf215546Sopenharmony_ci# Convert masking followed by signed downcast to just unsigned downcast 2173bf215546Sopenharmony_cioptimizations += [ 2174bf215546Sopenharmony_ci (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2175bf215546Sopenharmony_ci (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2176bf215546Sopenharmony_ci (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2177bf215546Sopenharmony_ci (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2178bf215546Sopenharmony_ci (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2179bf215546Sopenharmony_ci (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2180bf215546Sopenharmony_ci] 2181bf215546Sopenharmony_ci 2182bf215546Sopenharmony_ci# Some operations such as iadd have the property that the bottom N bits of the 2183bf215546Sopenharmony_ci# output only depends on the bottom N bits of each of the inputs so we can 2184bf215546Sopenharmony_ci# remove casts 2185bf215546Sopenharmony_cifor N in [16, 32]: 2186bf215546Sopenharmony_ci for M in [8, 16]: 2187bf215546Sopenharmony_ci if M >= N: 2188bf215546Sopenharmony_ci continue 2189bf215546Sopenharmony_ci 2190bf215546Sopenharmony_ci aN = 'a@' + str(N) 2191bf215546Sopenharmony_ci u2uM = 'u2u{0}'.format(M) 2192bf215546Sopenharmony_ci i2iM = 'i2i{0}'.format(M) 2193bf215546Sopenharmony_ci 2194bf215546Sopenharmony_ci for x in ['u', 'i']: 2195bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(x, N) 2196bf215546Sopenharmony_ci extract_xM = 'extract_{0}{1}'.format(x, M) 2197bf215546Sopenharmony_ci 2198bf215546Sopenharmony_ci x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2199bf215546Sopenharmony_ci extract_xM_M_bits = \ 2200bf215546Sopenharmony_ci '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2201bf215546Sopenharmony_ci optimizations += [ 2202bf215546Sopenharmony_ci ((x2xN_M_bits, (u2uM, aN)), a), 2203bf215546Sopenharmony_ci ((extract_xM_M_bits, aN, 0), a), 2204bf215546Sopenharmony_ci ] 2205bf215546Sopenharmony_ci 2206bf215546Sopenharmony_ci bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2207bf215546Sopenharmony_ci optimizations += [ 2208bf215546Sopenharmony_ci ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2209bf215546Sopenharmony_ci ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2210bf215546Sopenharmony_ci ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2211bf215546Sopenharmony_ci ] 2212bf215546Sopenharmony_ci 2213bf215546Sopenharmony_ci for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2214bf215546Sopenharmony_ci op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2215bf215546Sopenharmony_ci optimizations += [ 2216bf215546Sopenharmony_ci ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2217bf215546Sopenharmony_ci ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2218bf215546Sopenharmony_ci ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2219bf215546Sopenharmony_ci ] 2220bf215546Sopenharmony_ci 2221bf215546Sopenharmony_cidef fexp2i(exp, bits): 2222bf215546Sopenharmony_ci # Generate an expression which constructs value 2.0^exp or 0.0. 2223bf215546Sopenharmony_ci # 2224bf215546Sopenharmony_ci # We assume that exp is already in a valid range: 2225bf215546Sopenharmony_ci # 2226bf215546Sopenharmony_ci # * [-15, 15] for 16-bit float 2227bf215546Sopenharmony_ci # * [-127, 127] for 32-bit float 2228bf215546Sopenharmony_ci # * [-1023, 1023] for 16-bit float 2229bf215546Sopenharmony_ci # 2230bf215546Sopenharmony_ci # If exp is the lowest value in the valid range, a value of 0.0 is 2231bf215546Sopenharmony_ci # constructed. Otherwise, the value 2.0^exp is constructed. 2232bf215546Sopenharmony_ci if bits == 16: 2233bf215546Sopenharmony_ci return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2234bf215546Sopenharmony_ci elif bits == 32: 2235bf215546Sopenharmony_ci return ('ishl', ('iadd', exp, 127), 23) 2236bf215546Sopenharmony_ci elif bits == 64: 2237bf215546Sopenharmony_ci return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2238bf215546Sopenharmony_ci else: 2239bf215546Sopenharmony_ci assert False 2240bf215546Sopenharmony_ci 2241bf215546Sopenharmony_cidef ldexp(f, exp, bits): 2242bf215546Sopenharmony_ci # The maximum possible range for a normal exponent is [-126, 127] and, 2243bf215546Sopenharmony_ci # throwing in denormals, you get a maximum range of [-149, 127]. This 2244bf215546Sopenharmony_ci # means that we can potentially have a swing of +-276. If you start with 2245bf215546Sopenharmony_ci # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2246bf215546Sopenharmony_ci # all the way to zero. The GLSL spec only requires that we handle a subset 2247bf215546Sopenharmony_ci # of this range. From version 4.60 of the spec: 2248bf215546Sopenharmony_ci # 2249bf215546Sopenharmony_ci # "If exp is greater than +128 (single-precision) or +1024 2250bf215546Sopenharmony_ci # (double-precision), the value returned is undefined. If exp is less 2251bf215546Sopenharmony_ci # than -126 (single-precision) or -1022 (double-precision), the value 2252bf215546Sopenharmony_ci # returned may be flushed to zero. Additionally, splitting the value 2253bf215546Sopenharmony_ci # into a significand and exponent using frexp() and then reconstructing 2254bf215546Sopenharmony_ci # a floating-point value using ldexp() should yield the original input 2255bf215546Sopenharmony_ci # for zero and all finite non-denormalized values." 2256bf215546Sopenharmony_ci # 2257bf215546Sopenharmony_ci # The SPIR-V spec has similar language. 2258bf215546Sopenharmony_ci # 2259bf215546Sopenharmony_ci # In order to handle the maximum value +128 using the fexp2i() helper 2260bf215546Sopenharmony_ci # above, we have to split the exponent in half and do two multiply 2261bf215546Sopenharmony_ci # operations. 2262bf215546Sopenharmony_ci # 2263bf215546Sopenharmony_ci # First, we clamp exp to a reasonable range. Specifically, we clamp to 2264bf215546Sopenharmony_ci # twice the full range that is valid for the fexp2i() function above. If 2265bf215546Sopenharmony_ci # exp/2 is the bottom value of that range, the fexp2i() expression will 2266bf215546Sopenharmony_ci # yield 0.0f which, when multiplied by f, will flush it to zero which is 2267bf215546Sopenharmony_ci # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2268bf215546Sopenharmony_ci # value is clamped from above, then it must have been above the supported 2269bf215546Sopenharmony_ci # range of the GLSL built-in and therefore any return value is acceptable. 2270bf215546Sopenharmony_ci if bits == 16: 2271bf215546Sopenharmony_ci exp = ('imin', ('imax', exp, -30), 30) 2272bf215546Sopenharmony_ci elif bits == 32: 2273bf215546Sopenharmony_ci exp = ('imin', ('imax', exp, -254), 254) 2274bf215546Sopenharmony_ci elif bits == 64: 2275bf215546Sopenharmony_ci exp = ('imin', ('imax', exp, -2046), 2046) 2276bf215546Sopenharmony_ci else: 2277bf215546Sopenharmony_ci assert False 2278bf215546Sopenharmony_ci 2279bf215546Sopenharmony_ci # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2280bf215546Sopenharmony_ci # (We use ishr which isn't the same for -1, but the -1 case still works 2281bf215546Sopenharmony_ci # since we use exp-exp/2 as the second exponent.) While the spec 2282bf215546Sopenharmony_ci # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2283bf215546Sopenharmony_ci # work with denormals and doesn't allow for the full swing in exponents 2284bf215546Sopenharmony_ci # that you can get with normalized values. Instead, we create two powers 2285bf215546Sopenharmony_ci # of two and multiply by them each in turn. That way the effective range 2286bf215546Sopenharmony_ci # of our exponent is doubled. 2287bf215546Sopenharmony_ci pow2_1 = fexp2i(('ishr', exp, 1), bits) 2288bf215546Sopenharmony_ci pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2289bf215546Sopenharmony_ci return ('fmul', ('fmul', f, pow2_1), pow2_2) 2290bf215546Sopenharmony_ci 2291bf215546Sopenharmony_cioptimizations += [ 2292bf215546Sopenharmony_ci (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2293bf215546Sopenharmony_ci (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2294bf215546Sopenharmony_ci (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2295bf215546Sopenharmony_ci] 2296bf215546Sopenharmony_ci 2297bf215546Sopenharmony_ci# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2298bf215546Sopenharmony_cidef bitfield_reverse_ue4(u): 2299bf215546Sopenharmony_ci step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2300bf215546Sopenharmony_ci step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2301bf215546Sopenharmony_ci step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2302bf215546Sopenharmony_ci step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2303bf215546Sopenharmony_ci step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2304bf215546Sopenharmony_ci 2305bf215546Sopenharmony_ci return step5 2306bf215546Sopenharmony_ci 2307bf215546Sopenharmony_ci# Cyberpunk 2077 open-codes bitfieldReverse() 2308bf215546Sopenharmony_cidef bitfield_reverse_cp2077(u): 2309bf215546Sopenharmony_ci step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2310bf215546Sopenharmony_ci step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2311bf215546Sopenharmony_ci step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2312bf215546Sopenharmony_ci step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2313bf215546Sopenharmony_ci step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2314bf215546Sopenharmony_ci 2315bf215546Sopenharmony_ci return step5 2316bf215546Sopenharmony_ci 2317bf215546Sopenharmony_cioptimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2318bf215546Sopenharmony_cioptimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2319bf215546Sopenharmony_ci 2320bf215546Sopenharmony_ci# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2321bf215546Sopenharmony_ci# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2322bf215546Sopenharmony_cifor ncomp in [2, 3, 4, 8, 16]: 2323bf215546Sopenharmony_ci optimizations += [ 2324bf215546Sopenharmony_ci (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2325bf215546Sopenharmony_ci (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2326bf215546Sopenharmony_ci (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2327bf215546Sopenharmony_ci (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2328bf215546Sopenharmony_ci ] 2329bf215546Sopenharmony_ci 2330bf215546Sopenharmony_ci# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2331bf215546Sopenharmony_ci# then the "a == a" is redundant because it's equivalent to "a is not NaN" 2332bf215546Sopenharmony_ci# and, if a is a NaN then the second comparison will fail anyway. 2333bf215546Sopenharmony_cifor op in ['flt', 'fge', 'feq']: 2334bf215546Sopenharmony_ci optimizations += [ 2335bf215546Sopenharmony_ci (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 2336bf215546Sopenharmony_ci (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 2337bf215546Sopenharmony_ci ] 2338bf215546Sopenharmony_ci 2339bf215546Sopenharmony_ci# Add optimizations to handle the case where the result of a ternary is 2340bf215546Sopenharmony_ci# compared to a constant. This way we can take things like 2341bf215546Sopenharmony_ci# 2342bf215546Sopenharmony_ci# (a ? 0 : 1) > 0 2343bf215546Sopenharmony_ci# 2344bf215546Sopenharmony_ci# and turn it into 2345bf215546Sopenharmony_ci# 2346bf215546Sopenharmony_ci# a ? (0 > 0) : (1 > 0) 2347bf215546Sopenharmony_ci# 2348bf215546Sopenharmony_ci# which constant folding will eat for lunch. The resulting ternary will 2349bf215546Sopenharmony_ci# further get cleaned up by the boolean reductions above and we will be 2350bf215546Sopenharmony_ci# left with just the original variable "a". 2351bf215546Sopenharmony_cifor op in ['feq', 'fneu', 'ieq', 'ine']: 2352bf215546Sopenharmony_ci optimizations += [ 2353bf215546Sopenharmony_ci ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2354bf215546Sopenharmony_ci ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2355bf215546Sopenharmony_ci ] 2356bf215546Sopenharmony_ci 2357bf215546Sopenharmony_cifor op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 2358bf215546Sopenharmony_ci optimizations += [ 2359bf215546Sopenharmony_ci ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2360bf215546Sopenharmony_ci ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2361bf215546Sopenharmony_ci ((op, '#d', ('bcsel', a, '#b', '#c')), 2362bf215546Sopenharmony_ci ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 2363bf215546Sopenharmony_ci ] 2364bf215546Sopenharmony_ci 2365bf215546Sopenharmony_ci 2366bf215546Sopenharmony_ci# For example, this converts things like 2367bf215546Sopenharmony_ci# 2368bf215546Sopenharmony_ci# 1 + mix(0, a - 1, condition) 2369bf215546Sopenharmony_ci# 2370bf215546Sopenharmony_ci# into 2371bf215546Sopenharmony_ci# 2372bf215546Sopenharmony_ci# mix(1, (a-1)+1, condition) 2373bf215546Sopenharmony_ci# 2374bf215546Sopenharmony_ci# Other optimizations will rearrange the constants. 2375bf215546Sopenharmony_cifor op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: 2376bf215546Sopenharmony_ci optimizations += [ 2377bf215546Sopenharmony_ci ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 2378bf215546Sopenharmony_ci ] 2379bf215546Sopenharmony_ci 2380bf215546Sopenharmony_ci# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 2381bf215546Sopenharmony_ci# states: 2382bf215546Sopenharmony_ci# 2383bf215546Sopenharmony_ci# If neither layout qualifier is specified, derivatives in compute shaders 2384bf215546Sopenharmony_ci# return zero, which is consistent with the handling of built-in texture 2385bf215546Sopenharmony_ci# functions like texture() in GLSL 4.50 compute shaders. 2386bf215546Sopenharmony_cifor op in ['fddx', 'fddx_fine', 'fddx_coarse', 2387bf215546Sopenharmony_ci 'fddy', 'fddy_fine', 'fddy_coarse']: 2388bf215546Sopenharmony_ci optimizations += [ 2389bf215546Sopenharmony_ci ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') 2390bf215546Sopenharmony_ci] 2391bf215546Sopenharmony_ci 2392bf215546Sopenharmony_ci# Some optimizations for ir3-specific instructions. 2393bf215546Sopenharmony_cioptimizations += [ 2394bf215546Sopenharmony_ci # 'al * bl': If either 'al' or 'bl' is zero, return zero. 2395bf215546Sopenharmony_ci (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 2396bf215546Sopenharmony_ci # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'. 2397bf215546Sopenharmony_ci (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 2398bf215546Sopenharmony_ci (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 2399bf215546Sopenharmony_ci] 2400bf215546Sopenharmony_ci 2401bf215546Sopenharmony_ci# These kinds of sequences can occur after nir_opt_peephole_select. 2402bf215546Sopenharmony_ci# 2403bf215546Sopenharmony_ci# NOTE: fadd is not handled here because that gets in the way of ffma 2404bf215546Sopenharmony_ci# generation in the i965 driver. Instead, fadd and ffma are handled in 2405bf215546Sopenharmony_ci# late_optimizations. 2406bf215546Sopenharmony_ci 2407bf215546Sopenharmony_cifor op in ['flrp']: 2408bf215546Sopenharmony_ci optimizations += [ 2409bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2410bf215546Sopenharmony_ci (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2411bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2412bf215546Sopenharmony_ci (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2413bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2414bf215546Sopenharmony_ci (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2415bf215546Sopenharmony_ci ] 2416bf215546Sopenharmony_ci 2417bf215546Sopenharmony_cifor op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 2418bf215546Sopenharmony_ci optimizations += [ 2419bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2420bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2421bf215546Sopenharmony_ci (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2422bf215546Sopenharmony_ci (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2423bf215546Sopenharmony_ci ] 2424bf215546Sopenharmony_ci 2425bf215546Sopenharmony_cifor op in ['fpow']: 2426bf215546Sopenharmony_ci optimizations += [ 2427bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2428bf215546Sopenharmony_ci (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2429bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 2430bf215546Sopenharmony_ci (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 2431bf215546Sopenharmony_ci ] 2432bf215546Sopenharmony_ci 2433bf215546Sopenharmony_cifor op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fneg', 'fabs', 'fsign']: 2434bf215546Sopenharmony_ci optimizations += [ 2435bf215546Sopenharmony_ci (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 2436bf215546Sopenharmony_ci ] 2437bf215546Sopenharmony_ci 2438bf215546Sopenharmony_cifor op in ['ineg', 'iabs', 'inot', 'isign']: 2439bf215546Sopenharmony_ci optimizations += [ 2440bf215546Sopenharmony_ci ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 2441bf215546Sopenharmony_ci ] 2442bf215546Sopenharmony_ci 2443bf215546Sopenharmony_cioptimizations.extend([ 2444bf215546Sopenharmony_ci (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'), 2445bf215546Sopenharmony_ci (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'), 2446bf215546Sopenharmony_ci (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal') 2447bf215546Sopenharmony_ci ]) 2448bf215546Sopenharmony_ci 2449bf215546Sopenharmony_ci# This section contains optimizations to propagate downsizing conversions of 2450bf215546Sopenharmony_ci# constructed vectors into vectors of downsized components. Whether this is 2451bf215546Sopenharmony_ci# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 2452bf215546Sopenharmony_ci# this reduces the register pressure of the vector itself and often enables the 2453bf215546Sopenharmony_ci# conversions to be eliminated via other algebraic rules or constant folding. 2454bf215546Sopenharmony_ci# In the worst case on a SIMD architecture, the propagated conversions may be 2455bf215546Sopenharmony_ci# revectorized via nir_opt_vectorize so instruction count is minimally 2456bf215546Sopenharmony_ci# impacted. 2457bf215546Sopenharmony_ci# 2458bf215546Sopenharmony_ci# On a machine with SIMD-within-a-register only, this actually 2459bf215546Sopenharmony_ci# counterintuitively hurts instruction count. These machines are the same that 2460bf215546Sopenharmony_ci# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 2461bf215546Sopenharmony_ci# not being set. 2462bf215546Sopenharmony_ci# 2463bf215546Sopenharmony_ci# Finally for scalar architectures, there should be no difference in generated 2464bf215546Sopenharmony_ci# code since it all ends up scalarized at the end, but it might minimally help 2465bf215546Sopenharmony_ci# compile-times. 2466bf215546Sopenharmony_ci 2467bf215546Sopenharmony_cifor i in range(2, 4 + 1): 2468bf215546Sopenharmony_ci for T in ('f', 'u', 'i'): 2469bf215546Sopenharmony_ci vec_inst = ('vec' + str(i),) 2470bf215546Sopenharmony_ci 2471bf215546Sopenharmony_ci indices = ['a', 'b', 'c', 'd'] 2472bf215546Sopenharmony_ci suffix_in = tuple((indices[j] + '@32') for j in range(i)) 2473bf215546Sopenharmony_ci 2474bf215546Sopenharmony_ci to_16 = '{}2{}16'.format(T, T) 2475bf215546Sopenharmony_ci to_mp = '{}2{}mp'.format(T, T) 2476bf215546Sopenharmony_ci 2477bf215546Sopenharmony_ci out_16 = tuple((to_16, indices[j]) for j in range(i)) 2478bf215546Sopenharmony_ci out_mp = tuple((to_mp, indices[j]) for j in range(i)) 2479bf215546Sopenharmony_ci 2480bf215546Sopenharmony_ci optimizations += [ 2481bf215546Sopenharmony_ci ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 2482bf215546Sopenharmony_ci ] 2483bf215546Sopenharmony_ci # u2ump doesn't exist, because it's equal to i2imp 2484bf215546Sopenharmony_ci if T in ['f', 'i']: 2485bf215546Sopenharmony_ci optimizations += [ 2486bf215546Sopenharmony_ci ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 2487bf215546Sopenharmony_ci ] 2488bf215546Sopenharmony_ci 2489bf215546Sopenharmony_ci# This section contains "late" optimizations that should be run before 2490bf215546Sopenharmony_ci# creating ffmas and calling regular optimizations for the final time. 2491bf215546Sopenharmony_ci# Optimizations should go here if they help code generation and conflict 2492bf215546Sopenharmony_ci# with the regular optimizations. 2493bf215546Sopenharmony_cibefore_ffma_optimizations = [ 2494bf215546Sopenharmony_ci # Propagate constants down multiplication chains 2495bf215546Sopenharmony_ci (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 2496bf215546Sopenharmony_ci (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 2497bf215546Sopenharmony_ci (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 2498bf215546Sopenharmony_ci (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 2499bf215546Sopenharmony_ci 2500bf215546Sopenharmony_ci (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 2501bf215546Sopenharmony_ci (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 2502bf215546Sopenharmony_ci (('~fadd', ('fneg', a), a), 0.0), 2503bf215546Sopenharmony_ci (('iadd', ('ineg', a), a), 0), 2504bf215546Sopenharmony_ci (('iadd', ('ineg', a), ('iadd', a, b)), b), 2505bf215546Sopenharmony_ci (('iadd', a, ('iadd', ('ineg', a), b)), b), 2506bf215546Sopenharmony_ci (('~fadd', ('fneg', a), ('fadd', a, b)), b), 2507bf215546Sopenharmony_ci (('~fadd', a, ('fadd', ('fneg', a), b)), b), 2508bf215546Sopenharmony_ci 2509bf215546Sopenharmony_ci (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 2510bf215546Sopenharmony_ci (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 2511bf215546Sopenharmony_ci (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 2512bf215546Sopenharmony_ci] 2513bf215546Sopenharmony_ci 2514bf215546Sopenharmony_ci# This section contains "late" optimizations that should be run after the 2515bf215546Sopenharmony_ci# regular optimizations have finished. Optimizations should go here if 2516bf215546Sopenharmony_ci# they help code generation but do not necessarily produce code that is 2517bf215546Sopenharmony_ci# more easily optimizable. 2518bf215546Sopenharmony_cilate_optimizations = [ 2519bf215546Sopenharmony_ci # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 2520bf215546Sopenharmony_ci # results if one operand is +Inf and the other is -Inf. 2521bf215546Sopenharmony_ci # 2522bf215546Sopenharmony_ci # 1. Inf + -Inf = NaN 2523bf215546Sopenharmony_ci # 2. ∀x: x + NaN = NaN and x - NaN = NaN 2524bf215546Sopenharmony_ci # 3. ∀x: x != NaN = true 2525bf215546Sopenharmony_ci # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 2526bf215546Sopenharmony_ci # 2527bf215546Sopenharmony_ci # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 2528bf215546Sopenharmony_ci # (a+b) < 0 false false false false 2529bf215546Sopenharmony_ci # a < -b false false false false 2530bf215546Sopenharmony_ci # -(a+b) < 0 false false false false 2531bf215546Sopenharmony_ci # -a < b false false false false 2532bf215546Sopenharmony_ci # (a+b) >= 0 false false false false 2533bf215546Sopenharmony_ci # a >= -b true true false false 2534bf215546Sopenharmony_ci # -(a+b) >= 0 false false false false 2535bf215546Sopenharmony_ci # -a >= b true true false false 2536bf215546Sopenharmony_ci # (a+b) == 0 false false false false 2537bf215546Sopenharmony_ci # a == -b true true false false 2538bf215546Sopenharmony_ci # (a+b) != 0 true true true true 2539bf215546Sopenharmony_ci # a != -b false false true true 2540bf215546Sopenharmony_ci (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 2541bf215546Sopenharmony_ci (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 2542bf215546Sopenharmony_ci (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 2543bf215546Sopenharmony_ci (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 2544bf215546Sopenharmony_ci (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 2545bf215546Sopenharmony_ci (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 2546bf215546Sopenharmony_ci (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 2547bf215546Sopenharmony_ci (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 2548bf215546Sopenharmony_ci (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 2549bf215546Sopenharmony_ci (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 2550bf215546Sopenharmony_ci 2551bf215546Sopenharmony_ci # If either source must be finite, then the original (a+b) cannot produce 2552bf215546Sopenharmony_ci # NaN due to Inf-Inf. The patterns and the replacements produce the same 2553bf215546Sopenharmony_ci # result if b is NaN. Therefore, the replacements are exact. 2554bf215546Sopenharmony_ci (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 2555bf215546Sopenharmony_ci (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 2556bf215546Sopenharmony_ci (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 2557bf215546Sopenharmony_ci (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 2558bf215546Sopenharmony_ci (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 2559bf215546Sopenharmony_ci (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 2560bf215546Sopenharmony_ci 2561bf215546Sopenharmony_ci # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 2562bf215546Sopenharmony_ci # SpvOpLessOrGreater. 2563bf215546Sopenharmony_ci (('iand', ('fneu', a, b), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))), 2564bf215546Sopenharmony_ci (('iand', ('fneu', a, 0.0), ('feq', a, a) ), ('!flt', 0.0, ('fabs', a))), 2565bf215546Sopenharmony_ci 2566bf215546Sopenharmony_ci # This is how SpvOpFUnordEqual might be implemented. Replace it with 2567bf215546Sopenharmony_ci # !SpvOpLessOrGreater. 2568bf215546Sopenharmony_ci (('ior', ('feq', a, b), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))), 2569bf215546Sopenharmony_ci (('ior', ('feq', a, 0.0), ('fneu', a, a), ), ('inot', ('!flt', 0.0, ('fabs', a)))), 2570bf215546Sopenharmony_ci 2571bf215546Sopenharmony_ci # nir_lower_to_source_mods will collapse this, but its existence during the 2572bf215546Sopenharmony_ci # optimization loop can prevent other optimizations. 2573bf215546Sopenharmony_ci (('fneg', ('fneg', a)), a), 2574bf215546Sopenharmony_ci 2575bf215546Sopenharmony_ci # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c 2576bf215546Sopenharmony_ci # gets combined to fma(a, b, -c). 2577bf215546Sopenharmony_ci (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), 2578bf215546Sopenharmony_ci (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), 2579bf215546Sopenharmony_ci (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), 2580bf215546Sopenharmony_ci (('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'), 2581bf215546Sopenharmony_ci 2582bf215546Sopenharmony_ci # Subtractions get lowered during optimization, so we need to recombine them 2583bf215546Sopenharmony_ci (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2584bf215546Sopenharmony_ci (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2585bf215546Sopenharmony_ci (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2586bf215546Sopenharmony_ci (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'), 2587bf215546Sopenharmony_ci 2588bf215546Sopenharmony_ci (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 2589bf215546Sopenharmony_ci (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 2590bf215546Sopenharmony_ci (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 2591bf215546Sopenharmony_ci (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 2592bf215546Sopenharmony_ci 2593bf215546Sopenharmony_ci (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'), 2594bf215546Sopenharmony_ci (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'), 2595bf215546Sopenharmony_ci (('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'), 2596bf215546Sopenharmony_ci 2597bf215546Sopenharmony_ci # fneg_lo / fneg_hi 2598bf215546Sopenharmony_ci (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'), 2599bf215546Sopenharmony_ci (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'), 2600bf215546Sopenharmony_ci 2601bf215546Sopenharmony_ci # These are duplicated from the main optimizations table. The late 2602bf215546Sopenharmony_ci # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 2603bf215546Sopenharmony_ci # new patterns like these. The patterns that compare with zero are removed 2604bf215546Sopenharmony_ci # because they are unlikely to be created in by anything in 2605bf215546Sopenharmony_ci # late_optimizations. 2606bf215546Sopenharmony_ci (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 2607bf215546Sopenharmony_ci (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 2608bf215546Sopenharmony_ci (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 2609bf215546Sopenharmony_ci (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 2610bf215546Sopenharmony_ci 2611bf215546Sopenharmony_ci (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 2612bf215546Sopenharmony_ci 2613bf215546Sopenharmony_ci (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 2614bf215546Sopenharmony_ci 2615bf215546Sopenharmony_ci (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 2616bf215546Sopenharmony_ci (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 2617bf215546Sopenharmony_ci (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 2618bf215546Sopenharmony_ci (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 2619bf215546Sopenharmony_ci (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 2620bf215546Sopenharmony_ci (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 2621bf215546Sopenharmony_ci (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 2622bf215546Sopenharmony_ci (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 2623bf215546Sopenharmony_ci (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 2624bf215546Sopenharmony_ci (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 2625bf215546Sopenharmony_ci 2626bf215546Sopenharmony_ci (('ior', a, a), a), 2627bf215546Sopenharmony_ci (('iand', a, a), a), 2628bf215546Sopenharmony_ci 2629bf215546Sopenharmony_ci (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 2630bf215546Sopenharmony_ci 2631bf215546Sopenharmony_ci (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 2632bf215546Sopenharmony_ci (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 2633bf215546Sopenharmony_ci (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 2634bf215546Sopenharmony_ci (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 2635bf215546Sopenharmony_ci 2636bf215546Sopenharmony_ci (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 2637bf215546Sopenharmony_ci 2638bf215546Sopenharmony_ci # Approximate handling of fround_even for DX9 addressing from gallium nine on 2639bf215546Sopenharmony_ci # DX9-class hardware with no proper fround support. This is in 2640bf215546Sopenharmony_ci # late_optimizations so that the is_integral() opts in the main pass get a 2641bf215546Sopenharmony_ci # chance to eliminate the fround_even first. 2642bf215546Sopenharmony_ci (('fround_even', a), ('bcsel', 2643bf215546Sopenharmony_ci ('feq', ('ffract', a), 0.5), 2644bf215546Sopenharmony_ci ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0), 2645bf215546Sopenharmony_ci ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'), 2646bf215546Sopenharmony_ci 2647bf215546Sopenharmony_ci # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 2648bf215546Sopenharmony_ci # particular operation is common for expanding values stored in a texture 2649bf215546Sopenharmony_ci # from [0,1] to [-1,1]. 2650bf215546Sopenharmony_ci (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 2651bf215546Sopenharmony_ci (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 2652bf215546Sopenharmony_ci (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 2653bf215546Sopenharmony_ci (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 2654bf215546Sopenharmony_ci (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 2655bf215546Sopenharmony_ci (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 2656bf215546Sopenharmony_ci (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 2657bf215546Sopenharmony_ci (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 2658bf215546Sopenharmony_ci 2659bf215546Sopenharmony_ci # flrp(a, b, a) 2660bf215546Sopenharmony_ci # a*(1-a) + b*a 2661bf215546Sopenharmony_ci # a + -a*a + a*b (1) 2662bf215546Sopenharmony_ci # a + a*(b - a) 2663bf215546Sopenharmony_ci # Option 1: ffma(a, (b-a), a) 2664bf215546Sopenharmony_ci # 2665bf215546Sopenharmony_ci # Alternately, after (1): 2666bf215546Sopenharmony_ci # a*(1+b) + -a*a 2667bf215546Sopenharmony_ci # a*((1+b) + -a) 2668bf215546Sopenharmony_ci # 2669bf215546Sopenharmony_ci # Let b=1 2670bf215546Sopenharmony_ci # 2671bf215546Sopenharmony_ci # Option 2: ffma(a, 2, -(a*a)) 2672bf215546Sopenharmony_ci # Option 3: ffma(a, 2, (-a)*a) 2673bf215546Sopenharmony_ci # Option 4: ffma(a, -a, (2*a) 2674bf215546Sopenharmony_ci # Option 5: a * (2 - a) 2675bf215546Sopenharmony_ci # 2676bf215546Sopenharmony_ci # There are a lot of other possible combinations. 2677bf215546Sopenharmony_ci (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 2678bf215546Sopenharmony_ci (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2679bf215546Sopenharmony_ci (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2680bf215546Sopenharmony_ci (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2681bf215546Sopenharmony_ci (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2682bf215546Sopenharmony_ci 2683bf215546Sopenharmony_ci # we do these late so that we don't get in the way of creating ffmas 2684bf215546Sopenharmony_ci (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 2685bf215546Sopenharmony_ci (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 2686bf215546Sopenharmony_ci 2687bf215546Sopenharmony_ci # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 2688bf215546Sopenharmony_ci # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 2689bf215546Sopenharmony_ci (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 2690bf215546Sopenharmony_ci ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 2691bf215546Sopenharmony_ci 2692bf215546Sopenharmony_ci # Things that look like DPH in the source shader may get expanded to 2693bf215546Sopenharmony_ci # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 2694bf215546Sopenharmony_ci # to NIR. After FFMA is generated, this can look like: 2695bf215546Sopenharmony_ci # 2696bf215546Sopenharmony_ci # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 2697bf215546Sopenharmony_ci # 2698bf215546Sopenharmony_ci # Reassociate the last addition into the first multiplication. 2699bf215546Sopenharmony_ci # 2700bf215546Sopenharmony_ci # Some shaders do not use 'invariant' in vertex and (possibly) geometry 2701bf215546Sopenharmony_ci # shader stages on some outputs that are intended to be invariant. For 2702bf215546Sopenharmony_ci # various reasons, this optimization may not be fully applied in all 2703bf215546Sopenharmony_ci # shaders used for different rendering passes of the same geometry. This 2704bf215546Sopenharmony_ci # can result in Z-fighting artifacts (at best). For now, disable this 2705bf215546Sopenharmony_ci # optimization in these stages. See bugzilla #111490. In tessellation 2706bf215546Sopenharmony_ci # stages applications seem to use 'precise' when necessary, so allow the 2707bf215546Sopenharmony_ci # optimization in those stages. 2708bf215546Sopenharmony_ci (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 2709bf215546Sopenharmony_ci ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2710bf215546Sopenharmony_ci (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 2711bf215546Sopenharmony_ci ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2712bf215546Sopenharmony_ci (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 2713bf215546Sopenharmony_ci ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2714bf215546Sopenharmony_ci 2715bf215546Sopenharmony_ci (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 2716bf215546Sopenharmony_ci ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2717bf215546Sopenharmony_ci (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 2718bf215546Sopenharmony_ci ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2719bf215546Sopenharmony_ci (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 2720bf215546Sopenharmony_ci ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2721bf215546Sopenharmony_ci 2722bf215546Sopenharmony_ci # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2723bf215546Sopenharmony_ci # 2724bf215546Sopenharmony_ci # If bits is zero, the result will be zero. 2725bf215546Sopenharmony_ci # 2726bf215546Sopenharmony_ci # These prevent the next two lowerings generating incorrect results when 2727bf215546Sopenharmony_ci # count is zero. 2728bf215546Sopenharmony_ci (('ubfe', a, b, 0), 0), 2729bf215546Sopenharmony_ci (('ibfe', a, b, 0), 0), 2730bf215546Sopenharmony_ci 2731bf215546Sopenharmony_ci # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 2732bf215546Sopenharmony_ci # instructions on Intel GPUs, it cannot have an immediate values as 2733bf215546Sopenharmony_ci # sources. There are also limitations on source register strides. As a 2734bf215546Sopenharmony_ci # result, it is very easy for 3-source instruction combined with either 2735bf215546Sopenharmony_ci # loads of immediate values or copies from weird register strides to be 2736bf215546Sopenharmony_ci # more expensive than the primitive instructions it represents. 2737bf215546Sopenharmony_ci (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 2738bf215546Sopenharmony_ci 2739bf215546Sopenharmony_ci # b is the lowest order bit to be extracted and c is the number of bits to 2740bf215546Sopenharmony_ci # extract. The inner shift removes the bits above b + c by shifting left 2741bf215546Sopenharmony_ci # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 2742bf215546Sopenharmony_ci # -(b + c). The outer shift moves the bit that was at b to bit zero. 2743bf215546Sopenharmony_ci # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 2744bf215546Sopenharmony_ci # This means that it must be shifted right by 32 - c or -c bits. 2745bf215546Sopenharmony_ci (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 2746bf215546Sopenharmony_ci 2747bf215546Sopenharmony_ci # Clean up no-op shifts that may result from the bfe lowerings. 2748bf215546Sopenharmony_ci (('ishl', a, 0), a), 2749bf215546Sopenharmony_ci (('ishl', a, -32), a), 2750bf215546Sopenharmony_ci (('ishr', a, 0), a), 2751bf215546Sopenharmony_ci (('ishr', a, -32), a), 2752bf215546Sopenharmony_ci (('ushr', a, 0), a), 2753bf215546Sopenharmony_ci 2754bf215546Sopenharmony_ci (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 2755bf215546Sopenharmony_ci (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 2756bf215546Sopenharmony_ci (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 2757bf215546Sopenharmony_ci (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 2758bf215546Sopenharmony_ci] 2759bf215546Sopenharmony_ci 2760bf215546Sopenharmony_ci# A few more extract cases we'd rather leave late 2761bf215546Sopenharmony_cifor N in [16, 32]: 2762bf215546Sopenharmony_ci aN = 'a@{0}'.format(N) 2763bf215546Sopenharmony_ci u2uM = 'u2u{0}'.format(M) 2764bf215546Sopenharmony_ci i2iM = 'i2i{0}'.format(M) 2765bf215546Sopenharmony_ci 2766bf215546Sopenharmony_ci for x in ['u', 'i']: 2767bf215546Sopenharmony_ci x2xN = '{0}2{0}{1}'.format(x, N) 2768bf215546Sopenharmony_ci extract_x8 = 'extract_{0}8'.format(x) 2769bf215546Sopenharmony_ci extract_x16 = 'extract_{0}16'.format(x) 2770bf215546Sopenharmony_ci 2771bf215546Sopenharmony_ci late_optimizations.extend([ 2772bf215546Sopenharmony_ci ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 2773bf215546Sopenharmony_ci ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 2774bf215546Sopenharmony_ci ]) 2775bf215546Sopenharmony_ci 2776bf215546Sopenharmony_ci if N > 16: 2777bf215546Sopenharmony_ci late_optimizations.extend([ 2778bf215546Sopenharmony_ci ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 2779bf215546Sopenharmony_ci ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 2780bf215546Sopenharmony_ci ]) 2781bf215546Sopenharmony_ci 2782bf215546Sopenharmony_ci# Byte insertion 2783bf215546Sopenharmony_cilate_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 2784bf215546Sopenharmony_cilate_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 2785bf215546Sopenharmony_cilate_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 2786bf215546Sopenharmony_ci 2787bf215546Sopenharmony_cilate_optimizations += [ 2788bf215546Sopenharmony_ci # Word insertion 2789bf215546Sopenharmony_ci (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 2790bf215546Sopenharmony_ci 2791bf215546Sopenharmony_ci # Extract and then insert 2792bf215546Sopenharmony_ci (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 2793bf215546Sopenharmony_ci (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 2794bf215546Sopenharmony_ci] 2795bf215546Sopenharmony_ci 2796bf215546Sopenharmony_ci# Integer sizes 2797bf215546Sopenharmony_cifor s in [8, 16, 32, 64]: 2798bf215546Sopenharmony_ci late_optimizations.extend([ 2799bf215546Sopenharmony_ci (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)), 2800bf215546Sopenharmony_ci (('ior', ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)), 2801bf215546Sopenharmony_ci ]) 2802bf215546Sopenharmony_ci 2803bf215546Sopenharmony_ci# Float sizes 2804bf215546Sopenharmony_cifor s in [16, 32, 64]: 2805bf215546Sopenharmony_ci late_optimizations.extend([ 2806bf215546Sopenharmony_ci (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 2807bf215546Sopenharmony_ci (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 2808bf215546Sopenharmony_ci ]) 2809bf215546Sopenharmony_ci 2810bf215546Sopenharmony_cifor op in ['fadd']: 2811bf215546Sopenharmony_ci late_optimizations += [ 2812bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2813bf215546Sopenharmony_ci (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2814bf215546Sopenharmony_ci ] 2815bf215546Sopenharmony_ci 2816bf215546Sopenharmony_cifor op in ['ffma', 'ffmaz']: 2817bf215546Sopenharmony_ci late_optimizations += [ 2818bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2819bf215546Sopenharmony_ci (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2820bf215546Sopenharmony_ci 2821bf215546Sopenharmony_ci (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2822bf215546Sopenharmony_ci (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2823bf215546Sopenharmony_ci ] 2824bf215546Sopenharmony_ci 2825bf215546Sopenharmony_ci# mediump: If an opcode is surrounded by conversions, remove the conversions. 2826bf215546Sopenharmony_ci# The rationale is that type conversions + the low precision opcode are more 2827bf215546Sopenharmony_ci# expensive that the same arithmetic opcode at higher precision. 2828bf215546Sopenharmony_ci# 2829bf215546Sopenharmony_ci# This must be done in late optimizations, because we need normal optimizations to 2830bf215546Sopenharmony_ci# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 2831bf215546Sopenharmony_ci# 2832bf215546Sopenharmony_ci# Unary opcodes 2833bf215546Sopenharmony_cifor op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy', 2834bf215546Sopenharmony_ci 'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 2835bf215546Sopenharmony_ci 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 2836bf215546Sopenharmony_ci late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 2837bf215546Sopenharmony_ci 2838bf215546Sopenharmony_ci# Binary opcodes 2839bf215546Sopenharmony_cifor op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 2840bf215546Sopenharmony_ci late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 2841bf215546Sopenharmony_ci 2842bf215546Sopenharmony_ci# Ternary opcodes 2843bf215546Sopenharmony_cifor op in ['ffma', 'flrp']: 2844bf215546Sopenharmony_ci late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 2845bf215546Sopenharmony_ci 2846bf215546Sopenharmony_ci# Comparison opcodes 2847bf215546Sopenharmony_cifor op in ['feq', 'fge', 'flt', 'fneu']: 2848bf215546Sopenharmony_ci late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 2849bf215546Sopenharmony_ci 2850bf215546Sopenharmony_ci# Do this last, so that the f2fmp patterns above have effect. 2851bf215546Sopenharmony_cilate_optimizations += [ 2852bf215546Sopenharmony_ci # Convert *2*mp instructions to concrete *2*16 instructions. At this point 2853bf215546Sopenharmony_ci # any conversions that could have been removed will have been removed in 2854bf215546Sopenharmony_ci # nir_opt_algebraic so any remaining ones are required. 2855bf215546Sopenharmony_ci (('f2fmp', a), ('f2f16', a)), 2856bf215546Sopenharmony_ci (('f2imp', a), ('f2i16', a)), 2857bf215546Sopenharmony_ci (('f2ump', a), ('f2u16', a)), 2858bf215546Sopenharmony_ci (('i2imp', a), ('i2i16', a)), 2859bf215546Sopenharmony_ci (('i2fmp', a), ('i2f16', a)), 2860bf215546Sopenharmony_ci (('i2imp', a), ('u2u16', a)), 2861bf215546Sopenharmony_ci (('u2fmp', a), ('u2f16', a)), 2862bf215546Sopenharmony_ci (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 2863bf215546Sopenharmony_ci] 2864bf215546Sopenharmony_ci 2865bf215546Sopenharmony_cidistribute_src_mods = [ 2866bf215546Sopenharmony_ci # Try to remove some spurious negations rather than pushing them down. 2867bf215546Sopenharmony_ci (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 2868bf215546Sopenharmony_ci (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 2869bf215546Sopenharmony_ci (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 2870bf215546Sopenharmony_ci (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 2871bf215546Sopenharmony_ci (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 2872bf215546Sopenharmony_ci (('fneg', ('fneg', a)), a), 2873bf215546Sopenharmony_ci 2874bf215546Sopenharmony_ci (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 2875bf215546Sopenharmony_ci (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 2876bf215546Sopenharmony_ci 2877bf215546Sopenharmony_ci (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 2878bf215546Sopenharmony_ci (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 2879bf215546Sopenharmony_ci (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 2880bf215546Sopenharmony_ci 2881bf215546Sopenharmony_ci # Note that fmin <-> fmax. I don't think there is a way to distribute 2882bf215546Sopenharmony_ci # fabs() into fmin or fmax. 2883bf215546Sopenharmony_ci (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 2884bf215546Sopenharmony_ci (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 2885bf215546Sopenharmony_ci 2886bf215546Sopenharmony_ci (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 2887bf215546Sopenharmony_ci (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 2888bf215546Sopenharmony_ci (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 2889bf215546Sopenharmony_ci 2890bf215546Sopenharmony_ci # fdph works mostly like fdot, but to get the correct result, the negation 2891bf215546Sopenharmony_ci # must be applied to the second source. 2892bf215546Sopenharmony_ci (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 2893bf215546Sopenharmony_ci 2894bf215546Sopenharmony_ci (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 2895bf215546Sopenharmony_ci (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 2896bf215546Sopenharmony_ci] 2897bf215546Sopenharmony_ci 2898bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 2899bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 2900bf215546Sopenharmony_ci before_ffma_optimizations).render()) 2901bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 2902bf215546Sopenharmony_ci late_optimizations).render()) 2903bf215546Sopenharmony_ciprint(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 2904bf215546Sopenharmony_ci distribute_src_mods).render()) 2905