1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2014 Intel Corporation 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23# 24# Authors: 25# Jason Ekstrand (jason@jlekstrand.net) 26 27from collections import OrderedDict 28import nir_algebraic 29from nir_opcodes import type_sizes 30import itertools 31import struct 32from math import pi 33import math 34 35# Convenience variables 36a = 'a' 37b = 'b' 38c = 'c' 39d = 'd' 40e = 'e' 41 42signed_zero_inf_nan_preserve_16 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 16)' 43signed_zero_inf_nan_preserve_32 = 'nir_is_float_control_signed_zero_inf_nan_preserve(info->float_controls_execution_mode, 32)' 44 45ignore_exact = nir_algebraic.ignore_exact 46 47# Written in the form (<search>, <replace>) where <search> is an expression 48# and <replace> is either an expression or a value. An expression is 49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 50# where each source is either an expression or a value. A value can be 51# either a numeric constant or a string representing a variable name. 52# 53# If the opcode in a search expression is prefixed by a '~' character, this 54# indicates that the operation is inexact. Such operations will only get 55# applied to SSA values that do not have the exact bit set. This should be 56# used by by any optimizations that are not bit-for-bit exact. It should not, 57# however, be used for backend-requested lowering operations as those need to 58# happen regardless of precision. 59# 60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 61# "#" indicates that the given variable will only match constants, 62# type indicates that the given variable will only match values from ALU 63# instructions with the given output type, 64# (cond) specifies an additional condition function (see nir_search_helpers.h), 65# swiz is a swizzle applied to the variable (only in the <replace> expression) 66# 67# For constants, you have to be careful to make sure that it is the right 68# type because python is unaware of the source and destination types of the 69# opcodes. 70# 71# All expression types can have a bit-size specified. For opcodes, this 72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 73# type and size. In the search half of the expression this indicates that it 74# should only match that particular bit-size. In the replace half of the 75# expression this indicates that the constructed value should have that 76# bit-size. 77# 78# If the opcode in a replacement expression is prefixed by a '!' character, 79# this indicated that the new expression will be marked exact. 80# 81# A special condition "many-comm-expr" can be used with expressions to note 82# that the expression and its subexpressions have more commutative expressions 83# than nir_replace_instr can handle. If this special condition is needed with 84# another condition, the two can be separated by a comma (e.g., 85# "(many-comm-expr,is_used_once)"). 86 87# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 88def lowered_sincos(c): 89 x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 90 x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 91 return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 92 93def intBitsToFloat(i): 94 return struct.unpack('!f', struct.pack('!I', i))[0] 95 96optimizations = [ 97 98 (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 99 (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 100 (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 101 (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 102 (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 103 (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 104 (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 105 106 (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))), 107 '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'), 108 109 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 110 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 111 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 112 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 113 (('udiv', a, 1), a), 114 (('idiv', a, 1), a), 115 (('umod', a, 1), 0), 116 (('imod', a, 1), 0), 117 (('imod', a, -1), 0), 118 (('irem', a, 1), 0), 119 (('irem', a, -1), 0), 120 (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 121 (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 122 (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 123 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 124 (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 125 (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 126 # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 127 (('irem', a, '#b(is_pos_power_of_two)'), 128 ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 129 '!options->lower_bitops'), 130 (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 131 132 (('~fneg', ('fneg', a)), a), 133 (('ineg', ('ineg', a)), a), 134 (('fabs', ('fneg', a)), ('fabs', a)), 135 (('fabs', ('u2f', a)), ('u2f', a)), 136 (('iabs', ('iabs', a)), ('iabs', a)), 137 (('iabs', ('ineg', a)), ('iabs', a)), 138 (('f2b', ('fneg', a)), ('f2b', a)), 139 (('i2b', ('ineg', a)), ('i2b', a)), 140 (('~fadd', a, 0.0), a), 141 # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 142 # floating point instruction, they should flush any input denormals and we 143 # can replace -0.0 with 0.0 if the float execution mode allows it. 144 (('fadd(is_only_used_as_float)', 'a@16', 0.0), a, '!'+signed_zero_inf_nan_preserve_16), 145 (('fadd(is_only_used_as_float)', 'a@32', 0.0), a, '!'+signed_zero_inf_nan_preserve_32), 146 (('iadd', a, 0), a), 147 (('iadd_sat', a, 0), a), 148 (('isub_sat', a, 0), a), 149 (('uadd_sat', a, 0), a), 150 (('usub_sat', a, 0), a), 151 (('usadd_4x8_vc4', a, 0), a), 152 (('usadd_4x8_vc4', a, ~0), ~0), 153 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 154 (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), 155 (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 156 (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 157 (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 158 (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 159 (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 160 (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), 161 (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 162 (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), 163 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 164 (('iand', ('ior', a, b), ('ior', a, c)), ('ior', a, ('iand', b, c))), 165 (('ior', ('iand', a, b), ('iand', a, c)), ('iand', a, ('ior', b, c))), 166 (('~fadd', ('fneg', a), a), 0.0), 167 (('iadd', ('ineg', a), a), 0), 168 (('iadd', ('ineg', a), ('iadd', a, b)), b), 169 (('iadd', a, ('iadd', ('ineg', a), b)), b), 170 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 171 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 172 (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 173 (('~fmul', a, 0.0), 0.0), 174 # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 175 (('fmul', 'a@16', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_16), 176 (('fmul', 'a@32', 0.0), 0.0, '!'+signed_zero_inf_nan_preserve_32), 177 (('fmulz', a, 0.0), 0.0), 178 (('fmulz', a, 'b(is_finite_not_zero)'), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), 179 (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), 180 (('fmulz', a, a), ('fmul', a, a)), 181 (('ffmaz', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c), '!'+signed_zero_inf_nan_preserve_32), 182 (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), 183 (('ffmaz', a, a, b), ('ffma', a, a, b)), 184 (('imul', a, 0), 0), 185 (('umul_unorm_4x8_vc4', a, 0), 0), 186 (('umul_unorm_4x8_vc4', a, ~0), a), 187 (('~fmul', a, 1.0), a), 188 (('~fmulz', a, 1.0), a), 189 # The only effect a*1.0 can have is flushing denormals. If it's only used by 190 # a floating point instruction, they should flush any input denormals and 191 # this multiplication isn't needed. 192 (('fmul(is_only_used_as_float)', a, 1.0), a), 193 (('imul', a, 1), a), 194 (('fmul', a, -1.0), ('fneg', a)), 195 (('imul', a, -1), ('ineg', a)), 196 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 197 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 198 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 199 # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 200 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 201 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 202 (('~ffma', 0.0, a, b), b), 203 (('ffma@16(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_16), 204 (('ffma@32(is_only_used_as_float)', 0.0, a, b), b, '!'+signed_zero_inf_nan_preserve_32), 205 (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), 206 (('~ffma', a, b, 0.0), ('fmul', a, b)), 207 (('ffma@16', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_16), 208 (('ffma@32', a, b, 0.0), ('fmul', a, b), '!'+signed_zero_inf_nan_preserve_32), 209 (('ffmaz', a, b, 0.0), ('fmulz', a, b), '!'+signed_zero_inf_nan_preserve_32), 210 (('ffma', 1.0, a, b), ('fadd', a, b)), 211 (('ffmaz', 1.0, a, b), ('fadd', a, b), '!'+signed_zero_inf_nan_preserve_32), 212 (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 213 (('ffmaz', -1.0, a, b), ('fadd', ('fneg', a), b), '!'+signed_zero_inf_nan_preserve_32), 214 (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 215 (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), 216 (('~flrp', a, b, 0.0), a), 217 (('~flrp', a, b, 1.0), b), 218 (('~flrp', a, a, b), a), 219 (('~flrp', 0.0, a, b), ('fmul', a, b)), 220 221 # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 222 (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 223 224 (('sdot_4x8_iadd', a, 0, b), b), 225 (('udot_4x8_uadd', a, 0, b), b), 226 (('sdot_4x8_iadd_sat', a, 0, b), b), 227 (('udot_4x8_uadd_sat', a, 0, b), b), 228 (('sdot_2x16_iadd', a, 0, b), b), 229 (('udot_2x16_uadd', a, 0, b), b), 230 (('sdot_2x16_iadd_sat', a, 0, b), b), 231 (('udot_2x16_uadd_sat', a, 0, b), b), 232 233 # sudot_4x8_iadd is not commutative at all, so the patterns must be 234 # duplicated with zeros on each of the first positions. 235 (('sudot_4x8_iadd', a, 0, b), b), 236 (('sudot_4x8_iadd', 0, a, b), b), 237 (('sudot_4x8_iadd_sat', a, 0, b), b), 238 (('sudot_4x8_iadd_sat', 0, a, b), b), 239 240 (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 241 (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 242 (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 243 (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 244 (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 245 246 # Try to let constant folding eliminate the dot-product part. These are 247 # safe because the dot product cannot overflow 32 bits. 248 (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 249 (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 250 (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 251 (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 252 (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 253 (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 254 (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 255 (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 256 (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 257 (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 258 (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 259 (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 260 (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 261 (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 262 (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 263 (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 264 265 # Optimize open-coded fmulz. 266 # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b) 267 (('fmul@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b)), 268 ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 269 (('fmul@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')), 270 ('fmulz', a, b), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 271 272 # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c) 273 (('ffma@32', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, a), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, b), c), 274 ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 275 (('ffma@32', a, ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c), 276 ('ffmaz', a, b, c), 'options->has_fmulz && !'+signed_zero_inf_nan_preserve_32), 277] 278 279# Shorthand for the expansion of just the dot product part of the [iu]dp4a 280# instructions. 281sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 282 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 283 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 284 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 285udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 286 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 287 ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 288 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 289sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 290 ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 291 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 292 ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 293sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 294 ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 295udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 296 ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 297 298optimizations.extend([ 299 (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 300 (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'), 301 (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 302 (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 303 (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 304 305 # For the unsigned dot-product, the largest possible value 4*(255*255) = 306 # 0x3f804, so we don't have to worry about that intermediate result 307 # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 308 # that is less than 0xfffc07fc, then the result cannot overflow ever. 309 (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 310 (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', udot_4x8_a_b, c), '!options->has_udot_4x8'), 311 312 # For the signed dot-product, the largest positive value is 4*(-128*-128) = 313 # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 314 # don't have to worry about that intermediate result overflowing or 315 # underflowing. 316 (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 317 318 (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 319 320 (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 321 (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 322]) 323 324# Float sizes 325for s in [16, 32, 64]: 326 optimizations.extend([ 327 (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 328 329 (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 330 (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 331 (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 332 333 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 334 # These are the same as the previous three rules, but it depends on 335 # 1-fsat(x) <=> fsat(1-x). See below. 336 (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 337 (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 338 339 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 340 (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 341 342 (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 343 (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 344 345 # These two aren't flrp lowerings, but do appear in some shaders. 346 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 347 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 348 349 # 1 - ((1 - a) * (1 - b)) 350 # 1 - (1 - a - b + a*b) 351 # 1 - 1 + a + b - a*b 352 # a + b - a*b 353 # a + b*(1 - a) 354 # b*(1 - a) + 1*a 355 # flrp(b, 1, a) 356 (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 357 ]) 358 359optimizations.extend([ 360 (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 361 362 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 363 (('ftrunc', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 364 365 (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 366 (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 367 (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'), 368 (('fadd@16', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 369 (('fadd@32', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 370 (('fadd@64', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 371 (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 372 (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 373 (('ffract@64', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)'), 374 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 375 (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 376 (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 377 (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 378 (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), 379 # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 380 (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 381 (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 382 (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 383 (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), 384 385 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 386 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 387 388 (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 389 390 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 391 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 392 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 393 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 394 395 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 396 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 397 398 (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 399 (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 400 401 # Lower fdot to fsum when it is available 402 (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 403 (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 404 (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 405 (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 406 407 # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 408 # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 409 # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 410 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 411 412 # (a * #b + #c) << #d 413 # ((a * #b) << #d) + (#c << #d) 414 # (a * (#b << #d)) + (#c << #d) 415 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 416 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 417 418 # (a * #b) << #c 419 # a * (#b << #c) 420 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 421]) 422 423# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 424# bits of the second source. These replacements must correctly handle the 425# case where (b % bitsize) + (c % bitsize) >= bitsize. 426for s in [8, 16, 32, 64]: 427 mask = s - 1 428 429 ishl = "ishl@{}".format(s) 430 ishr = "ishr@{}".format(s) 431 ushr = "ushr@{}".format(s) 432 433 in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 434 435 optimizations.extend([ 436 ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 437 ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 438 439 # To get get -1 for large shifts of negative values, ishr must instead 440 # clamp the shift count to the maximum value. 441 ((ishr, (ishr, a, '#b'), '#c'), 442 (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 443 ]) 444 445# Optimize a pattern of address calculation created by DXVK where the offset is 446# divided by 4 and then multipled by 4. This can be turned into an iand and the 447# additions before can be reassociated to CSE the iand instruction. 448 449for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 450 a_sz = 'a@{}'.format(size) 451 452 optimizations.extend([ 453 # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 454 (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 455 (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 456 457 # This does not trivially work with ishr. 458 (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 459 ]) 460 461optimizations.extend([ 462 (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)), 463 (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)), 464]) 465 466for log2 in range(1, 7): # powers of two from 2 to 64 467 v = 1 << log2 468 mask = 0xffffffff & ~(v - 1) 469 b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 470 471 optimizations.extend([ 472 # Reassociate for improved CSE 473 (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 474 ]) 475 476# To save space in the state tables, reduce to the set that is known to help. 477# Previously, this was range(1, 32). In addition, a couple rules inside the 478# loop are commented out. Revisit someday, probably after mesa/#2635 has some 479# resolution. 480for i in [1, 2, 16, 24]: 481 lo_mask = 0xffffffff >> i 482 hi_mask = (0xffffffff << i) & 0xffffffff 483 484 optimizations.extend([ 485 # This pattern seems to only help in the soft-fp64 code. 486 (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 487# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 488# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 489 490 (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 491 (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 492# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 493 ]) 494 495optimizations.extend([ 496 # This is common for address calculations. Reassociating may enable the 497 # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 498 # instruction or a constant offset field for in load / store instructions. 499 (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 500 501 # (a + #b) * #c => (a * #c) + (#b * #c) 502 (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 503 504 # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 505 (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 506 ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 507 (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 508 ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 509 510 # Comparison simplifications 511 (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 512 (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 513 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 514 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 515 (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 516 (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 517 (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 518 (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 519 (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 520 (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 521 522 (('iand', ('feq', a, b), ('fneu', a, b)), False), 523 (('iand', ('flt', a, b), ('flt', b, a)), False), 524 (('iand', ('ieq', a, b), ('ine', a, b)), False), 525 (('iand', ('ilt', a, b), ('ilt', b, a)), False), 526 (('iand', ('ult', a, b), ('ult', b, a)), False), 527 528 # This helps some shaders because, after some optimizations, they end up 529 # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 530 # matching would be handled by CSE. 531 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 532 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 533 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 534 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 535 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 536 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 537 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 538 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 539 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 540 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 541 542 # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 543 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 544 545 # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 546 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 547 548 # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 549 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 550 551 # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 552 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 553 554 # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 555 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 556 557 # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 558 (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 559 560 # 0.0 >= b2f(a) 561 # b2f(a) <= 0.0 562 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 563 # inot(a) 564 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 565 566 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 567 568 (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 569 (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 570 (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 571 (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 572 (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 573 (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 574 (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 575 (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 576 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 577 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 578 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 579 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 580 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 581 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 582 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 583 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 584 585 # -(b2f(a) + b2f(b)) < 0 586 # 0 < b2f(a) + b2f(b) 587 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 588 # a || b 589 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 590 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 591 592 # -(b2f(a) + b2f(b)) >= 0 593 # 0 >= b2f(a) + b2f(b) 594 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 595 # !(a || b) 596 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 597 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 598 599 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 600 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 601 602 # Some optimizations (below) convert things like (a < b || c < b) into 603 # (min(a, c) < b). However, this interfers with the previous optimizations 604 # that try to remove comparisons with negated sums of b2f. This just 605 # breaks that apart. 606 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 607 ('ior', ('flt', c, 0.0), ('ior', a, b))), 608 609 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 610 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 611 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 612 (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 613 (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 614 (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 615 (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 616 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 617 (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 618 (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 619 (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 620 (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 621 622 # Cannot remove the addition from ilt or ige due to overflow. 623 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 624 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 625 626 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 627 (('fneu', ('b2f', 'a@1'), 0.0), a), 628 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 629 (('ine', ('b2i', 'a@1'), 0), a), 630 631 (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 632 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 633 (('fge', ('u2f', a), 0.0), True), 634 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 635 (('flt', ('u2f', a), 0.0), False), 636 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 637 (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 638 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 639 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 640 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 641 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 642 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 643 644 # 0.0 < fabs(a) 645 # fabs(a) > 0.0 646 # fabs(a) != 0.0 because fabs(a) must be >= 0 647 # a != 0.0 648 (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 649 650 # -fabs(a) < 0.0 651 # fabs(a) > 0.0 652 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 653 654 # 0.0 >= fabs(a) 655 # 0.0 == fabs(a) because fabs(a) must be >= 0 656 # 0.0 == a 657 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 658 659 # -fabs(a) >= 0.0 660 # 0.0 >= fabs(a) 661 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 662 663 # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 664 # 665 # This should be NaN safe. 666 # 667 # NaN >= 0 && 1 >= NaN -> false && false -> false 668 # 669 # vs. 670 # 671 # NaN == fsat(NaN) -> NaN == 0 -> false 672 (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 673 674 # Note: fmin(-a, -b) == -fmax(a, b) 675 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 676 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 677 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 678 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 679 680 # fmin(b2f(a), b) 681 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 682 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 683 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 684 # 685 # Since b is a constant, constant folding will eliminate the fmin and the 686 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 687 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 688 689 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 690 691 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 692 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 693 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 694 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 695 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 696 (('bcsel', ('i2b', a), b, c), ('bcsel', ('ine', a, 0), b, c)), 697 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 698 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 699 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 700 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 701 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 702 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 703 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 704 (('bcsel', a, True, b), ('ior', a, b)), 705 (('bcsel', a, a, b), ('ior', a, b)), 706 (('bcsel', a, b, False), ('iand', a, b)), 707 (('bcsel', a, b, a), ('iand', a, b)), 708 (('~fmin', a, a), a), 709 (('~fmax', a, a), a), 710 (('imin', a, a), a), 711 (('imax', a, a), a), 712 (('umin', a, a), a), 713 (('umin', a, 0), 0), 714 (('umin', a, -1), a), 715 (('umax', a, a), a), 716 (('umax', a, 0), a), 717 (('umax', a, -1), -1), 718 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 719 (('umax', ('umax', a, b), b), ('umax', a, b)), 720 (('imax', ('imax', a, b), b), ('imax', a, b)), 721 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 722 (('umin', ('umin', a, b), b), ('umin', a, b)), 723 (('imin', ('imin', a, b), b), ('imin', a, b)), 724 (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 725 (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 726 (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 727 (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 728 (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 729 (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 730]) 731 732for N in [8, 16, 32, 64]: 733 b2iN = 'b2i{0}'.format(N) 734 optimizations.extend([ 735 (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 736 (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 737 ]) 738 739for N in [16, 32, 64]: 740 b2fN = 'b2f{0}'.format(N) 741 optimizations.extend([ 742 (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 743 (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 744 ]) 745 746# Integer sizes 747for s in [8, 16, 32, 64]: 748 optimizations.extend([ 749 (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 750 751 # Simplify logic to detect sign of an integer. 752 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 753 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 754 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 755 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 756 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 757 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 758 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 759 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 760 (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 761 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 762 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 763 (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 764 ]) 765 766optimizations.extend([ 767 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 768 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 769 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 770 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 771 (('~fmin', a, ('fabs', a)), a), 772 (('imin', a, ('iabs', a)), a), 773 (('~fmax', a, ('fneg', ('fabs', a))), a), 774 (('imax', a, ('ineg', ('iabs', a))), a), 775 (('fmax', a, ('fabs', a)), ('fabs', a)), 776 (('imax', a, ('iabs', a)), ('iabs', a)), 777 (('fmax', a, ('fneg', a)), ('fabs', a)), 778 (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 779 (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 780 (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 781 # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 782 # fsat(a) returns 0.0. 783 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 784 # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 785 # fneg(fsat(fneg(a))) returns -0.0 on NaN. 786 (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 787 # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 788 # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 789 # SignedZeroInfNanPreserve is set, but we don't currently have any way of 790 # representing this in the optimizations other than the usual ~. 791 (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 792 # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 793 # the new comparison precise to prevent it being changed to 'a != 0'. 794 (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 795 (('fsat', ('b2f', a)), ('b2f', a)), 796 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 797 (('fsat', ('fsat', a)), ('fsat', a)), 798 (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 799 (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 800 (('fsat', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat && !'+signed_zero_inf_nan_preserve_32), 801 (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 802 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 803 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 804 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 805 # Both the left and right patterns are "b" when isnan(a), so this is exact. 806 (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 807 # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 808 # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 809 (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 810 811 # max(-min(b, a), b) -> max(abs(b), -a) 812 # min(-max(b, a), b) -> min(-abs(b), -a) 813 (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 814 (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 815 816 # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 817 # fsat(b-a). 818 # 819 # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 820 # 821 # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 822 (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 823 ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 824 825 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 826 827 # The ior versions are exact because fmin and fmax will always pick a 828 # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 829 # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 830 # to prevent other optimizations from ruining the "NaN clensing" property 831 # of the fmin or fmax. 832 (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 833 (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 834 (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 835 (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 836 (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 837 (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 838 (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 839 (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 840 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 841 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 842 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 843 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 844 (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 845 (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 846 (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 847 (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 848 849 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 850 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 851 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 852 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 853 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 854 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 855 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 856 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 857 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 858 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 859 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 860 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 861 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 862 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 863 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 864 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 865 866 # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 867 # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 868 # single step. Doing just the replacement can lead to an infinite loop as 869 # the pattern is repeatedly applied to the result of the previous 870 # application of the pattern. 871 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 872 (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 873 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 874 (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 875 876 # This is how SpvOpFOrdNotEqual might be implemented. If both values are 877 # numbers, then it can be replaced with fneu. 878 (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 879]) 880 881# Float sizes 882for s in [16, 32, 64]: 883 optimizations.extend([ 884 # These derive from the previous patterns with the application of b < 0 <=> 885 # 0 < -b. The transformation should be applied if either comparison is 886 # used once as this ensures that the number of comparisons will not 887 # increase. The sources to the ior and iand are not symmetric, so the 888 # rules have to be duplicated to get this behavior. 889 (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 890 (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 891 (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 892 (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 893 (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 894 (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 895 (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 896 (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 897 898 # The (i2f32, ...) part is an open-coded fsign. When that is combined 899 # with the bcsel, it's basically copysign(1.0, a). There are some 900 # behavior differences between this pattern and copysign w.r.t. ±0 and 901 # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 902 # to x, regardless of whether either or both values are NaN. 903 # 904 # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 905 # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 906 # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 907 # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 908 # 909 # For all other values of 'a', the original and replacement behave as 910 # copysign. 911 # 912 # Marking the replacement comparisons as precise prevents any future 913 # optimizations from replacing either of the comparisons with the 914 # logical-not of the other. 915 # 916 # Note: Use b2i32 in the replacement because some platforms that 917 # support fp16 don't support int16. 918 (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 919 ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 920 921 (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 922 923 # The C spec says, "If the value of the integral part cannot be represented 924 # by the integer type, the behavior is undefined." "Undefined" can mean 925 # "the conversion doesn't happen at all." 926 (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 927 928 # Ironically, mark these as imprecise because removing the conversions may 929 # preserve more precision than doing the conversions (e.g., 930 # uint(float(0x81818181u)) == 0x81818200). 931 (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 932 (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 933 (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 934 (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 935 936 (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), '!options->lower_fsign'), 937 (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), '!options->lower_fsign'), 938 ]) 939 940 # float? -> float? -> floatS ==> float? -> floatS 941 (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 942 943 # int? -> float? -> floatS ==> int? -> floatS 944 (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 945 (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 946 947 # float? -> float? -> intS ==> float? -> intS 948 (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 949 (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 950 951 for B in [32, 64]: 952 if s < B: 953 optimizations.extend([ 954 # S = smaller, B = bigger 955 # typeS -> typeB -> typeS ==> identity 956 (('f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 957 (('i2i{}'.format(s), ('i2i{}'.format(B), 'a@{}'.format(s))), a), 958 (('u2u{}'.format(s), ('u2u{}'.format(B), 'a@{}'.format(s))), a), 959 960 # bool1 -> typeB -> typeS ==> bool1 -> typeS 961 (('f2f{}'.format(s), ('b2f{}'.format(B), 'a@1')), ('b2f{}'.format(s), a)), 962 (('i2i{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)), 963 (('u2u{}'.format(s), ('b2i{}'.format(B), 'a@1')), ('b2i{}'.format(s), a)), 964 965 # floatS -> floatB -> intB ==> floatS -> intB 966 (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 967 (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 968 969 # int? -> floatB -> floatS ==> int? -> floatS 970 (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 971 (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 972 973 # intS -> intB -> floatB ==> intS -> floatB 974 (('u2f{}'.format(B), ('u2u{}'.format(B), 'a@{}'.format(s))), ('u2f{}'.format(B), a)), 975 (('i2f{}'.format(B), ('i2i{}'.format(B), 'a@{}'.format(s))), ('i2f{}'.format(B), a)), 976 ]) 977 978# mediump variants of the above 979optimizations.extend([ 980 # int32 -> float32 -> float16 ==> int32 -> float16 981 (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 982 (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 983 984 # float32 -> float16 -> int16 ==> float32 -> int16 985 (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 986 (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 987 988 # float32 -> int32 -> int16 ==> float32 -> int16 989 (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 990 (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 991 992 # int32 -> int16 -> float16 ==> int32 -> float16 993 (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 994 (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 995]) 996 997# Clean up junk left from 8-bit integer to 16-bit integer lowering. 998optimizations.extend([ 999 # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 1000 # accomplished by mask the upper 8-bit of the immediate operand to the 1001 # iand instruction. Often times, both patterns will end up being applied 1002 # to the same original expression tree. 1003 (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 1004 (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 1005]) 1006 1007for op in ['iand', 'ior', 'ixor']: 1008 optimizations.extend([ 1009 (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 1010 (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 1011 1012 # Undistribute extract from a logic op 1013 ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 1014 ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 1015 ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 1016 ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 1017 1018 # Undistribute shifts from a logic op 1019 ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 1020 ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 1021 ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 1022 ]) 1023 1024# Integer sizes 1025for s in [8, 16, 32, 64]: 1026 last_shift_bit = int(math.log2(s)) - 1 1027 1028 optimizations.extend([ 1029 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), 'options->lower_umax'), 1030 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), 'options->lower_umin'), 1031 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!options->lower_umax'), 1032 (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!options->lower_umin'), 1033 (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!options->lower_umin'), 1034 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!options->lower_umax'), 1035 1036 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 1037 (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 1038 1039 # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 1040 (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 1041 (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 1042 (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 1043 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), last_shift_bit)), ('ushr', a, ('ishl', b, last_shift_bit))), 1044 ]) 1045 1046optimizations.extend([ 1047 # Common pattern like 'if (i == 0 || i == 1 || ...)' 1048 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 1049 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 1050 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 1051 1052 (('ior', a, ('ieq', a, False)), True), 1053 (('ior', a, ('inot', a)), -1), 1054 1055 (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 1056 (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 1057 1058 # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1059 # The first part of the iand comes from the !__feq64_nonnan. 1060 # 1061 # The second pattern is a reformulation of the first based on the relation 1062 # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1063 # happens to be y == 0. 1064 (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1065 ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1066 (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1067 ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1068 1069 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1070 # transformations occur before constant propagation and loop-unrolling. 1071 # 1072 # The flt versions are exact. If isnan(a), the original pattern is 1073 # trivially false, and the replacements are false too. If isnan(b): 1074 # 1075 # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1076 (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1077 (('flt', ('fmin', a, b), a), ('flt', b, a)), 1078 (('~fge', a, ('fmin', b, a)), True), 1079 (('~fge', ('fmax', a, b), a), True), 1080 (('flt', a, ('fmin', b, a)), False), 1081 (('flt', ('fmax', a, b), a), False), 1082 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1083 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1084 1085 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1086 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1087 (('ige', a, ('imin', b, a)), True), 1088 (('ige', ('imax', a, b), a), True), 1089 (('ult', a, ('umax', b, a)), ('ult', a, b)), 1090 (('ult', ('umin', a, b), a), ('ult', b, a)), 1091 (('uge', a, ('umin', b, a)), True), 1092 (('uge', ('umax', a, b), a), True), 1093 (('ilt', a, ('imin', b, a)), False), 1094 (('ilt', ('imax', a, b), a), False), 1095 (('ige', a, ('imax', b, a)), ('ige', a, b)), 1096 (('ige', ('imin', a, b), a), ('ige', b, a)), 1097 (('ult', a, ('umin', b, a)), False), 1098 (('ult', ('umax', a, b), a), False), 1099 (('uge', a, ('umax', b, a)), ('uge', a, b)), 1100 (('uge', ('umin', a, b), a), ('uge', b, a)), 1101 (('ult', a, ('iand', b, a)), False), 1102 (('ult', ('ior', a, b), a), False), 1103 (('uge', a, ('iand', b, a)), True), 1104 (('uge', ('ior', a, b), a), True), 1105 1106 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1107 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1108 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1109 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1110 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1111 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1112 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1113 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1114 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1115 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1116 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1117 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1118 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1119 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1120 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1121 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1122 1123 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1124 # negative. 1125 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1126 ('iabs', ('ishr', a, b))), 1127 (('iabs', ('ishr', ('iabs', a), b)), ('ishr', ('iabs', a), b)), 1128 1129 (('fabs', ('slt', a, b)), ('slt', a, b)), 1130 (('fabs', ('sge', a, b)), ('sge', a, b)), 1131 (('fabs', ('seq', a, b)), ('seq', a, b)), 1132 (('fabs', ('sne', a, b)), ('sne', a, b)), 1133 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1134 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1135 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1136 (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1137 (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1138 (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1139 (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1140 (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1141 (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1142 (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1143 (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1144 (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1145 (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1146 (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1147 (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1148 (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1149 (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1150 (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1151 (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1152 (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1153 (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1154 (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1155 (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1156 (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1157 (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1158 (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1159 1160 (('ball_iequal2', a, b), ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1161 (('ball_iequal3', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('ieq', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1162 (('ball_iequal4', a, b), ('iand', ('iand', ('ieq', 'a.x', 'b.x'), ('ieq', 'a.y', 'b.y')), ('iand', ('ieq', 'a.z', 'b.z'), ('ieq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1163 1164 (('bany_inequal2', a, b), ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1165 (('bany_inequal3', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ine', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1166 (('bany_inequal4', a, b), ('ior', ('ior', ('ine', 'a.x', 'b.x'), ('ine', 'a.y', 'b.y')), ('ior', ('ine', 'a.z', 'b.z'), ('ine', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1167 1168 (('ball_fequal2', a, b), ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1169 (('ball_fequal3', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('feq', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1170 (('ball_fequal4', a, b), ('iand', ('iand', ('feq', 'a.x', 'b.x'), ('feq', 'a.y', 'b.y')), ('iand', ('feq', 'a.z', 'b.z'), ('feq', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1171 1172 (('bany_fnequal2', a, b), ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1173 (('bany_fnequal3', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('fneu', 'a.z', 'b.z')), 'options->lower_vector_cmp'), 1174 (('bany_fnequal4', a, b), ('ior', ('ior', ('fneu', 'a.x', 'b.x'), ('fneu', 'a.y', 'b.y')), ('ior', ('fneu', 'a.z', 'b.z'), ('fneu', 'a.w', 'b.w'))), 'options->lower_vector_cmp'), 1175 1176 (('feq', ('seq', a, b), 1.0), ('feq', a, b)), 1177 (('feq', ('sne', a, b), 1.0), ('fneu', a, b)), 1178 (('feq', ('slt', a, b), 1.0), ('flt', a, b)), 1179 (('feq', ('sge', a, b), 1.0), ('fge', a, b)), 1180 (('fneu', ('seq', a, b), 0.0), ('feq', a, b)), 1181 (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)), 1182 (('fneu', ('slt', a, b), 0.0), ('flt', a, b)), 1183 (('fneu', ('sge', a, b), 0.0), ('fge', a, b)), 1184 (('feq', ('seq', a, b), 0.0), ('fneu', a, b)), 1185 (('feq', ('sne', a, b), 0.0), ('feq', a, b)), 1186 (('feq', ('slt', a, b), 0.0), ('fge', a, b)), 1187 (('feq', ('sge', a, b), 0.0), ('flt', a, b)), 1188 (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)), 1189 (('fneu', ('sne', a, b), 1.0), ('feq', a, b)), 1190 (('fneu', ('slt', a, b), 1.0), ('fge', a, b)), 1191 (('fneu', ('sge', a, b), 1.0), ('flt', a, b)), 1192 1193 (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1194 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1195 # Emulating booleans 1196 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1197 (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1198 (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1199 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1200 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1201 (('iand', 'a@bool16', 1.0), ('b2f', a)), 1202 (('iand', 'a@bool32', 1.0), ('b2f', a)), 1203 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1204 # Comparison with the same args. Note that these are only done for the 1205 # float versions when the source must be a number. Generally, NaN cmp NaN 1206 # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1207 # is false, and, for any number X, X < X is also false. 1208 (('ilt', a, a), False), 1209 (('ige', a, a), True), 1210 (('ieq', a, a), True), 1211 (('ine', a, a), False), 1212 (('ult', a, a), False), 1213 (('uge', a, a), True), 1214 (('flt', a, a), False), 1215 (('fge', 'a(is_a_number)', a), True), 1216 (('feq', 'a(is_a_number)', a), True), 1217 (('fneu', 'a(is_a_number)', a), False), 1218 # Logical and bit operations 1219 (('iand', a, a), a), 1220 (('iand', a, ~0), a), 1221 (('iand', a, 0), 0), 1222 (('ior', a, a), a), 1223 (('ior', a, 0), a), 1224 (('ior', a, True), True), 1225 (('ixor', a, a), 0), 1226 (('ixor', a, 0), a), 1227 (('ixor', a, ('ixor', a, b)), b), 1228 (('ixor', a, -1), ('inot', a)), 1229 (('inot', ('inot', a)), a), 1230 (('ior', ('iand', a, b), b), b), 1231 (('ior', ('ior', a, b), b), ('ior', a, b)), 1232 (('iand', ('ior', a, b), b), b), 1233 (('iand', ('iand', a, b), b), ('iand', a, b)), 1234 # DeMorgan's Laws 1235 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1236 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1237 # Shift optimizations 1238 (('ishl', 0, a), 0), 1239 (('ishl', a, 0), a), 1240 (('ishr', 0, a), 0), 1241 (('ishr', -1, a), -1), 1242 (('ishr', a, 0), a), 1243 (('ushr', 0, a), 0), 1244 (('ushr', a, 0), a), 1245 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), 1246 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), '!options->lower_rotate'), 1247 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), '!options->lower_rotate'), 1248 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), '!options->lower_rotate'), 1249 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), 1250 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), '!options->lower_rotate'), 1251 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), '!options->lower_rotate'), 1252 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), '!options->lower_rotate'), 1253 (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), 'options->lower_rotate'), 1254 (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), 'options->lower_rotate'), 1255 (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), 'options->lower_rotate'), 1256 (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), 'options->lower_rotate'), 1257 # Exponential/logarithmic identities 1258 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1259 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1260 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1261 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1262 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1263 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1264 (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1265 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1266 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1267 (('~fpow', a, 1.0), a), 1268 (('~fpow', a, 2.0), ('fmul', a, a)), 1269 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1270 (('~fpow', 2.0, a), ('fexp2', a)), 1271 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1272 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1273 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1274 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1275 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1276 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1277 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1278 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1279 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1280 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1281 (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1282 (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1283 (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), 1284 # Division and reciprocal 1285 (('~fdiv', 1.0, a), ('frcp', a)), 1286 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1287 (('~frcp', ('frcp', a)), a), 1288 (('~frcp', ('fsqrt', a)), ('frsq', a)), 1289 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1290 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1291 # Trig 1292 (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1293 (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1294 # Boolean simplifications 1295 (('i2b16(is_used_by_if)', a), ('ine16', a, 0)), 1296 (('i2b32(is_used_by_if)', a), ('ine32', a, 0)), 1297 (('i2b1(is_used_by_if)', a), ('ine', a, 0)), 1298 (('ieq', a, True), a), 1299 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1300 (('ine', a, False), a), 1301 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1302 (('bcsel', a, True, False), a), 1303 (('bcsel', a, False, True), ('inot', a)), 1304 (('bcsel', True, b, c), b), 1305 (('bcsel', False, b, c), c), 1306 1307 (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1308 (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1309 (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1310 (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1311 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1312 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1313 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1314 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1315 (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1316 (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1317 (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1318 (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1319 1320 (('bcsel', a, b, b), b), 1321 (('~fcsel', a, b, b), b), 1322 1323 # D3D Boolean emulation 1324 (('bcsel', a, -1, 0), ('ineg', ('b2i', 'a@1'))), 1325 (('bcsel', a, 0, -1), ('ineg', ('b2i', ('inot', a)))), 1326 (('bcsel', a, 1, 0), ('b2i', 'a@1')), 1327 (('bcsel', a, 0, 1), ('b2i', ('inot', a))), 1328 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1329 ('ineg', ('b2i', ('iand', a, b)))), 1330 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1331 ('ineg', ('b2i', ('ior', a, b)))), 1332 (('ieq', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1333 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1334 (('ine', ('ineg', ('b2i', 'a@1')), 0), a), 1335 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1336 (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1337 (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1338 (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1339 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1340 (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1341 1342 # With D3D booleans, imax is AND and umax is OR 1343 (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1344 ('ineg', ('b2i', ('iand', a, b)))), 1345 (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1346 ('ineg', ('b2i', ('ior', a, b)))), 1347 (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1348 ('ineg', ('b2i', ('ior', a, b)))), 1349 (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1350 ('ineg', ('b2i', ('iand', a, b)))), 1351 1352 # Conversions 1353 (('i2b16', ('b2i', 'a@16')), a), 1354 (('i2b32', ('b2i', 'a@32')), a), 1355 (('f2i', ('ftrunc', a)), ('f2i', a)), 1356 (('f2u', ('ftrunc', a)), ('f2u', a)), 1357 (('i2b', ('ineg', a)), ('i2b', a)), 1358 (('i2b', ('iabs', a)), ('i2b', a)), 1359 (('inot', ('f2b1', a)), ('feq', a, 0.0)), 1360 1361 # Conversions from 16 bits to 32 bits and back can always be removed 1362 (('f2fmp', ('f2f32', 'a@16')), a), 1363 (('i2imp', ('i2i32', 'a@16')), a), 1364 (('i2imp', ('u2u32', 'a@16')), a), 1365 1366 (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1367 (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1368 (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1369 (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1370 1371 (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1372 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1373 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1374 1375 (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1376 (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1377 (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1378 (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1379 1380 # Conversions to 16 bits would be lossy so they should only be removed if 1381 # the instruction was generated by the precision lowering pass. 1382 (('f2f32', ('f2fmp', 'a@32')), a), 1383 (('i2i32', ('i2imp', 'a@32')), a), 1384 (('u2u32', ('i2imp', 'a@32')), a), 1385 1386 (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1387 (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1388 (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1389 (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1390 1391 # Conversions from float32 to float64 and back can be removed as long as 1392 # it doesn't need to be precise, since the conversion may e.g. flush denorms 1393 (('~f2f32', ('f2f64', 'a@32')), a), 1394 1395 (('ffloor', 'a(is_integral)'), a), 1396 (('fceil', 'a(is_integral)'), a), 1397 (('ftrunc', 'a(is_integral)'), a), 1398 (('fround_even', 'a(is_integral)'), a), 1399 1400 # fract(x) = x - floor(x), so fract(NaN) = NaN 1401 (('~ffract', 'a(is_integral)'), 0.0), 1402 (('fabs', 'a(is_not_negative)'), a), 1403 (('iabs', 'a(is_not_negative)'), a), 1404 (('fsat', 'a(is_not_positive)'), 0.0), 1405 1406 (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1407 1408 # The result of the multiply must be in [-1, 0], so the result of the ffma 1409 # must be in [0, 1]. 1410 (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1411 (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1412 (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1413 (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1414 1415 (('fneu', 'a(is_not_zero)', 0.0), True), 1416 (('feq', 'a(is_not_zero)', 0.0), False), 1417 1418 # In this chart, + means value > 0 and - means value < 0. 1419 # 1420 # + >= + -> unknown 0 >= + -> false - >= + -> false 1421 # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1422 # + >= - -> true 0 >= - -> true - >= - -> unknown 1423 # 1424 # Using grouping conceptually similar to a Karnaugh map... 1425 # 1426 # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1427 # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1428 # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1429 # 1430 # The flt / ilt cases just invert the expected result. 1431 # 1432 # The results expecting true, must be marked imprecise. The results 1433 # expecting false are fine because NaN compared >= or < anything is false. 1434 1435 (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1436 (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1437 (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1438 1439 (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1440 (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1441 (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1442 1443 (('ine', 'a(is_not_zero)', 0), True), 1444 (('ieq', 'a(is_not_zero)', 0), False), 1445 1446 (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1447 (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1448 (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1449 1450 (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1451 (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1452 (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1453 1454 (('ult', 0, 'a(is_gt_zero)'), True), 1455 (('ult', a, 0), False), 1456 1457 # Packing and then unpacking does nothing 1458 (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a), 1459 (('unpack_64_2x32_split_y', ('pack_64_2x32_split', a, b)), b), 1460 (('unpack_64_2x32_split_x', ('pack_64_2x32', a)), 'a.x'), 1461 (('unpack_64_2x32_split_y', ('pack_64_2x32', a)), 'a.y'), 1462 (('unpack_64_2x32', ('pack_64_2x32_split', a, b)), ('vec2', a, b)), 1463 (('unpack_64_2x32', ('pack_64_2x32', a)), a), 1464 (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1465 (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1466 ('unpack_64_2x32_split_y', a)), a), 1467 (('pack_64_2x32', ('vec2', ('unpack_64_2x32_split_x', a), 1468 ('unpack_64_2x32_split_y', a))), a), 1469 (('pack_64_2x32', ('unpack_64_2x32', a)), a), 1470 (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1471 1472 # Comparing two halves of an unpack separately. While this optimization 1473 # should be correct for non-constant values, it's less obvious that it's 1474 # useful in that case. For constant values, the pack will fold and we're 1475 # guaranteed to reduce the whole tree to one instruction. 1476 (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1477 ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1478 ('ieq', a, ('pack_32_2x16_split', b, c))), 1479 1480 # Byte extraction 1481 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1482 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1483 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1484 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1485 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1486 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1487 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1488 1489 # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1490 # storage buffer. 1491 (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1492 (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1493 1494 # Common pattern after lowering 8-bit integers to 16-bit. 1495 (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1496 (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1497 1498 (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1499 (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1500 (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1501 (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1502 (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1503 (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1504 (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1505 (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1506 1507 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1508 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1509 1510 # Word extraction 1511 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1512 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1513 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1514 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1515 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1516 1517 (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1518 (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1519 (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1520 (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1521 1522 # Packing a u8vec4 to write to an SSBO. 1523 (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1524 ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1525 1526 (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1527 (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1528 1529 # Lower pack/unpack 1530 (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1531 (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split'), 1532 (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1533 (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1534 (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split'), 1535 (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split'), 1536 1537 # Useless masking before unpacking 1538 (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1539 (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1540 (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1541 (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1542 (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1543 (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1544 1545 (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1546 (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1547 (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1548 (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1549 (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1550 1551 # Optimize half packing 1552 (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1553 (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1554 1555 (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1556 ('pack_half_2x16', ('vec2', a, b))), 1557 (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1558 ('pack_half_2x16', ('vec2', a, b))), 1559 1560 (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 1561 (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 1562 (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 1563 1564 (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1565 (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 1566 1567 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 1568 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 1569 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 1570 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 1571 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 1572 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 1573 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 1574 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 1575]) 1576 1577# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 1578# patterns like those below. 1579for op in ('ushr', 'ishr'): 1580 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 1581 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 1582 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 1583 1584optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 1585 1586# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 1587# patterns like those below. 1588for op in ('extract_u8', 'extract_i8'): 1589 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 1590 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 1591 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 1592 1593optimizations.extend([ 1594 # Subtracts 1595 (('ussub_4x8_vc4', a, 0), a), 1596 (('ussub_4x8_vc4', a, ~0), 0), 1597 # Lower all Subtractions first - they can get recombined later 1598 (('fsub', a, b), ('fadd', a, ('fneg', b))), 1599 (('isub', a, b), ('iadd', a, ('ineg', b))), 1600 (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1601 # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 1602 (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 1603 1604 # Propagate negation up multiplication chains 1605 (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 1606 (('fmulz(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmulz', a, b)), '!'+signed_zero_inf_nan_preserve_32), 1607 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 1608 (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), 1609 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 1610 1611 # Propagate constants up multiplication chains 1612 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 1613 (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), 1614 (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), 1615 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 1616 (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 1617 (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), 1618 (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), 1619 # Prefer moving out a multiplication for more MAD/FMA-friendly code 1620 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 1621 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 1622 (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 1623 (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), 1624 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 1625 1626 # Reassociate constants in add/mul chains so they can be folded together. 1627 # For now, we mostly only handle cases where the constants are separated by 1628 # a single non-constant. We could do better eventually. 1629 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 1630 (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), 1631 (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), 1632 (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 1633 (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), 1634 (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), 1635 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 1636 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 1637 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 1638 (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 1639 (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1640 (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), 1641 (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 1642 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 1643 (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 1644 (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 1645 (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 1646 1647 # Reassociate add chains for more MAD/FMA-friendly code 1648 (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 1649 1650 # Drop mul-div by the same value when there's no wrapping. 1651 (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 1652 1653 # By definition... 1654 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 1655 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1656 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1657 1658 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 1659 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 1660 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 1661 1662 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 1663 1664 (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1665 (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1666 (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 1667 (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 1668 (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 1669 1670 (('bcsel', a, ('bcsel', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 1671 (('bcsel', a, b, ('bcsel', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 1672 1673 # Misc. lowering 1674 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 1675 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 1676 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 1677 (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 1678 1679 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1680 ('bcsel', ('ult', 31, 'bits'), 'insert', 1681 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 1682 'options->lower_bitfield_insert'), 1683 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1684 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1685 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1686 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 1687 (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1688 (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1689 (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1690 (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1691 1692 (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'), 1693 (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'), 1694 1695 (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat || (options->lower_int64_options & nir_lower_iadd64) != 0'), 1696 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 1697 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'), 1698 (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'), 1699 1700 # int64_t sum = a + b; 1701 # 1702 # if (a < 0 && b < 0 && a < sum) 1703 # sum = INT64_MIN; 1704 # } else if (a >= 0 && b >= 0 && sum < a) 1705 # sum = INT64_MAX; 1706 # } 1707 # 1708 # A couple optimizations are applied. 1709 # 1710 # 1. a < sum => sum >= 0. This replacement works because it is known that 1711 # a < 0 and b < 0, so sum should also be < 0 unless there was 1712 # underflow. 1713 # 1714 # 2. sum < a => sum < 0. This replacement works because it is known that 1715 # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 1716 # overflow. 1717 # 1718 # 3. Invert the second if-condition and swap the order of parameters for 1719 # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 1720 # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 1721 # 1722 # On Intel Gen11, this saves ~11 instructions. 1723 (('iadd_sat@64', a, b), ('bcsel', 1724 ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1725 0x8000000000000000, 1726 ('bcsel', 1727 ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 1728 ('iadd', a, b), 1729 0x7fffffffffffffff)), 1730 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 1731 1732 # int64_t sum = a - b; 1733 # 1734 # if (a < 0 && b >= 0 && a < sum) 1735 # sum = INT64_MIN; 1736 # } else if (a >= 0 && b < 0 && a >= sum) 1737 # sum = INT64_MAX; 1738 # } 1739 # 1740 # Optimizations similar to the iadd_sat case are applied here. 1741 (('isub_sat@64', a, b), ('bcsel', 1742 ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1743 0x8000000000000000, 1744 ('bcsel', 1745 ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 1746 ('isub', a, b), 1747 0x7fffffffffffffff)), 1748 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 1749 1750 # These are done here instead of in the backend because the int64 lowering 1751 # pass will make a mess of the patterns. The first patterns are 1752 # conditioned on nir_lower_minmax64 because it was not clear that it was 1753 # always an improvement on platforms that have real int64 support. No 1754 # shaders in shader-db hit this, so it was hard to say one way or the 1755 # other. 1756 (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1757 (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1758 (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1759 (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 1760 (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1761 (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1762 1763 (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1764 (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1765 # 0u < uint(a) <=> uint(a) != 0u 1766 (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 1767 1768 # Alternative lowering that doesn't rely on bfi. 1769 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1770 ('bcsel', ('ult', 31, 'bits'), 1771 'insert', 1772 (('ior', 1773 ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 1774 ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 1775 'options->lower_bitfield_insert_to_shifts'), 1776 1777 # Alternative lowering that uses bitfield_select. 1778 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 1779 ('bcsel', ('ult', 31, 'bits'), 'insert', 1780 ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 1781 'options->lower_bitfield_insert_to_bitfield_select'), 1782 1783 (('ibitfield_extract', 'value', 'offset', 'bits'), 1784 ('bcsel', ('ult', 31, 'bits'), 'value', 1785 ('ibfe', 'value', 'offset', 'bits')), 1786 'options->lower_bitfield_extract'), 1787 1788 (('ubitfield_extract', 'value', 'offset', 'bits'), 1789 ('bcsel', ('ult', 31, 'bits'), 'value', 1790 ('ubfe', 'value', 'offset', 'bits')), 1791 'options->lower_bitfield_extract'), 1792 1793 # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 1794 (('bitfield_select', a, b, 0), ('iand', a, b)), 1795 (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 1796 1797 # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 1798 (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 1799 (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 1800 (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 1801 (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 1802 (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 1803 (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 1804 1805 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 1806 # 1807 # If bits is zero, the result will be zero. 1808 # 1809 # These patterns prevent other patterns from generating invalid results 1810 # when count is zero. 1811 (('ubfe', a, b, 0), 0), 1812 (('ibfe', a, b, 0), 0), 1813 1814 (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 1815 1816 (('b2i32', ('i2b', ('ubfe', a, b, 1))), ('ubfe', a, b, 1)), 1817 (('b2i32', ('i2b', ('ibfe', a, b, 1))), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 1818 (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1819 (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1820 (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1821 (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 1822 1823 (('ibitfield_extract', 'value', 'offset', 'bits'), 1824 ('bcsel', ('ieq', 0, 'bits'), 1825 0, 1826 ('ishr', 1827 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 1828 ('isub', 32, 'bits'))), 1829 'options->lower_bitfield_extract_to_shifts'), 1830 1831 (('ubitfield_extract', 'value', 'offset', 'bits'), 1832 ('iand', 1833 ('ushr', 'value', 'offset'), 1834 ('bcsel', ('ieq', 'bits', 32), 1835 0xffffffff, 1836 ('isub', ('ishl', 1, 'bits'), 1))), 1837 'options->lower_bitfield_extract_to_shifts'), 1838 1839 (('ifind_msb', 'value'), 1840 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 1841 'options->lower_ifind_msb'), 1842 1843 (('ifind_msb', 'value'), 1844 ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 1845 ('isub', 31, ('ifind_msb_rev', 'value')), 1846 ('ifind_msb_rev', 'value')), 1847 'options->lower_find_msb_to_reverse'), 1848 1849 (('ufind_msb', 'value'), 1850 ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 1851 ('isub', 31, ('ufind_msb_rev', 'value')), 1852 ('ufind_msb_rev', 'value')), 1853 'options->lower_find_msb_to_reverse'), 1854 1855 (('find_lsb', 'value'), 1856 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 1857 'options->lower_find_lsb'), 1858 1859 (('extract_i8', a, 'b@32'), 1860 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 1861 'options->lower_extract_byte'), 1862 1863 (('extract_u8', a, 'b@32'), 1864 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 1865 'options->lower_extract_byte'), 1866 1867 (('extract_i16', a, 'b@32'), 1868 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 1869 'options->lower_extract_word'), 1870 1871 (('extract_u16', a, 'b@32'), 1872 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 1873 'options->lower_extract_word'), 1874 1875 (('pack_unorm_2x16', 'v'), 1876 ('pack_uvec2_to_uint', 1877 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 1878 'options->lower_pack_unorm_2x16'), 1879 1880 (('pack_unorm_4x8', 'v'), 1881 ('pack_uvec4_to_uint', 1882 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 1883 'options->lower_pack_unorm_4x8'), 1884 1885 (('pack_snorm_2x16', 'v'), 1886 ('pack_uvec2_to_uint', 1887 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 1888 'options->lower_pack_snorm_2x16'), 1889 1890 (('pack_snorm_4x8', 'v'), 1891 ('pack_uvec4_to_uint', 1892 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 1893 'options->lower_pack_snorm_4x8'), 1894 1895 (('unpack_unorm_2x16', 'v'), 1896 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 1897 ('extract_u16', 'v', 1))), 1898 65535.0), 1899 'options->lower_unpack_unorm_2x16'), 1900 1901 (('unpack_unorm_4x8', 'v'), 1902 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 1903 ('extract_u8', 'v', 1), 1904 ('extract_u8', 'v', 2), 1905 ('extract_u8', 'v', 3))), 1906 255.0), 1907 'options->lower_unpack_unorm_4x8'), 1908 1909 (('unpack_snorm_2x16', 'v'), 1910 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 1911 ('extract_i16', 'v', 1))), 1912 32767.0))), 1913 'options->lower_unpack_snorm_2x16'), 1914 1915 (('unpack_snorm_4x8', 'v'), 1916 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 1917 ('extract_i8', 'v', 1), 1918 ('extract_i8', 'v', 2), 1919 ('extract_i8', 'v', 3))), 1920 127.0))), 1921 'options->lower_unpack_snorm_4x8'), 1922 1923 (('pack_half_2x16_split', 'a@32', 'b@32'), 1924 ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 1925 'options->lower_pack_split'), 1926 1927 (('unpack_half_2x16_split_x', 'a@32'), 1928 ('f2f32', ('u2u16', a)), 1929 'options->lower_pack_split'), 1930 1931 (('unpack_half_2x16_split_y', 'a@32'), 1932 ('f2f32', ('u2u16', ('ushr', a, 16))), 1933 'options->lower_pack_split'), 1934 1935 (('pack_32_2x16_split', 'a@16', 'b@16'), 1936 ('ior', ('ishl', ('u2u32', b), 16), ('u2u32', a)), 1937 'options->lower_pack_split'), 1938 1939 (('unpack_32_2x16_split_x', 'a@32'), 1940 ('u2u16', a), 1941 'options->lower_pack_split'), 1942 1943 (('unpack_32_2x16_split_y', 'a@32'), 1944 ('u2u16', ('ushr', 'a', 16)), 1945 'options->lower_pack_split'), 1946 1947 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 1948 (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 1949 (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 1950 # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 1951 # Mark the new comparisons precise to prevent them being changed to 'a != 1952 # 0' or 'a == 0'. 1953 (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 1954 1955 # Address/offset calculations: 1956 # Drivers supporting imul24 should use the nir_lower_amul() pass, this 1957 # rule converts everyone else to imul: 1958 (('amul', a, b), ('imul', a, b), '!options->has_imul24'), 1959 1960 (('umul24', a, b), 1961 ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 1962 '!options->has_umul24'), 1963 (('umad24', a, b, c), 1964 ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 1965 '!options->has_umad24'), 1966 1967 # Relaxed 24bit ops 1968 (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 1969 (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 1970 (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 1971 (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 1972 (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 1973 (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 1974 1975 (('imad24_ir3', a, b, 0), ('imul24', a, b)), 1976 (('imad24_ir3', a, 0, c), (c)), 1977 (('imad24_ir3', a, 1, c), ('iadd', a, c)), 1978 1979 # if first two srcs are const, crack apart the imad so constant folding 1980 # can clean up the imul: 1981 # TODO ffma should probably get a similar rule: 1982 (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 1983 1984 # These will turn 24b address/offset calc back into 32b shifts, but 1985 # it should be safe to get back some of the bits of precision that we 1986 # already decided were no necessary: 1987 (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 1988 (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 1989 (('imul24', a, 0), (0)), 1990 1991 (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1992 (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 1993 (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1994 (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 1995 1996 (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel"), 1997 (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel"), 1998 (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel"), 1999 (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel"), 2000 2001 (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 2002 (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 2003 (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 2004 (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 2005 2006]) 2007 2008# bit_size dependent lowerings 2009for bit_size in [8, 16, 32, 64]: 2010 # convenience constants 2011 intmax = (1 << (bit_size - 1)) - 1 2012 intmin = 1 << (bit_size - 1) 2013 2014 optimizations += [ 2015 (('iadd_sat@' + str(bit_size), a, b), 2016 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 2017 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 2018 (('isub_sat@' + str(bit_size), a, b), 2019 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 2020 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 2021 ] 2022 2023invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 2024 2025for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 2026 optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))), 2027 ('iand', (invert[left], a, b), (invert[right], c, d)))) 2028 optimizations.append((('inot', ('iand(is_used_once)', (left, a, b), (right, c, d))), 2029 ('ior', (invert[left], a, b), (invert[right], c, d)))) 2030 2031# Optimize x2bN(b2x(x)) -> x 2032for size in type_sizes('bool'): 2033 aN = 'a@' + str(size) 2034 f2bN = 'f2b' + str(size) 2035 i2bN = 'i2b' + str(size) 2036 optimizations.append(((f2bN, ('b2f', aN)), a)) 2037 optimizations.append(((i2bN, ('b2i', aN)), a)) 2038 2039# Optimize x2yN(b2x(x)) -> b2y 2040for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 2041 if x != 'f' and y != 'f' and x != y: 2042 continue 2043 2044 b2x = 'b2f' if x == 'f' else 'b2i' 2045 b2y = 'b2f' if y == 'f' else 'b2i' 2046 x2yN = '{}2{}'.format(x, y) 2047 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 2048 2049# Optimize away x2xN(a@N) 2050for t in ['int', 'uint', 'float', 'bool']: 2051 for N in type_sizes(t): 2052 x2xN = '{0}2{0}{1}'.format(t[0], N) 2053 aN = 'a@{0}'.format(N) 2054 optimizations.append(((x2xN, aN), a)) 2055 2056# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 2057# In particular, we can optimize away everything except upcast of downcast and 2058# upcasts where the type differs from the other cast 2059for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 2060 if N < M: 2061 # The outer cast is a down-cast. It doesn't matter what the size of the 2062 # argument of the inner cast is because we'll never been in the upcast 2063 # of downcast case. Regardless of types, we'll always end up with y2yN 2064 # in the end. 2065 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 2066 x2xN = '{0}2{0}{1}'.format(x, N) 2067 y2yM = '{0}2{0}{1}'.format(y, M) 2068 y2yN = '{0}2{0}{1}'.format(y, N) 2069 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 2070 elif N > M: 2071 # If the outer cast is an up-cast, we have to be more careful about the 2072 # size of the argument of the inner cast and with types. In this case, 2073 # the type is always the type of type up-cast which is given by the 2074 # outer cast. 2075 for P in type_sizes('uint'): 2076 # We can't optimize away up-cast of down-cast. 2077 if M < P: 2078 continue 2079 2080 # Because we're doing down-cast of down-cast, the types always have 2081 # to match between the two casts 2082 for x in ['i', 'u']: 2083 x2xN = '{0}2{0}{1}'.format(x, N) 2084 x2xM = '{0}2{0}{1}'.format(x, M) 2085 aP = 'a@{0}'.format(P) 2086 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 2087 else: 2088 # The N == M case is handled by other optimizations 2089 pass 2090 2091# Downcast operations should be able to see through pack 2092for t in ['i', 'u']: 2093 for N in [8, 16, 32]: 2094 x2xN = '{0}2{0}{1}'.format(t, N) 2095 optimizations += [ 2096 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2097 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2098 ] 2099 2100# Optimize comparisons with up-casts 2101for t in ['int', 'uint', 'float']: 2102 for N, M in itertools.product(type_sizes(t), repeat=2): 2103 if N == 1 or N >= M: 2104 continue 2105 2106 cond = 'true' 2107 if N == 8: 2108 cond = 'options->support_8bit_alu' 2109 elif N == 16: 2110 cond = 'options->support_16bit_alu' 2111 x2xM = '{0}2{0}{1}'.format(t[0], M) 2112 x2xN = '{0}2{0}{1}'.format(t[0], N) 2113 aN = 'a@' + str(N) 2114 bN = 'b@' + str(N) 2115 xeq = 'feq' if t == 'float' else 'ieq' 2116 xne = 'fneu' if t == 'float' else 'ine' 2117 xge = '{0}ge'.format(t[0]) 2118 xlt = '{0}lt'.format(t[0]) 2119 2120 # Up-casts are lossless so for correctly signed comparisons of 2121 # up-casted values we can do the comparison at the largest of the two 2122 # original sizes and drop one or both of the casts. (We have 2123 # optimizations to drop the no-op casts which this may generate.) 2124 for P in type_sizes(t): 2125 if P == 1 or P > N: 2126 continue 2127 2128 bP = 'b@' + str(P) 2129 optimizations += [ 2130 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2131 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2132 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2133 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2134 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2135 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2136 ] 2137 2138 # The next bit doesn't work on floats because the range checks would 2139 # get way too complicated. 2140 if t in ['int', 'uint']: 2141 if t == 'int': 2142 xN_min = -(1 << (N - 1)) 2143 xN_max = (1 << (N - 1)) - 1 2144 elif t == 'uint': 2145 xN_min = 0 2146 xN_max = (1 << N) - 1 2147 else: 2148 assert False 2149 2150 # If we're up-casting and comparing to a constant, we can unfold 2151 # the comparison into a comparison with the shrunk down constant 2152 # and a check that the constant fits in the smaller bit size. 2153 optimizations += [ 2154 ((xeq, (x2xM, aN), '#b'), 2155 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2156 ((xne, (x2xM, aN), '#b'), 2157 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2158 ((xlt, (x2xM, aN), '#b'), 2159 ('iand', (xlt, xN_min, b), 2160 ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2161 ((xlt, '#a', (x2xM, bN)), 2162 ('iand', (xlt, a, xN_max), 2163 ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2164 ((xge, (x2xM, aN), '#b'), 2165 ('iand', (xge, xN_max, b), 2166 ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2167 ((xge, '#a', (x2xM, bN)), 2168 ('iand', (xge, a, xN_min), 2169 ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2170 ] 2171 2172# Convert masking followed by signed downcast to just unsigned downcast 2173optimizations += [ 2174 (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2175 (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2176 (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2177 (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2178 (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2179 (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2180] 2181 2182# Some operations such as iadd have the property that the bottom N bits of the 2183# output only depends on the bottom N bits of each of the inputs so we can 2184# remove casts 2185for N in [16, 32]: 2186 for M in [8, 16]: 2187 if M >= N: 2188 continue 2189 2190 aN = 'a@' + str(N) 2191 u2uM = 'u2u{0}'.format(M) 2192 i2iM = 'i2i{0}'.format(M) 2193 2194 for x in ['u', 'i']: 2195 x2xN = '{0}2{0}{1}'.format(x, N) 2196 extract_xM = 'extract_{0}{1}'.format(x, M) 2197 2198 x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2199 extract_xM_M_bits = \ 2200 '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2201 optimizations += [ 2202 ((x2xN_M_bits, (u2uM, aN)), a), 2203 ((extract_xM_M_bits, aN, 0), a), 2204 ] 2205 2206 bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2207 optimizations += [ 2208 ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2209 ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2210 ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2211 ] 2212 2213 for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2214 op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2215 optimizations += [ 2216 ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2217 ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2218 ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2219 ] 2220 2221def fexp2i(exp, bits): 2222 # Generate an expression which constructs value 2.0^exp or 0.0. 2223 # 2224 # We assume that exp is already in a valid range: 2225 # 2226 # * [-15, 15] for 16-bit float 2227 # * [-127, 127] for 32-bit float 2228 # * [-1023, 1023] for 16-bit float 2229 # 2230 # If exp is the lowest value in the valid range, a value of 0.0 is 2231 # constructed. Otherwise, the value 2.0^exp is constructed. 2232 if bits == 16: 2233 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2234 elif bits == 32: 2235 return ('ishl', ('iadd', exp, 127), 23) 2236 elif bits == 64: 2237 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2238 else: 2239 assert False 2240 2241def ldexp(f, exp, bits): 2242 # The maximum possible range for a normal exponent is [-126, 127] and, 2243 # throwing in denormals, you get a maximum range of [-149, 127]. This 2244 # means that we can potentially have a swing of +-276. If you start with 2245 # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2246 # all the way to zero. The GLSL spec only requires that we handle a subset 2247 # of this range. From version 4.60 of the spec: 2248 # 2249 # "If exp is greater than +128 (single-precision) or +1024 2250 # (double-precision), the value returned is undefined. If exp is less 2251 # than -126 (single-precision) or -1022 (double-precision), the value 2252 # returned may be flushed to zero. Additionally, splitting the value 2253 # into a significand and exponent using frexp() and then reconstructing 2254 # a floating-point value using ldexp() should yield the original input 2255 # for zero and all finite non-denormalized values." 2256 # 2257 # The SPIR-V spec has similar language. 2258 # 2259 # In order to handle the maximum value +128 using the fexp2i() helper 2260 # above, we have to split the exponent in half and do two multiply 2261 # operations. 2262 # 2263 # First, we clamp exp to a reasonable range. Specifically, we clamp to 2264 # twice the full range that is valid for the fexp2i() function above. If 2265 # exp/2 is the bottom value of that range, the fexp2i() expression will 2266 # yield 0.0f which, when multiplied by f, will flush it to zero which is 2267 # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2268 # value is clamped from above, then it must have been above the supported 2269 # range of the GLSL built-in and therefore any return value is acceptable. 2270 if bits == 16: 2271 exp = ('imin', ('imax', exp, -30), 30) 2272 elif bits == 32: 2273 exp = ('imin', ('imax', exp, -254), 254) 2274 elif bits == 64: 2275 exp = ('imin', ('imax', exp, -2046), 2046) 2276 else: 2277 assert False 2278 2279 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2280 # (We use ishr which isn't the same for -1, but the -1 case still works 2281 # since we use exp-exp/2 as the second exponent.) While the spec 2282 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2283 # work with denormals and doesn't allow for the full swing in exponents 2284 # that you can get with normalized values. Instead, we create two powers 2285 # of two and multiply by them each in turn. That way the effective range 2286 # of our exponent is doubled. 2287 pow2_1 = fexp2i(('ishr', exp, 1), bits) 2288 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2289 return ('fmul', ('fmul', f, pow2_1), pow2_2) 2290 2291optimizations += [ 2292 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2293 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2294 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2295] 2296 2297# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2298def bitfield_reverse_ue4(u): 2299 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2300 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2301 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2302 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2303 step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2304 2305 return step5 2306 2307# Cyberpunk 2077 open-codes bitfieldReverse() 2308def bitfield_reverse_cp2077(u): 2309 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2310 step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2311 step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2312 step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2313 step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2314 2315 return step5 2316 2317optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2318optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2319 2320# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2321# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2322for ncomp in [2, 3, 4, 8, 16]: 2323 optimizations += [ 2324 (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2325 (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2326 (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2327 (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2328 ] 2329 2330# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2331# then the "a == a" is redundant because it's equivalent to "a is not NaN" 2332# and, if a is a NaN then the second comparison will fail anyway. 2333for op in ['flt', 'fge', 'feq']: 2334 optimizations += [ 2335 (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 2336 (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 2337 ] 2338 2339# Add optimizations to handle the case where the result of a ternary is 2340# compared to a constant. This way we can take things like 2341# 2342# (a ? 0 : 1) > 0 2343# 2344# and turn it into 2345# 2346# a ? (0 > 0) : (1 > 0) 2347# 2348# which constant folding will eat for lunch. The resulting ternary will 2349# further get cleaned up by the boolean reductions above and we will be 2350# left with just the original variable "a". 2351for op in ['feq', 'fneu', 'ieq', 'ine']: 2352 optimizations += [ 2353 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2354 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2355 ] 2356 2357for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 2358 optimizations += [ 2359 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 2360 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 2361 ((op, '#d', ('bcsel', a, '#b', '#c')), 2362 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 2363 ] 2364 2365 2366# For example, this converts things like 2367# 2368# 1 + mix(0, a - 1, condition) 2369# 2370# into 2371# 2372# mix(1, (a-1)+1, condition) 2373# 2374# Other optimizations will rearrange the constants. 2375for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: 2376 optimizations += [ 2377 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 2378 ] 2379 2380# For derivatives in compute shaders, GLSL_NV_compute_shader_derivatives 2381# states: 2382# 2383# If neither layout qualifier is specified, derivatives in compute shaders 2384# return zero, which is consistent with the handling of built-in texture 2385# functions like texture() in GLSL 4.50 compute shaders. 2386for op in ['fddx', 'fddx_fine', 'fddx_coarse', 2387 'fddy', 'fddy_fine', 'fddy_coarse']: 2388 optimizations += [ 2389 ((op, 'a'), 0.0, 'info->stage == MESA_SHADER_COMPUTE && info->cs.derivative_group == DERIVATIVE_GROUP_NONE') 2390] 2391 2392# Some optimizations for ir3-specific instructions. 2393optimizations += [ 2394 # 'al * bl': If either 'al' or 'bl' is zero, return zero. 2395 (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 2396 # '(ah * bl) << 16 + c': If either 'ah' or 'bl' is zero, return 'c'. 2397 (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 2398 (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 2399] 2400 2401# These kinds of sequences can occur after nir_opt_peephole_select. 2402# 2403# NOTE: fadd is not handled here because that gets in the way of ffma 2404# generation in the i965 driver. Instead, fadd and ffma are handled in 2405# late_optimizations. 2406 2407for op in ['flrp']: 2408 optimizations += [ 2409 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2410 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2411 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2412 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2413 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2414 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 2415 ] 2416 2417for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 2418 optimizations += [ 2419 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2420 (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2421 (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2422 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 2423 ] 2424 2425for op in ['fpow']: 2426 optimizations += [ 2427 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2428 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2429 (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 2430 (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 2431 ] 2432 2433for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fneg', 'fabs', 'fsign']: 2434 optimizations += [ 2435 (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 2436 ] 2437 2438for op in ['ineg', 'iabs', 'inot', 'isign']: 2439 optimizations += [ 2440 ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 2441 ] 2442 2443optimizations.extend([ 2444 (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'), 2445 (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'), 2446 (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal') 2447 ]) 2448 2449# This section contains optimizations to propagate downsizing conversions of 2450# constructed vectors into vectors of downsized components. Whether this is 2451# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 2452# this reduces the register pressure of the vector itself and often enables the 2453# conversions to be eliminated via other algebraic rules or constant folding. 2454# In the worst case on a SIMD architecture, the propagated conversions may be 2455# revectorized via nir_opt_vectorize so instruction count is minimally 2456# impacted. 2457# 2458# On a machine with SIMD-within-a-register only, this actually 2459# counterintuitively hurts instruction count. These machines are the same that 2460# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 2461# not being set. 2462# 2463# Finally for scalar architectures, there should be no difference in generated 2464# code since it all ends up scalarized at the end, but it might minimally help 2465# compile-times. 2466 2467for i in range(2, 4 + 1): 2468 for T in ('f', 'u', 'i'): 2469 vec_inst = ('vec' + str(i),) 2470 2471 indices = ['a', 'b', 'c', 'd'] 2472 suffix_in = tuple((indices[j] + '@32') for j in range(i)) 2473 2474 to_16 = '{}2{}16'.format(T, T) 2475 to_mp = '{}2{}mp'.format(T, T) 2476 2477 out_16 = tuple((to_16, indices[j]) for j in range(i)) 2478 out_mp = tuple((to_mp, indices[j]) for j in range(i)) 2479 2480 optimizations += [ 2481 ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 2482 ] 2483 # u2ump doesn't exist, because it's equal to i2imp 2484 if T in ['f', 'i']: 2485 optimizations += [ 2486 ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 2487 ] 2488 2489# This section contains "late" optimizations that should be run before 2490# creating ffmas and calling regular optimizations for the final time. 2491# Optimizations should go here if they help code generation and conflict 2492# with the regular optimizations. 2493before_ffma_optimizations = [ 2494 # Propagate constants down multiplication chains 2495 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 2496 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 2497 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 2498 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 2499 2500 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 2501 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 2502 (('~fadd', ('fneg', a), a), 0.0), 2503 (('iadd', ('ineg', a), a), 0), 2504 (('iadd', ('ineg', a), ('iadd', a, b)), b), 2505 (('iadd', a, ('iadd', ('ineg', a), b)), b), 2506 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 2507 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 2508 2509 (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 2510 (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 2511 (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 2512] 2513 2514# This section contains "late" optimizations that should be run after the 2515# regular optimizations have finished. Optimizations should go here if 2516# they help code generation but do not necessarily produce code that is 2517# more easily optimizable. 2518late_optimizations = [ 2519 # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 2520 # results if one operand is +Inf and the other is -Inf. 2521 # 2522 # 1. Inf + -Inf = NaN 2523 # 2. ∀x: x + NaN = NaN and x - NaN = NaN 2524 # 3. ∀x: x != NaN = true 2525 # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 2526 # 2527 # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 2528 # (a+b) < 0 false false false false 2529 # a < -b false false false false 2530 # -(a+b) < 0 false false false false 2531 # -a < b false false false false 2532 # (a+b) >= 0 false false false false 2533 # a >= -b true true false false 2534 # -(a+b) >= 0 false false false false 2535 # -a >= b true true false false 2536 # (a+b) == 0 false false false false 2537 # a == -b true true false false 2538 # (a+b) != 0 true true true true 2539 # a != -b false false true true 2540 (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 2541 (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 2542 (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 2543 (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 2544 (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 2545 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 2546 (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 2547 (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 2548 (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 2549 (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 2550 2551 # If either source must be finite, then the original (a+b) cannot produce 2552 # NaN due to Inf-Inf. The patterns and the replacements produce the same 2553 # result if b is NaN. Therefore, the replacements are exact. 2554 (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 2555 (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 2556 (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 2557 (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 2558 (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 2559 (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 2560 2561 # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 2562 # SpvOpLessOrGreater. 2563 (('iand', ('fneu', a, b), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', a, b), ('!flt', b, a))), 2564 (('iand', ('fneu', a, 0.0), ('feq', a, a) ), ('!flt', 0.0, ('fabs', a))), 2565 2566 # This is how SpvOpFUnordEqual might be implemented. Replace it with 2567 # !SpvOpLessOrGreater. 2568 (('ior', ('feq', a, b), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', a, b), ('!flt', b, a)))), 2569 (('ior', ('feq', a, 0.0), ('fneu', a, a), ), ('inot', ('!flt', 0.0, ('fabs', a)))), 2570 2571 # nir_lower_to_source_mods will collapse this, but its existence during the 2572 # optimization loop can prevent other optimizations. 2573 (('fneg', ('fneg', a)), a), 2574 2575 # re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c 2576 # gets combined to fma(a, b, -c). 2577 (('~fadd@16', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma16'), 2578 (('~fadd@32', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma32'), 2579 (('~fadd@64', ('fmul', a, b), c), ('ffma', a, b, c), 'options->fuse_ffma64'), 2580 (('~fadd@32', ('fmulz', a, b), c), ('ffmaz', a, b, c), 'options->fuse_ffma32'), 2581 2582 # Subtractions get lowered during optimization, so we need to recombine them 2583 (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2584 (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2585 (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 2586 (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'), 2587 2588 (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 2589 (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 2590 (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 2591 (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 2592 2593 (('iadd', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), 'options->has_iadd3'), 2594 (('iadd', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), c), 'options->has_iadd3'), 2595 (('isub', ('isub(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, ('ineg', b), ('ineg', c)), 'options->has_iadd3'), 2596 2597 # fneg_lo / fneg_hi 2598 (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'), 2599 (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'), 2600 2601 # These are duplicated from the main optimizations table. The late 2602 # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 2603 # new patterns like these. The patterns that compare with zero are removed 2604 # because they are unlikely to be created in by anything in 2605 # late_optimizations. 2606 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 2607 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 2608 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 2609 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 2610 2611 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 2612 2613 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 2614 2615 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 2616 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 2617 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 2618 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 2619 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 2620 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 2621 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 2622 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 2623 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 2624 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 2625 2626 (('ior', a, a), a), 2627 (('iand', a, a), a), 2628 2629 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 2630 2631 (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 2632 (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 2633 (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 2634 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 2635 2636 (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 2637 2638 # Approximate handling of fround_even for DX9 addressing from gallium nine on 2639 # DX9-class hardware with no proper fround support. This is in 2640 # late_optimizations so that the is_integral() opts in the main pass get a 2641 # chance to eliminate the fround_even first. 2642 (('fround_even', a), ('bcsel', 2643 ('feq', ('ffract', a), 0.5), 2644 ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0), 2645 ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'), 2646 2647 # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 2648 # particular operation is common for expanding values stored in a texture 2649 # from [0,1] to [-1,1]. 2650 (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 2651 (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 2652 (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 2653 (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 2654 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 2655 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 2656 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 2657 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 2658 2659 # flrp(a, b, a) 2660 # a*(1-a) + b*a 2661 # a + -a*a + a*b (1) 2662 # a + a*(b - a) 2663 # Option 1: ffma(a, (b-a), a) 2664 # 2665 # Alternately, after (1): 2666 # a*(1+b) + -a*a 2667 # a*((1+b) + -a) 2668 # 2669 # Let b=1 2670 # 2671 # Option 2: ffma(a, 2, -(a*a)) 2672 # Option 3: ffma(a, 2, (-a)*a) 2673 # Option 4: ffma(a, -a, (2*a) 2674 # Option 5: a * (2 - a) 2675 # 2676 # There are a lot of other possible combinations. 2677 (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 2678 (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2679 (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2680 (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2681 (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 2682 2683 # we do these late so that we don't get in the way of creating ffmas 2684 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 2685 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 2686 2687 # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 2688 # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 2689 (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 2690 ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 2691 2692 # Things that look like DPH in the source shader may get expanded to 2693 # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 2694 # to NIR. After FFMA is generated, this can look like: 2695 # 2696 # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 2697 # 2698 # Reassociate the last addition into the first multiplication. 2699 # 2700 # Some shaders do not use 'invariant' in vertex and (possibly) geometry 2701 # shader stages on some outputs that are intended to be invariant. For 2702 # various reasons, this optimization may not be fully applied in all 2703 # shaders used for different rendering passes of the same geometry. This 2704 # can result in Z-fighting artifacts (at best). For now, disable this 2705 # optimization in these stages. See bugzilla #111490. In tessellation 2706 # stages applications seem to use 'precise' when necessary, so allow the 2707 # optimization in those stages. 2708 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 2709 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2710 (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 2711 ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2712 (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 2713 ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2714 2715 (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 2716 ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2717 (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 2718 ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2719 (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 2720 ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 2721 2722 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2723 # 2724 # If bits is zero, the result will be zero. 2725 # 2726 # These prevent the next two lowerings generating incorrect results when 2727 # count is zero. 2728 (('ubfe', a, b, 0), 0), 2729 (('ibfe', a, b, 0), 0), 2730 2731 # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 2732 # instructions on Intel GPUs, it cannot have an immediate values as 2733 # sources. There are also limitations on source register strides. As a 2734 # result, it is very easy for 3-source instruction combined with either 2735 # loads of immediate values or copies from weird register strides to be 2736 # more expensive than the primitive instructions it represents. 2737 (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 2738 2739 # b is the lowest order bit to be extracted and c is the number of bits to 2740 # extract. The inner shift removes the bits above b + c by shifting left 2741 # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 2742 # -(b + c). The outer shift moves the bit that was at b to bit zero. 2743 # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 2744 # This means that it must be shifted right by 32 - c or -c bits. 2745 (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 2746 2747 # Clean up no-op shifts that may result from the bfe lowerings. 2748 (('ishl', a, 0), a), 2749 (('ishl', a, -32), a), 2750 (('ishr', a, 0), a), 2751 (('ishr', a, -32), a), 2752 (('ushr', a, 0), a), 2753 2754 (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 2755 (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 2756 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 2757 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 2758] 2759 2760# A few more extract cases we'd rather leave late 2761for N in [16, 32]: 2762 aN = 'a@{0}'.format(N) 2763 u2uM = 'u2u{0}'.format(M) 2764 i2iM = 'i2i{0}'.format(M) 2765 2766 for x in ['u', 'i']: 2767 x2xN = '{0}2{0}{1}'.format(x, N) 2768 extract_x8 = 'extract_{0}8'.format(x) 2769 extract_x16 = 'extract_{0}16'.format(x) 2770 2771 late_optimizations.extend([ 2772 ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 2773 ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 2774 ]) 2775 2776 if N > 16: 2777 late_optimizations.extend([ 2778 ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 2779 ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 2780 ]) 2781 2782# Byte insertion 2783late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 2784late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 2785late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 2786 2787late_optimizations += [ 2788 # Word insertion 2789 (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 2790 2791 # Extract and then insert 2792 (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 2793 (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 2794] 2795 2796# Integer sizes 2797for s in [8, 16, 32, 64]: 2798 late_optimizations.extend([ 2799 (('iand', ('ine(is_used_once)', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0)), 2800 (('ior', ('ieq(is_used_once)', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0)), 2801 ]) 2802 2803# Float sizes 2804for s in [16, 32, 64]: 2805 late_optimizations.extend([ 2806 (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 2807 (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 2808 ]) 2809 2810for op in ['fadd']: 2811 late_optimizations += [ 2812 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 2813 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 2814 ] 2815 2816for op in ['ffma', 'ffmaz']: 2817 late_optimizations += [ 2818 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2819 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 2820 2821 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2822 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 2823 ] 2824 2825# mediump: If an opcode is surrounded by conversions, remove the conversions. 2826# The rationale is that type conversions + the low precision opcode are more 2827# expensive that the same arithmetic opcode at higher precision. 2828# 2829# This must be done in late optimizations, because we need normal optimizations to 2830# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 2831# 2832# Unary opcodes 2833for op in ['fabs', 'fceil', 'fcos', 'fddx', 'fddx_coarse', 'fddx_fine', 'fddy', 2834 'fddy_coarse', 'fddy_fine', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 2835 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 2836 late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 2837 2838# Binary opcodes 2839for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 2840 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 2841 2842# Ternary opcodes 2843for op in ['ffma', 'flrp']: 2844 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 2845 2846# Comparison opcodes 2847for op in ['feq', 'fge', 'flt', 'fneu']: 2848 late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 2849 2850# Do this last, so that the f2fmp patterns above have effect. 2851late_optimizations += [ 2852 # Convert *2*mp instructions to concrete *2*16 instructions. At this point 2853 # any conversions that could have been removed will have been removed in 2854 # nir_opt_algebraic so any remaining ones are required. 2855 (('f2fmp', a), ('f2f16', a)), 2856 (('f2imp', a), ('f2i16', a)), 2857 (('f2ump', a), ('f2u16', a)), 2858 (('i2imp', a), ('i2i16', a)), 2859 (('i2fmp', a), ('i2f16', a)), 2860 (('i2imp', a), ('u2u16', a)), 2861 (('u2fmp', a), ('u2f16', a)), 2862 (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 2863] 2864 2865distribute_src_mods = [ 2866 # Try to remove some spurious negations rather than pushing them down. 2867 (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 2868 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 2869 (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 2870 (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 2871 (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 2872 (('fneg', ('fneg', a)), a), 2873 2874 (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 2875 (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 2876 2877 (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 2878 (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 2879 (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 2880 2881 # Note that fmin <-> fmax. I don't think there is a way to distribute 2882 # fabs() into fmin or fmax. 2883 (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 2884 (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 2885 2886 (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 2887 (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 2888 (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 2889 2890 # fdph works mostly like fdot, but to get the correct result, the negation 2891 # must be applied to the second source. 2892 (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 2893 2894 (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 2895 (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 2896] 2897 2898print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 2899print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 2900 before_ffma_optimizations).render()) 2901print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 2902 late_optimizations).render()) 2903print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 2904 distribute_src_mods).render()) 2905