1cb93a386Sopenharmony_ci/*
2cb93a386Sopenharmony_ci * Copyright 2019 Google LLC
3cb93a386Sopenharmony_ci *
4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be
5cb93a386Sopenharmony_ci * found in the LICENSE file.
6cb93a386Sopenharmony_ci */
7cb93a386Sopenharmony_ci
8cb93a386Sopenharmony_ci#include "include/core/SkStream.h"
9cb93a386Sopenharmony_ci#include "include/core/SkString.h"
10cb93a386Sopenharmony_ci#include "include/private/SkHalf.h"
11cb93a386Sopenharmony_ci#include "include/private/SkTFitsIn.h"
12cb93a386Sopenharmony_ci#include "include/private/SkThreadID.h"
13cb93a386Sopenharmony_ci#include "src/core/SkColorSpacePriv.h"
14cb93a386Sopenharmony_ci#include "src/core/SkColorSpaceXformSteps.h"
15cb93a386Sopenharmony_ci#include "src/core/SkCpu.h"
16cb93a386Sopenharmony_ci#include "src/core/SkEnumerate.h"
17cb93a386Sopenharmony_ci#include "src/core/SkOpts.h"
18cb93a386Sopenharmony_ci#include "src/core/SkVM.h"
19cb93a386Sopenharmony_ci#include <algorithm>
20cb93a386Sopenharmony_ci#include <atomic>
21cb93a386Sopenharmony_ci#include <queue>
22cb93a386Sopenharmony_ci
23cb93a386Sopenharmony_ci#if defined(SKVM_LLVM)
24cb93a386Sopenharmony_ci    #include <future>
25cb93a386Sopenharmony_ci    #include <llvm/Bitcode/BitcodeWriter.h>
26cb93a386Sopenharmony_ci    #include <llvm/ExecutionEngine/ExecutionEngine.h>
27cb93a386Sopenharmony_ci    #include <llvm/IR/IRBuilder.h>
28cb93a386Sopenharmony_ci    #include <llvm/IR/Verifier.h>
29cb93a386Sopenharmony_ci    #include <llvm/Support/TargetSelect.h>
30cb93a386Sopenharmony_ci    #include <llvm/Support/Host.h>
31cb93a386Sopenharmony_ci
32cb93a386Sopenharmony_ci    // Platform-specific intrinsics got their own files in LLVM 10.
33cb93a386Sopenharmony_ci    #if __has_include(<llvm/IR/IntrinsicsX86.h>)
34cb93a386Sopenharmony_ci        #include <llvm/IR/IntrinsicsX86.h>
35cb93a386Sopenharmony_ci    #endif
36cb93a386Sopenharmony_ci#endif
37cb93a386Sopenharmony_ci
38cb93a386Sopenharmony_ci// #define SKVM_LLVM_WAIT_FOR_COMPILATION
39cb93a386Sopenharmony_ci
40cb93a386Sopenharmony_cibool gSkVMAllowJIT{false};
41cb93a386Sopenharmony_cibool gSkVMJITViaDylib{false};
42cb93a386Sopenharmony_ci
43cb93a386Sopenharmony_ci#if defined(SKVM_JIT)
44cb93a386Sopenharmony_ci    #if defined(SK_BUILD_FOR_WIN)
45cb93a386Sopenharmony_ci        #include "src/core/SkLeanWindows.h"
46cb93a386Sopenharmony_ci        #include <memoryapi.h>
47cb93a386Sopenharmony_ci
48cb93a386Sopenharmony_ci        static void* alloc_jit_buffer(size_t* len) {
49cb93a386Sopenharmony_ci            return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
50cb93a386Sopenharmony_ci        }
51cb93a386Sopenharmony_ci        static void remap_as_executable(void* ptr, size_t len) {
52cb93a386Sopenharmony_ci            DWORD old;
53cb93a386Sopenharmony_ci            VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
54cb93a386Sopenharmony_ci            SkASSERT(old == PAGE_READWRITE);
55cb93a386Sopenharmony_ci        }
56cb93a386Sopenharmony_ci        #if !defined(SKVM_LLVM)
57cb93a386Sopenharmony_ci        static void unmap_jit_buffer(void* ptr, size_t len) {
58cb93a386Sopenharmony_ci            VirtualFree(ptr, 0, MEM_RELEASE);
59cb93a386Sopenharmony_ci        }
60cb93a386Sopenharmony_ci        static void close_dylib(void* dylib) {
61cb93a386Sopenharmony_ci            SkASSERT(false);  // TODO?  For now just assert we never make one.
62cb93a386Sopenharmony_ci        }
63cb93a386Sopenharmony_ci        #endif
64cb93a386Sopenharmony_ci    #else
65cb93a386Sopenharmony_ci        #include <dlfcn.h>
66cb93a386Sopenharmony_ci        #include <sys/mman.h>
67cb93a386Sopenharmony_ci
68cb93a386Sopenharmony_ci        static void* alloc_jit_buffer(size_t* len) {
69cb93a386Sopenharmony_ci            // While mprotect and VirtualAlloc both work at page granularity,
70cb93a386Sopenharmony_ci            // mprotect doesn't round up for you, and instead requires *len is at page granularity.
71cb93a386Sopenharmony_ci            const size_t page = sysconf(_SC_PAGESIZE);
72cb93a386Sopenharmony_ci            *len = ((*len + page - 1) / page) * page;
73cb93a386Sopenharmony_ci            return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
74cb93a386Sopenharmony_ci        }
75cb93a386Sopenharmony_ci        static void remap_as_executable(void* ptr, size_t len) {
76cb93a386Sopenharmony_ci            mprotect(ptr, len, PROT_READ|PROT_EXEC);
77cb93a386Sopenharmony_ci            __builtin___clear_cache((char*)ptr,
78cb93a386Sopenharmony_ci                                    (char*)ptr + len);
79cb93a386Sopenharmony_ci        }
80cb93a386Sopenharmony_ci        #if !defined(SKVM_LLVM)
81cb93a386Sopenharmony_ci        static void unmap_jit_buffer(void* ptr, size_t len) {
82cb93a386Sopenharmony_ci            munmap(ptr, len);
83cb93a386Sopenharmony_ci        }
84cb93a386Sopenharmony_ci        static void close_dylib(void* dylib) {
85cb93a386Sopenharmony_ci            dlclose(dylib);
86cb93a386Sopenharmony_ci        }
87cb93a386Sopenharmony_ci        #endif
88cb93a386Sopenharmony_ci    #endif
89cb93a386Sopenharmony_ci
90cb93a386Sopenharmony_ci    #if defined(SKVM_JIT_VTUNE)
91cb93a386Sopenharmony_ci        #include <jitprofiling.h>
92cb93a386Sopenharmony_ci        static void notify_vtune(const char* name, void* addr, size_t len) {
93cb93a386Sopenharmony_ci            if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
94cb93a386Sopenharmony_ci                iJIT_Method_Load event;
95cb93a386Sopenharmony_ci                memset(&event, 0, sizeof(event));
96cb93a386Sopenharmony_ci                event.method_id           = iJIT_GetNewMethodID();
97cb93a386Sopenharmony_ci                event.method_name         = const_cast<char*>(name);
98cb93a386Sopenharmony_ci                event.method_load_address = addr;
99cb93a386Sopenharmony_ci                event.method_size         = len;
100cb93a386Sopenharmony_ci                iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
101cb93a386Sopenharmony_ci            }
102cb93a386Sopenharmony_ci        }
103cb93a386Sopenharmony_ci    #else
104cb93a386Sopenharmony_ci        static void notify_vtune(const char* name, void* addr, size_t len) {}
105cb93a386Sopenharmony_ci    #endif
106cb93a386Sopenharmony_ci#endif
107cb93a386Sopenharmony_ci
108cb93a386Sopenharmony_ci// JIT code isn't MSAN-instrumented, so we won't see when it uses
109cb93a386Sopenharmony_ci// uninitialized memory, and we'll not see the writes it makes as properly
110cb93a386Sopenharmony_ci// initializing memory.  Instead force the interpreter, which should let
111cb93a386Sopenharmony_ci// MSAN see everything our programs do properly.
112cb93a386Sopenharmony_ci//
113cb93a386Sopenharmony_ci// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
114cb93a386Sopenharmony_ci#if defined(__has_feature)
115cb93a386Sopenharmony_ci    #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
116cb93a386Sopenharmony_ci        #define SKVM_JIT_BUT_IGNORE_IT
117cb93a386Sopenharmony_ci    #endif
118cb93a386Sopenharmony_ci#endif
119cb93a386Sopenharmony_ci
120cb93a386Sopenharmony_ci#if defined(SKSL_STANDALONE)
121cb93a386Sopenharmony_ci    // skslc needs to link against this module (for the VM code generator). This module pulls in
122cb93a386Sopenharmony_ci    // color-space code, but attempting to add those transitive dependencies to skslc gets out of
123cb93a386Sopenharmony_ci    // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM
124cb93a386Sopenharmony_ci    // never cares about color management.
125cb93a386Sopenharmony_ci    skvm::F32 sk_program_transfer_fn(
126cb93a386Sopenharmony_ci        skvm::F32 v, TFKind tf_kind,
127cb93a386Sopenharmony_ci        skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) {
128cb93a386Sopenharmony_ci            return v;
129cb93a386Sopenharmony_ci    }
130cb93a386Sopenharmony_ci
131cb93a386Sopenharmony_ci    const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; }
132cb93a386Sopenharmony_ci    const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; }
133cb93a386Sopenharmony_ci#endif
134cb93a386Sopenharmony_ci
135cb93a386Sopenharmony_cinamespace skvm {
136cb93a386Sopenharmony_ci
137cb93a386Sopenharmony_ci    static Features detect_features() {
138cb93a386Sopenharmony_ci        static const bool fma =
139cb93a386Sopenharmony_ci        #if defined(SK_CPU_X86)
140cb93a386Sopenharmony_ci            SkCpu::Supports(SkCpu::HSW);
141cb93a386Sopenharmony_ci        #elif defined(SK_CPU_ARM64)
142cb93a386Sopenharmony_ci            true;
143cb93a386Sopenharmony_ci        #else
144cb93a386Sopenharmony_ci            false;
145cb93a386Sopenharmony_ci        #endif
146cb93a386Sopenharmony_ci
147cb93a386Sopenharmony_ci        static const bool fp16 = false;  // TODO
148cb93a386Sopenharmony_ci
149cb93a386Sopenharmony_ci        return { fma, fp16 };
150cb93a386Sopenharmony_ci    }
151cb93a386Sopenharmony_ci
152cb93a386Sopenharmony_ci    Builder::Builder()                  : fFeatures(detect_features()) {}
153cb93a386Sopenharmony_ci    Builder::Builder(Features features) : fFeatures(features         ) {}
154cb93a386Sopenharmony_ci
155cb93a386Sopenharmony_ci
156cb93a386Sopenharmony_ci    struct Program::Impl {
157cb93a386Sopenharmony_ci        std::vector<InterpreterInstruction> instructions;
158cb93a386Sopenharmony_ci        int regs = 0;
159cb93a386Sopenharmony_ci        int loop = 0;
160cb93a386Sopenharmony_ci        std::vector<int> strides;
161cb93a386Sopenharmony_ci
162cb93a386Sopenharmony_ci        std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
163cb93a386Sopenharmony_ci        size_t jit_size = 0;
164cb93a386Sopenharmony_ci        void*  dylib    = nullptr;
165cb93a386Sopenharmony_ci
166cb93a386Sopenharmony_ci    #if defined(SKVM_LLVM)
167cb93a386Sopenharmony_ci        std::unique_ptr<llvm::LLVMContext>     llvm_ctx;
168cb93a386Sopenharmony_ci        std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
169cb93a386Sopenharmony_ci        std::future<void>                      llvm_compiling;
170cb93a386Sopenharmony_ci    #endif
171cb93a386Sopenharmony_ci    };
172cb93a386Sopenharmony_ci
173cb93a386Sopenharmony_ci    // Debugging tools, mostly for printing various data structures out to a stream.
174cb93a386Sopenharmony_ci
175cb93a386Sopenharmony_ci    namespace {
176cb93a386Sopenharmony_ci        class SkDebugfStream final : public SkWStream {
177cb93a386Sopenharmony_ci            size_t fBytesWritten = 0;
178cb93a386Sopenharmony_ci
179cb93a386Sopenharmony_ci            bool write(const void* buffer, size_t size) override {
180cb93a386Sopenharmony_ci                SkDebugf("%.*s", (int)size, (const char*)buffer);
181cb93a386Sopenharmony_ci                fBytesWritten += size;
182cb93a386Sopenharmony_ci                return true;
183cb93a386Sopenharmony_ci            }
184cb93a386Sopenharmony_ci
185cb93a386Sopenharmony_ci            size_t bytesWritten() const override {
186cb93a386Sopenharmony_ci                return fBytesWritten;
187cb93a386Sopenharmony_ci            }
188cb93a386Sopenharmony_ci        };
189cb93a386Sopenharmony_ci
190cb93a386Sopenharmony_ci        struct V { Val id; };
191cb93a386Sopenharmony_ci        struct R { Reg id; };
192cb93a386Sopenharmony_ci        struct Shift { int bits; };
193cb93a386Sopenharmony_ci        struct Splat { int bits; };
194cb93a386Sopenharmony_ci        struct Hex   { int bits; };
195cb93a386Sopenharmony_ci        // For op `trace_line` or `trace_call`
196cb93a386Sopenharmony_ci        struct Line  { int bits; };
197cb93a386Sopenharmony_ci        // For op `trace_var`
198cb93a386Sopenharmony_ci        struct VarSlot { int bits; };
199cb93a386Sopenharmony_ci        struct VarType { int bits; };
200cb93a386Sopenharmony_ci        static constexpr VarType kVarTypeInt{0};
201cb93a386Sopenharmony_ci        static constexpr VarType kVarTypeFloat{1};
202cb93a386Sopenharmony_ci        static constexpr VarType kVarTypeBool{2};
203cb93a386Sopenharmony_ci        // For op `trace_call`
204cb93a386Sopenharmony_ci        struct CallType { int bits; };
205cb93a386Sopenharmony_ci        static constexpr CallType kCallTypeEnter{1};
206cb93a386Sopenharmony_ci        static constexpr CallType kCallTypeExit{0};
207cb93a386Sopenharmony_ci
208cb93a386Sopenharmony_ci        static void write(SkWStream* o, const char* s) {
209cb93a386Sopenharmony_ci            o->writeText(s);
210cb93a386Sopenharmony_ci        }
211cb93a386Sopenharmony_ci
212cb93a386Sopenharmony_ci        static const char* name(Op op) {
213cb93a386Sopenharmony_ci            switch (op) {
214cb93a386Sopenharmony_ci            #define M(x) case Op::x: return #x;
215cb93a386Sopenharmony_ci                SKVM_OPS(M)
216cb93a386Sopenharmony_ci            #undef M
217cb93a386Sopenharmony_ci            }
218cb93a386Sopenharmony_ci            return "unknown op";
219cb93a386Sopenharmony_ci        }
220cb93a386Sopenharmony_ci
221cb93a386Sopenharmony_ci        static void write(SkWStream* o, Op op) {
222cb93a386Sopenharmony_ci            o->writeText(name(op));
223cb93a386Sopenharmony_ci        }
224cb93a386Sopenharmony_ci        static void write(SkWStream* o, Ptr p) {
225cb93a386Sopenharmony_ci            write(o, "ptr");
226cb93a386Sopenharmony_ci            o->writeDecAsText(p.ix);
227cb93a386Sopenharmony_ci        }
228cb93a386Sopenharmony_ci        static void write(SkWStream* o, V v) {
229cb93a386Sopenharmony_ci            write(o, "v");
230cb93a386Sopenharmony_ci            o->writeDecAsText(v.id);
231cb93a386Sopenharmony_ci        }
232cb93a386Sopenharmony_ci        static void write(SkWStream* o, R r) {
233cb93a386Sopenharmony_ci            write(o, "r");
234cb93a386Sopenharmony_ci            o->writeDecAsText(r.id);
235cb93a386Sopenharmony_ci        }
236cb93a386Sopenharmony_ci        static void write(SkWStream* o, Shift s) {
237cb93a386Sopenharmony_ci            o->writeDecAsText(s.bits);
238cb93a386Sopenharmony_ci        }
239cb93a386Sopenharmony_ci        static void write(SkWStream* o, Splat s) {
240cb93a386Sopenharmony_ci            float f;
241cb93a386Sopenharmony_ci            memcpy(&f, &s.bits, 4);
242cb93a386Sopenharmony_ci            o->writeHexAsText(s.bits);
243cb93a386Sopenharmony_ci            write(o, " (");
244cb93a386Sopenharmony_ci            o->writeScalarAsText(f);
245cb93a386Sopenharmony_ci            write(o, ")");
246cb93a386Sopenharmony_ci        }
247cb93a386Sopenharmony_ci        static void write(SkWStream* o, Hex h) {
248cb93a386Sopenharmony_ci            o->writeHexAsText(h.bits);
249cb93a386Sopenharmony_ci        }
250cb93a386Sopenharmony_ci        static void write(SkWStream* o, Line d) {
251cb93a386Sopenharmony_ci            write(o, "L");
252cb93a386Sopenharmony_ci            o->writeDecAsText(d.bits);
253cb93a386Sopenharmony_ci        }
254cb93a386Sopenharmony_ci        static void write(SkWStream* o, VarSlot s) {
255cb93a386Sopenharmony_ci            write(o, "$");
256cb93a386Sopenharmony_ci            o->writeDecAsText(s.bits);
257cb93a386Sopenharmony_ci        }
258cb93a386Sopenharmony_ci        static void write(SkWStream* o, VarType n) {
259cb93a386Sopenharmony_ci            if (n.bits == kVarTypeFloat.bits) {
260cb93a386Sopenharmony_ci                write(o, "(F32)");
261cb93a386Sopenharmony_ci            } else if (n.bits == kVarTypeInt.bits) {
262cb93a386Sopenharmony_ci                write(o, "(I32)");
263cb93a386Sopenharmony_ci            } else if (n.bits == kVarTypeBool.bits) {
264cb93a386Sopenharmony_ci                write(o, "(bool)");
265cb93a386Sopenharmony_ci            } else {
266cb93a386Sopenharmony_ci                write(o, "???");
267cb93a386Sopenharmony_ci            }
268cb93a386Sopenharmony_ci        }
269cb93a386Sopenharmony_ci        static void write(SkWStream* o, CallType n) {
270cb93a386Sopenharmony_ci            if (n.bits == kCallTypeEnter.bits) {
271cb93a386Sopenharmony_ci                write(o, "(enter)");
272cb93a386Sopenharmony_ci            } else if (n.bits == kCallTypeExit.bits) {
273cb93a386Sopenharmony_ci                write(o, "(exit)");
274cb93a386Sopenharmony_ci            } else {
275cb93a386Sopenharmony_ci                write(o, "???");
276cb93a386Sopenharmony_ci            }
277cb93a386Sopenharmony_ci        }
278cb93a386Sopenharmony_ci
279cb93a386Sopenharmony_ci        template <typename T, typename... Ts>
280cb93a386Sopenharmony_ci        static void write(SkWStream* o, T first, Ts... rest) {
281cb93a386Sopenharmony_ci            write(o, first);
282cb93a386Sopenharmony_ci            write(o, " ");
283cb93a386Sopenharmony_ci            write(o, rest...);
284cb93a386Sopenharmony_ci        }
285cb93a386Sopenharmony_ci    }  // namespace
286cb93a386Sopenharmony_ci
287cb93a386Sopenharmony_ci    static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) {
288cb93a386Sopenharmony_ci        Op  op = inst.op;
289cb93a386Sopenharmony_ci        Val  x = inst.x,
290cb93a386Sopenharmony_ci             y = inst.y,
291cb93a386Sopenharmony_ci             z = inst.z,
292cb93a386Sopenharmony_ci             w = inst.w;
293cb93a386Sopenharmony_ci        int immA = inst.immA,
294cb93a386Sopenharmony_ci            immB = inst.immB,
295cb93a386Sopenharmony_ci            immC = inst.immC;
296cb93a386Sopenharmony_ci        switch (op) {
297cb93a386Sopenharmony_ci            case Op::assert_true: write(o, op, V{x}, V{y}); break;
298cb93a386Sopenharmony_ci
299cb93a386Sopenharmony_ci            case Op::trace_line: write(o, op, V{x}, Line{immA}); break;
300cb93a386Sopenharmony_ci            case Op::trace_var:  write(o, op, V{x}, VarSlot{immA}, "=", V{y}, VarType{immB}); break;
301cb93a386Sopenharmony_ci            case Op::trace_call: write(o, op, V{x}, Line{immA}, CallType{immB}); break;
302cb93a386Sopenharmony_ci
303cb93a386Sopenharmony_ci            case Op::store8:   write(o, op, Ptr{immA}, V{x}               ); break;
304cb93a386Sopenharmony_ci            case Op::store16:  write(o, op, Ptr{immA}, V{x}               ); break;
305cb93a386Sopenharmony_ci            case Op::store32:  write(o, op, Ptr{immA}, V{x}               ); break;
306cb93a386Sopenharmony_ci            case Op::store64:  write(o, op, Ptr{immA}, V{x},V{y}          ); break;
307cb93a386Sopenharmony_ci            case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break;
308cb93a386Sopenharmony_ci
309cb93a386Sopenharmony_ci            case Op::index: write(o, V{id}, "=", op); break;
310cb93a386Sopenharmony_ci
311cb93a386Sopenharmony_ci            case Op::load8:   write(o, V{id}, "=", op, Ptr{immA}); break;
312cb93a386Sopenharmony_ci            case Op::load16:  write(o, V{id}, "=", op, Ptr{immA}); break;
313cb93a386Sopenharmony_ci            case Op::load32:  write(o, V{id}, "=", op, Ptr{immA}); break;
314cb93a386Sopenharmony_ci            case Op::load64:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
315cb93a386Sopenharmony_ci            case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
316cb93a386Sopenharmony_ci
317cb93a386Sopenharmony_ci            case Op::gather8:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
318cb93a386Sopenharmony_ci            case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
319cb93a386Sopenharmony_ci            case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
320cb93a386Sopenharmony_ci
321cb93a386Sopenharmony_ci            case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
322cb93a386Sopenharmony_ci            case Op::array32:   write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
323cb93a386Sopenharmony_ci
324cb93a386Sopenharmony_ci            case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
325cb93a386Sopenharmony_ci
326cb93a386Sopenharmony_ci            case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
327cb93a386Sopenharmony_ci            case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
328cb93a386Sopenharmony_ci            case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
329cb93a386Sopenharmony_ci            case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
330cb93a386Sopenharmony_ci            case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
331cb93a386Sopenharmony_ci            case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
332cb93a386Sopenharmony_ci            case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
333cb93a386Sopenharmony_ci            case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
334cb93a386Sopenharmony_ci            case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
335cb93a386Sopenharmony_ci
336cb93a386Sopenharmony_ci
337cb93a386Sopenharmony_ci            case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
338cb93a386Sopenharmony_ci
339cb93a386Sopenharmony_ci            case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
340cb93a386Sopenharmony_ci            case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
341cb93a386Sopenharmony_ci            case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
342cb93a386Sopenharmony_ci            case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
343cb93a386Sopenharmony_ci
344cb93a386Sopenharmony_ci
345cb93a386Sopenharmony_ci            case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
346cb93a386Sopenharmony_ci            case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
347cb93a386Sopenharmony_ci            case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
348cb93a386Sopenharmony_ci
349cb93a386Sopenharmony_ci            case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
350cb93a386Sopenharmony_ci            case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
351cb93a386Sopenharmony_ci            case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
352cb93a386Sopenharmony_ci
353cb93a386Sopenharmony_ci            case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
354cb93a386Sopenharmony_ci            case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
355cb93a386Sopenharmony_ci
356cb93a386Sopenharmony_ci
357cb93a386Sopenharmony_ci            case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}); break;
358cb93a386Sopenharmony_ci            case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}); break;
359cb93a386Sopenharmony_ci            case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}); break;
360cb93a386Sopenharmony_ci            case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break;
361cb93a386Sopenharmony_ci
362cb93a386Sopenharmony_ci            case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
363cb93a386Sopenharmony_ci
364cb93a386Sopenharmony_ci            case Op::ceil:      write(o, V{id}, "=", op, V{x}); break;
365cb93a386Sopenharmony_ci            case Op::floor:     write(o, V{id}, "=", op, V{x}); break;
366cb93a386Sopenharmony_ci            case Op::to_f32:    write(o, V{id}, "=", op, V{x}); break;
367cb93a386Sopenharmony_ci            case Op::to_fp16:   write(o, V{id}, "=", op, V{x}); break;
368cb93a386Sopenharmony_ci            case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break;
369cb93a386Sopenharmony_ci            case Op::trunc:     write(o, V{id}, "=", op, V{x}); break;
370cb93a386Sopenharmony_ci            case Op::round:     write(o, V{id}, "=", op, V{x}); break;
371cb93a386Sopenharmony_ci        }
372cb93a386Sopenharmony_ci
373cb93a386Sopenharmony_ci        write(o, "\n");
374cb93a386Sopenharmony_ci    }
375cb93a386Sopenharmony_ci
376cb93a386Sopenharmony_ci    void Builder::dump(SkWStream* o) const {
377cb93a386Sopenharmony_ci        SkDebugfStream debug;
378cb93a386Sopenharmony_ci        if (!o) { o = &debug; }
379cb93a386Sopenharmony_ci
380cb93a386Sopenharmony_ci        std::vector<OptimizedInstruction> optimized = this->optimize();
381cb93a386Sopenharmony_ci        o->writeDecAsText(optimized.size());
382cb93a386Sopenharmony_ci        o->writeText(" values (originally ");
383cb93a386Sopenharmony_ci        o->writeDecAsText(fProgram.size());
384cb93a386Sopenharmony_ci        o->writeText("):\n");
385cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)optimized.size(); id++) {
386cb93a386Sopenharmony_ci            const OptimizedInstruction& inst = optimized[id];
387cb93a386Sopenharmony_ci            write(o, inst.can_hoist ? "↑ " : "  ");
388cb93a386Sopenharmony_ci            write_one_instruction(id, inst, o);
389cb93a386Sopenharmony_ci        }
390cb93a386Sopenharmony_ci    }
391cb93a386Sopenharmony_ci
392cb93a386Sopenharmony_ci    void Program::dump(SkWStream* o) const {
393cb93a386Sopenharmony_ci        SkDebugfStream debug;
394cb93a386Sopenharmony_ci        if (!o) { o = &debug; }
395cb93a386Sopenharmony_ci
396cb93a386Sopenharmony_ci        o->writeDecAsText(fImpl->regs);
397cb93a386Sopenharmony_ci        o->writeText(" registers, ");
398cb93a386Sopenharmony_ci        o->writeDecAsText(fImpl->instructions.size());
399cb93a386Sopenharmony_ci        o->writeText(" instructions:\n");
400cb93a386Sopenharmony_ci        for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
401cb93a386Sopenharmony_ci            if (i == fImpl->loop) { write(o, "loop:\n"); }
402cb93a386Sopenharmony_ci            o->writeDecAsText(i);
403cb93a386Sopenharmony_ci            o->writeText("\t");
404cb93a386Sopenharmony_ci            if (i >= fImpl->loop) { write(o, "    "); }
405cb93a386Sopenharmony_ci            const InterpreterInstruction& inst = fImpl->instructions[i];
406cb93a386Sopenharmony_ci            Op   op = inst.op;
407cb93a386Sopenharmony_ci            Reg   d = inst.d,
408cb93a386Sopenharmony_ci                  x = inst.x,
409cb93a386Sopenharmony_ci                  y = inst.y,
410cb93a386Sopenharmony_ci                  z = inst.z,
411cb93a386Sopenharmony_ci                  w = inst.w;
412cb93a386Sopenharmony_ci            int immA = inst.immA,
413cb93a386Sopenharmony_ci                immB = inst.immB,
414cb93a386Sopenharmony_ci                immC = inst.immC;
415cb93a386Sopenharmony_ci            switch (op) {
416cb93a386Sopenharmony_ci                case Op::assert_true: write(o, op, R{x}, R{y}); break;
417cb93a386Sopenharmony_ci
418cb93a386Sopenharmony_ci                case Op::trace_line: write(o, op, R{x}, Line{immA}); break;
419cb93a386Sopenharmony_ci                case Op::trace_var: write(o, op, R{x}, VarSlot{immA}, "=", R{y}, VarType{immB});
420cb93a386Sopenharmony_ci                                    break;
421cb93a386Sopenharmony_ci                case Op::trace_call: write(o, op, R{x}, Line{immA}, CallType{immB}); break;
422cb93a386Sopenharmony_ci
423cb93a386Sopenharmony_ci                case Op::store8:   write(o, op, Ptr{immA}, R{x}                  ); break;
424cb93a386Sopenharmony_ci                case Op::store16:  write(o, op, Ptr{immA}, R{x}                  ); break;
425cb93a386Sopenharmony_ci                case Op::store32:  write(o, op, Ptr{immA}, R{x}                  ); break;
426cb93a386Sopenharmony_ci                case Op::store64:  write(o, op, Ptr{immA}, R{x}, R{y}            ); break;
427cb93a386Sopenharmony_ci                case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break;
428cb93a386Sopenharmony_ci
429cb93a386Sopenharmony_ci                case Op::index: write(o, R{d}, "=", op); break;
430cb93a386Sopenharmony_ci
431cb93a386Sopenharmony_ci                case Op::load8:   write(o, R{d}, "=", op, Ptr{immA}); break;
432cb93a386Sopenharmony_ci                case Op::load16:  write(o, R{d}, "=", op, Ptr{immA}); break;
433cb93a386Sopenharmony_ci                case Op::load32:  write(o, R{d}, "=", op, Ptr{immA}); break;
434cb93a386Sopenharmony_ci                case Op::load64:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
435cb93a386Sopenharmony_ci                case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
436cb93a386Sopenharmony_ci
437cb93a386Sopenharmony_ci                case Op::gather8:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
438cb93a386Sopenharmony_ci                case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
439cb93a386Sopenharmony_ci                case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
440cb93a386Sopenharmony_ci
441cb93a386Sopenharmony_ci                case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
442cb93a386Sopenharmony_ci                case Op::array32:   write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
443cb93a386Sopenharmony_ci
444cb93a386Sopenharmony_ci                case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
445cb93a386Sopenharmony_ci
446cb93a386Sopenharmony_ci                case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
447cb93a386Sopenharmony_ci                case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
448cb93a386Sopenharmony_ci                case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
449cb93a386Sopenharmony_ci                case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
450cb93a386Sopenharmony_ci                case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
451cb93a386Sopenharmony_ci                case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
452cb93a386Sopenharmony_ci                case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
453cb93a386Sopenharmony_ci                case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
454cb93a386Sopenharmony_ci                case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
455cb93a386Sopenharmony_ci
456cb93a386Sopenharmony_ci                case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
457cb93a386Sopenharmony_ci
458cb93a386Sopenharmony_ci                case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
459cb93a386Sopenharmony_ci                case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
460cb93a386Sopenharmony_ci                case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
461cb93a386Sopenharmony_ci                case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
462cb93a386Sopenharmony_ci
463cb93a386Sopenharmony_ci
464cb93a386Sopenharmony_ci                case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
465cb93a386Sopenharmony_ci                case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
466cb93a386Sopenharmony_ci                case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
467cb93a386Sopenharmony_ci
468cb93a386Sopenharmony_ci                case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
469cb93a386Sopenharmony_ci                case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
470cb93a386Sopenharmony_ci                case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
471cb93a386Sopenharmony_ci
472cb93a386Sopenharmony_ci                case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
473cb93a386Sopenharmony_ci                case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
474cb93a386Sopenharmony_ci
475cb93a386Sopenharmony_ci                case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}); break;
476cb93a386Sopenharmony_ci                case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}); break;
477cb93a386Sopenharmony_ci                case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}); break;
478cb93a386Sopenharmony_ci                case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break;
479cb93a386Sopenharmony_ci
480cb93a386Sopenharmony_ci                case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
481cb93a386Sopenharmony_ci
482cb93a386Sopenharmony_ci                case Op::ceil:      write(o, R{d}, "=", op, R{x}); break;
483cb93a386Sopenharmony_ci                case Op::floor:     write(o, R{d}, "=", op, R{x}); break;
484cb93a386Sopenharmony_ci                case Op::to_f32:    write(o, R{d}, "=", op, R{x}); break;
485cb93a386Sopenharmony_ci                case Op::to_fp16:   write(o, R{d}, "=", op, R{x}); break;
486cb93a386Sopenharmony_ci                case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break;
487cb93a386Sopenharmony_ci                case Op::trunc:     write(o, R{d}, "=", op, R{x}); break;
488cb93a386Sopenharmony_ci                case Op::round:     write(o, R{d}, "=", op, R{x}); break;
489cb93a386Sopenharmony_ci            }
490cb93a386Sopenharmony_ci            write(o, "\n");
491cb93a386Sopenharmony_ci        }
492cb93a386Sopenharmony_ci    }
493cb93a386Sopenharmony_ci
494cb93a386Sopenharmony_ci    std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
495cb93a386Sopenharmony_ci        // Determine which Instructions are live by working back from side effects.
496cb93a386Sopenharmony_ci        std::vector<bool> live(program.size(), false);
497cb93a386Sopenharmony_ci        for (Val id = program.size(); id--;) {
498cb93a386Sopenharmony_ci            if (live[id] || has_side_effect(program[id].op)) {
499cb93a386Sopenharmony_ci                live[id] = true;
500cb93a386Sopenharmony_ci                const Instruction& inst = program[id];
501cb93a386Sopenharmony_ci                for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
502cb93a386Sopenharmony_ci                    if (arg != NA) { live[arg] = true; }
503cb93a386Sopenharmony_ci                }
504cb93a386Sopenharmony_ci            }
505cb93a386Sopenharmony_ci        }
506cb93a386Sopenharmony_ci
507cb93a386Sopenharmony_ci        // After removing non-live instructions, we can be left with redundant back-to-back
508cb93a386Sopenharmony_ci        // trace_line instructions. (e.g. one line could have multiple statements on it.)
509cb93a386Sopenharmony_ci        // Eliminate any duplicate ops.
510cb93a386Sopenharmony_ci        int lastId = -1;
511cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)program.size(); id++) {
512cb93a386Sopenharmony_ci            if (!live[id]) {
513cb93a386Sopenharmony_ci                continue;
514cb93a386Sopenharmony_ci            }
515cb93a386Sopenharmony_ci            const Instruction& inst = program[id];
516cb93a386Sopenharmony_ci            if (inst.op != Op::trace_line) {
517cb93a386Sopenharmony_ci                lastId = -1;
518cb93a386Sopenharmony_ci                continue;
519cb93a386Sopenharmony_ci            }
520cb93a386Sopenharmony_ci            if (lastId >= 0) {
521cb93a386Sopenharmony_ci                const Instruction& last = program[lastId];
522cb93a386Sopenharmony_ci                if (inst.immA == last.immA && inst.x == last.x) {
523cb93a386Sopenharmony_ci                    // Found two matching trace_lines in a row. Mark the first one as dead.
524cb93a386Sopenharmony_ci                    live[lastId] = false;
525cb93a386Sopenharmony_ci                }
526cb93a386Sopenharmony_ci            }
527cb93a386Sopenharmony_ci            lastId = id;
528cb93a386Sopenharmony_ci        }
529cb93a386Sopenharmony_ci
530cb93a386Sopenharmony_ci        // Rewrite the program with only live Instructions:
531cb93a386Sopenharmony_ci        //   - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
532cb93a386Sopenharmony_ci        //   - then actually remove the dead Instructions.
533cb93a386Sopenharmony_ci        std::vector<Val> new_id(program.size(), NA);
534cb93a386Sopenharmony_ci        for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
535cb93a386Sopenharmony_ci            if (live[id]) {
536cb93a386Sopenharmony_ci                Instruction& inst = program[id];
537cb93a386Sopenharmony_ci                for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) {
538cb93a386Sopenharmony_ci                    if (*arg != NA) {
539cb93a386Sopenharmony_ci                        *arg = new_id[*arg];
540cb93a386Sopenharmony_ci                        SkASSERT(*arg != NA);
541cb93a386Sopenharmony_ci                    }
542cb93a386Sopenharmony_ci                }
543cb93a386Sopenharmony_ci                new_id[id] = next++;
544cb93a386Sopenharmony_ci            }
545cb93a386Sopenharmony_ci        }
546cb93a386Sopenharmony_ci
547cb93a386Sopenharmony_ci        // Eliminate any non-live ops.
548cb93a386Sopenharmony_ci        auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
549cb93a386Sopenharmony_ci            Val id = (Val)(&inst - program.data());
550cb93a386Sopenharmony_ci            return !live[id];
551cb93a386Sopenharmony_ci        });
552cb93a386Sopenharmony_ci        program.erase(it, program.end());
553cb93a386Sopenharmony_ci
554cb93a386Sopenharmony_ci        return program;
555cb93a386Sopenharmony_ci    }
556cb93a386Sopenharmony_ci
557cb93a386Sopenharmony_ci    std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
558cb93a386Sopenharmony_ci        std::vector<OptimizedInstruction> optimized(program.size());
559cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)program.size(); id++) {
560cb93a386Sopenharmony_ci            Instruction inst = program[id];
561cb93a386Sopenharmony_ci            optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w,
562cb93a386Sopenharmony_ci                             inst.immA,inst.immB,inst.immC,
563cb93a386Sopenharmony_ci                             /*death=*/id, /*can_hoist=*/true};
564cb93a386Sopenharmony_ci        }
565cb93a386Sopenharmony_ci
566cb93a386Sopenharmony_ci        // Each Instruction's inputs need to live at least until that Instruction issues.
567cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)optimized.size(); id++) {
568cb93a386Sopenharmony_ci            OptimizedInstruction& inst = optimized[id];
569cb93a386Sopenharmony_ci            for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
570cb93a386Sopenharmony_ci                // (We're walking in order, so this is the same as max()ing with the existing Val.)
571cb93a386Sopenharmony_ci                if (arg != NA) { optimized[arg].death = id; }
572cb93a386Sopenharmony_ci            }
573cb93a386Sopenharmony_ci        }
574cb93a386Sopenharmony_ci
575cb93a386Sopenharmony_ci        // Mark which values don't depend on the loop and can be hoisted.
576cb93a386Sopenharmony_ci        for (OptimizedInstruction& inst : optimized) {
577cb93a386Sopenharmony_ci            // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
578cb93a386Sopenharmony_ci            if (is_always_varying(inst.op) || is_trace(inst.op)) {
579cb93a386Sopenharmony_ci                inst.can_hoist = false;
580cb93a386Sopenharmony_ci            }
581cb93a386Sopenharmony_ci
582cb93a386Sopenharmony_ci            // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
583cb93a386Sopenharmony_ci            if (inst.can_hoist) {
584cb93a386Sopenharmony_ci                for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
585cb93a386Sopenharmony_ci                    if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
586cb93a386Sopenharmony_ci                }
587cb93a386Sopenharmony_ci            }
588cb93a386Sopenharmony_ci        }
589cb93a386Sopenharmony_ci
590cb93a386Sopenharmony_ci        // Extend the lifetime of any hoisted value that's used in the loop to infinity.
591cb93a386Sopenharmony_ci        for (OptimizedInstruction& inst : optimized) {
592cb93a386Sopenharmony_ci            if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
593cb93a386Sopenharmony_ci                for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
594cb93a386Sopenharmony_ci                    if (arg != NA && optimized[arg].can_hoist) {
595cb93a386Sopenharmony_ci                        optimized[arg].death = (Val)program.size();
596cb93a386Sopenharmony_ci                    }
597cb93a386Sopenharmony_ci                }
598cb93a386Sopenharmony_ci            }
599cb93a386Sopenharmony_ci        }
600cb93a386Sopenharmony_ci
601cb93a386Sopenharmony_ci        return optimized;
602cb93a386Sopenharmony_ci    }
603cb93a386Sopenharmony_ci
604cb93a386Sopenharmony_ci    std::vector<OptimizedInstruction> Builder::optimize() const {
605cb93a386Sopenharmony_ci        std::vector<Instruction> program = this->program();
606cb93a386Sopenharmony_ci        program = eliminate_dead_code(std::move(program));
607cb93a386Sopenharmony_ci        return    finalize           (std::move(program));
608cb93a386Sopenharmony_ci    }
609cb93a386Sopenharmony_ci
610cb93a386Sopenharmony_ci    Program Builder::done(const char* debug_name, bool allow_jit) const {
611cb93a386Sopenharmony_ci        char buf[64] = "skvm-jit-";
612cb93a386Sopenharmony_ci        if (!debug_name) {
613cb93a386Sopenharmony_ci            *SkStrAppendU32(buf+9, this->hash()) = '\0';
614cb93a386Sopenharmony_ci            debug_name = buf;
615cb93a386Sopenharmony_ci        }
616cb93a386Sopenharmony_ci
617cb93a386Sopenharmony_ci        return {this->optimize(), fStrides, debug_name, allow_jit};
618cb93a386Sopenharmony_ci    }
619cb93a386Sopenharmony_ci
620cb93a386Sopenharmony_ci    uint64_t Builder::hash() const {
621cb93a386Sopenharmony_ci        uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
622cb93a386Sopenharmony_ci                 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
623cb93a386Sopenharmony_ci        return (uint64_t)lo | (uint64_t)hi << 32;
624cb93a386Sopenharmony_ci    }
625cb93a386Sopenharmony_ci
626cb93a386Sopenharmony_ci    bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; }
627cb93a386Sopenharmony_ci
628cb93a386Sopenharmony_ci    bool operator==(const Instruction& a, const Instruction& b) {
629cb93a386Sopenharmony_ci        return a.op   == b.op
630cb93a386Sopenharmony_ci            && a.x    == b.x
631cb93a386Sopenharmony_ci            && a.y    == b.y
632cb93a386Sopenharmony_ci            && a.z    == b.z
633cb93a386Sopenharmony_ci            && a.w    == b.w
634cb93a386Sopenharmony_ci            && a.immA == b.immA
635cb93a386Sopenharmony_ci            && a.immB == b.immB
636cb93a386Sopenharmony_ci            && a.immC == b.immC;
637cb93a386Sopenharmony_ci    }
638cb93a386Sopenharmony_ci
639cb93a386Sopenharmony_ci    uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
640cb93a386Sopenharmony_ci        return SkOpts::hash(&inst, sizeof(inst), seed);
641cb93a386Sopenharmony_ci    }
642cb93a386Sopenharmony_ci
643cb93a386Sopenharmony_ci
644cb93a386Sopenharmony_ci    // Most instructions produce a value and return it by ID,
645cb93a386Sopenharmony_ci    // the value-producing instruction's own index in the program vector.
646cb93a386Sopenharmony_ci    Val Builder::push(Instruction inst) {
647cb93a386Sopenharmony_ci        // Basic common subexpression elimination:
648cb93a386Sopenharmony_ci        // if we've already seen this exact Instruction, use it instead of creating a new one.
649cb93a386Sopenharmony_ci        //
650cb93a386Sopenharmony_ci        // But we never dedup loads or stores: an intervening store could change that memory.
651cb93a386Sopenharmony_ci        // Uniforms and gathers touch only uniform memory, so they're fine to dedup,
652cb93a386Sopenharmony_ci        // and index is varying but doesn't touch memory, so it's fine to dedup too.
653cb93a386Sopenharmony_ci        if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) {
654cb93a386Sopenharmony_ci            if (Val* id = fIndex.find(inst)) {
655cb93a386Sopenharmony_ci                return *id;
656cb93a386Sopenharmony_ci            }
657cb93a386Sopenharmony_ci        }
658cb93a386Sopenharmony_ci        Val id = static_cast<Val>(fProgram.size());
659cb93a386Sopenharmony_ci        fProgram.push_back(inst);
660cb93a386Sopenharmony_ci        fIndex.set(inst, id);
661cb93a386Sopenharmony_ci        return id;
662cb93a386Sopenharmony_ci    }
663cb93a386Sopenharmony_ci
664cb93a386Sopenharmony_ci    Ptr Builder::arg(int stride) {
665cb93a386Sopenharmony_ci        int ix = (int)fStrides.size();
666cb93a386Sopenharmony_ci        fStrides.push_back(stride);
667cb93a386Sopenharmony_ci        return {ix};
668cb93a386Sopenharmony_ci    }
669cb93a386Sopenharmony_ci
670cb93a386Sopenharmony_ci    void Builder::assert_true(I32 cond, I32 debug) {
671cb93a386Sopenharmony_ci    #ifdef SK_DEBUG
672cb93a386Sopenharmony_ci        int imm;
673cb93a386Sopenharmony_ci        if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
674cb93a386Sopenharmony_ci        (void)push(Op::assert_true, cond.id, debug.id);
675cb93a386Sopenharmony_ci    #endif
676cb93a386Sopenharmony_ci    }
677cb93a386Sopenharmony_ci
678cb93a386Sopenharmony_ci    void Builder::trace_line(I32 mask, int line) {
679cb93a386Sopenharmony_ci        if (this->isImm(mask.id, 0)) { return; }
680cb93a386Sopenharmony_ci        (void)push(Op::trace_line, mask.id,NA,NA,NA, line);
681cb93a386Sopenharmony_ci    }
682cb93a386Sopenharmony_ci    void Builder::trace_var(I32 mask, int slot, I32 val) {
683cb93a386Sopenharmony_ci        if (this->isImm(mask.id, 0)) { return; }
684cb93a386Sopenharmony_ci        (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeInt.bits);
685cb93a386Sopenharmony_ci    }
686cb93a386Sopenharmony_ci    void Builder::trace_var(I32 mask, int slot, F32 val) {
687cb93a386Sopenharmony_ci        if (this->isImm(mask.id, 0)) { return; }
688cb93a386Sopenharmony_ci        (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeFloat.bits);
689cb93a386Sopenharmony_ci    }
690cb93a386Sopenharmony_ci    void Builder::trace_var(I32 mask, int slot, bool b) {
691cb93a386Sopenharmony_ci        if (this->isImm(mask.id, 0)) { return; }
692cb93a386Sopenharmony_ci        I32 val = b ? this->splat(1) : this->splat(0);
693cb93a386Sopenharmony_ci        (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeBool.bits);
694cb93a386Sopenharmony_ci    }
695cb93a386Sopenharmony_ci    void Builder::trace_call_enter(I32 mask, int line) {
696cb93a386Sopenharmony_ci        if (this->isImm(mask.id, 0)) { return; }
697cb93a386Sopenharmony_ci        (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeEnter.bits);
698cb93a386Sopenharmony_ci    }
699cb93a386Sopenharmony_ci    void Builder::trace_call_exit(I32 mask, int line) {
700cb93a386Sopenharmony_ci        if (this->isImm(mask.id, 0)) { return; }
701cb93a386Sopenharmony_ci        (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeExit.bits);
702cb93a386Sopenharmony_ci    }
703cb93a386Sopenharmony_ci
704cb93a386Sopenharmony_ci    void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); }
705cb93a386Sopenharmony_ci    void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); }
706cb93a386Sopenharmony_ci    void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); }
707cb93a386Sopenharmony_ci    void Builder::store64(Ptr ptr, I32 lo, I32 hi) {
708cb93a386Sopenharmony_ci        (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix);
709cb93a386Sopenharmony_ci    }
710cb93a386Sopenharmony_ci    void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) {
711cb93a386Sopenharmony_ci        (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix);
712cb93a386Sopenharmony_ci    }
713cb93a386Sopenharmony_ci
714cb93a386Sopenharmony_ci    I32 Builder::index() { return {this, push(Op::index)}; }
715cb93a386Sopenharmony_ci
716cb93a386Sopenharmony_ci    I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; }
717cb93a386Sopenharmony_ci    I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; }
718cb93a386Sopenharmony_ci    I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; }
719cb93a386Sopenharmony_ci    I32 Builder::load64(Ptr ptr, int lane) {
720cb93a386Sopenharmony_ci        return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) };
721cb93a386Sopenharmony_ci    }
722cb93a386Sopenharmony_ci    I32 Builder::load128(Ptr ptr, int lane) {
723cb93a386Sopenharmony_ci        return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) };
724cb93a386Sopenharmony_ci    }
725cb93a386Sopenharmony_ci
726cb93a386Sopenharmony_ci    I32 Builder::gather8 (UPtr ptr, int offset, I32 index) {
727cb93a386Sopenharmony_ci        return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)};
728cb93a386Sopenharmony_ci    }
729cb93a386Sopenharmony_ci    I32 Builder::gather16(UPtr ptr, int offset, I32 index) {
730cb93a386Sopenharmony_ci        return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)};
731cb93a386Sopenharmony_ci    }
732cb93a386Sopenharmony_ci    I32 Builder::gather32(UPtr ptr, int offset, I32 index) {
733cb93a386Sopenharmony_ci        return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)};
734cb93a386Sopenharmony_ci    }
735cb93a386Sopenharmony_ci
736cb93a386Sopenharmony_ci    I32 Builder::uniform32(UPtr ptr, int offset) {
737cb93a386Sopenharmony_ci        return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
738cb93a386Sopenharmony_ci    }
739cb93a386Sopenharmony_ci
740cb93a386Sopenharmony_ci    // Note: this converts the array index into a byte offset for the op.
741cb93a386Sopenharmony_ci    I32 Builder::array32  (UPtr ptr, int offset, int index) {
742cb93a386Sopenharmony_ci        return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))};
743cb93a386Sopenharmony_ci    }
744cb93a386Sopenharmony_ci
745cb93a386Sopenharmony_ci    I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
746cb93a386Sopenharmony_ci
747cb93a386Sopenharmony_ci    // Be careful peepholing float math!  Transformations you might expect to
748cb93a386Sopenharmony_ci    // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
749cb93a386Sopenharmony_ci    // Float peepholes must pass this equivalence test for all ~4B floats:
750cb93a386Sopenharmony_ci    //
751cb93a386Sopenharmony_ci    //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
752cb93a386Sopenharmony_ci    //
753cb93a386Sopenharmony_ci    //     unsigned bits = 0;
754cb93a386Sopenharmony_ci    //     do {
755cb93a386Sopenharmony_ci    //        float f;
756cb93a386Sopenharmony_ci    //        memcpy(&f, &bits, 4);
757cb93a386Sopenharmony_ci    //        if (!equiv(f, ...)) {
758cb93a386Sopenharmony_ci    //           abort();
759cb93a386Sopenharmony_ci    //        }
760cb93a386Sopenharmony_ci    //     } while (++bits != 0);
761cb93a386Sopenharmony_ci
762cb93a386Sopenharmony_ci    F32 Builder::add(F32 x, F32 y) {
763cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
764cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
765cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
766cb93a386Sopenharmony_ci
767cb93a386Sopenharmony_ci        if (fFeatures.fma) {
768cb93a386Sopenharmony_ci            if (fProgram[x.id].op == Op::mul_f32) {
769cb93a386Sopenharmony_ci                return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
770cb93a386Sopenharmony_ci            }
771cb93a386Sopenharmony_ci            if (fProgram[y.id].op == Op::mul_f32) {
772cb93a386Sopenharmony_ci                return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
773cb93a386Sopenharmony_ci            }
774cb93a386Sopenharmony_ci        }
775cb93a386Sopenharmony_ci        return {this, this->push(Op::add_f32, x.id, y.id)};
776cb93a386Sopenharmony_ci    }
777cb93a386Sopenharmony_ci
778cb93a386Sopenharmony_ci    F32 Builder::sub(F32 x, F32 y) {
779cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
780cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
781cb93a386Sopenharmony_ci        if (fFeatures.fma) {
782cb93a386Sopenharmony_ci            if (fProgram[x.id].op == Op::mul_f32) {
783cb93a386Sopenharmony_ci                return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
784cb93a386Sopenharmony_ci            }
785cb93a386Sopenharmony_ci            if (fProgram[y.id].op == Op::mul_f32) {
786cb93a386Sopenharmony_ci                return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
787cb93a386Sopenharmony_ci            }
788cb93a386Sopenharmony_ci        }
789cb93a386Sopenharmony_ci        return {this, this->push(Op::sub_f32, x.id, y.id)};
790cb93a386Sopenharmony_ci    }
791cb93a386Sopenharmony_ci
792cb93a386Sopenharmony_ci    F32 Builder::mul(F32 x, F32 y) {
793cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
794cb93a386Sopenharmony_ci        if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
795cb93a386Sopenharmony_ci        if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
796cb93a386Sopenharmony_ci        return {this, this->push(Op::mul_f32, x.id, y.id)};
797cb93a386Sopenharmony_ci    }
798cb93a386Sopenharmony_ci
799cb93a386Sopenharmony_ci    F32 Builder::fast_mul(F32 x, F32 y) {
800cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); }
801cb93a386Sopenharmony_ci        return mul(x,y);
802cb93a386Sopenharmony_ci    }
803cb93a386Sopenharmony_ci
804cb93a386Sopenharmony_ci    F32 Builder::div(F32 x, F32 y) {
805cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); }
806cb93a386Sopenharmony_ci        if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
807cb93a386Sopenharmony_ci        return {this, this->push(Op::div_f32, x.id, y.id)};
808cb93a386Sopenharmony_ci    }
809cb93a386Sopenharmony_ci
810cb93a386Sopenharmony_ci    F32 Builder::sqrt(F32 x) {
811cb93a386Sopenharmony_ci        if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
812cb93a386Sopenharmony_ci        return {this, this->push(Op::sqrt_f32, x.id)};
813cb93a386Sopenharmony_ci    }
814cb93a386Sopenharmony_ci
815cb93a386Sopenharmony_ci    // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
816cb93a386Sopenharmony_ci    F32 Builder::approx_log2(F32 x) {
817cb93a386Sopenharmony_ci        // e - 127 is a fair approximation of log2(x) in its own right...
818cb93a386Sopenharmony_ci        F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23)));
819cb93a386Sopenharmony_ci
820cb93a386Sopenharmony_ci        // ... but using the mantissa to refine its error is _much_ better.
821cb93a386Sopenharmony_ci        F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff),
822cb93a386Sopenharmony_ci                                0x3f000000));
823cb93a386Sopenharmony_ci        F32 approx = sub(e,        124.225514990f);
824cb93a386Sopenharmony_ci            approx = sub(approx, mul(1.498030302f, m));
825cb93a386Sopenharmony_ci            approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
826cb93a386Sopenharmony_ci
827cb93a386Sopenharmony_ci        return approx;
828cb93a386Sopenharmony_ci    }
829cb93a386Sopenharmony_ci
830cb93a386Sopenharmony_ci    F32 Builder::approx_pow2(F32 x) {
831cb93a386Sopenharmony_ci        F32 f = fract(x);
832cb93a386Sopenharmony_ci        F32 approx = add(x,         121.274057500f);
833cb93a386Sopenharmony_ci            approx = sub(approx, mul( 1.490129070f, f));
834cb93a386Sopenharmony_ci            approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
835cb93a386Sopenharmony_ci
836cb93a386Sopenharmony_ci        return pun_to_F32(round(mul(1.0f * (1<<23), approx)));
837cb93a386Sopenharmony_ci    }
838cb93a386Sopenharmony_ci
839cb93a386Sopenharmony_ci    F32 Builder::approx_powf(F32 x, F32 y) {
840cb93a386Sopenharmony_ci        // TODO: assert this instead?  Sometimes x is very slightly negative.  See skia:10210.
841cb93a386Sopenharmony_ci        x = max(0.0f, x);
842cb93a386Sopenharmony_ci
843cb93a386Sopenharmony_ci        auto is_x = bit_or(eq(x, 0.0f),
844cb93a386Sopenharmony_ci                           eq(x, 1.0f));
845cb93a386Sopenharmony_ci        return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
846cb93a386Sopenharmony_ci    }
847cb93a386Sopenharmony_ci
848cb93a386Sopenharmony_ci    // Bhaskara I's sine approximation
849cb93a386Sopenharmony_ci    // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
850cb93a386Sopenharmony_ci    // ... divide by 4
851cb93a386Sopenharmony_ci    // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
852cb93a386Sopenharmony_ci    //
853cb93a386Sopenharmony_ci    // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
854cb93a386Sopenharmony_ci    // radians into that range first.
855cb93a386Sopenharmony_ci    //
856cb93a386Sopenharmony_ci    F32 Builder::approx_sin(F32 radians) {
857cb93a386Sopenharmony_ci        constexpr float Pi = SK_ScalarPI;
858cb93a386Sopenharmony_ci        // x = radians mod 2pi
859cb93a386Sopenharmony_ci        F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
860cb93a386Sopenharmony_ci        I32 neg = x > Pi;   // are we pi < x < 2pi --> need to negate result
861cb93a386Sopenharmony_ci        x = select(neg, x - Pi, x);
862cb93a386Sopenharmony_ci
863cb93a386Sopenharmony_ci        F32 pair = x * (Pi - x);
864cb93a386Sopenharmony_ci        x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
865cb93a386Sopenharmony_ci        x = select(neg, -x, x);
866cb93a386Sopenharmony_ci        return x;
867cb93a386Sopenharmony_ci    }
868cb93a386Sopenharmony_ci
869cb93a386Sopenharmony_ci    /*  "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
870cb93a386Sopenharmony_ci         https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
871cb93a386Sopenharmony_ci
872cb93a386Sopenharmony_ci        approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
873cb93a386Sopenharmony_ci
874cb93a386Sopenharmony_ci        Some simplifications:
875cb93a386Sopenharmony_ci        1. tan(x) is periodic, -PI/2 < x < PI/2
876cb93a386Sopenharmony_ci        2. tan(x) is odd, so tan(-x) = -tan(x)
877cb93a386Sopenharmony_ci        3. Our polynomial approximation is best near zero, so we use the following identity
878cb93a386Sopenharmony_ci                        tan(x) + tan(y)
879cb93a386Sopenharmony_ci           tan(x + y) = -----------------
880cb93a386Sopenharmony_ci                       1 - tan(x)*tan(y)
881cb93a386Sopenharmony_ci           tan(PI/4) = 1
882cb93a386Sopenharmony_ci
883cb93a386Sopenharmony_ci           So for x > PI/8, we do the following refactor:
884cb93a386Sopenharmony_ci           x' = x - PI/4
885cb93a386Sopenharmony_ci
886cb93a386Sopenharmony_ci                    1 + tan(x')
887cb93a386Sopenharmony_ci           tan(x) = ------------
888cb93a386Sopenharmony_ci                    1 - tan(x')
889cb93a386Sopenharmony_ci     */
890cb93a386Sopenharmony_ci    F32 Builder::approx_tan(F32 x) {
891cb93a386Sopenharmony_ci        constexpr float Pi = SK_ScalarPI;
892cb93a386Sopenharmony_ci        // periodic between -pi/2 ... pi/2
893cb93a386Sopenharmony_ci        // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
894cb93a386Sopenharmony_ci        x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
895cb93a386Sopenharmony_ci
896cb93a386Sopenharmony_ci        I32 neg = (x < 0.0f);
897cb93a386Sopenharmony_ci        x = select(neg, -x, x);
898cb93a386Sopenharmony_ci
899cb93a386Sopenharmony_ci        // minimize total error by shifting if x > pi/8
900cb93a386Sopenharmony_ci        I32 use_quotient = (x > (Pi/8));
901cb93a386Sopenharmony_ci        x = select(use_quotient, x - (Pi/4), x);
902cb93a386Sopenharmony_ci
903cb93a386Sopenharmony_ci        // 9th order poly = 4th order(x^2) * x
904cb93a386Sopenharmony_ci        x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
905cb93a386Sopenharmony_ci        x = select(use_quotient, (1+x)/(1-x), x);
906cb93a386Sopenharmony_ci        x = select(neg, -x, x);
907cb93a386Sopenharmony_ci        return x;
908cb93a386Sopenharmony_ci    }
909cb93a386Sopenharmony_ci
910cb93a386Sopenharmony_ci     // http://mathforum.org/library/drmath/view/54137.html
911cb93a386Sopenharmony_ci     // referencing Handbook of Mathematical Functions,
912cb93a386Sopenharmony_ci     //             by Milton Abramowitz and Irene Stegun
913cb93a386Sopenharmony_ci     F32 Builder::approx_asin(F32 x) {
914cb93a386Sopenharmony_ci         I32 neg = (x < 0.0f);
915cb93a386Sopenharmony_ci         x = select(neg, -x, x);
916cb93a386Sopenharmony_ci         x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
917cb93a386Sopenharmony_ci         x = select(neg, -x, x);
918cb93a386Sopenharmony_ci         return x;
919cb93a386Sopenharmony_ci     }
920cb93a386Sopenharmony_ci
921cb93a386Sopenharmony_ci    /*  Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
922cb93a386Sopenharmony_ci     *      with 129 values of x,atan(x) for x:[0...1]
923cb93a386Sopenharmony_ci     *  This only works for 0 <= x <= 1
924cb93a386Sopenharmony_ci     */
925cb93a386Sopenharmony_ci    static F32 approx_atan_unit(F32 x) {
926cb93a386Sopenharmony_ci        // for now we might be given NaN... let that through
927cb93a386Sopenharmony_ci        x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
928cb93a386Sopenharmony_ci        return poly(x, 0.14130025741326729f,
929cb93a386Sopenharmony_ci                      -0.34312835980675116f,
930cb93a386Sopenharmony_ci                      -0.016172900528248768f,
931cb93a386Sopenharmony_ci                       1.0037696976200385f,
932cb93a386Sopenharmony_ci                      -0.00014758242182738969f);
933cb93a386Sopenharmony_ci    }
934cb93a386Sopenharmony_ci
935cb93a386Sopenharmony_ci    /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
936cb93a386Sopenharmony_ci     */
937cb93a386Sopenharmony_ci    F32 Builder::approx_atan(F32 x) {
938cb93a386Sopenharmony_ci        I32 neg = (x < 0.0f);
939cb93a386Sopenharmony_ci        x = select(neg, -x, x);
940cb93a386Sopenharmony_ci        I32 flip = (x > 1.0f);
941cb93a386Sopenharmony_ci        x = select(flip, 1/x, x);
942cb93a386Sopenharmony_ci        x = approx_atan_unit(x);
943cb93a386Sopenharmony_ci        x = select(flip, SK_ScalarPI/2 - x, x);
944cb93a386Sopenharmony_ci        x = select(neg, -x, x);
945cb93a386Sopenharmony_ci        return x;
946cb93a386Sopenharmony_ci    }
947cb93a386Sopenharmony_ci
948cb93a386Sopenharmony_ci    /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
949cb93a386Sopenharmony_ci     *  By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
950cb93a386Sopenharmony_ci     *  which avoids a 2nd divide instruction if we had instead called atan().
951cb93a386Sopenharmony_ci     */
952cb93a386Sopenharmony_ci    F32 Builder::approx_atan2(F32 y0, F32 x0) {
953cb93a386Sopenharmony_ci
954cb93a386Sopenharmony_ci        I32 flip = (abs(y0) > abs(x0));
955cb93a386Sopenharmony_ci        F32 y = select(flip, x0, y0);
956cb93a386Sopenharmony_ci        F32 x = select(flip, y0, x0);
957cb93a386Sopenharmony_ci        F32 arg = y/x;
958cb93a386Sopenharmony_ci
959cb93a386Sopenharmony_ci        I32 neg = (arg < 0.0f);
960cb93a386Sopenharmony_ci        arg = select(neg, -arg, arg);
961cb93a386Sopenharmony_ci
962cb93a386Sopenharmony_ci        F32 r = approx_atan_unit(arg);
963cb93a386Sopenharmony_ci        r = select(flip, SK_ScalarPI/2 - r, r);
964cb93a386Sopenharmony_ci        r = select(neg, -r, r);
965cb93a386Sopenharmony_ci
966cb93a386Sopenharmony_ci        // handle quadrant distinctions
967cb93a386Sopenharmony_ci        r = select((y0 >= 0) & (x0  < 0), r + SK_ScalarPI, r);
968cb93a386Sopenharmony_ci        r = select((y0  < 0) & (x0 <= 0), r - SK_ScalarPI, r);
969cb93a386Sopenharmony_ci        // Note: we don't try to handle 0,0 or infinities (yet)
970cb93a386Sopenharmony_ci        return r;
971cb93a386Sopenharmony_ci    }
972cb93a386Sopenharmony_ci
973cb93a386Sopenharmony_ci    F32 Builder::min(F32 x, F32 y) {
974cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
975cb93a386Sopenharmony_ci        return {this, this->push(Op::min_f32, x.id, y.id)};
976cb93a386Sopenharmony_ci    }
977cb93a386Sopenharmony_ci    F32 Builder::max(F32 x, F32 y) {
978cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
979cb93a386Sopenharmony_ci        return {this, this->push(Op::max_f32, x.id, y.id)};
980cb93a386Sopenharmony_ci    }
981cb93a386Sopenharmony_ci
982cb93a386Sopenharmony_ci    SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
983cb93a386Sopenharmony_ci    I32 Builder::add(I32 x, I32 y) {
984cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
985cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return y; }
986cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return x; }
987cb93a386Sopenharmony_ci        return {this, this->push(Op::add_i32, x.id, y.id)};
988cb93a386Sopenharmony_ci    }
989cb93a386Sopenharmony_ci    SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
990cb93a386Sopenharmony_ci    I32 Builder::sub(I32 x, I32 y) {
991cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
992cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return x; }
993cb93a386Sopenharmony_ci        return {this, this->push(Op::sub_i32, x.id, y.id)};
994cb93a386Sopenharmony_ci    }
995cb93a386Sopenharmony_ci    SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
996cb93a386Sopenharmony_ci    I32 Builder::mul(I32 x, I32 y) {
997cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
998cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return splat(0); }
999cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return splat(0); }
1000cb93a386Sopenharmony_ci        if (this->isImm(x.id, 1)) { return y; }
1001cb93a386Sopenharmony_ci        if (this->isImm(y.id, 1)) { return x; }
1002cb93a386Sopenharmony_ci        return {this, this->push(Op::mul_i32, x.id, y.id)};
1003cb93a386Sopenharmony_ci    }
1004cb93a386Sopenharmony_ci
1005cb93a386Sopenharmony_ci    SK_ATTRIBUTE(no_sanitize("shift"))
1006cb93a386Sopenharmony_ci    I32 Builder::shl(I32 x, int bits) {
1007cb93a386Sopenharmony_ci        if (bits == 0) { return x; }
1008cb93a386Sopenharmony_ci        if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1009cb93a386Sopenharmony_ci        return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)};
1010cb93a386Sopenharmony_ci    }
1011cb93a386Sopenharmony_ci    I32 Builder::shr(I32 x, int bits) {
1012cb93a386Sopenharmony_ci        if (bits == 0) { return x; }
1013cb93a386Sopenharmony_ci        if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1014cb93a386Sopenharmony_ci        return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)};
1015cb93a386Sopenharmony_ci    }
1016cb93a386Sopenharmony_ci    I32 Builder::sra(I32 x, int bits) {
1017cb93a386Sopenharmony_ci        if (bits == 0) { return x; }
1018cb93a386Sopenharmony_ci        if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1019cb93a386Sopenharmony_ci        return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)};
1020cb93a386Sopenharmony_ci    }
1021cb93a386Sopenharmony_ci
1022cb93a386Sopenharmony_ci    I32 Builder:: eq(F32 x, F32 y) {
1023cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1024cb93a386Sopenharmony_ci        return {this, this->push(Op::eq_f32, x.id, y.id)};
1025cb93a386Sopenharmony_ci    }
1026cb93a386Sopenharmony_ci    I32 Builder::neq(F32 x, F32 y) {
1027cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1028cb93a386Sopenharmony_ci        return {this, this->push(Op::neq_f32, x.id, y.id)};
1029cb93a386Sopenharmony_ci    }
1030cb93a386Sopenharmony_ci    I32 Builder::lt(F32 x, F32 y) {
1031cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
1032cb93a386Sopenharmony_ci        return {this, this->push(Op::gt_f32, y.id, x.id)};
1033cb93a386Sopenharmony_ci    }
1034cb93a386Sopenharmony_ci    I32 Builder::lte(F32 x, F32 y) {
1035cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
1036cb93a386Sopenharmony_ci        return {this, this->push(Op::gte_f32, y.id, x.id)};
1037cb93a386Sopenharmony_ci    }
1038cb93a386Sopenharmony_ci    I32 Builder::gt(F32 x, F32 y) {
1039cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1040cb93a386Sopenharmony_ci        return {this, this->push(Op::gt_f32, x.id, y.id)};
1041cb93a386Sopenharmony_ci    }
1042cb93a386Sopenharmony_ci    I32 Builder::gte(F32 x, F32 y) {
1043cb93a386Sopenharmony_ci        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1044cb93a386Sopenharmony_ci        return {this, this->push(Op::gte_f32, x.id, y.id)};
1045cb93a386Sopenharmony_ci    }
1046cb93a386Sopenharmony_ci
1047cb93a386Sopenharmony_ci    I32 Builder:: eq(I32 x, I32 y) {
1048cb93a386Sopenharmony_ci        if (x.id == y.id) { return splat(~0); }
1049cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1050cb93a386Sopenharmony_ci        return {this, this->push(Op:: eq_i32, x.id, y.id)};
1051cb93a386Sopenharmony_ci    }
1052cb93a386Sopenharmony_ci    I32 Builder::neq(I32 x, I32 y) {
1053cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1054cb93a386Sopenharmony_ci        return ~(x == y);
1055cb93a386Sopenharmony_ci    }
1056cb93a386Sopenharmony_ci    I32 Builder:: gt(I32 x, I32 y) {
1057cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1058cb93a386Sopenharmony_ci        return {this, this->push(Op:: gt_i32, x.id, y.id)};
1059cb93a386Sopenharmony_ci    }
1060cb93a386Sopenharmony_ci    I32 Builder::gte(I32 x, I32 y) {
1061cb93a386Sopenharmony_ci        if (x.id == y.id) { return splat(~0); }
1062cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1063cb93a386Sopenharmony_ci        return ~(x < y);
1064cb93a386Sopenharmony_ci    }
1065cb93a386Sopenharmony_ci    I32 Builder:: lt(I32 x, I32 y) { return y>x; }
1066cb93a386Sopenharmony_ci    I32 Builder::lte(I32 x, I32 y) { return y>=x; }
1067cb93a386Sopenharmony_ci
1068cb93a386Sopenharmony_ci    I32 Builder::bit_and(I32 x, I32 y) {
1069cb93a386Sopenharmony_ci        if (x.id == y.id) { return x; }
1070cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1071cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return splat(0); }   // (x & false) == false
1072cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return splat(0); }   // (false & y) == false
1073cb93a386Sopenharmony_ci        if (this->isImm(y.id,~0)) { return x; }          // (x & true) == x
1074cb93a386Sopenharmony_ci        if (this->isImm(x.id,~0)) { return y; }          // (true & y) == y
1075cb93a386Sopenharmony_ci        return {this, this->push(Op::bit_and, x.id, y.id)};
1076cb93a386Sopenharmony_ci    }
1077cb93a386Sopenharmony_ci    I32 Builder::bit_or(I32 x, I32 y) {
1078cb93a386Sopenharmony_ci        if (x.id == y.id) { return x; }
1079cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
1080cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return x; }           // (x | false) == x
1081cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return y; }           // (false | y) == y
1082cb93a386Sopenharmony_ci        if (this->isImm(y.id,~0)) { return splat(~0); }   // (x | true) == true
1083cb93a386Sopenharmony_ci        if (this->isImm(x.id,~0)) { return splat(~0); }   // (true | y) == true
1084cb93a386Sopenharmony_ci        return {this, this->push(Op::bit_or, x.id, y.id)};
1085cb93a386Sopenharmony_ci    }
1086cb93a386Sopenharmony_ci    I32 Builder::bit_xor(I32 x, I32 y) {
1087cb93a386Sopenharmony_ci        if (x.id == y.id) { return splat(0); }
1088cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1089cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
1090cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
1091cb93a386Sopenharmony_ci        return {this, this->push(Op::bit_xor, x.id, y.id)};
1092cb93a386Sopenharmony_ci    }
1093cb93a386Sopenharmony_ci
1094cb93a386Sopenharmony_ci    I32 Builder::bit_clear(I32 x, I32 y) {
1095cb93a386Sopenharmony_ci        if (x.id == y.id) { return splat(0); }
1096cb93a386Sopenharmony_ci        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1097cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return x; }          // (x & ~false) == x
1098cb93a386Sopenharmony_ci        if (this->isImm(y.id,~0)) { return splat(0); }   // (x & ~true) == false
1099cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return splat(0); }   // (false & ~y) == false
1100cb93a386Sopenharmony_ci        return {this, this->push(Op::bit_clear, x.id, y.id)};
1101cb93a386Sopenharmony_ci    }
1102cb93a386Sopenharmony_ci
1103cb93a386Sopenharmony_ci    I32 Builder::select(I32 x, I32 y, I32 z) {
1104cb93a386Sopenharmony_ci        if (y.id == z.id) { return y; }
1105cb93a386Sopenharmony_ci        if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1106cb93a386Sopenharmony_ci        if (this->isImm(x.id,~0)) { return y; }               // true  ? y : z == y
1107cb93a386Sopenharmony_ci        if (this->isImm(x.id, 0)) { return z; }               // false ? y : z == z
1108cb93a386Sopenharmony_ci        if (this->isImm(y.id, 0)) { return bit_clear(z,x); }  //     x ? 0 : z == ~x&z
1109cb93a386Sopenharmony_ci        if (this->isImm(z.id, 0)) { return bit_and  (y,x); }  //     x ? y : 0 ==  x&y
1110cb93a386Sopenharmony_ci        return {this, this->push(Op::select, x.id, y.id, z.id)};
1111cb93a386Sopenharmony_ci    }
1112cb93a386Sopenharmony_ci
1113cb93a386Sopenharmony_ci    I32 Builder::extract(I32 x, int bits, I32 z) {
1114cb93a386Sopenharmony_ci        if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
1115cb93a386Sopenharmony_ci        return bit_and(z, shr(x, bits));
1116cb93a386Sopenharmony_ci    }
1117cb93a386Sopenharmony_ci
1118cb93a386Sopenharmony_ci    I32 Builder::pack(I32 x, I32 y, int bits) {
1119cb93a386Sopenharmony_ci        return bit_or(x, shl(y, bits));
1120cb93a386Sopenharmony_ci    }
1121cb93a386Sopenharmony_ci
1122cb93a386Sopenharmony_ci    F32 Builder::ceil(F32 x) {
1123cb93a386Sopenharmony_ci        if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1124cb93a386Sopenharmony_ci        return {this, this->push(Op::ceil, x.id)};
1125cb93a386Sopenharmony_ci    }
1126cb93a386Sopenharmony_ci    F32 Builder::floor(F32 x) {
1127cb93a386Sopenharmony_ci        if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1128cb93a386Sopenharmony_ci        return {this, this->push(Op::floor, x.id)};
1129cb93a386Sopenharmony_ci    }
1130cb93a386Sopenharmony_ci    F32 Builder::to_F32(I32 x) {
1131cb93a386Sopenharmony_ci        if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1132cb93a386Sopenharmony_ci        return {this, this->push(Op::to_f32, x.id)};
1133cb93a386Sopenharmony_ci    }
1134cb93a386Sopenharmony_ci    I32 Builder::trunc(F32 x) {
1135cb93a386Sopenharmony_ci        if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1136cb93a386Sopenharmony_ci        return {this, this->push(Op::trunc, x.id)};
1137cb93a386Sopenharmony_ci    }
1138cb93a386Sopenharmony_ci    I32 Builder::round(F32 x) {
1139cb93a386Sopenharmony_ci        if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1140cb93a386Sopenharmony_ci        return {this, this->push(Op::round, x.id)};
1141cb93a386Sopenharmony_ci    }
1142cb93a386Sopenharmony_ci
1143cb93a386Sopenharmony_ci    I32 Builder::to_fp16(F32 x) {
1144cb93a386Sopenharmony_ci        if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1145cb93a386Sopenharmony_ci        return {this, this->push(Op::to_fp16, x.id)};
1146cb93a386Sopenharmony_ci    }
1147cb93a386Sopenharmony_ci    F32 Builder::from_fp16(I32 x) {
1148cb93a386Sopenharmony_ci        if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1149cb93a386Sopenharmony_ci        return {this, this->push(Op::from_fp16, x.id)};
1150cb93a386Sopenharmony_ci    }
1151cb93a386Sopenharmony_ci
1152cb93a386Sopenharmony_ci    F32 Builder::from_unorm(int bits, I32 x) {
1153cb93a386Sopenharmony_ci        F32 limit = splat(1 / ((1<<bits)-1.0f));
1154cb93a386Sopenharmony_ci        return mul(to_F32(x), limit);
1155cb93a386Sopenharmony_ci    }
1156cb93a386Sopenharmony_ci    I32 Builder::to_unorm(int bits, F32 x) {
1157cb93a386Sopenharmony_ci        F32 limit = splat((1<<bits)-1.0f);
1158cb93a386Sopenharmony_ci        return round(mul(x, limit));
1159cb93a386Sopenharmony_ci    }
1160cb93a386Sopenharmony_ci
1161cb93a386Sopenharmony_ci    PixelFormat SkColorType_to_PixelFormat(SkColorType ct) {
1162cb93a386Sopenharmony_ci        auto UNORM = PixelFormat::UNORM,
1163cb93a386Sopenharmony_ci             SRGB  = PixelFormat::SRGB,
1164cb93a386Sopenharmony_ci             FLOAT = PixelFormat::FLOAT;
1165cb93a386Sopenharmony_ci        switch (ct) {
1166cb93a386Sopenharmony_ci            case kUnknown_SkColorType: break;
1167cb93a386Sopenharmony_ci
1168cb93a386Sopenharmony_ci            case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96};
1169cb93a386Sopenharmony_ci
1170cb93a386Sopenharmony_ci            case kRGBA_F16Norm_SkColorType:       return {FLOAT,16,16,16,16, 0,16,32,48};
1171cb93a386Sopenharmony_ci            case kRGBA_F16_SkColorType:           return {FLOAT,16,16,16,16, 0,16,32,48};
1172cb93a386Sopenharmony_ci            case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48};
1173cb93a386Sopenharmony_ci
1174cb93a386Sopenharmony_ci            case kA16_float_SkColorType:    return {FLOAT,  0, 0,0,16, 0, 0,0,0};
1175cb93a386Sopenharmony_ci            case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0};
1176cb93a386Sopenharmony_ci
1177cb93a386Sopenharmony_ci            case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0};
1178cb93a386Sopenharmony_ci            case kGray_8_SkColorType:  return {UNORM, 8,8,8,0, 0,0,0,0};  // Subtle.
1179cb93a386Sopenharmony_ci
1180cb93a386Sopenharmony_ci            case kRGB_565_SkColorType:   return {UNORM, 5,6,5,0, 11,5,0,0};  // (BGR)
1181cb93a386Sopenharmony_ci            case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0};  // (ABGR)
1182cb93a386Sopenharmony_ci
1183cb93a386Sopenharmony_ci            case kRGBA_8888_SkColorType:  return {UNORM, 8,8,8,8,  0,8,16,24};
1184cb93a386Sopenharmony_ci            case kRGB_888x_SkColorType:   return {UNORM, 8,8,8,0,  0,8,16,32};  // 32-bit
1185cb93a386Sopenharmony_ci            case kBGRA_8888_SkColorType:  return {UNORM, 8,8,8,8, 16,8, 0,24};
1186cb93a386Sopenharmony_ci            case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8,  0,8,16,24};
1187cb93a386Sopenharmony_ci
1188cb93a386Sopenharmony_ci            case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2,  0,10,20,30};
1189cb93a386Sopenharmony_ci            case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30};
1190cb93a386Sopenharmony_ci            case kRGB_101010x_SkColorType:  return {UNORM, 10,10,10,0,  0,10,20, 0};
1191cb93a386Sopenharmony_ci            case kBGR_101010x_SkColorType:  return {UNORM, 10,10,10,0, 20,10, 0, 0};
1192cb93a386Sopenharmony_ci
1193cb93a386Sopenharmony_ci            case kR8G8_unorm_SkColorType:   return {UNORM,  8, 8,0, 0, 0, 8,0,0};
1194cb93a386Sopenharmony_ci            case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0};
1195cb93a386Sopenharmony_ci            case kA16_unorm_SkColorType:    return {UNORM,  0, 0,0,16, 0, 0,0,0};
1196cb93a386Sopenharmony_ci        }
1197cb93a386Sopenharmony_ci        SkASSERT(false);
1198cb93a386Sopenharmony_ci        return {UNORM, 0,0,0,0, 0,0,0,0};
1199cb93a386Sopenharmony_ci    }
1200cb93a386Sopenharmony_ci
1201cb93a386Sopenharmony_ci    static int byte_size(PixelFormat f) {
1202cb93a386Sopenharmony_ci        // What's the highest bit we read?
1203cb93a386Sopenharmony_ci        int bits = std::max(f.r_bits + f.r_shift,
1204cb93a386Sopenharmony_ci                   std::max(f.g_bits + f.g_shift,
1205cb93a386Sopenharmony_ci                   std::max(f.b_bits + f.b_shift,
1206cb93a386Sopenharmony_ci                            f.a_bits + f.a_shift)));
1207cb93a386Sopenharmony_ci        // Round up to bytes.
1208cb93a386Sopenharmony_ci        return (bits + 7) / 8;
1209cb93a386Sopenharmony_ci    }
1210cb93a386Sopenharmony_ci
1211cb93a386Sopenharmony_ci    static Color unpack(PixelFormat f, I32 x) {
1212cb93a386Sopenharmony_ci        SkASSERT(byte_size(f) <= 4);
1213cb93a386Sopenharmony_ci
1214cb93a386Sopenharmony_ci        auto from_srgb = [](int bits, I32 channel) -> F32 {
1215cb93a386Sopenharmony_ci            const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction();
1216cb93a386Sopenharmony_ci            F32 v = from_unorm(bits, channel);
1217cb93a386Sopenharmony_ci            return sk_program_transfer_fn(v, sRGBish_TF,
1218cb93a386Sopenharmony_ci                                          v->splat(tf->g),
1219cb93a386Sopenharmony_ci                                          v->splat(tf->a),
1220cb93a386Sopenharmony_ci                                          v->splat(tf->b),
1221cb93a386Sopenharmony_ci                                          v->splat(tf->c),
1222cb93a386Sopenharmony_ci                                          v->splat(tf->d),
1223cb93a386Sopenharmony_ci                                          v->splat(tf->e),
1224cb93a386Sopenharmony_ci                                          v->splat(tf->f));
1225cb93a386Sopenharmony_ci        };
1226cb93a386Sopenharmony_ci
1227cb93a386Sopenharmony_ci        auto unpack_rgb = [=](int bits, int shift) -> F32 {
1228cb93a386Sopenharmony_ci            I32 channel = extract(x, shift, (1<<bits)-1);
1229cb93a386Sopenharmony_ci            switch (f.encoding) {
1230cb93a386Sopenharmony_ci                case PixelFormat::UNORM: return from_unorm(bits, channel);
1231cb93a386Sopenharmony_ci                case PixelFormat:: SRGB: return from_srgb (bits, channel);
1232cb93a386Sopenharmony_ci                case PixelFormat::FLOAT: return from_fp16 (      channel);
1233cb93a386Sopenharmony_ci            }
1234cb93a386Sopenharmony_ci            SkUNREACHABLE;
1235cb93a386Sopenharmony_ci        };
1236cb93a386Sopenharmony_ci        auto unpack_alpha = [=](int bits, int shift) -> F32 {
1237cb93a386Sopenharmony_ci            I32 channel = extract(x, shift, (1<<bits)-1);
1238cb93a386Sopenharmony_ci            switch (f.encoding) {
1239cb93a386Sopenharmony_ci                case PixelFormat::UNORM:
1240cb93a386Sopenharmony_ci                case PixelFormat:: SRGB: return from_unorm(bits, channel);
1241cb93a386Sopenharmony_ci                case PixelFormat::FLOAT: return from_fp16 (      channel);
1242cb93a386Sopenharmony_ci            }
1243cb93a386Sopenharmony_ci            SkUNREACHABLE;
1244cb93a386Sopenharmony_ci        };
1245cb93a386Sopenharmony_ci        return {
1246cb93a386Sopenharmony_ci            f.r_bits ? unpack_rgb  (f.r_bits, f.r_shift) : x->splat(0.0f),
1247cb93a386Sopenharmony_ci            f.g_bits ? unpack_rgb  (f.g_bits, f.g_shift) : x->splat(0.0f),
1248cb93a386Sopenharmony_ci            f.b_bits ? unpack_rgb  (f.b_bits, f.b_shift) : x->splat(0.0f),
1249cb93a386Sopenharmony_ci            f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f),
1250cb93a386Sopenharmony_ci        };
1251cb93a386Sopenharmony_ci    }
1252cb93a386Sopenharmony_ci
1253cb93a386Sopenharmony_ci    static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1254cb93a386Sopenharmony_ci        SkASSERT(byte_size(f) == 8);
1255cb93a386Sopenharmony_ci        // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1256cb93a386Sopenharmony_ci        // The assert on byte_size(lo) will trigger if this assumption is violated.
1257cb93a386Sopenharmony_ci        *lo = f;
1258cb93a386Sopenharmony_ci        if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1259cb93a386Sopenharmony_ci        if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1260cb93a386Sopenharmony_ci        if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1261cb93a386Sopenharmony_ci        if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1262cb93a386Sopenharmony_ci        SkASSERT(byte_size(*lo) == 4);
1263cb93a386Sopenharmony_ci
1264cb93a386Sopenharmony_ci        *hi = f;
1265cb93a386Sopenharmony_ci        if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1266cb93a386Sopenharmony_ci        if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1267cb93a386Sopenharmony_ci        if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1268cb93a386Sopenharmony_ci        if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1269cb93a386Sopenharmony_ci        SkASSERT(byte_size(*hi) == 4);
1270cb93a386Sopenharmony_ci    }
1271cb93a386Sopenharmony_ci
1272cb93a386Sopenharmony_ci    // The only 16-byte format we support today is RGBA F32,
1273cb93a386Sopenharmony_ci    // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
1274cb93a386Sopenharmony_ci    static void assert_16byte_is_rgba_f32(PixelFormat f) {
1275cb93a386Sopenharmony_ci    #if defined(SK_DEBUG)
1276cb93a386Sopenharmony_ci        SkASSERT(byte_size(f) == 16);
1277cb93a386Sopenharmony_ci        PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType);
1278cb93a386Sopenharmony_ci
1279cb93a386Sopenharmony_ci        SkASSERT(f.encoding == rgba_f32.encoding);
1280cb93a386Sopenharmony_ci
1281cb93a386Sopenharmony_ci        SkASSERT(f.r_bits == rgba_f32.r_bits);
1282cb93a386Sopenharmony_ci        SkASSERT(f.g_bits == rgba_f32.g_bits);
1283cb93a386Sopenharmony_ci        SkASSERT(f.b_bits == rgba_f32.b_bits);
1284cb93a386Sopenharmony_ci        SkASSERT(f.a_bits == rgba_f32.a_bits);
1285cb93a386Sopenharmony_ci
1286cb93a386Sopenharmony_ci        SkASSERT(f.r_shift == rgba_f32.r_shift);
1287cb93a386Sopenharmony_ci        SkASSERT(f.g_shift == rgba_f32.g_shift);
1288cb93a386Sopenharmony_ci        SkASSERT(f.b_shift == rgba_f32.b_shift);
1289cb93a386Sopenharmony_ci        SkASSERT(f.a_shift == rgba_f32.a_shift);
1290cb93a386Sopenharmony_ci    #endif
1291cb93a386Sopenharmony_ci    }
1292cb93a386Sopenharmony_ci
1293cb93a386Sopenharmony_ci    Color Builder::load(PixelFormat f, Ptr ptr) {
1294cb93a386Sopenharmony_ci        switch (byte_size(f)) {
1295cb93a386Sopenharmony_ci            case 1: return unpack(f, load8 (ptr));
1296cb93a386Sopenharmony_ci            case 2: return unpack(f, load16(ptr));
1297cb93a386Sopenharmony_ci            case 4: return unpack(f, load32(ptr));
1298cb93a386Sopenharmony_ci            case 8: {
1299cb93a386Sopenharmony_ci                PixelFormat lo,hi;
1300cb93a386Sopenharmony_ci                split_disjoint_8byte_format(f, &lo,&hi);
1301cb93a386Sopenharmony_ci                Color l = unpack(lo, load64(ptr, 0)),
1302cb93a386Sopenharmony_ci                      h = unpack(hi, load64(ptr, 1));
1303cb93a386Sopenharmony_ci                return {
1304cb93a386Sopenharmony_ci                    lo.r_bits ? l.r : h.r,
1305cb93a386Sopenharmony_ci                    lo.g_bits ? l.g : h.g,
1306cb93a386Sopenharmony_ci                    lo.b_bits ? l.b : h.b,
1307cb93a386Sopenharmony_ci                    lo.a_bits ? l.a : h.a,
1308cb93a386Sopenharmony_ci                };
1309cb93a386Sopenharmony_ci            }
1310cb93a386Sopenharmony_ci            case 16: {
1311cb93a386Sopenharmony_ci                assert_16byte_is_rgba_f32(f);
1312cb93a386Sopenharmony_ci                return {
1313cb93a386Sopenharmony_ci                    pun_to_F32(load128(ptr, 0)),
1314cb93a386Sopenharmony_ci                    pun_to_F32(load128(ptr, 1)),
1315cb93a386Sopenharmony_ci                    pun_to_F32(load128(ptr, 2)),
1316cb93a386Sopenharmony_ci                    pun_to_F32(load128(ptr, 3)),
1317cb93a386Sopenharmony_ci                };
1318cb93a386Sopenharmony_ci            }
1319cb93a386Sopenharmony_ci            default: SkUNREACHABLE;
1320cb93a386Sopenharmony_ci        }
1321cb93a386Sopenharmony_ci        return {};
1322cb93a386Sopenharmony_ci    }
1323cb93a386Sopenharmony_ci
1324cb93a386Sopenharmony_ci    Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) {
1325cb93a386Sopenharmony_ci        switch (byte_size(f)) {
1326cb93a386Sopenharmony_ci            case 1: return unpack(f, gather8 (ptr, offset, index));
1327cb93a386Sopenharmony_ci            case 2: return unpack(f, gather16(ptr, offset, index));
1328cb93a386Sopenharmony_ci            case 4: return unpack(f, gather32(ptr, offset, index));
1329cb93a386Sopenharmony_ci            case 8: {
1330cb93a386Sopenharmony_ci                PixelFormat lo,hi;
1331cb93a386Sopenharmony_ci                split_disjoint_8byte_format(f, &lo,&hi);
1332cb93a386Sopenharmony_ci                Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1333cb93a386Sopenharmony_ci                      h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1334cb93a386Sopenharmony_ci                return {
1335cb93a386Sopenharmony_ci                    lo.r_bits ? l.r : h.r,
1336cb93a386Sopenharmony_ci                    lo.g_bits ? l.g : h.g,
1337cb93a386Sopenharmony_ci                    lo.b_bits ? l.b : h.b,
1338cb93a386Sopenharmony_ci                    lo.a_bits ? l.a : h.a,
1339cb93a386Sopenharmony_ci                };
1340cb93a386Sopenharmony_ci            }
1341cb93a386Sopenharmony_ci            case 16: {
1342cb93a386Sopenharmony_ci                assert_16byte_is_rgba_f32(f);
1343cb93a386Sopenharmony_ci                return {
1344cb93a386Sopenharmony_ci                    gatherF(ptr, offset, (index<<2)+0),
1345cb93a386Sopenharmony_ci                    gatherF(ptr, offset, (index<<2)+1),
1346cb93a386Sopenharmony_ci                    gatherF(ptr, offset, (index<<2)+2),
1347cb93a386Sopenharmony_ci                    gatherF(ptr, offset, (index<<2)+3),
1348cb93a386Sopenharmony_ci                };
1349cb93a386Sopenharmony_ci            }
1350cb93a386Sopenharmony_ci            default: SkUNREACHABLE;
1351cb93a386Sopenharmony_ci        }
1352cb93a386Sopenharmony_ci        return {};
1353cb93a386Sopenharmony_ci    }
1354cb93a386Sopenharmony_ci
1355cb93a386Sopenharmony_ci    static I32 pack32(PixelFormat f, Color c) {
1356cb93a386Sopenharmony_ci        SkASSERT(byte_size(f) <= 4);
1357cb93a386Sopenharmony_ci
1358cb93a386Sopenharmony_ci        auto to_srgb = [](int bits, F32 v) {
1359cb93a386Sopenharmony_ci            const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction();
1360cb93a386Sopenharmony_ci            return to_unorm(bits, sk_program_transfer_fn(v, sRGBish_TF,
1361cb93a386Sopenharmony_ci                                                         v->splat(tf->g),
1362cb93a386Sopenharmony_ci                                                         v->splat(tf->a),
1363cb93a386Sopenharmony_ci                                                         v->splat(tf->b),
1364cb93a386Sopenharmony_ci                                                         v->splat(tf->c),
1365cb93a386Sopenharmony_ci                                                         v->splat(tf->d),
1366cb93a386Sopenharmony_ci                                                         v->splat(tf->e),
1367cb93a386Sopenharmony_ci                                                         v->splat(tf->f)));
1368cb93a386Sopenharmony_ci        };
1369cb93a386Sopenharmony_ci
1370cb93a386Sopenharmony_ci        I32 packed = c->splat(0);
1371cb93a386Sopenharmony_ci        auto pack_rgb = [&](F32 channel, int bits, int shift) {
1372cb93a386Sopenharmony_ci            I32 encoded;
1373cb93a386Sopenharmony_ci            switch (f.encoding) {
1374cb93a386Sopenharmony_ci                case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1375cb93a386Sopenharmony_ci                case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break;
1376cb93a386Sopenharmony_ci                case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1377cb93a386Sopenharmony_ci            }
1378cb93a386Sopenharmony_ci            packed = pack(packed, encoded, shift);
1379cb93a386Sopenharmony_ci        };
1380cb93a386Sopenharmony_ci        auto pack_alpha = [&](F32 channel, int bits, int shift) {
1381cb93a386Sopenharmony_ci            I32 encoded;
1382cb93a386Sopenharmony_ci            switch (f.encoding) {
1383cb93a386Sopenharmony_ci                case PixelFormat::UNORM:
1384cb93a386Sopenharmony_ci                case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break;
1385cb93a386Sopenharmony_ci                case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1386cb93a386Sopenharmony_ci            }
1387cb93a386Sopenharmony_ci            packed = pack(packed, encoded, shift);
1388cb93a386Sopenharmony_ci        };
1389cb93a386Sopenharmony_ci        if (f.r_bits) { pack_rgb  (c.r, f.r_bits, f.r_shift); }
1390cb93a386Sopenharmony_ci        if (f.g_bits) { pack_rgb  (c.g, f.g_bits, f.g_shift); }
1391cb93a386Sopenharmony_ci        if (f.b_bits) { pack_rgb  (c.b, f.b_bits, f.b_shift); }
1392cb93a386Sopenharmony_ci        if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); }
1393cb93a386Sopenharmony_ci        return packed;
1394cb93a386Sopenharmony_ci    }
1395cb93a386Sopenharmony_ci
1396cb93a386Sopenharmony_ci    void Builder::store(PixelFormat f, Ptr ptr, Color c) {
1397cb93a386Sopenharmony_ci        // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1398cb93a386Sopenharmony_ci        if (f.r_bits  == f.g_bits  && f.g_bits  == f.b_bits &&
1399cb93a386Sopenharmony_ci            f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1400cb93a386Sopenharmony_ci
1401cb93a386Sopenharmony_ci            // TODO: pull these coefficients from an SkColorSpace?  This is sRGB luma/luminance.
1402cb93a386Sopenharmony_ci            c.r = c.r * 0.2126f
1403cb93a386Sopenharmony_ci                + c.g * 0.7152f
1404cb93a386Sopenharmony_ci                + c.b * 0.0722f;
1405cb93a386Sopenharmony_ci            f.g_bits = f.b_bits = 0;
1406cb93a386Sopenharmony_ci        }
1407cb93a386Sopenharmony_ci
1408cb93a386Sopenharmony_ci        switch (byte_size(f)) {
1409cb93a386Sopenharmony_ci            case 1: store8 (ptr, pack32(f,c)); break;
1410cb93a386Sopenharmony_ci            case 2: store16(ptr, pack32(f,c)); break;
1411cb93a386Sopenharmony_ci            case 4: store32(ptr, pack32(f,c)); break;
1412cb93a386Sopenharmony_ci            case 8: {
1413cb93a386Sopenharmony_ci                PixelFormat lo,hi;
1414cb93a386Sopenharmony_ci                split_disjoint_8byte_format(f, &lo,&hi);
1415cb93a386Sopenharmony_ci                store64(ptr, pack32(lo,c)
1416cb93a386Sopenharmony_ci                           , pack32(hi,c));
1417cb93a386Sopenharmony_ci                break;
1418cb93a386Sopenharmony_ci            }
1419cb93a386Sopenharmony_ci            case 16: {
1420cb93a386Sopenharmony_ci                assert_16byte_is_rgba_f32(f);
1421cb93a386Sopenharmony_ci                store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a));
1422cb93a386Sopenharmony_ci                break;
1423cb93a386Sopenharmony_ci            }
1424cb93a386Sopenharmony_ci            default: SkUNREACHABLE;
1425cb93a386Sopenharmony_ci        }
1426cb93a386Sopenharmony_ci    }
1427cb93a386Sopenharmony_ci
1428cb93a386Sopenharmony_ci    void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1429cb93a386Sopenharmony_ci        skvm::F32 invA = 1.0f / a,
1430cb93a386Sopenharmony_ci                  inf  = pun_to_F32(splat(0x7f800000));
1431cb93a386Sopenharmony_ci        // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1432cb93a386Sopenharmony_ci        invA = select(invA < inf, invA
1433cb93a386Sopenharmony_ci                                , 0.0f);
1434cb93a386Sopenharmony_ci        *r *= invA;
1435cb93a386Sopenharmony_ci        *g *= invA;
1436cb93a386Sopenharmony_ci        *b *= invA;
1437cb93a386Sopenharmony_ci    }
1438cb93a386Sopenharmony_ci
1439cb93a386Sopenharmony_ci    void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1440cb93a386Sopenharmony_ci        *r *= a;
1441cb93a386Sopenharmony_ci        *g *= a;
1442cb93a386Sopenharmony_ci        *b *= a;
1443cb93a386Sopenharmony_ci    }
1444cb93a386Sopenharmony_ci
1445cb93a386Sopenharmony_ci    Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) {
1446cb93a386Sopenharmony_ci        auto [r,g,b,a] = color;
1447cb93a386Sopenharmony_ci        return {
1448cb93a386Sopenharmony_ci            uniformF(uniforms->pushF(r)),
1449cb93a386Sopenharmony_ci            uniformF(uniforms->pushF(g)),
1450cb93a386Sopenharmony_ci            uniformF(uniforms->pushF(b)),
1451cb93a386Sopenharmony_ci            uniformF(uniforms->pushF(a)),
1452cb93a386Sopenharmony_ci        };
1453cb93a386Sopenharmony_ci    }
1454cb93a386Sopenharmony_ci
1455cb93a386Sopenharmony_ci    F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1456cb93a386Sopenharmony_ci        if (this->isImm(t.id, 0.0f)) { return lo; }
1457cb93a386Sopenharmony_ci        if (this->isImm(t.id, 1.0f)) { return hi; }
1458cb93a386Sopenharmony_ci        return mad(sub(hi, lo), t, lo);
1459cb93a386Sopenharmony_ci    }
1460cb93a386Sopenharmony_ci
1461cb93a386Sopenharmony_ci    Color Builder::lerp(Color lo, Color hi, F32 t) {
1462cb93a386Sopenharmony_ci        return {
1463cb93a386Sopenharmony_ci            lerp(lo.r, hi.r, t),
1464cb93a386Sopenharmony_ci            lerp(lo.g, hi.g, t),
1465cb93a386Sopenharmony_ci            lerp(lo.b, hi.b, t),
1466cb93a386Sopenharmony_ci            lerp(lo.a, hi.a, t),
1467cb93a386Sopenharmony_ci        };
1468cb93a386Sopenharmony_ci    }
1469cb93a386Sopenharmony_ci
1470cb93a386Sopenharmony_ci    HSLA Builder::to_hsla(Color c) {
1471cb93a386Sopenharmony_ci        F32 mx = max(max(c.r,c.g),c.b),
1472cb93a386Sopenharmony_ci            mn = min(min(c.r,c.g),c.b),
1473cb93a386Sopenharmony_ci             d = mx - mn,
1474cb93a386Sopenharmony_ci          invd = 1.0f / d,
1475cb93a386Sopenharmony_ci        g_lt_b = select(c.g < c.b, splat(6.0f)
1476cb93a386Sopenharmony_ci                                 , splat(0.0f));
1477cb93a386Sopenharmony_ci
1478cb93a386Sopenharmony_ci        F32 h = (1/6.0f) * select(mx == mn,  0.0f,
1479cb93a386Sopenharmony_ci                           select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1480cb93a386Sopenharmony_ci                           select(mx == c.g, invd * (c.b - c.r) + 2.0f
1481cb93a386Sopenharmony_ci                                           , invd * (c.r - c.g) + 4.0f)));
1482cb93a386Sopenharmony_ci
1483cb93a386Sopenharmony_ci        F32 sum = mx + mn,
1484cb93a386Sopenharmony_ci              l = sum * 0.5f,
1485cb93a386Sopenharmony_ci              s = select(mx == mn, 0.0f
1486cb93a386Sopenharmony_ci                                 , d / select(l > 0.5f, 2.0f - sum
1487cb93a386Sopenharmony_ci                                                      , sum));
1488cb93a386Sopenharmony_ci        return {h, s, l, c.a};
1489cb93a386Sopenharmony_ci    }
1490cb93a386Sopenharmony_ci
1491cb93a386Sopenharmony_ci    Color Builder::to_rgba(HSLA c) {
1492cb93a386Sopenharmony_ci        // See GrRGBToHSLFilterEffect.fp
1493cb93a386Sopenharmony_ci
1494cb93a386Sopenharmony_ci        auto [h,s,l,a] = c;
1495cb93a386Sopenharmony_ci        F32 x = s * (1.0f - abs(l + l - 1.0f));
1496cb93a386Sopenharmony_ci
1497cb93a386Sopenharmony_ci        auto hue_to_rgb = [&,l=l](auto hue) {
1498cb93a386Sopenharmony_ci            auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1499cb93a386Sopenharmony_ci            return x * (clamp01(q) - 0.5f) + l;
1500cb93a386Sopenharmony_ci        };
1501cb93a386Sopenharmony_ci
1502cb93a386Sopenharmony_ci        return {
1503cb93a386Sopenharmony_ci            hue_to_rgb(h + 0/3.0f),
1504cb93a386Sopenharmony_ci            hue_to_rgb(h + 2/3.0f),
1505cb93a386Sopenharmony_ci            hue_to_rgb(h + 1/3.0f),
1506cb93a386Sopenharmony_ci            c.a,
1507cb93a386Sopenharmony_ci        };
1508cb93a386Sopenharmony_ci    }
1509cb93a386Sopenharmony_ci
1510cb93a386Sopenharmony_ci    // We're basing our implementation of non-separable blend modes on
1511cb93a386Sopenharmony_ci    //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1512cb93a386Sopenharmony_ci    // and
1513cb93a386Sopenharmony_ci    //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1514cb93a386Sopenharmony_ci    // They're equivalent, but ES' math has been better simplified.
1515cb93a386Sopenharmony_ci    //
1516cb93a386Sopenharmony_ci    // Anything extra we add beyond that is to make the math work with premul inputs.
1517cb93a386Sopenharmony_ci
1518cb93a386Sopenharmony_ci    static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1519cb93a386Sopenharmony_ci        return max(r, max(g, b))
1520cb93a386Sopenharmony_ci             - min(r, min(g, b));
1521cb93a386Sopenharmony_ci    }
1522cb93a386Sopenharmony_ci
1523cb93a386Sopenharmony_ci    static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1524cb93a386Sopenharmony_ci        return r*0.30f + g*0.59f + b*0.11f;
1525cb93a386Sopenharmony_ci    }
1526cb93a386Sopenharmony_ci
1527cb93a386Sopenharmony_ci    static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1528cb93a386Sopenharmony_ci        F32 mn  = min(*r, min(*g, *b)),
1529cb93a386Sopenharmony_ci            mx  = max(*r, max(*g, *b)),
1530cb93a386Sopenharmony_ci            sat = mx - mn;
1531cb93a386Sopenharmony_ci
1532cb93a386Sopenharmony_ci        // Map min channel to 0, max channel to s, and scale the middle proportionally.
1533cb93a386Sopenharmony_ci        auto scale = [&](skvm::F32 c) {
1534cb93a386Sopenharmony_ci            auto scaled = ((c - mn) * s) / sat;
1535cb93a386Sopenharmony_ci            return select(is_finite(scaled), scaled, 0.0f);
1536cb93a386Sopenharmony_ci        };
1537cb93a386Sopenharmony_ci        *r = scale(*r);
1538cb93a386Sopenharmony_ci        *g = scale(*g);
1539cb93a386Sopenharmony_ci        *b = scale(*b);
1540cb93a386Sopenharmony_ci    }
1541cb93a386Sopenharmony_ci
1542cb93a386Sopenharmony_ci    static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1543cb93a386Sopenharmony_ci        auto diff = lu - luminance(*r, *g, *b);
1544cb93a386Sopenharmony_ci        *r += diff;
1545cb93a386Sopenharmony_ci        *g += diff;
1546cb93a386Sopenharmony_ci        *b += diff;
1547cb93a386Sopenharmony_ci    }
1548cb93a386Sopenharmony_ci
1549cb93a386Sopenharmony_ci    static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1550cb93a386Sopenharmony_ci        F32 mn  = min(*r, min(*g, *b)),
1551cb93a386Sopenharmony_ci            mx  = max(*r, max(*g, *b)),
1552cb93a386Sopenharmony_ci            lu = luminance(*r, *g, *b);
1553cb93a386Sopenharmony_ci
1554cb93a386Sopenharmony_ci        auto clip = [&](auto c) {
1555cb93a386Sopenharmony_ci            c = select(mn >= 0, c
1556cb93a386Sopenharmony_ci                              , lu + ((c-lu)*(  lu)) / (lu-mn));
1557cb93a386Sopenharmony_ci            c = select(mx >  a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1558cb93a386Sopenharmony_ci                              , c);
1559cb93a386Sopenharmony_ci            return clamp01(c);  // May be a little negative, or worse, NaN.
1560cb93a386Sopenharmony_ci        };
1561cb93a386Sopenharmony_ci        *r = clip(*r);
1562cb93a386Sopenharmony_ci        *g = clip(*g);
1563cb93a386Sopenharmony_ci        *b = clip(*b);
1564cb93a386Sopenharmony_ci    }
1565cb93a386Sopenharmony_ci
1566cb93a386Sopenharmony_ci    Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1567cb93a386Sopenharmony_ci        auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1568cb93a386Sopenharmony_ci            return x*y + z*w;
1569cb93a386Sopenharmony_ci        };
1570cb93a386Sopenharmony_ci
1571cb93a386Sopenharmony_ci        auto two = [](skvm::F32 x) { return x+x; };
1572cb93a386Sopenharmony_ci
1573cb93a386Sopenharmony_ci        auto apply_rgba = [&](auto fn) {
1574cb93a386Sopenharmony_ci            return Color {
1575cb93a386Sopenharmony_ci                fn(src.r, dst.r),
1576cb93a386Sopenharmony_ci                fn(src.g, dst.g),
1577cb93a386Sopenharmony_ci                fn(src.b, dst.b),
1578cb93a386Sopenharmony_ci                fn(src.a, dst.a),
1579cb93a386Sopenharmony_ci            };
1580cb93a386Sopenharmony_ci        };
1581cb93a386Sopenharmony_ci
1582cb93a386Sopenharmony_ci        auto apply_rgb_srcover_a = [&](auto fn) {
1583cb93a386Sopenharmony_ci            return Color {
1584cb93a386Sopenharmony_ci                fn(src.r, dst.r),
1585cb93a386Sopenharmony_ci                fn(src.g, dst.g),
1586cb93a386Sopenharmony_ci                fn(src.b, dst.b),
1587cb93a386Sopenharmony_ci                mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1588cb93a386Sopenharmony_ci            };
1589cb93a386Sopenharmony_ci        };
1590cb93a386Sopenharmony_ci
1591cb93a386Sopenharmony_ci        auto non_sep = [&](auto R, auto G, auto B) {
1592cb93a386Sopenharmony_ci            return Color{
1593cb93a386Sopenharmony_ci                R + mma(src.r, 1-dst.a,  dst.r, 1-src.a),
1594cb93a386Sopenharmony_ci                G + mma(src.g, 1-dst.a,  dst.g, 1-src.a),
1595cb93a386Sopenharmony_ci                B + mma(src.b, 1-dst.a,  dst.b, 1-src.a),
1596cb93a386Sopenharmony_ci                mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1597cb93a386Sopenharmony_ci            };
1598cb93a386Sopenharmony_ci        };
1599cb93a386Sopenharmony_ci
1600cb93a386Sopenharmony_ci        switch (mode) {
1601cb93a386Sopenharmony_ci            default:
1602cb93a386Sopenharmony_ci                SkASSERT(false);
1603cb93a386Sopenharmony_ci                [[fallthrough]]; /*but also, for safety, fallthrough*/
1604cb93a386Sopenharmony_ci
1605cb93a386Sopenharmony_ci            case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1606cb93a386Sopenharmony_ci
1607cb93a386Sopenharmony_ci            case SkBlendMode::kSrc: return src;
1608cb93a386Sopenharmony_ci            case SkBlendMode::kDst: return dst;
1609cb93a386Sopenharmony_ci
1610cb93a386Sopenharmony_ci            case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1611cb93a386Sopenharmony_ci            case SkBlendMode::kSrcOver:
1612cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1613cb93a386Sopenharmony_ci                    return mad(d,1-src.a, s);
1614cb93a386Sopenharmony_ci                });
1615cb93a386Sopenharmony_ci
1616cb93a386Sopenharmony_ci            case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1617cb93a386Sopenharmony_ci            case SkBlendMode::kSrcIn:
1618cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1619cb93a386Sopenharmony_ci                    return s * dst.a;
1620cb93a386Sopenharmony_ci                });
1621cb93a386Sopenharmony_ci
1622cb93a386Sopenharmony_ci            case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1623cb93a386Sopenharmony_ci
1624cb93a386Sopenharmony_ci            case SkBlendMode::kSrcOut:
1625cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1626cb93a386Sopenharmony_ci                    return s * (1-dst.a);
1627cb93a386Sopenharmony_ci                });
1628cb93a386Sopenharmony_ci
1629cb93a386Sopenharmony_ci            case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1630cb93a386Sopenharmony_ci            case SkBlendMode::kSrcATop:
1631cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1632cb93a386Sopenharmony_ci                    return mma(s, dst.a,  d, 1-src.a);
1633cb93a386Sopenharmony_ci                });
1634cb93a386Sopenharmony_ci
1635cb93a386Sopenharmony_ci            case SkBlendMode::kXor:
1636cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1637cb93a386Sopenharmony_ci                    return mma(s, 1-dst.a,  d, 1-src.a);
1638cb93a386Sopenharmony_ci                });
1639cb93a386Sopenharmony_ci
1640cb93a386Sopenharmony_ci            case SkBlendMode::kPlus:
1641cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1642cb93a386Sopenharmony_ci                    return min(s+d, 1.0f);
1643cb93a386Sopenharmony_ci                });
1644cb93a386Sopenharmony_ci
1645cb93a386Sopenharmony_ci            case SkBlendMode::kModulate:
1646cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1647cb93a386Sopenharmony_ci                    return s * d;
1648cb93a386Sopenharmony_ci                });
1649cb93a386Sopenharmony_ci
1650cb93a386Sopenharmony_ci            case SkBlendMode::kScreen:
1651cb93a386Sopenharmony_ci                // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1652cb93a386Sopenharmony_ci                // It's kind of plausible that s + (d - sd) keeps more precision?
1653cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1654cb93a386Sopenharmony_ci                    return s + (d - s*d);
1655cb93a386Sopenharmony_ci                });
1656cb93a386Sopenharmony_ci
1657cb93a386Sopenharmony_ci            case SkBlendMode::kDarken:
1658cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1659cb93a386Sopenharmony_ci                    return s + (d - max(s * dst.a,
1660cb93a386Sopenharmony_ci                                        d * src.a));
1661cb93a386Sopenharmony_ci                });
1662cb93a386Sopenharmony_ci
1663cb93a386Sopenharmony_ci            case SkBlendMode::kLighten:
1664cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1665cb93a386Sopenharmony_ci                    return s + (d - min(s * dst.a,
1666cb93a386Sopenharmony_ci                                        d * src.a));
1667cb93a386Sopenharmony_ci                });
1668cb93a386Sopenharmony_ci
1669cb93a386Sopenharmony_ci            case SkBlendMode::kDifference:
1670cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1671cb93a386Sopenharmony_ci                    return s + (d - two(min(s * dst.a,
1672cb93a386Sopenharmony_ci                                            d * src.a)));
1673cb93a386Sopenharmony_ci                });
1674cb93a386Sopenharmony_ci
1675cb93a386Sopenharmony_ci            case SkBlendMode::kExclusion:
1676cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1677cb93a386Sopenharmony_ci                    return s + (d - two(s * d));
1678cb93a386Sopenharmony_ci                });
1679cb93a386Sopenharmony_ci
1680cb93a386Sopenharmony_ci            case SkBlendMode::kColorBurn:
1681cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1682cb93a386Sopenharmony_ci                    auto mn   = min(dst.a,
1683cb93a386Sopenharmony_ci                                    src.a * (dst.a - d) / s),
1684cb93a386Sopenharmony_ci                         burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1685cb93a386Sopenharmony_ci                    return select(d == dst.a     , s * (1-dst.a) + d,
1686cb93a386Sopenharmony_ci                           select(is_finite(burn), burn
1687cb93a386Sopenharmony_ci                                                 , d * (1-src.a) + s));
1688cb93a386Sopenharmony_ci                });
1689cb93a386Sopenharmony_ci
1690cb93a386Sopenharmony_ci            case SkBlendMode::kColorDodge:
1691cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1692cb93a386Sopenharmony_ci                    auto dodge = src.a * min(dst.a,
1693cb93a386Sopenharmony_ci                                             d * src.a / (src.a - s))
1694cb93a386Sopenharmony_ci                                       + mma(s, 1-dst.a, d, 1-src.a);
1695cb93a386Sopenharmony_ci                    return select(d == 0.0f       , s * (1-dst.a) + d,
1696cb93a386Sopenharmony_ci                           select(is_finite(dodge), dodge
1697cb93a386Sopenharmony_ci                                                  , d * (1-src.a) + s));
1698cb93a386Sopenharmony_ci                });
1699cb93a386Sopenharmony_ci
1700cb93a386Sopenharmony_ci            case SkBlendMode::kHardLight:
1701cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1702cb93a386Sopenharmony_ci                    return mma(s, 1-dst.a, d, 1-src.a) +
1703cb93a386Sopenharmony_ci                           select(two(s) <= src.a,
1704cb93a386Sopenharmony_ci                                  two(s * d),
1705cb93a386Sopenharmony_ci                                  src.a * dst.a - two((dst.a - d) * (src.a - s)));
1706cb93a386Sopenharmony_ci                });
1707cb93a386Sopenharmony_ci
1708cb93a386Sopenharmony_ci            case SkBlendMode::kOverlay:
1709cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1710cb93a386Sopenharmony_ci                    return mma(s, 1-dst.a, d, 1-src.a) +
1711cb93a386Sopenharmony_ci                           select(two(d) <= dst.a,
1712cb93a386Sopenharmony_ci                                  two(s * d),
1713cb93a386Sopenharmony_ci                                  src.a * dst.a - two((dst.a - d) * (src.a - s)));
1714cb93a386Sopenharmony_ci                });
1715cb93a386Sopenharmony_ci
1716cb93a386Sopenharmony_ci            case SkBlendMode::kMultiply:
1717cb93a386Sopenharmony_ci                return apply_rgba([&](auto s, auto d) {
1718cb93a386Sopenharmony_ci                    return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1719cb93a386Sopenharmony_ci                });
1720cb93a386Sopenharmony_ci
1721cb93a386Sopenharmony_ci            case SkBlendMode::kSoftLight:
1722cb93a386Sopenharmony_ci                return apply_rgb_srcover_a([&](auto s, auto d) {
1723cb93a386Sopenharmony_ci                    auto  m = select(dst.a > 0.0f, d / dst.a
1724cb93a386Sopenharmony_ci                                                 , 0.0f),
1725cb93a386Sopenharmony_ci                         s2 = two(s),
1726cb93a386Sopenharmony_ci                         m4 = 4*m;
1727cb93a386Sopenharmony_ci
1728cb93a386Sopenharmony_ci                         // The logic forks three ways:
1729cb93a386Sopenharmony_ci                         //    1. dark src?
1730cb93a386Sopenharmony_ci                         //    2. light src, dark dst?
1731cb93a386Sopenharmony_ci                         //    3. light src, light dst?
1732cb93a386Sopenharmony_ci
1733cb93a386Sopenharmony_ci                         // Used in case 1
1734cb93a386Sopenharmony_ci                    auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1735cb93a386Sopenharmony_ci                         // Used in case 2
1736cb93a386Sopenharmony_ci                         darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1737cb93a386Sopenharmony_ci                         // Used in case 3.
1738cb93a386Sopenharmony_ci                         liteDst = sqrt(m) - m,
1739cb93a386Sopenharmony_ci                         // Used in 2 or 3?
1740cb93a386Sopenharmony_ci                         liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1741cb93a386Sopenharmony_ci                                                                             , liteDst)
1742cb93a386Sopenharmony_ci                                   + d * src.a;
1743cb93a386Sopenharmony_ci                    return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1744cb93a386Sopenharmony_ci                                                                             , liteSrc);
1745cb93a386Sopenharmony_ci                });
1746cb93a386Sopenharmony_ci
1747cb93a386Sopenharmony_ci            case SkBlendMode::kHue: {
1748cb93a386Sopenharmony_ci                skvm::F32 R = src.r * src.a,
1749cb93a386Sopenharmony_ci                          G = src.g * src.a,
1750cb93a386Sopenharmony_ci                          B = src.b * src.a;
1751cb93a386Sopenharmony_ci
1752cb93a386Sopenharmony_ci                set_sat   (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1753cb93a386Sopenharmony_ci                set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1754cb93a386Sopenharmony_ci                clip_color(&R, &G, &B, src.a * dst.a);
1755cb93a386Sopenharmony_ci
1756cb93a386Sopenharmony_ci                return non_sep(R, G, B);
1757cb93a386Sopenharmony_ci            }
1758cb93a386Sopenharmony_ci
1759cb93a386Sopenharmony_ci            case SkBlendMode::kSaturation: {
1760cb93a386Sopenharmony_ci                skvm::F32 R = dst.r * src.a,
1761cb93a386Sopenharmony_ci                          G = dst.g * src.a,
1762cb93a386Sopenharmony_ci                          B = dst.b * src.a;
1763cb93a386Sopenharmony_ci
1764cb93a386Sopenharmony_ci                set_sat   (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1765cb93a386Sopenharmony_ci                set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1766cb93a386Sopenharmony_ci                clip_color(&R, &G, &B, src.a * dst.a);
1767cb93a386Sopenharmony_ci
1768cb93a386Sopenharmony_ci                return non_sep(R, G, B);
1769cb93a386Sopenharmony_ci            }
1770cb93a386Sopenharmony_ci
1771cb93a386Sopenharmony_ci            case SkBlendMode::kColor: {
1772cb93a386Sopenharmony_ci                skvm::F32 R = src.r * dst.a,
1773cb93a386Sopenharmony_ci                          G = src.g * dst.a,
1774cb93a386Sopenharmony_ci                          B = src.b * dst.a;
1775cb93a386Sopenharmony_ci
1776cb93a386Sopenharmony_ci                set_lum   (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1777cb93a386Sopenharmony_ci                clip_color(&R, &G, &B, src.a * dst.a);
1778cb93a386Sopenharmony_ci
1779cb93a386Sopenharmony_ci                return non_sep(R, G, B);
1780cb93a386Sopenharmony_ci            }
1781cb93a386Sopenharmony_ci
1782cb93a386Sopenharmony_ci            case SkBlendMode::kLuminosity: {
1783cb93a386Sopenharmony_ci                skvm::F32 R = dst.r * src.a,
1784cb93a386Sopenharmony_ci                          G = dst.g * src.a,
1785cb93a386Sopenharmony_ci                          B = dst.b * src.a;
1786cb93a386Sopenharmony_ci
1787cb93a386Sopenharmony_ci                set_lum   (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1788cb93a386Sopenharmony_ci                clip_color(&R, &G, &B, dst.a * src.a);
1789cb93a386Sopenharmony_ci
1790cb93a386Sopenharmony_ci                return non_sep(R, G, B);
1791cb93a386Sopenharmony_ci            }
1792cb93a386Sopenharmony_ci        }
1793cb93a386Sopenharmony_ci    }
1794cb93a386Sopenharmony_ci
1795cb93a386Sopenharmony_ci    // ~~~~ Program::eval() and co. ~~~~ //
1796cb93a386Sopenharmony_ci
1797cb93a386Sopenharmony_ci    // Handy references for x86-64 instruction encoding:
1798cb93a386Sopenharmony_ci    // https://wiki.osdev.org/X86-64_Instruction_Encoding
1799cb93a386Sopenharmony_ci    // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1800cb93a386Sopenharmony_ci    // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1801cb93a386Sopenharmony_ci    // http://ref.x86asm.net/coder64.html
1802cb93a386Sopenharmony_ci
1803cb93a386Sopenharmony_ci    // Used for ModRM / immediate instruction encoding.
1804cb93a386Sopenharmony_ci    static uint8_t _233(int a, int b, int c) {
1805cb93a386Sopenharmony_ci        return (a & 3) << 6
1806cb93a386Sopenharmony_ci             | (b & 7) << 3
1807cb93a386Sopenharmony_ci             | (c & 7) << 0;
1808cb93a386Sopenharmony_ci    }
1809cb93a386Sopenharmony_ci
1810cb93a386Sopenharmony_ci    // ModRM byte encodes the arguments of an opcode.
1811cb93a386Sopenharmony_ci    enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
1812cb93a386Sopenharmony_ci    static uint8_t mod_rm(Mod mod, int reg, int rm) {
1813cb93a386Sopenharmony_ci        return _233((int)mod, reg, rm);
1814cb93a386Sopenharmony_ci    }
1815cb93a386Sopenharmony_ci
1816cb93a386Sopenharmony_ci    static Mod mod(int imm) {
1817cb93a386Sopenharmony_ci        if (imm == 0)               { return Mod::Indirect; }
1818cb93a386Sopenharmony_ci        if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1819cb93a386Sopenharmony_ci        return Mod::FourByteImm;
1820cb93a386Sopenharmony_ci    }
1821cb93a386Sopenharmony_ci
1822cb93a386Sopenharmony_ci    static int imm_bytes(Mod mod) {
1823cb93a386Sopenharmony_ci        switch (mod) {
1824cb93a386Sopenharmony_ci            case Mod::Indirect:    return 0;
1825cb93a386Sopenharmony_ci            case Mod::OneByteImm:  return 1;
1826cb93a386Sopenharmony_ci            case Mod::FourByteImm: return 4;
1827cb93a386Sopenharmony_ci            case Mod::Direct: SkUNREACHABLE;
1828cb93a386Sopenharmony_ci        }
1829cb93a386Sopenharmony_ci        SkUNREACHABLE;
1830cb93a386Sopenharmony_ci    }
1831cb93a386Sopenharmony_ci
1832cb93a386Sopenharmony_ci    // SIB byte encodes a memory address, base + (index * scale).
1833cb93a386Sopenharmony_ci    static uint8_t sib(Assembler::Scale scale, int index, int base) {
1834cb93a386Sopenharmony_ci        return _233((int)scale, index, base);
1835cb93a386Sopenharmony_ci    }
1836cb93a386Sopenharmony_ci
1837cb93a386Sopenharmony_ci    // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
1838cb93a386Sopenharmony_ci    static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1839cb93a386Sopenharmony_ci                       bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1840cb93a386Sopenharmony_ci                       bool X,   // Extra top bit for SIB index register.
1841cb93a386Sopenharmony_ci                       bool B) { // Extra top bit for SIB base or ModRM rm register.
1842cb93a386Sopenharmony_ci        return 0b01000000   // Fixed 0100 for top four bits.
1843cb93a386Sopenharmony_ci             | (W << 3)
1844cb93a386Sopenharmony_ci             | (R << 2)
1845cb93a386Sopenharmony_ci             | (X << 1)
1846cb93a386Sopenharmony_ci             | (B << 0);
1847cb93a386Sopenharmony_ci    }
1848cb93a386Sopenharmony_ci
1849cb93a386Sopenharmony_ci
1850cb93a386Sopenharmony_ci    // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1851cb93a386Sopenharmony_ci    struct VEX {
1852cb93a386Sopenharmony_ci        int     len;
1853cb93a386Sopenharmony_ci        uint8_t bytes[3];
1854cb93a386Sopenharmony_ci    };
1855cb93a386Sopenharmony_ci
1856cb93a386Sopenharmony_ci    static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1857cb93a386Sopenharmony_ci                   bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1858cb93a386Sopenharmony_ci                   bool   X,   // Same as REX X.
1859cb93a386Sopenharmony_ci                   bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1860cb93a386Sopenharmony_ci                   int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1861cb93a386Sopenharmony_ci                   int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1862cb93a386Sopenharmony_ci                   bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1863cb93a386Sopenharmony_ci                   int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1864cb93a386Sopenharmony_ci
1865cb93a386Sopenharmony_ci        // Pack x86 opcode map selector to 5-bit VEX encoding.
1866cb93a386Sopenharmony_ci        map = [map]{
1867cb93a386Sopenharmony_ci            switch (map) {
1868cb93a386Sopenharmony_ci                case   0x0f: return 0b00001;
1869cb93a386Sopenharmony_ci                case 0x380f: return 0b00010;
1870cb93a386Sopenharmony_ci                case 0x3a0f: return 0b00011;
1871cb93a386Sopenharmony_ci                // Several more cases only used by XOP / TBM.
1872cb93a386Sopenharmony_ci            }
1873cb93a386Sopenharmony_ci            SkUNREACHABLE;
1874cb93a386Sopenharmony_ci        }();
1875cb93a386Sopenharmony_ci
1876cb93a386Sopenharmony_ci        // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1877cb93a386Sopenharmony_ci        pp = [pp]{
1878cb93a386Sopenharmony_ci            switch (pp) {
1879cb93a386Sopenharmony_ci                case 0x66: return 0b01;
1880cb93a386Sopenharmony_ci                case 0xf3: return 0b10;
1881cb93a386Sopenharmony_ci                case 0xf2: return 0b11;
1882cb93a386Sopenharmony_ci            }
1883cb93a386Sopenharmony_ci            return 0b00;
1884cb93a386Sopenharmony_ci        }();
1885cb93a386Sopenharmony_ci
1886cb93a386Sopenharmony_ci        VEX vex = {0, {0,0,0}};
1887cb93a386Sopenharmony_ci        if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1888cb93a386Sopenharmony_ci            // With these conditions met, we can optionally compress VEX to 2-byte.
1889cb93a386Sopenharmony_ci            vex.len = 2;
1890cb93a386Sopenharmony_ci            vex.bytes[0] = 0xc5;
1891cb93a386Sopenharmony_ci            vex.bytes[1] = (pp      &  3) << 0
1892cb93a386Sopenharmony_ci                         | (L       &  1) << 2
1893cb93a386Sopenharmony_ci                         | (~vvvv   & 15) << 3
1894cb93a386Sopenharmony_ci                         | (~(int)R &  1) << 7;
1895cb93a386Sopenharmony_ci        } else {
1896cb93a386Sopenharmony_ci            // We could use this 3-byte VEX prefix all the time if we like.
1897cb93a386Sopenharmony_ci            vex.len = 3;
1898cb93a386Sopenharmony_ci            vex.bytes[0] = 0xc4;
1899cb93a386Sopenharmony_ci            vex.bytes[1] = (map     & 31) << 0
1900cb93a386Sopenharmony_ci                         | (~(int)B &  1) << 5
1901cb93a386Sopenharmony_ci                         | (~(int)X &  1) << 6
1902cb93a386Sopenharmony_ci                         | (~(int)R &  1) << 7;
1903cb93a386Sopenharmony_ci            vex.bytes[2] = (pp    &  3) << 0
1904cb93a386Sopenharmony_ci                         | (L     &  1) << 2
1905cb93a386Sopenharmony_ci                         | (~vvvv & 15) << 3
1906cb93a386Sopenharmony_ci                         | (WE    &  1) << 7;
1907cb93a386Sopenharmony_ci        }
1908cb93a386Sopenharmony_ci        return vex;
1909cb93a386Sopenharmony_ci    }
1910cb93a386Sopenharmony_ci
1911cb93a386Sopenharmony_ci    Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
1912cb93a386Sopenharmony_ci
1913cb93a386Sopenharmony_ci    size_t Assembler::size() const { return fSize; }
1914cb93a386Sopenharmony_ci
1915cb93a386Sopenharmony_ci    void Assembler::bytes(const void* p, int n) {
1916cb93a386Sopenharmony_ci        if (fCode) {
1917cb93a386Sopenharmony_ci            memcpy(fCode+fSize, p, n);
1918cb93a386Sopenharmony_ci        }
1919cb93a386Sopenharmony_ci        fSize += n;
1920cb93a386Sopenharmony_ci    }
1921cb93a386Sopenharmony_ci
1922cb93a386Sopenharmony_ci    void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
1923cb93a386Sopenharmony_ci    void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1924cb93a386Sopenharmony_ci
1925cb93a386Sopenharmony_ci    void Assembler::align(int mod) {
1926cb93a386Sopenharmony_ci        while (this->size() % mod) {
1927cb93a386Sopenharmony_ci            this->byte(0x00);
1928cb93a386Sopenharmony_ci        }
1929cb93a386Sopenharmony_ci    }
1930cb93a386Sopenharmony_ci
1931cb93a386Sopenharmony_ci    void Assembler::int3() {
1932cb93a386Sopenharmony_ci        this->byte(0xcc);
1933cb93a386Sopenharmony_ci    }
1934cb93a386Sopenharmony_ci
1935cb93a386Sopenharmony_ci    void Assembler::vzeroupper() {
1936cb93a386Sopenharmony_ci        this->byte(0xc5);
1937cb93a386Sopenharmony_ci        this->byte(0xf8);
1938cb93a386Sopenharmony_ci        this->byte(0x77);
1939cb93a386Sopenharmony_ci    }
1940cb93a386Sopenharmony_ci    void Assembler::ret() { this->byte(0xc3); }
1941cb93a386Sopenharmony_ci
1942cb93a386Sopenharmony_ci    void Assembler::op(int opcode, Operand dst, GP64 x) {
1943cb93a386Sopenharmony_ci        if (dst.kind == Operand::REG) {
1944cb93a386Sopenharmony_ci            this->byte(rex(W1,x>>3,0,dst.reg>>3));
1945cb93a386Sopenharmony_ci            this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1946cb93a386Sopenharmony_ci            this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1947cb93a386Sopenharmony_ci        } else {
1948cb93a386Sopenharmony_ci            SkASSERT(dst.kind == Operand::MEM);
1949cb93a386Sopenharmony_ci            const Mem& m = dst.mem;
1950cb93a386Sopenharmony_ci            const bool need_SIB = (m.base&7) == rsp
1951cb93a386Sopenharmony_ci                               || m.index != rsp;
1952cb93a386Sopenharmony_ci
1953cb93a386Sopenharmony_ci            this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1954cb93a386Sopenharmony_ci            this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1955cb93a386Sopenharmony_ci            this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1956cb93a386Sopenharmony_ci            if (need_SIB) {
1957cb93a386Sopenharmony_ci                this->byte(sib(m.scale, m.index&7, m.base&7));
1958cb93a386Sopenharmony_ci            }
1959cb93a386Sopenharmony_ci            this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1960cb93a386Sopenharmony_ci        }
1961cb93a386Sopenharmony_ci    }
1962cb93a386Sopenharmony_ci
1963cb93a386Sopenharmony_ci    void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1964cb93a386Sopenharmony_ci        opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1965cb93a386Sopenharmony_ci
1966cb93a386Sopenharmony_ci        int imm_bytes = 4;
1967cb93a386Sopenharmony_ci        if (SkTFitsIn<int8_t>(imm)) {
1968cb93a386Sopenharmony_ci            imm_bytes = 1;
1969cb93a386Sopenharmony_ci            opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
1970cb93a386Sopenharmony_ci        }
1971cb93a386Sopenharmony_ci
1972cb93a386Sopenharmony_ci        this->op(opcode, dst, (GP64)opcode_ext);
1973cb93a386Sopenharmony_ci        this->bytes(&imm, imm_bytes);
1974cb93a386Sopenharmony_ci    }
1975cb93a386Sopenharmony_ci
1976cb93a386Sopenharmony_ci    void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
1977cb93a386Sopenharmony_ci    void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
1978cb93a386Sopenharmony_ci    void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
1979cb93a386Sopenharmony_ci
1980cb93a386Sopenharmony_ci    // These don't work quite like the other instructions with immediates:
1981cb93a386Sopenharmony_ci    // these immediates are always fixed size at 4 bytes or 1 byte.
1982cb93a386Sopenharmony_ci    void Assembler::mov(Operand dst, int imm) {
1983cb93a386Sopenharmony_ci        this->op(0xC7,dst,(GP64)0b000);
1984cb93a386Sopenharmony_ci        this->word(imm);
1985cb93a386Sopenharmony_ci    }
1986cb93a386Sopenharmony_ci    void Assembler::movb(Operand dst, int imm) {
1987cb93a386Sopenharmony_ci        this->op(0xC6,dst,(GP64)0b000);
1988cb93a386Sopenharmony_ci        this->byte(imm);
1989cb93a386Sopenharmony_ci    }
1990cb93a386Sopenharmony_ci
1991cb93a386Sopenharmony_ci    void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
1992cb93a386Sopenharmony_ci    void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
1993cb93a386Sopenharmony_ci    void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
1994cb93a386Sopenharmony_ci    void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
1995cb93a386Sopenharmony_ci    void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
1996cb93a386Sopenharmony_ci
1997cb93a386Sopenharmony_ci    void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
1998cb93a386Sopenharmony_ci    void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
1999cb93a386Sopenharmony_ci    void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
2000cb93a386Sopenharmony_ci    void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
2001cb93a386Sopenharmony_ci    void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
2002cb93a386Sopenharmony_ci
2003cb93a386Sopenharmony_ci    void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
2004cb93a386Sopenharmony_ci    void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
2005cb93a386Sopenharmony_ci
2006cb93a386Sopenharmony_ci    void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
2007cb93a386Sopenharmony_ci    void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
2008cb93a386Sopenharmony_ci    void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
2009cb93a386Sopenharmony_ci
2010cb93a386Sopenharmony_ci    void Assembler::vpaddw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfd, dst,x,y); }
2011cb93a386Sopenharmony_ci    void Assembler::vpsubw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xf9, dst,x,y); }
2012cb93a386Sopenharmony_ci    void Assembler::vpmullw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xd5, dst,x,y); }
2013cb93a386Sopenharmony_ci    void Assembler::vpavgw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xe3, dst,x,y); }
2014cb93a386Sopenharmony_ci    void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
2015cb93a386Sopenharmony_ci    void Assembler::vpminsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xea, dst,x,y); }
2016cb93a386Sopenharmony_ci    void Assembler::vpmaxsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xee, dst,x,y); }
2017cb93a386Sopenharmony_ci    void Assembler::vpminuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
2018cb93a386Sopenharmony_ci    void Assembler::vpmaxuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
2019cb93a386Sopenharmony_ci
2020cb93a386Sopenharmony_ci    void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
2021cb93a386Sopenharmony_ci
2022cb93a386Sopenharmony_ci
2023cb93a386Sopenharmony_ci    void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
2024cb93a386Sopenharmony_ci    void Assembler::vpor  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
2025cb93a386Sopenharmony_ci    void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
2026cb93a386Sopenharmony_ci    void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
2027cb93a386Sopenharmony_ci
2028cb93a386Sopenharmony_ci    void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
2029cb93a386Sopenharmony_ci    void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
2030cb93a386Sopenharmony_ci    void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
2031cb93a386Sopenharmony_ci    void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
2032cb93a386Sopenharmony_ci    void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
2033cb93a386Sopenharmony_ci    void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
2034cb93a386Sopenharmony_ci
2035cb93a386Sopenharmony_ci    void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
2036cb93a386Sopenharmony_ci    void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
2037cb93a386Sopenharmony_ci    void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
2038cb93a386Sopenharmony_ci
2039cb93a386Sopenharmony_ci    void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
2040cb93a386Sopenharmony_ci    void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
2041cb93a386Sopenharmony_ci    void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
2042cb93a386Sopenharmony_ci
2043cb93a386Sopenharmony_ci    void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
2044cb93a386Sopenharmony_ci    void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
2045cb93a386Sopenharmony_ci    void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
2046cb93a386Sopenharmony_ci
2047cb93a386Sopenharmony_ci    void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
2048cb93a386Sopenharmony_ci    void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
2049cb93a386Sopenharmony_ci
2050cb93a386Sopenharmony_ci    void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
2051cb93a386Sopenharmony_ci    void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
2052cb93a386Sopenharmony_ci
2053cb93a386Sopenharmony_ci    void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
2054cb93a386Sopenharmony_ci    void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
2055cb93a386Sopenharmony_ci    void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
2056cb93a386Sopenharmony_ci    void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
2057cb93a386Sopenharmony_ci
2058cb93a386Sopenharmony_ci
2059cb93a386Sopenharmony_ci    void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2060cb93a386Sopenharmony_ci        // When we've embedded a label displacement in the middle of an instruction,
2061cb93a386Sopenharmony_ci        // we need to tweak it a little so that the resolved displacement starts
2062cb93a386Sopenharmony_ci        // from the end of the instruction and not the end of the displacement.
2063cb93a386Sopenharmony_ci        if (operand.kind == Operand::LABEL && fCode) {
2064cb93a386Sopenharmony_ci            int disp;
2065cb93a386Sopenharmony_ci            memcpy(&disp, fCode+fSize-4, 4);
2066cb93a386Sopenharmony_ci            disp--;
2067cb93a386Sopenharmony_ci            memcpy(fCode+fSize-4, &disp, 4);
2068cb93a386Sopenharmony_ci        }
2069cb93a386Sopenharmony_ci        this->byte(imm);
2070cb93a386Sopenharmony_ci    }
2071cb93a386Sopenharmony_ci
2072cb93a386Sopenharmony_ci    void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2073cb93a386Sopenharmony_ci        this->op(0,0x0f,0xc2, dst,x,y);
2074cb93a386Sopenharmony_ci        this->imm_byte_after_operand(y, imm);
2075cb93a386Sopenharmony_ci    }
2076cb93a386Sopenharmony_ci
2077cb93a386Sopenharmony_ci    void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2078cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x4c, dst,x,y);
2079cb93a386Sopenharmony_ci        this->imm_byte_after_operand(y, z << 4);
2080cb93a386Sopenharmony_ci    }
2081cb93a386Sopenharmony_ci
2082cb93a386Sopenharmony_ci    // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
2083cb93a386Sopenharmony_ci    void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2084cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
2085cb93a386Sopenharmony_ci        this->byte(imm);
2086cb93a386Sopenharmony_ci    }
2087cb93a386Sopenharmony_ci    void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2088cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
2089cb93a386Sopenharmony_ci        this->byte(imm);
2090cb93a386Sopenharmony_ci    }
2091cb93a386Sopenharmony_ci    void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2092cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
2093cb93a386Sopenharmony_ci        this->byte(imm);
2094cb93a386Sopenharmony_ci    }
2095cb93a386Sopenharmony_ci    void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
2096cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
2097cb93a386Sopenharmony_ci        this->byte(imm);
2098cb93a386Sopenharmony_ci    }
2099cb93a386Sopenharmony_ci    void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2100cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
2101cb93a386Sopenharmony_ci        this->byte(imm);
2102cb93a386Sopenharmony_ci    }
2103cb93a386Sopenharmony_ci    void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
2104cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
2105cb93a386Sopenharmony_ci        this->byte(imm);
2106cb93a386Sopenharmony_ci    }
2107cb93a386Sopenharmony_ci
2108cb93a386Sopenharmony_ci    void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2109cb93a386Sopenharmony_ci        // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2110cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x00, dst,x,W1);
2111cb93a386Sopenharmony_ci        this->imm_byte_after_operand(x, imm);
2112cb93a386Sopenharmony_ci    }
2113cb93a386Sopenharmony_ci
2114cb93a386Sopenharmony_ci    void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2115cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x06, dst,x,y);
2116cb93a386Sopenharmony_ci        this->imm_byte_after_operand(y, imm);
2117cb93a386Sopenharmony_ci    }
2118cb93a386Sopenharmony_ci
2119cb93a386Sopenharmony_ci    void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2120cb93a386Sopenharmony_ci        this->op(0x66,0x380f,0x16, dst,ix,src);
2121cb93a386Sopenharmony_ci    }
2122cb93a386Sopenharmony_ci
2123cb93a386Sopenharmony_ci    void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2124cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x08, dst,x);
2125cb93a386Sopenharmony_ci        this->imm_byte_after_operand(x, imm);
2126cb93a386Sopenharmony_ci    }
2127cb93a386Sopenharmony_ci
2128cb93a386Sopenharmony_ci    void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
2129cb93a386Sopenharmony_ci    void Assembler::vmovups(Ymm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
2130cb93a386Sopenharmony_ci    void Assembler::vmovups(Xmm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
2131cb93a386Sopenharmony_ci    void Assembler::vmovups(Operand dst, Ymm src) { this->op(   0,0x0f,0x11, src,dst); }
2132cb93a386Sopenharmony_ci    void Assembler::vmovups(Operand dst, Xmm src) { this->op(   0,0x0f,0x11, src,dst); }
2133cb93a386Sopenharmony_ci
2134cb93a386Sopenharmony_ci    void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op(   0,0x0f,0x5b, dst,x); }
2135cb93a386Sopenharmony_ci    void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
2136cb93a386Sopenharmony_ci    void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
2137cb93a386Sopenharmony_ci    void Assembler::vsqrtps   (Ymm dst, Operand x) { this->op(   0,0x0f,0x51, dst,x); }
2138cb93a386Sopenharmony_ci
2139cb93a386Sopenharmony_ci    void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2140cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x1d, x,dst);
2141cb93a386Sopenharmony_ci        this->imm_byte_after_operand(dst, imm);
2142cb93a386Sopenharmony_ci    }
2143cb93a386Sopenharmony_ci    void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2144cb93a386Sopenharmony_ci        this->op(0x66,0x380f,0x13, dst,x);
2145cb93a386Sopenharmony_ci    }
2146cb93a386Sopenharmony_ci
2147cb93a386Sopenharmony_ci    int Assembler::disp19(Label* l) {
2148cb93a386Sopenharmony_ci        SkASSERT(l->kind == Label::NotYetSet ||
2149cb93a386Sopenharmony_ci                 l->kind == Label::ARMDisp19);
2150cb93a386Sopenharmony_ci        int here = (int)this->size();
2151cb93a386Sopenharmony_ci        l->kind = Label::ARMDisp19;
2152cb93a386Sopenharmony_ci        l->references.push_back(here);
2153cb93a386Sopenharmony_ci        // ARM 19-bit instruction count, from the beginning of this instruction.
2154cb93a386Sopenharmony_ci        return (l->offset - here) / 4;
2155cb93a386Sopenharmony_ci    }
2156cb93a386Sopenharmony_ci
2157cb93a386Sopenharmony_ci    int Assembler::disp32(Label* l) {
2158cb93a386Sopenharmony_ci        SkASSERT(l->kind == Label::NotYetSet ||
2159cb93a386Sopenharmony_ci                 l->kind == Label::X86Disp32);
2160cb93a386Sopenharmony_ci        int here = (int)this->size();
2161cb93a386Sopenharmony_ci        l->kind = Label::X86Disp32;
2162cb93a386Sopenharmony_ci        l->references.push_back(here);
2163cb93a386Sopenharmony_ci        // x86 32-bit byte count, from the end of this instruction.
2164cb93a386Sopenharmony_ci        return l->offset - (here + 4);
2165cb93a386Sopenharmony_ci    }
2166cb93a386Sopenharmony_ci
2167cb93a386Sopenharmony_ci    void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2168cb93a386Sopenharmony_ci        switch (y.kind) {
2169cb93a386Sopenharmony_ci            case Operand::REG: {
2170cb93a386Sopenharmony_ci                VEX v = vex(w, dst>>3, 0, y.reg>>3,
2171cb93a386Sopenharmony_ci                            map, x, l, prefix);
2172cb93a386Sopenharmony_ci                this->bytes(v.bytes, v.len);
2173cb93a386Sopenharmony_ci                this->byte(opcode);
2174cb93a386Sopenharmony_ci                this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
2175cb93a386Sopenharmony_ci            } return;
2176cb93a386Sopenharmony_ci
2177cb93a386Sopenharmony_ci            case Operand::MEM: {
2178cb93a386Sopenharmony_ci                // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2179cb93a386Sopenharmony_ci                // without an SIB byte, that's where the base register would usually go.
2180cb93a386Sopenharmony_ci                // This means we have to use an SIB byte if we want to use rsp as a base register.
2181cb93a386Sopenharmony_ci                const Mem& m = y.mem;
2182cb93a386Sopenharmony_ci                const bool need_SIB = m.base  == rsp
2183cb93a386Sopenharmony_ci                                   || m.index != rsp;
2184cb93a386Sopenharmony_ci
2185cb93a386Sopenharmony_ci                VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2186cb93a386Sopenharmony_ci                            map, x, l, prefix);
2187cb93a386Sopenharmony_ci                this->bytes(v.bytes, v.len);
2188cb93a386Sopenharmony_ci                this->byte(opcode);
2189cb93a386Sopenharmony_ci                this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2190cb93a386Sopenharmony_ci                if (need_SIB) {
2191cb93a386Sopenharmony_ci                    this->byte(sib(m.scale, m.index&7, m.base&7));
2192cb93a386Sopenharmony_ci                }
2193cb93a386Sopenharmony_ci                this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2194cb93a386Sopenharmony_ci            } return;
2195cb93a386Sopenharmony_ci
2196cb93a386Sopenharmony_ci            case Operand::LABEL: {
2197cb93a386Sopenharmony_ci                // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2198cb93a386Sopenharmony_ci                const int rip = rbp;
2199cb93a386Sopenharmony_ci
2200cb93a386Sopenharmony_ci                VEX v = vex(w, dst>>3, 0, rip>>3,
2201cb93a386Sopenharmony_ci                            map, x, l, prefix);
2202cb93a386Sopenharmony_ci                this->bytes(v.bytes, v.len);
2203cb93a386Sopenharmony_ci                this->byte(opcode);
2204cb93a386Sopenharmony_ci                this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2205cb93a386Sopenharmony_ci                this->word(this->disp32(y.label));
2206cb93a386Sopenharmony_ci            } return;
2207cb93a386Sopenharmony_ci        }
2208cb93a386Sopenharmony_ci    }
2209cb93a386Sopenharmony_ci
2210cb93a386Sopenharmony_ci    void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2211cb93a386Sopenharmony_ci
2212cb93a386Sopenharmony_ci    void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2213cb93a386Sopenharmony_ci
2214cb93a386Sopenharmony_ci    void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2215cb93a386Sopenharmony_ci
2216cb93a386Sopenharmony_ci    void Assembler::jump(uint8_t condition, Label* l) {
2217cb93a386Sopenharmony_ci        // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2218cb93a386Sopenharmony_ci        //    7?     one-byte-disp
2219cb93a386Sopenharmony_ci        //    0F 8? four-byte-disp
2220cb93a386Sopenharmony_ci        // We always use the near displacement to make updating labels simpler (no resizing).
2221cb93a386Sopenharmony_ci        this->byte(0x0f);
2222cb93a386Sopenharmony_ci        this->byte(condition);
2223cb93a386Sopenharmony_ci        this->word(this->disp32(l));
2224cb93a386Sopenharmony_ci    }
2225cb93a386Sopenharmony_ci    void Assembler::je (Label* l) { this->jump(0x84, l); }
2226cb93a386Sopenharmony_ci    void Assembler::jne(Label* l) { this->jump(0x85, l); }
2227cb93a386Sopenharmony_ci    void Assembler::jl (Label* l) { this->jump(0x8c, l); }
2228cb93a386Sopenharmony_ci    void Assembler::jc (Label* l) { this->jump(0x82, l); }
2229cb93a386Sopenharmony_ci
2230cb93a386Sopenharmony_ci    void Assembler::jmp(Label* l) {
2231cb93a386Sopenharmony_ci        // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2232cb93a386Sopenharmony_ci        this->byte(0xe9);
2233cb93a386Sopenharmony_ci        this->word(this->disp32(l));
2234cb93a386Sopenharmony_ci    }
2235cb93a386Sopenharmony_ci
2236cb93a386Sopenharmony_ci    void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
2237cb93a386Sopenharmony_ci    void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2238cb93a386Sopenharmony_ci
2239cb93a386Sopenharmony_ci    void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2240cb93a386Sopenharmony_ci
2241cb93a386Sopenharmony_ci    void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
2242cb93a386Sopenharmony_ci    void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2243cb93a386Sopenharmony_ci
2244cb93a386Sopenharmony_ci    void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2245cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x22, dst,src,y);
2246cb93a386Sopenharmony_ci        this->imm_byte_after_operand(y, imm);
2247cb93a386Sopenharmony_ci    }
2248cb93a386Sopenharmony_ci    void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2249cb93a386Sopenharmony_ci        this->op(0x66,0x0f,0xc4, dst,src,y);
2250cb93a386Sopenharmony_ci        this->imm_byte_after_operand(y, imm);
2251cb93a386Sopenharmony_ci    }
2252cb93a386Sopenharmony_ci    void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2253cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x20, dst,src,y);
2254cb93a386Sopenharmony_ci        this->imm_byte_after_operand(y, imm);
2255cb93a386Sopenharmony_ci    }
2256cb93a386Sopenharmony_ci
2257cb93a386Sopenharmony_ci    void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2258cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x39, src,dst);
2259cb93a386Sopenharmony_ci        SkASSERT(dst.kind != Operand::LABEL);
2260cb93a386Sopenharmony_ci        this->byte(imm);
2261cb93a386Sopenharmony_ci    }
2262cb93a386Sopenharmony_ci    void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2263cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x16, src,dst);
2264cb93a386Sopenharmony_ci        SkASSERT(dst.kind != Operand::LABEL);
2265cb93a386Sopenharmony_ci        this->byte(imm);
2266cb93a386Sopenharmony_ci    }
2267cb93a386Sopenharmony_ci    void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2268cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x15, src,dst);
2269cb93a386Sopenharmony_ci        SkASSERT(dst.kind != Operand::LABEL);
2270cb93a386Sopenharmony_ci        this->byte(imm);
2271cb93a386Sopenharmony_ci    }
2272cb93a386Sopenharmony_ci    void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2273cb93a386Sopenharmony_ci        this->op(0x66,0x3a0f,0x14, src,dst);
2274cb93a386Sopenharmony_ci        SkASSERT(dst.kind != Operand::LABEL);
2275cb93a386Sopenharmony_ci        this->byte(imm);
2276cb93a386Sopenharmony_ci    }
2277cb93a386Sopenharmony_ci
2278cb93a386Sopenharmony_ci    void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2279cb93a386Sopenharmony_ci        // Unlike most instructions, no aliasing is permitted here.
2280cb93a386Sopenharmony_ci        SkASSERT(dst != ix);
2281cb93a386Sopenharmony_ci        SkASSERT(dst != mask);
2282cb93a386Sopenharmony_ci        SkASSERT(mask != ix);
2283cb93a386Sopenharmony_ci
2284cb93a386Sopenharmony_ci        int prefix = 0x66,
2285cb93a386Sopenharmony_ci            map    = 0x380f,
2286cb93a386Sopenharmony_ci            opcode = 0x92;
2287cb93a386Sopenharmony_ci        VEX v = vex(0, dst>>3, ix>>3, base>>3,
2288cb93a386Sopenharmony_ci                    map, mask, /*ymm?*/1, prefix);
2289cb93a386Sopenharmony_ci        this->bytes(v.bytes, v.len);
2290cb93a386Sopenharmony_ci        this->byte(opcode);
2291cb93a386Sopenharmony_ci        this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2292cb93a386Sopenharmony_ci        this->byte(sib(scale, ix&7, base&7));
2293cb93a386Sopenharmony_ci    }
2294cb93a386Sopenharmony_ci
2295cb93a386Sopenharmony_ci    // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2296cb93a386Sopenharmony_ci
2297cb93a386Sopenharmony_ci    static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2298cb93a386Sopenharmony_ci
2299cb93a386Sopenharmony_ci    void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2300cb93a386Sopenharmony_ci        this->word( (hi & 11_mask) << 21
2301cb93a386Sopenharmony_ci                  | (m  &  5_mask) << 16
2302cb93a386Sopenharmony_ci                  | (lo &  6_mask) << 10
2303cb93a386Sopenharmony_ci                  | (n  &  5_mask) <<  5
2304cb93a386Sopenharmony_ci                  | (d  &  5_mask) <<  0);
2305cb93a386Sopenharmony_ci    }
2306cb93a386Sopenharmony_ci    void Assembler::op(uint32_t op22, V n, V d, int imm) {
2307cb93a386Sopenharmony_ci        this->word( (op22 & 22_mask) << 10
2308cb93a386Sopenharmony_ci                  | imm  // size and location depends on the instruction
2309cb93a386Sopenharmony_ci                  | (n    &  5_mask) <<  5
2310cb93a386Sopenharmony_ci                  | (d    &  5_mask) <<  0);
2311cb93a386Sopenharmony_ci    }
2312cb93a386Sopenharmony_ci
2313cb93a386Sopenharmony_ci    void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
2314cb93a386Sopenharmony_ci    void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
2315cb93a386Sopenharmony_ci    void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
2316cb93a386Sopenharmony_ci    void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
2317cb93a386Sopenharmony_ci    void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
2318cb93a386Sopenharmony_ci    void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2319cb93a386Sopenharmony_ci
2320cb93a386Sopenharmony_ci    void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
2321cb93a386Sopenharmony_ci    void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
2322cb93a386Sopenharmony_ci    void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2323cb93a386Sopenharmony_ci
2324cb93a386Sopenharmony_ci    void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
2325cb93a386Sopenharmony_ci    void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2326cb93a386Sopenharmony_ci
2327cb93a386Sopenharmony_ci    void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
2328cb93a386Sopenharmony_ci    void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2329cb93a386Sopenharmony_ci
2330cb93a386Sopenharmony_ci    void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
2331cb93a386Sopenharmony_ci    void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
2332cb93a386Sopenharmony_ci    void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
2333cb93a386Sopenharmony_ci    void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
2334cb93a386Sopenharmony_ci    void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
2335cb93a386Sopenharmony_ci    void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2336cb93a386Sopenharmony_ci
2337cb93a386Sopenharmony_ci    void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
2338cb93a386Sopenharmony_ci    void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
2339cb93a386Sopenharmony_ci
2340cb93a386Sopenharmony_ci    void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
2341cb93a386Sopenharmony_ci    void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
2342cb93a386Sopenharmony_ci    void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2343cb93a386Sopenharmony_ci
2344cb93a386Sopenharmony_ci    void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
2345cb93a386Sopenharmony_ci    void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2346cb93a386Sopenharmony_ci
2347cb93a386Sopenharmony_ci    void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2348cb93a386Sopenharmony_ci
2349cb93a386Sopenharmony_ci    void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); }
2350cb93a386Sopenharmony_ci    void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); }
2351cb93a386Sopenharmony_ci    void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); }
2352cb93a386Sopenharmony_ci    void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); }
2353cb93a386Sopenharmony_ci
2354cb93a386Sopenharmony_ci    void Assembler::sli4s(V d, V n, int imm5) {
2355cb93a386Sopenharmony_ci        this->op(0b0'1'1'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2356cb93a386Sopenharmony_ci    }
2357cb93a386Sopenharmony_ci    void Assembler::shl4s(V d, V n, int imm5) {
2358cb93a386Sopenharmony_ci        this->op(0b0'1'0'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2359cb93a386Sopenharmony_ci    }
2360cb93a386Sopenharmony_ci    void Assembler::sshr4s(V d, V n, int imm5) {
2361cb93a386Sopenharmony_ci        this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2362cb93a386Sopenharmony_ci    }
2363cb93a386Sopenharmony_ci    void Assembler::ushr4s(V d, V n, int imm5) {
2364cb93a386Sopenharmony_ci        this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2365cb93a386Sopenharmony_ci    }
2366cb93a386Sopenharmony_ci    void Assembler::ushr8h(V d, V n, int imm4) {
2367cb93a386Sopenharmony_ci        this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16);
2368cb93a386Sopenharmony_ci    }
2369cb93a386Sopenharmony_ci
2370cb93a386Sopenharmony_ci    void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
2371cb93a386Sopenharmony_ci    void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
2372cb93a386Sopenharmony_ci    void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
2373cb93a386Sopenharmony_ci    void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
2374cb93a386Sopenharmony_ci    void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
2375cb93a386Sopenharmony_ci
2376cb93a386Sopenharmony_ci    void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); }
2377cb93a386Sopenharmony_ci    void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); }
2378cb93a386Sopenharmony_ci
2379cb93a386Sopenharmony_ci    void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
2380cb93a386Sopenharmony_ci    void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2381cb93a386Sopenharmony_ci
2382cb93a386Sopenharmony_ci    void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
2383cb93a386Sopenharmony_ci    void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2384cb93a386Sopenharmony_ci
2385cb93a386Sopenharmony_ci    void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2386cb93a386Sopenharmony_ci
2387cb93a386Sopenharmony_ci    void Assembler::brk(int imm16) {
2388cb93a386Sopenharmony_ci        this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5);
2389cb93a386Sopenharmony_ci    }
2390cb93a386Sopenharmony_ci
2391cb93a386Sopenharmony_ci    void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2392cb93a386Sopenharmony_ci
2393cb93a386Sopenharmony_ci    void Assembler::add(X d, X n, int imm12) {
2394cb93a386Sopenharmony_ci        this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2395cb93a386Sopenharmony_ci    }
2396cb93a386Sopenharmony_ci    void Assembler::sub(X d, X n, int imm12) {
2397cb93a386Sopenharmony_ci        this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2398cb93a386Sopenharmony_ci    }
2399cb93a386Sopenharmony_ci    void Assembler::subs(X d, X n, int imm12) {
2400cb93a386Sopenharmony_ci        this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2401cb93a386Sopenharmony_ci    }
2402cb93a386Sopenharmony_ci
2403cb93a386Sopenharmony_ci    void Assembler::add(X d, X n, X m, Shift shift, int imm6) {
2404cb93a386Sopenharmony_ci        SkASSERT(shift != ROR);
2405cb93a386Sopenharmony_ci
2406cb93a386Sopenharmony_ci        int imm = (imm6  & 6_mask) << 0
2407cb93a386Sopenharmony_ci                | (m     & 5_mask) << 6
2408cb93a386Sopenharmony_ci                | (0     & 1_mask) << 11
2409cb93a386Sopenharmony_ci                | (shift & 2_mask) << 12;
2410cb93a386Sopenharmony_ci        this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10);
2411cb93a386Sopenharmony_ci    }
2412cb93a386Sopenharmony_ci
2413cb93a386Sopenharmony_ci    void Assembler::b(Condition cond, Label* l) {
2414cb93a386Sopenharmony_ci        const int imm19 = this->disp19(l);
2415cb93a386Sopenharmony_ci        this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5);
2416cb93a386Sopenharmony_ci    }
2417cb93a386Sopenharmony_ci    void Assembler::cbz(X t, Label* l) {
2418cb93a386Sopenharmony_ci        const int imm19 = this->disp19(l);
2419cb93a386Sopenharmony_ci        this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2420cb93a386Sopenharmony_ci    }
2421cb93a386Sopenharmony_ci    void Assembler::cbnz(X t, Label* l) {
2422cb93a386Sopenharmony_ci        const int imm19 = this->disp19(l);
2423cb93a386Sopenharmony_ci        this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2424cb93a386Sopenharmony_ci    }
2425cb93a386Sopenharmony_ci
2426cb93a386Sopenharmony_ci    void Assembler::ldrd(X dst, X src, int imm12) {
2427cb93a386Sopenharmony_ci        this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2428cb93a386Sopenharmony_ci    }
2429cb93a386Sopenharmony_ci    void Assembler::ldrs(X dst, X src, int imm12) {
2430cb93a386Sopenharmony_ci        this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2431cb93a386Sopenharmony_ci    }
2432cb93a386Sopenharmony_ci    void Assembler::ldrh(X dst, X src, int imm12) {
2433cb93a386Sopenharmony_ci        this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2434cb93a386Sopenharmony_ci    }
2435cb93a386Sopenharmony_ci    void Assembler::ldrb(X dst, X src, int imm12) {
2436cb93a386Sopenharmony_ci        this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2437cb93a386Sopenharmony_ci    }
2438cb93a386Sopenharmony_ci
2439cb93a386Sopenharmony_ci    void Assembler::ldrq(V dst, X src, int imm12) {
2440cb93a386Sopenharmony_ci        this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10);
2441cb93a386Sopenharmony_ci    }
2442cb93a386Sopenharmony_ci    void Assembler::ldrd(V dst, X src, int imm12) {
2443cb93a386Sopenharmony_ci        this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2444cb93a386Sopenharmony_ci    }
2445cb93a386Sopenharmony_ci    void Assembler::ldrs(V dst, X src, int imm12) {
2446cb93a386Sopenharmony_ci        this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2447cb93a386Sopenharmony_ci    }
2448cb93a386Sopenharmony_ci    void Assembler::ldrh(V dst, X src, int imm12) {
2449cb93a386Sopenharmony_ci        this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2450cb93a386Sopenharmony_ci    }
2451cb93a386Sopenharmony_ci    void Assembler::ldrb(V dst, X src, int imm12) {
2452cb93a386Sopenharmony_ci        this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2453cb93a386Sopenharmony_ci    }
2454cb93a386Sopenharmony_ci
2455cb93a386Sopenharmony_ci    void Assembler::strs(X src, X dst, int imm12) {
2456cb93a386Sopenharmony_ci        this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2457cb93a386Sopenharmony_ci    }
2458cb93a386Sopenharmony_ci
2459cb93a386Sopenharmony_ci    void Assembler::strq(V src, X dst, int imm12) {
2460cb93a386Sopenharmony_ci        this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10);
2461cb93a386Sopenharmony_ci    }
2462cb93a386Sopenharmony_ci    void Assembler::strd(V src, X dst, int imm12) {
2463cb93a386Sopenharmony_ci        this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2464cb93a386Sopenharmony_ci    }
2465cb93a386Sopenharmony_ci    void Assembler::strs(V src, X dst, int imm12) {
2466cb93a386Sopenharmony_ci        this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2467cb93a386Sopenharmony_ci    }
2468cb93a386Sopenharmony_ci    void Assembler::strh(V src, X dst, int imm12) {
2469cb93a386Sopenharmony_ci        this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2470cb93a386Sopenharmony_ci    }
2471cb93a386Sopenharmony_ci    void Assembler::strb(V src, X dst, int imm12) {
2472cb93a386Sopenharmony_ci        this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2473cb93a386Sopenharmony_ci    }
2474cb93a386Sopenharmony_ci
2475cb93a386Sopenharmony_ci    void Assembler::movs(X dst, V src, int lane) {
2476cb93a386Sopenharmony_ci        int imm5 = (lane << 3) | 0b100;
2477cb93a386Sopenharmony_ci        this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16);
2478cb93a386Sopenharmony_ci    }
2479cb93a386Sopenharmony_ci    void Assembler::inss(V dst, X src, int lane) {
2480cb93a386Sopenharmony_ci        int imm5 = (lane << 3) | 0b100;
2481cb93a386Sopenharmony_ci        this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16);
2482cb93a386Sopenharmony_ci    }
2483cb93a386Sopenharmony_ci
2484cb93a386Sopenharmony_ci
2485cb93a386Sopenharmony_ci    void Assembler::ldrq(V dst, Label* l) {
2486cb93a386Sopenharmony_ci        const int imm19 = this->disp19(l);
2487cb93a386Sopenharmony_ci        this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
2488cb93a386Sopenharmony_ci    }
2489cb93a386Sopenharmony_ci
2490cb93a386Sopenharmony_ci    void Assembler::dup4s(V dst, X src) {
2491cb93a386Sopenharmony_ci        this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2492cb93a386Sopenharmony_ci    }
2493cb93a386Sopenharmony_ci
2494cb93a386Sopenharmony_ci    void Assembler::ld1r4s(V dst, X src) {
2495cb93a386Sopenharmony_ci        this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
2496cb93a386Sopenharmony_ci    }
2497cb93a386Sopenharmony_ci    void Assembler::ld1r8h(V dst, X src) {
2498cb93a386Sopenharmony_ci        this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst);
2499cb93a386Sopenharmony_ci    }
2500cb93a386Sopenharmony_ci    void Assembler::ld1r16b(V dst, X src) {
2501cb93a386Sopenharmony_ci        this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst);
2502cb93a386Sopenharmony_ci    }
2503cb93a386Sopenharmony_ci
2504cb93a386Sopenharmony_ci    void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); }
2505cb93a386Sopenharmony_ci    void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); }
2506cb93a386Sopenharmony_ci    void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); }
2507cb93a386Sopenharmony_ci    void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); }
2508cb93a386Sopenharmony_ci
2509cb93a386Sopenharmony_ci    void Assembler::ld24s(V dst, X src, int lane) {
2510cb93a386Sopenharmony_ci        int Q = (lane & 2)>>1,
2511cb93a386Sopenharmony_ci            S = (lane & 1);
2512cb93a386Sopenharmony_ci                 /*  Q                       S */
2513cb93a386Sopenharmony_ci        this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12));
2514cb93a386Sopenharmony_ci    }
2515cb93a386Sopenharmony_ci    void Assembler::ld44s(V dst, X src, int lane) {
2516cb93a386Sopenharmony_ci        int Q = (lane & 2)>>1,
2517cb93a386Sopenharmony_ci            S = (lane & 1);
2518cb93a386Sopenharmony_ci        this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12));
2519cb93a386Sopenharmony_ci    }
2520cb93a386Sopenharmony_ci
2521cb93a386Sopenharmony_ci    void Assembler::label(Label* l) {
2522cb93a386Sopenharmony_ci        if (fCode) {
2523cb93a386Sopenharmony_ci            // The instructions all currently point to l->offset.
2524cb93a386Sopenharmony_ci            // We'll want to add a delta to point them to here.
2525cb93a386Sopenharmony_ci            int here = (int)this->size();
2526cb93a386Sopenharmony_ci            int delta = here - l->offset;
2527cb93a386Sopenharmony_ci            l->offset = here;
2528cb93a386Sopenharmony_ci
2529cb93a386Sopenharmony_ci            if (l->kind == Label::ARMDisp19) {
2530cb93a386Sopenharmony_ci                for (int ref : l->references) {
2531cb93a386Sopenharmony_ci                    // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2532cb93a386Sopenharmony_ci                    uint32_t inst;
2533cb93a386Sopenharmony_ci                    memcpy(&inst, fCode + ref, 4);
2534cb93a386Sopenharmony_ci
2535cb93a386Sopenharmony_ci                    // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2536cb93a386Sopenharmony_ci                    int disp = (int)(inst << 8) >> 13;
2537cb93a386Sopenharmony_ci
2538cb93a386Sopenharmony_ci                    disp += delta/4;  // delta is in bytes, we want instructions.
2539cb93a386Sopenharmony_ci
2540cb93a386Sopenharmony_ci                    // Put it all back together, preserving the high 8 bits and low 5.
2541cb93a386Sopenharmony_ci                    inst = ((disp << 5) &  (19_mask << 5))
2542cb93a386Sopenharmony_ci                         | ((inst     ) & ~(19_mask << 5));
2543cb93a386Sopenharmony_ci                    memcpy(fCode + ref, &inst, 4);
2544cb93a386Sopenharmony_ci                }
2545cb93a386Sopenharmony_ci            }
2546cb93a386Sopenharmony_ci
2547cb93a386Sopenharmony_ci            if (l->kind == Label::X86Disp32) {
2548cb93a386Sopenharmony_ci                for (int ref : l->references) {
2549cb93a386Sopenharmony_ci                    // ref points to a 32-bit displacement in bytes.
2550cb93a386Sopenharmony_ci                    int disp;
2551cb93a386Sopenharmony_ci                    memcpy(&disp, fCode + ref, 4);
2552cb93a386Sopenharmony_ci
2553cb93a386Sopenharmony_ci                    disp += delta;
2554cb93a386Sopenharmony_ci
2555cb93a386Sopenharmony_ci                    memcpy(fCode + ref, &disp, 4);
2556cb93a386Sopenharmony_ci                }
2557cb93a386Sopenharmony_ci            }
2558cb93a386Sopenharmony_ci        }
2559cb93a386Sopenharmony_ci    }
2560cb93a386Sopenharmony_ci
2561cb93a386Sopenharmony_ci    void Program::eval(int n, void* args[]) const {
2562cb93a386Sopenharmony_ci    #define SKVM_JIT_STATS 0
2563cb93a386Sopenharmony_ci    #if SKVM_JIT_STATS
2564cb93a386Sopenharmony_ci        static std::atomic<int64_t>  calls{0}, jits{0},
2565cb93a386Sopenharmony_ci                                    pixels{0}, fast{0};
2566cb93a386Sopenharmony_ci        pixels += n;
2567cb93a386Sopenharmony_ci        if (0 == calls++) {
2568cb93a386Sopenharmony_ci            atexit([]{
2569cb93a386Sopenharmony_ci                int64_t num = jits .load(),
2570cb93a386Sopenharmony_ci                        den = calls.load();
2571cb93a386Sopenharmony_ci                SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2572cb93a386Sopenharmony_ci                num = fast  .load();
2573cb93a386Sopenharmony_ci                den = pixels.load();
2574cb93a386Sopenharmony_ci                SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2575cb93a386Sopenharmony_ci            });
2576cb93a386Sopenharmony_ci        }
2577cb93a386Sopenharmony_ci    #endif
2578cb93a386Sopenharmony_ci
2579cb93a386Sopenharmony_ci    #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2580cb93a386Sopenharmony_ci        const void* jit_entry = fImpl->jit_entry.load();
2581cb93a386Sopenharmony_ci        // jit_entry may be null either simply because we can't JIT, or when using LLVM
2582cb93a386Sopenharmony_ci        // if the work represented by fImpl->llvm_compiling hasn't finished yet.
2583cb93a386Sopenharmony_ci        //
2584cb93a386Sopenharmony_ci        // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2585cb93a386Sopenharmony_ci        // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2586cb93a386Sopenharmony_ci        // due to timing or program caching.
2587cb93a386Sopenharmony_ci        if (jit_entry != nullptr && gSkVMAllowJIT) {
2588cb93a386Sopenharmony_ci        #if SKVM_JIT_STATS
2589cb93a386Sopenharmony_ci            jits++;
2590cb93a386Sopenharmony_ci            fast += n;
2591cb93a386Sopenharmony_ci        #endif
2592cb93a386Sopenharmony_ci            void** a = args;
2593cb93a386Sopenharmony_ci            switch (fImpl->strides.size()) {
2594cb93a386Sopenharmony_ci                case 0: return ((void(*)(int                        ))jit_entry)(n               );
2595cb93a386Sopenharmony_ci                case 1: return ((void(*)(int,void*                  ))jit_entry)(n,a[0]          );
2596cb93a386Sopenharmony_ci                case 2: return ((void(*)(int,void*,void*            ))jit_entry)(n,a[0],a[1]     );
2597cb93a386Sopenharmony_ci                case 3: return ((void(*)(int,void*,void*,void*      ))jit_entry)(n,a[0],a[1],a[2]);
2598cb93a386Sopenharmony_ci                case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2599cb93a386Sopenharmony_ci                                (n,a[0],a[1],a[2],a[3]);
2600cb93a386Sopenharmony_ci                case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2601cb93a386Sopenharmony_ci                                (n,a[0],a[1],a[2],a[3],a[4]);
2602cb93a386Sopenharmony_ci                case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2603cb93a386Sopenharmony_ci                                (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2604cb93a386Sopenharmony_ci                case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry)
2605cb93a386Sopenharmony_ci                                (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]);
2606cb93a386Sopenharmony_ci                default: break; //SkASSERT(fImpl->strides.size() <= 7);
2607cb93a386Sopenharmony_ci            }
2608cb93a386Sopenharmony_ci        }
2609cb93a386Sopenharmony_ci    #endif
2610cb93a386Sopenharmony_ci
2611cb93a386Sopenharmony_ci        // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2612cb93a386Sopenharmony_ci        SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2613cb93a386Sopenharmony_ci                               this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
2614cb93a386Sopenharmony_ci                               n, args);
2615cb93a386Sopenharmony_ci    }
2616cb93a386Sopenharmony_ci
2617cb93a386Sopenharmony_ci    #if defined(SKVM_LLVM)
2618cb93a386Sopenharmony_ci    // -- SKVM_LLVM --------------------------------------------------------------------------------
2619cb93a386Sopenharmony_ci    void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2620cb93a386Sopenharmony_ci                            const char* debug_name) {
2621cb93a386Sopenharmony_ci        auto ctx = std::make_unique<llvm::LLVMContext>();
2622cb93a386Sopenharmony_ci
2623cb93a386Sopenharmony_ci        auto mod = std::make_unique<llvm::Module>("", *ctx);
2624cb93a386Sopenharmony_ci        // All the scary bare pointers from here on are owned by ctx or mod, I think.
2625cb93a386Sopenharmony_ci
2626cb93a386Sopenharmony_ci        // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2627cb93a386Sopenharmony_ci        const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2628cb93a386Sopenharmony_ci
2629cb93a386Sopenharmony_ci        llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2630cb93a386Sopenharmony_ci                   *i32 = llvm::Type::getInt32Ty(*ctx);
2631cb93a386Sopenharmony_ci
2632cb93a386Sopenharmony_ci        std::vector<llvm::Type*> arg_types = { i32 };
2633cb93a386Sopenharmony_ci        for (size_t i = 0; i < fImpl->strides.size(); i++) {
2634cb93a386Sopenharmony_ci            arg_types.push_back(ptr);
2635cb93a386Sopenharmony_ci        }
2636cb93a386Sopenharmony_ci
2637cb93a386Sopenharmony_ci        llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2638cb93a386Sopenharmony_ci                                                              arg_types, /*vararg?=*/false);
2639cb93a386Sopenharmony_ci        llvm::Function* fn
2640cb93a386Sopenharmony_ci            = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2641cb93a386Sopenharmony_ci        for (size_t i = 0; i < fImpl->strides.size(); i++) {
2642cb93a386Sopenharmony_ci            fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2643cb93a386Sopenharmony_ci        }
2644cb93a386Sopenharmony_ci
2645cb93a386Sopenharmony_ci        llvm::BasicBlock *enter  = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2646cb93a386Sopenharmony_ci                         *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2647cb93a386Sopenharmony_ci                         *testK  = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2648cb93a386Sopenharmony_ci                         *loopK  = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2649cb93a386Sopenharmony_ci                         *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2650cb93a386Sopenharmony_ci                         *test1  = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2651cb93a386Sopenharmony_ci                         *loop1  = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2652cb93a386Sopenharmony_ci                         *leave  = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2653cb93a386Sopenharmony_ci
2654cb93a386Sopenharmony_ci        using IRBuilder = llvm::IRBuilder<>;
2655cb93a386Sopenharmony_ci
2656cb93a386Sopenharmony_ci        llvm::PHINode*                 n;
2657cb93a386Sopenharmony_ci        std::vector<llvm::PHINode*> args;
2658cb93a386Sopenharmony_ci        std::vector<llvm::Value*> vals(instructions.size());
2659cb93a386Sopenharmony_ci
2660cb93a386Sopenharmony_ci        auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2661cb93a386Sopenharmony_ci            auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i];
2662cb93a386Sopenharmony_ci
2663cb93a386Sopenharmony_ci            llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
2664cb93a386Sopenharmony_ci                       *i8    = llvm::Type::getInt8Ty (*ctx),
2665cb93a386Sopenharmony_ci                       *i16   = llvm::Type::getInt16Ty(*ctx),
2666cb93a386Sopenharmony_ci                       *f32   = llvm::Type::getFloatTy(*ctx),
2667cb93a386Sopenharmony_ci                       *I1    = scalar ? i1    : llvm::VectorType::get(i1 , K, false  ),
2668cb93a386Sopenharmony_ci                       *I8    = scalar ? i8    : llvm::VectorType::get(i8 , K, false  ),
2669cb93a386Sopenharmony_ci                       *I16   = scalar ? i16   : llvm::VectorType::get(i16, K, false  ),
2670cb93a386Sopenharmony_ci                       *I32   = scalar ? i32   : llvm::VectorType::get(i32, K, false  ),
2671cb93a386Sopenharmony_ci                       *F32   = scalar ? f32   : llvm::VectorType::get(f32, K, false  );
2672cb93a386Sopenharmony_ci
2673cb93a386Sopenharmony_ci            auto I  = [&](llvm::Value* v) { return b->CreateBitCast(v, I32  ); };
2674cb93a386Sopenharmony_ci            auto F  = [&](llvm::Value* v) { return b->CreateBitCast(v, F32  ); };
2675cb93a386Sopenharmony_ci
2676cb93a386Sopenharmony_ci            auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2677cb93a386Sopenharmony_ci
2678cb93a386Sopenharmony_ci            llvm::Type* vt = nullptr;
2679cb93a386Sopenharmony_ci            switch (llvm::Type* t = nullptr; op) {
2680cb93a386Sopenharmony_ci                default:
2681cb93a386Sopenharmony_ci                    SkDebugf("can't llvm %s (%d)\n", name(op), op);
2682cb93a386Sopenharmony_ci                    return false;
2683cb93a386Sopenharmony_ci
2684cb93a386Sopenharmony_ci                case Op::assert_true: /*TODO*/ break;
2685cb93a386Sopenharmony_ci
2686cb93a386Sopenharmony_ci                case Op::trace_line:
2687cb93a386Sopenharmony_ci                case Op::trace_var:
2688cb93a386Sopenharmony_ci                case Op::trace_call:
2689cb93a386Sopenharmony_ci                    /* Only supported in the interpreter. */
2690cb93a386Sopenharmony_ci                    break;
2691cb93a386Sopenharmony_ci
2692cb93a386Sopenharmony_ci                case Op::index:
2693cb93a386Sopenharmony_ci                    if (I32->isVectorTy()) {
2694cb93a386Sopenharmony_ci                        std::vector<llvm::Constant*> iota(K);
2695cb93a386Sopenharmony_ci                        for (int j = 0; j < K; j++) {
2696cb93a386Sopenharmony_ci                            iota[j] = b->getInt32(j);
2697cb93a386Sopenharmony_ci                        }
2698cb93a386Sopenharmony_ci                        vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2699cb93a386Sopenharmony_ci                                               llvm::ConstantVector::get(iota));
2700cb93a386Sopenharmony_ci                    } else {
2701cb93a386Sopenharmony_ci                        vals[i] = n;
2702cb93a386Sopenharmony_ci                    } break;
2703cb93a386Sopenharmony_ci
2704cb93a386Sopenharmony_ci                case Op::load8:  t = I8 ; goto load;
2705cb93a386Sopenharmony_ci                case Op::load16: t = I16; goto load;
2706cb93a386Sopenharmony_ci                case Op::load32: t = I32; goto load;
2707cb93a386Sopenharmony_ci                load: {
2708cb93a386Sopenharmony_ci                    llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo());
2709cb93a386Sopenharmony_ci                    vals[i] = b->CreateZExt(
2710cb93a386Sopenharmony_ci                            b->CreateAlignedLoad(t, ptr, llvm::MaybeAlign{1}), I32);
2711cb93a386Sopenharmony_ci                } break;
2712cb93a386Sopenharmony_ci
2713cb93a386Sopenharmony_ci
2714cb93a386Sopenharmony_ci                case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break;
2715cb93a386Sopenharmony_ci
2716cb93a386Sopenharmony_ci                case Op::uniform32: {
2717cb93a386Sopenharmony_ci                    llvm::Value* ptr = b->CreateBitCast(
2718cb93a386Sopenharmony_ci                            b->CreateConstInBoundsGEP1_32(i8, args[immA], immB),
2719cb93a386Sopenharmony_ci                            i32->getPointerTo());
2720cb93a386Sopenharmony_ci                    llvm::Value* val = b->CreateZExt(
2721cb93a386Sopenharmony_ci                            b->CreateAlignedLoad(i32, ptr, llvm::MaybeAlign{1}), i32);
2722cb93a386Sopenharmony_ci                    vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2723cb93a386Sopenharmony_ci                                                : val;
2724cb93a386Sopenharmony_ci                } break;
2725cb93a386Sopenharmony_ci
2726cb93a386Sopenharmony_ci                case Op::gather8:  t = i8 ; vt = I8; goto gather;
2727cb93a386Sopenharmony_ci                case Op::gather16: t = i16; vt = I16; goto gather;
2728cb93a386Sopenharmony_ci                case Op::gather32: t = i32; vt = I32; goto gather;
2729cb93a386Sopenharmony_ci                gather: {
2730cb93a386Sopenharmony_ci                    // Our gather base pointer is immB bytes off of uniform immA.
2731cb93a386Sopenharmony_ci                    llvm::Value* base =
2732cb93a386Sopenharmony_ci                        b->CreateLoad(b->CreateBitCast(
2733cb93a386Sopenharmony_ci                                b->CreateConstInBoundsGEP1_32(i8, args[immA],immB),
2734cb93a386Sopenharmony_ci                                t->getPointerTo()->getPointerTo()));
2735cb93a386Sopenharmony_ci
2736cb93a386Sopenharmony_ci                    llvm::Value* ptr = b->CreateInBoundsGEP(t, base, vals[x]);
2737cb93a386Sopenharmony_ci                    llvm::Value* gathered;
2738cb93a386Sopenharmony_ci                    if (ptr->getType()->isVectorTy()) {
2739cb93a386Sopenharmony_ci                        gathered = b->CreateMaskedGather(
2740cb93a386Sopenharmony_ci                                vt,
2741cb93a386Sopenharmony_ci                                ptr,
2742cb93a386Sopenharmony_ci                                llvm::Align{1});
2743cb93a386Sopenharmony_ci                    } else {
2744cb93a386Sopenharmony_ci                        gathered = b->CreateAlignedLoad(vt, ptr, llvm::MaybeAlign{1});
2745cb93a386Sopenharmony_ci                    }
2746cb93a386Sopenharmony_ci                    vals[i] = b->CreateZExt(gathered, I32);
2747cb93a386Sopenharmony_ci                } break;
2748cb93a386Sopenharmony_ci
2749cb93a386Sopenharmony_ci                case Op::store8:  t = I8 ; goto store;
2750cb93a386Sopenharmony_ci                case Op::store16: t = I16; goto store;
2751cb93a386Sopenharmony_ci                case Op::store32: t = I32; goto store;
2752cb93a386Sopenharmony_ci                store: {
2753cb93a386Sopenharmony_ci                    llvm::Value* val = b->CreateTrunc(vals[x], t);
2754cb93a386Sopenharmony_ci                    llvm::Value* ptr = b->CreateBitCast(args[immA],
2755cb93a386Sopenharmony_ci                                                        val->getType()->getPointerTo());
2756cb93a386Sopenharmony_ci                    vals[i] = b->CreateAlignedStore(val, ptr, llvm::MaybeAlign{1});
2757cb93a386Sopenharmony_ci                } break;
2758cb93a386Sopenharmony_ci
2759cb93a386Sopenharmony_ci                case Op::bit_and:   vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2760cb93a386Sopenharmony_ci                case Op::bit_or :   vals[i] = b->CreateOr (vals[x], vals[y]); break;
2761cb93a386Sopenharmony_ci                case Op::bit_xor:   vals[i] = b->CreateXor(vals[x], vals[y]); break;
2762cb93a386Sopenharmony_ci                case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2763cb93a386Sopenharmony_ci
2764cb93a386Sopenharmony_ci                case Op::select:
2765cb93a386Sopenharmony_ci                    vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2766cb93a386Sopenharmony_ci                    break;
2767cb93a386Sopenharmony_ci
2768cb93a386Sopenharmony_ci                case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2769cb93a386Sopenharmony_ci                case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2770cb93a386Sopenharmony_ci                case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2771cb93a386Sopenharmony_ci
2772cb93a386Sopenharmony_ci                case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break;
2773cb93a386Sopenharmony_ci                case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break;
2774cb93a386Sopenharmony_ci                case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break;
2775cb93a386Sopenharmony_ci
2776cb93a386Sopenharmony_ci                case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2777cb93a386Sopenharmony_ci                case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2778cb93a386Sopenharmony_ci
2779cb93a386Sopenharmony_ci                case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2780cb93a386Sopenharmony_ci                case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2781cb93a386Sopenharmony_ci                case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2782cb93a386Sopenharmony_ci                case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2783cb93a386Sopenharmony_ci
2784cb93a386Sopenharmony_ci                case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2785cb93a386Sopenharmony_ci                case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2786cb93a386Sopenharmony_ci                case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2787cb93a386Sopenharmony_ci                case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2788cb93a386Sopenharmony_ci
2789cb93a386Sopenharmony_ci                case Op::fma_f32:
2790cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2791cb93a386Sopenharmony_ci                                                   {F(vals[x]), F(vals[y]), F(vals[z])}));
2792cb93a386Sopenharmony_ci                    break;
2793cb93a386Sopenharmony_ci
2794cb93a386Sopenharmony_ci                case Op::fms_f32:
2795cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2796cb93a386Sopenharmony_ci                                                   {F(vals[x]), F(vals[y]),
2797cb93a386Sopenharmony_ci                                                    b->CreateFNeg(F(vals[z]))}));
2798cb93a386Sopenharmony_ci                    break;
2799cb93a386Sopenharmony_ci
2800cb93a386Sopenharmony_ci                case Op::fnma_f32:
2801cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2802cb93a386Sopenharmony_ci                                                   {b->CreateFNeg(F(vals[x])), F(vals[y]),
2803cb93a386Sopenharmony_ci                                                    F(vals[z])}));
2804cb93a386Sopenharmony_ci                    break;
2805cb93a386Sopenharmony_ci
2806cb93a386Sopenharmony_ci                case Op::ceil:
2807cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2808cb93a386Sopenharmony_ci                    break;
2809cb93a386Sopenharmony_ci                case Op::floor:
2810cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2811cb93a386Sopenharmony_ci                    break;
2812cb93a386Sopenharmony_ci
2813cb93a386Sopenharmony_ci                case Op::max_f32:
2814cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2815cb93a386Sopenharmony_ci                                                F(vals[y]), F(vals[x])));
2816cb93a386Sopenharmony_ci                    break;
2817cb93a386Sopenharmony_ci                case Op::min_f32:
2818cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2819cb93a386Sopenharmony_ci                                                F(vals[y]), F(vals[x])));
2820cb93a386Sopenharmony_ci                    break;
2821cb93a386Sopenharmony_ci
2822cb93a386Sopenharmony_ci                case Op::sqrt_f32:
2823cb93a386Sopenharmony_ci                    vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2824cb93a386Sopenharmony_ci                    break;
2825cb93a386Sopenharmony_ci
2826cb93a386Sopenharmony_ci                case Op::to_f32: vals[i] = I(b->CreateSIToFP(  vals[x] , F32)); break;
2827cb93a386Sopenharmony_ci                case Op::trunc : vals[i] =   b->CreateFPToSI(F(vals[x]), I32) ; break;
2828cb93a386Sopenharmony_ci                case Op::round : {
2829cb93a386Sopenharmony_ci                    // Basic impl when we can't use cvtps2dq and co.
2830cb93a386Sopenharmony_ci                    auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2831cb93a386Sopenharmony_ci                    vals[i] = b->CreateFPToSI(round, I32);
2832cb93a386Sopenharmony_ci
2833cb93a386Sopenharmony_ci                #if 1 && defined(SK_CPU_X86)
2834cb93a386Sopenharmony_ci                    // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2835cb93a386Sopenharmony_ci                    if (scalar) {
2836cb93a386Sopenharmony_ci                        // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3.  ¯\_(ツ)_/¯
2837cb93a386Sopenharmony_ci                        llvm::Value* v = llvm::UndefValue::get(
2838cb93a386Sopenharmony_ci                                llvm::VectorType::get(f32, 4, false));
2839cb93a386Sopenharmony_ci                        v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2840cb93a386Sopenharmony_ci                        vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2841cb93a386Sopenharmony_ci                    } else {
2842cb93a386Sopenharmony_ci                        SkASSERT(K == 4  || K == 8);
2843cb93a386Sopenharmony_ci                        auto intr = K == 4 ?   llvm::Intrinsic::x86_sse2_cvtps2dq :
2844cb93a386Sopenharmony_ci                                 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2845cb93a386Sopenharmony_ci                        vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2846cb93a386Sopenharmony_ci                    }
2847cb93a386Sopenharmony_ci                #endif
2848cb93a386Sopenharmony_ci                } break;
2849cb93a386Sopenharmony_ci
2850cb93a386Sopenharmony_ci            }
2851cb93a386Sopenharmony_ci            return true;
2852cb93a386Sopenharmony_ci        };
2853cb93a386Sopenharmony_ci
2854cb93a386Sopenharmony_ci        {
2855cb93a386Sopenharmony_ci            IRBuilder b(enter);
2856cb93a386Sopenharmony_ci            b.CreateBr(hoistK);
2857cb93a386Sopenharmony_ci        }
2858cb93a386Sopenharmony_ci
2859cb93a386Sopenharmony_ci        // hoistK: emit each hoistable vector instruction; goto testK;
2860cb93a386Sopenharmony_ci        // LLVM can do this sort of thing itself, but we've got the information cheap,
2861cb93a386Sopenharmony_ci        // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2862cb93a386Sopenharmony_ci        {
2863cb93a386Sopenharmony_ci            IRBuilder b(hoistK);
2864cb93a386Sopenharmony_ci
2865cb93a386Sopenharmony_ci            // Hoisted instructions will need args (think, uniforms), so set that up now.
2866cb93a386Sopenharmony_ci            // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2867cb93a386Sopenharmony_ci            // Later on when we start looping the phi nodes will start looking useful.
2868cb93a386Sopenharmony_ci            llvm::Argument* arg = fn->arg_begin();
2869cb93a386Sopenharmony_ci            (void)arg++;  // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2870cb93a386Sopenharmony_ci            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2871cb93a386Sopenharmony_ci                args.push_back(b.CreatePHI(arg->getType(), 1));
2872cb93a386Sopenharmony_ci                args.back()->addIncoming(arg++, enter);
2873cb93a386Sopenharmony_ci            }
2874cb93a386Sopenharmony_ci
2875cb93a386Sopenharmony_ci            for (size_t i = 0; i < instructions.size(); i++) {
2876cb93a386Sopenharmony_ci                if (instructions[i].can_hoist && !emit(i, false, &b)) {
2877cb93a386Sopenharmony_ci                    return;
2878cb93a386Sopenharmony_ci                }
2879cb93a386Sopenharmony_ci            }
2880cb93a386Sopenharmony_ci
2881cb93a386Sopenharmony_ci            b.CreateBr(testK);
2882cb93a386Sopenharmony_ci        }
2883cb93a386Sopenharmony_ci
2884cb93a386Sopenharmony_ci        // testK:  if (N >= K) goto loopK; else goto hoist1;
2885cb93a386Sopenharmony_ci        {
2886cb93a386Sopenharmony_ci            IRBuilder b(testK);
2887cb93a386Sopenharmony_ci
2888cb93a386Sopenharmony_ci            // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2889cb93a386Sopenharmony_ci            // These also start as the initial function arguments; hoistK can't have changed them.
2890cb93a386Sopenharmony_ci            llvm::Argument* arg = fn->arg_begin();
2891cb93a386Sopenharmony_ci
2892cb93a386Sopenharmony_ci            n = b.CreatePHI(arg->getType(), 2);
2893cb93a386Sopenharmony_ci            n->addIncoming(arg++, hoistK);
2894cb93a386Sopenharmony_ci
2895cb93a386Sopenharmony_ci            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2896cb93a386Sopenharmony_ci                args[i] = b.CreatePHI(arg->getType(), 2);
2897cb93a386Sopenharmony_ci                args[i]->addIncoming(arg++, hoistK);
2898cb93a386Sopenharmony_ci            }
2899cb93a386Sopenharmony_ci
2900cb93a386Sopenharmony_ci            b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2901cb93a386Sopenharmony_ci        }
2902cb93a386Sopenharmony_ci
2903cb93a386Sopenharmony_ci        // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2904cb93a386Sopenharmony_ci        {
2905cb93a386Sopenharmony_ci            IRBuilder b(loopK);
2906cb93a386Sopenharmony_ci            for (size_t i = 0; i < instructions.size(); i++) {
2907cb93a386Sopenharmony_ci                if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2908cb93a386Sopenharmony_ci                    return;
2909cb93a386Sopenharmony_ci                }
2910cb93a386Sopenharmony_ci            }
2911cb93a386Sopenharmony_ci
2912cb93a386Sopenharmony_ci            // n -= K
2913cb93a386Sopenharmony_ci            llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2914cb93a386Sopenharmony_ci            n->addIncoming(n_next, loopK);
2915cb93a386Sopenharmony_ci
2916cb93a386Sopenharmony_ci            // Each arg ptr += K
2917cb93a386Sopenharmony_ci            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2918cb93a386Sopenharmony_ci                llvm::Value* arg_next
2919cb93a386Sopenharmony_ci                    = b.CreateConstInBoundsGEP1_32(
2920cb93a386Sopenharmony_ci                            llvm::Type::getInt8Ty (*ctx),
2921cb93a386Sopenharmony_ci                            args[i],
2922cb93a386Sopenharmony_ci                            K*fImpl->strides[i]);
2923cb93a386Sopenharmony_ci                args[i]->addIncoming(arg_next, loopK);
2924cb93a386Sopenharmony_ci            }
2925cb93a386Sopenharmony_ci            b.CreateBr(testK);
2926cb93a386Sopenharmony_ci        }
2927cb93a386Sopenharmony_ci
2928cb93a386Sopenharmony_ci        // hoist1: emit each hoistable scalar instruction; goto test1;
2929cb93a386Sopenharmony_ci        {
2930cb93a386Sopenharmony_ci            IRBuilder b(hoist1);
2931cb93a386Sopenharmony_ci            for (size_t i = 0; i < instructions.size(); i++) {
2932cb93a386Sopenharmony_ci                if (instructions[i].can_hoist && !emit(i, true, &b)) {
2933cb93a386Sopenharmony_ci                    return;
2934cb93a386Sopenharmony_ci                }
2935cb93a386Sopenharmony_ci            }
2936cb93a386Sopenharmony_ci            b.CreateBr(test1);
2937cb93a386Sopenharmony_ci        }
2938cb93a386Sopenharmony_ci
2939cb93a386Sopenharmony_ci        // test1:  if (N >= 1) goto loop1; else goto leave;
2940cb93a386Sopenharmony_ci        {
2941cb93a386Sopenharmony_ci            IRBuilder b(test1);
2942cb93a386Sopenharmony_ci
2943cb93a386Sopenharmony_ci            // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2944cb93a386Sopenharmony_ci            llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2945cb93a386Sopenharmony_ci            n_new->addIncoming(n, hoist1);
2946cb93a386Sopenharmony_ci            n = n_new;
2947cb93a386Sopenharmony_ci
2948cb93a386Sopenharmony_ci            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2949cb93a386Sopenharmony_ci                llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2950cb93a386Sopenharmony_ci                arg_new->addIncoming(args[i], hoist1);
2951cb93a386Sopenharmony_ci                args[i] = arg_new;
2952cb93a386Sopenharmony_ci            }
2953cb93a386Sopenharmony_ci
2954cb93a386Sopenharmony_ci            b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2955cb93a386Sopenharmony_ci        }
2956cb93a386Sopenharmony_ci
2957cb93a386Sopenharmony_ci        // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
2958cb93a386Sopenharmony_ci        {
2959cb93a386Sopenharmony_ci            IRBuilder b(loop1);
2960cb93a386Sopenharmony_ci            for (size_t i = 0; i < instructions.size(); i++) {
2961cb93a386Sopenharmony_ci                if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2962cb93a386Sopenharmony_ci                    return;
2963cb93a386Sopenharmony_ci                }
2964cb93a386Sopenharmony_ci            }
2965cb93a386Sopenharmony_ci
2966cb93a386Sopenharmony_ci            // n -= 1
2967cb93a386Sopenharmony_ci            llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
2968cb93a386Sopenharmony_ci            n->addIncoming(n_next, loop1);
2969cb93a386Sopenharmony_ci
2970cb93a386Sopenharmony_ci            // Each arg ptr += 1
2971cb93a386Sopenharmony_ci            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2972cb93a386Sopenharmony_ci                llvm::Value* arg_next
2973cb93a386Sopenharmony_ci                    = b.CreateConstInBoundsGEP1_32(
2974cb93a386Sopenharmony_ci                            llvm::Type::getInt8Ty (*ctx), args[i], fImpl->strides[i]);
2975cb93a386Sopenharmony_ci                args[i]->addIncoming(arg_next, loop1);
2976cb93a386Sopenharmony_ci            }
2977cb93a386Sopenharmony_ci            b.CreateBr(test1);
2978cb93a386Sopenharmony_ci        }
2979cb93a386Sopenharmony_ci
2980cb93a386Sopenharmony_ci        // leave:  ret
2981cb93a386Sopenharmony_ci        {
2982cb93a386Sopenharmony_ci            IRBuilder b(leave);
2983cb93a386Sopenharmony_ci            b.CreateRetVoid();
2984cb93a386Sopenharmony_ci        }
2985cb93a386Sopenharmony_ci
2986cb93a386Sopenharmony_ci        SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2987cb93a386Sopenharmony_ci
2988cb93a386Sopenharmony_ci        if (true) {
2989cb93a386Sopenharmony_ci            SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2990cb93a386Sopenharmony_ci            std::error_code err;
2991cb93a386Sopenharmony_ci            llvm::raw_fd_ostream os(path.c_str(), err);
2992cb93a386Sopenharmony_ci            if (err) {
2993cb93a386Sopenharmony_ci                return;
2994cb93a386Sopenharmony_ci            }
2995cb93a386Sopenharmony_ci            llvm::WriteBitcodeToFile(*mod, os);
2996cb93a386Sopenharmony_ci        }
2997cb93a386Sopenharmony_ci
2998cb93a386Sopenharmony_ci        static SkOnce once;
2999cb93a386Sopenharmony_ci        once([]{
3000cb93a386Sopenharmony_ci            SkAssertResult(false == llvm::InitializeNativeTarget());
3001cb93a386Sopenharmony_ci            SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
3002cb93a386Sopenharmony_ci        });
3003cb93a386Sopenharmony_ci
3004cb93a386Sopenharmony_ci        if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
3005cb93a386Sopenharmony_ci                                            .setEngineKind(llvm::EngineKind::JIT)
3006cb93a386Sopenharmony_ci                                            .setMCPU(llvm::sys::getHostCPUName())
3007cb93a386Sopenharmony_ci                                            .create()) {
3008cb93a386Sopenharmony_ci            fImpl->llvm_ctx = std::move(ctx);
3009cb93a386Sopenharmony_ci            fImpl->llvm_ee.reset(ee);
3010cb93a386Sopenharmony_ci
3011cb93a386Sopenharmony_ci            #if defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3012cb93a386Sopenharmony_ci            // Wait for llvm to compile
3013cb93a386Sopenharmony_ci            void* function = (void*)ee->getFunctionAddress(debug_name);
3014cb93a386Sopenharmony_ci            fImpl->jit_entry.store(function);
3015cb93a386Sopenharmony_ci            // We have to be careful here about what we close over and how, in case fImpl moves.
3016cb93a386Sopenharmony_ci            // fImpl itself may change, but its pointee fields won't, so close over them by value.
3017cb93a386Sopenharmony_ci            // Also, debug_name will almost certainly leave scope, so copy it.
3018cb93a386Sopenharmony_ci            #else
3019cb93a386Sopenharmony_ci            fImpl->llvm_compiling = std::async(std::launch::async, [dst  = &fImpl->jit_entry,
3020cb93a386Sopenharmony_ci                                                                    ee   =  fImpl->llvm_ee.get(),
3021cb93a386Sopenharmony_ci                                                                    name = std::string(debug_name)]{
3022cb93a386Sopenharmony_ci                // std::atomic<void*>*    dst;
3023cb93a386Sopenharmony_ci                // llvm::ExecutionEngine* ee;
3024cb93a386Sopenharmony_ci                // std::string            name;
3025cb93a386Sopenharmony_ci                dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
3026cb93a386Sopenharmony_ci            });
3027cb93a386Sopenharmony_ci            #endif
3028cb93a386Sopenharmony_ci        }
3029cb93a386Sopenharmony_ci    }
3030cb93a386Sopenharmony_ci    #endif  // SKVM_LLVM
3031cb93a386Sopenharmony_ci
3032cb93a386Sopenharmony_ci    void Program::waitForLLVM() const {
3033cb93a386Sopenharmony_ci    #if defined(SKVM_LLVM) && !defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3034cb93a386Sopenharmony_ci        if (fImpl->llvm_compiling.valid()) {
3035cb93a386Sopenharmony_ci            fImpl->llvm_compiling.wait();
3036cb93a386Sopenharmony_ci        }
3037cb93a386Sopenharmony_ci    #endif
3038cb93a386Sopenharmony_ci    }
3039cb93a386Sopenharmony_ci
3040cb93a386Sopenharmony_ci    bool Program::hasJIT() const {
3041cb93a386Sopenharmony_ci        // Program::hasJIT() is really just a debugging / test aid,
3042cb93a386Sopenharmony_ci        // so we don't mind adding a sync point here to wait for compilation.
3043cb93a386Sopenharmony_ci        this->waitForLLVM();
3044cb93a386Sopenharmony_ci
3045cb93a386Sopenharmony_ci        return fImpl->jit_entry.load() != nullptr;
3046cb93a386Sopenharmony_ci    }
3047cb93a386Sopenharmony_ci
3048cb93a386Sopenharmony_ci    void Program::dropJIT() {
3049cb93a386Sopenharmony_ci    #if defined(SKVM_LLVM)
3050cb93a386Sopenharmony_ci        this->waitForLLVM();
3051cb93a386Sopenharmony_ci        fImpl->llvm_ee .reset(nullptr);
3052cb93a386Sopenharmony_ci        fImpl->llvm_ctx.reset(nullptr);
3053cb93a386Sopenharmony_ci    #elif defined(SKVM_JIT)
3054cb93a386Sopenharmony_ci        if (fImpl->dylib) {
3055cb93a386Sopenharmony_ci            close_dylib(fImpl->dylib);
3056cb93a386Sopenharmony_ci        } else if (auto jit_entry = fImpl->jit_entry.load()) {
3057cb93a386Sopenharmony_ci            unmap_jit_buffer(jit_entry, fImpl->jit_size);
3058cb93a386Sopenharmony_ci        }
3059cb93a386Sopenharmony_ci    #else
3060cb93a386Sopenharmony_ci        SkASSERT(!this->hasJIT());
3061cb93a386Sopenharmony_ci    #endif
3062cb93a386Sopenharmony_ci
3063cb93a386Sopenharmony_ci        fImpl->jit_entry.store(nullptr);
3064cb93a386Sopenharmony_ci        fImpl->jit_size  = 0;
3065cb93a386Sopenharmony_ci        fImpl->dylib     = nullptr;
3066cb93a386Sopenharmony_ci    }
3067cb93a386Sopenharmony_ci
3068cb93a386Sopenharmony_ci    Program::Program() : fImpl(std::make_unique<Impl>()) {}
3069cb93a386Sopenharmony_ci
3070cb93a386Sopenharmony_ci    Program::~Program() {
3071cb93a386Sopenharmony_ci        // Moved-from Programs may have fImpl == nullptr.
3072cb93a386Sopenharmony_ci        if (fImpl) {
3073cb93a386Sopenharmony_ci            this->dropJIT();
3074cb93a386Sopenharmony_ci        }
3075cb93a386Sopenharmony_ci    }
3076cb93a386Sopenharmony_ci
3077cb93a386Sopenharmony_ci    Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
3078cb93a386Sopenharmony_ci
3079cb93a386Sopenharmony_ci    Program& Program::operator=(Program&& other) {
3080cb93a386Sopenharmony_ci        fImpl = std::move(other.fImpl);
3081cb93a386Sopenharmony_ci        return *this;
3082cb93a386Sopenharmony_ci    }
3083cb93a386Sopenharmony_ci
3084cb93a386Sopenharmony_ci    Program::Program(const std::vector<OptimizedInstruction>& instructions,
3085cb93a386Sopenharmony_ci                     const std::vector<int>& strides,
3086cb93a386Sopenharmony_ci                     const char* debug_name, bool allow_jit) : Program() {
3087cb93a386Sopenharmony_ci        fImpl->strides = strides;
3088cb93a386Sopenharmony_ci        if (gSkVMAllowJIT && allow_jit) {
3089cb93a386Sopenharmony_ci        #if 1 && defined(SKVM_LLVM)
3090cb93a386Sopenharmony_ci            this->setupLLVM(instructions, debug_name);
3091cb93a386Sopenharmony_ci        #elif 1 && defined(SKVM_JIT)
3092cb93a386Sopenharmony_ci            this->setupJIT(instructions, debug_name);
3093cb93a386Sopenharmony_ci        #endif
3094cb93a386Sopenharmony_ci        }
3095cb93a386Sopenharmony_ci
3096cb93a386Sopenharmony_ci        // Might as well do this after setupLLVM() to get a little more time to compile.
3097cb93a386Sopenharmony_ci        this->setupInterpreter(instructions);
3098cb93a386Sopenharmony_ci    }
3099cb93a386Sopenharmony_ci
3100cb93a386Sopenharmony_ci    std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
3101cb93a386Sopenharmony_ci    int  Program::nargs() const { return (int)fImpl->strides.size(); }
3102cb93a386Sopenharmony_ci    int  Program::nregs() const { return fImpl->regs; }
3103cb93a386Sopenharmony_ci    int  Program::loop () const { return fImpl->loop; }
3104cb93a386Sopenharmony_ci    bool Program::empty() const { return fImpl->instructions.empty(); }
3105cb93a386Sopenharmony_ci
3106cb93a386Sopenharmony_ci    // Translate OptimizedInstructions to InterpreterInstructions.
3107cb93a386Sopenharmony_ci    void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
3108cb93a386Sopenharmony_ci        // Register each instruction is assigned to.
3109cb93a386Sopenharmony_ci        std::vector<Reg> reg(instructions.size());
3110cb93a386Sopenharmony_ci
3111cb93a386Sopenharmony_ci        // This next bit is a bit more complicated than strictly necessary;
3112cb93a386Sopenharmony_ci        // we could just assign every instruction to its own register.
3113cb93a386Sopenharmony_ci        //
3114cb93a386Sopenharmony_ci        // But recycling registers is fairly cheap, and good practice for the
3115cb93a386Sopenharmony_ci        // JITs where minimizing register pressure really is important.
3116cb93a386Sopenharmony_ci        //
3117cb93a386Sopenharmony_ci        // We have effectively infinite registers, so we hoist any value we can.
3118cb93a386Sopenharmony_ci        // (The JIT may choose a more complex policy to reduce register pressure.)
3119cb93a386Sopenharmony_ci
3120cb93a386Sopenharmony_ci        fImpl->regs = 0;
3121cb93a386Sopenharmony_ci        std::vector<Reg> avail;
3122cb93a386Sopenharmony_ci
3123cb93a386Sopenharmony_ci        // Assign this value to a register, recycling them where we can.
3124cb93a386Sopenharmony_ci        auto assign_register = [&](Val id) {
3125cb93a386Sopenharmony_ci            const OptimizedInstruction& inst = instructions[id];
3126cb93a386Sopenharmony_ci
3127cb93a386Sopenharmony_ci            // If this is a real input and it's lifetime ends at this instruction,
3128cb93a386Sopenharmony_ci            // we can recycle the register it's occupying.
3129cb93a386Sopenharmony_ci            auto maybe_recycle_register = [&](Val input) {
3130cb93a386Sopenharmony_ci                if (input != NA && instructions[input].death == id) {
3131cb93a386Sopenharmony_ci                    avail.push_back(reg[input]);
3132cb93a386Sopenharmony_ci                }
3133cb93a386Sopenharmony_ci            };
3134cb93a386Sopenharmony_ci
3135cb93a386Sopenharmony_ci            // Take care to not recycle the same register twice.
3136cb93a386Sopenharmony_ci            const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w;
3137cb93a386Sopenharmony_ci            if (true                      ) { maybe_recycle_register(x); }
3138cb93a386Sopenharmony_ci            if (y != x                    ) { maybe_recycle_register(y); }
3139cb93a386Sopenharmony_ci            if (z != x && z != y          ) { maybe_recycle_register(z); }
3140cb93a386Sopenharmony_ci            if (w != x && w != y && w != z) { maybe_recycle_register(w); }
3141cb93a386Sopenharmony_ci
3142cb93a386Sopenharmony_ci            // Instructions that die at themselves (stores) don't need a register.
3143cb93a386Sopenharmony_ci            if (inst.death != id) {
3144cb93a386Sopenharmony_ci                // Allocate a register if we have to, preferring to reuse anything available.
3145cb93a386Sopenharmony_ci                if (avail.empty()) {
3146cb93a386Sopenharmony_ci                    reg[id] = fImpl->regs++;
3147cb93a386Sopenharmony_ci                } else {
3148cb93a386Sopenharmony_ci                    reg[id] = avail.back();
3149cb93a386Sopenharmony_ci                    avail.pop_back();
3150cb93a386Sopenharmony_ci                }
3151cb93a386Sopenharmony_ci            }
3152cb93a386Sopenharmony_ci        };
3153cb93a386Sopenharmony_ci
3154cb93a386Sopenharmony_ci        // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
3155cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)instructions.size(); id++) {
3156cb93a386Sopenharmony_ci            if ( instructions[id].can_hoist) { assign_register(id); }
3157cb93a386Sopenharmony_ci        }
3158cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)instructions.size(); id++) {
3159cb93a386Sopenharmony_ci            if (!instructions[id].can_hoist) { assign_register(id); }
3160cb93a386Sopenharmony_ci        }
3161cb93a386Sopenharmony_ci
3162cb93a386Sopenharmony_ci        // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
3163cb93a386Sopenharmony_ci        // registers.  This will be two passes, first hoisted instructions, then inside the loop.
3164cb93a386Sopenharmony_ci
3165cb93a386Sopenharmony_ci        // The loop begins at the fImpl->loop'th Instruction.
3166cb93a386Sopenharmony_ci        fImpl->loop = 0;
3167cb93a386Sopenharmony_ci        fImpl->instructions.reserve(instructions.size());
3168cb93a386Sopenharmony_ci
3169cb93a386Sopenharmony_ci        // Add a mapping for the N/A sentinel Val to any arbitrary register
3170cb93a386Sopenharmony_ci        // so lookups don't have to know which arguments are used by which Ops.
3171cb93a386Sopenharmony_ci        auto lookup_register = [&](Val id) {
3172cb93a386Sopenharmony_ci            return id == NA ? (Reg)0
3173cb93a386Sopenharmony_ci                            : reg[id];
3174cb93a386Sopenharmony_ci        };
3175cb93a386Sopenharmony_ci
3176cb93a386Sopenharmony_ci        auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
3177cb93a386Sopenharmony_ci            InterpreterInstruction pinst{
3178cb93a386Sopenharmony_ci                inst.op,
3179cb93a386Sopenharmony_ci                lookup_register(id),
3180cb93a386Sopenharmony_ci                lookup_register(inst.x),
3181cb93a386Sopenharmony_ci                lookup_register(inst.y),
3182cb93a386Sopenharmony_ci                lookup_register(inst.z),
3183cb93a386Sopenharmony_ci                lookup_register(inst.w),
3184cb93a386Sopenharmony_ci                inst.immA,
3185cb93a386Sopenharmony_ci                inst.immB,
3186cb93a386Sopenharmony_ci                inst.immC,
3187cb93a386Sopenharmony_ci            };
3188cb93a386Sopenharmony_ci            fImpl->instructions.push_back(pinst);
3189cb93a386Sopenharmony_ci        };
3190cb93a386Sopenharmony_ci
3191cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)instructions.size(); id++) {
3192cb93a386Sopenharmony_ci            const OptimizedInstruction& inst = instructions[id];
3193cb93a386Sopenharmony_ci            if (inst.can_hoist) {
3194cb93a386Sopenharmony_ci                push_instruction(id, inst);
3195cb93a386Sopenharmony_ci                fImpl->loop++;
3196cb93a386Sopenharmony_ci            }
3197cb93a386Sopenharmony_ci        }
3198cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)instructions.size(); id++) {
3199cb93a386Sopenharmony_ci            const OptimizedInstruction& inst = instructions[id];
3200cb93a386Sopenharmony_ci            if (!inst.can_hoist) {
3201cb93a386Sopenharmony_ci                push_instruction(id, inst);
3202cb93a386Sopenharmony_ci            }
3203cb93a386Sopenharmony_ci        }
3204cb93a386Sopenharmony_ci    }
3205cb93a386Sopenharmony_ci
3206cb93a386Sopenharmony_ci#if defined(SKVM_JIT)
3207cb93a386Sopenharmony_ci
3208cb93a386Sopenharmony_ci    namespace SkVMJitTypes {
3209cb93a386Sopenharmony_ci    #if defined(__x86_64__) || defined(_M_X64)
3210cb93a386Sopenharmony_ci        using Reg = Assembler::Ymm;
3211cb93a386Sopenharmony_ci    #elif defined(__aarch64__)
3212cb93a386Sopenharmony_ci        using Reg = Assembler::V;
3213cb93a386Sopenharmony_ci    #endif
3214cb93a386Sopenharmony_ci    }  // namespace SkVMJitTypes
3215cb93a386Sopenharmony_ci
3216cb93a386Sopenharmony_ci    bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3217cb93a386Sopenharmony_ci                      int* stack_hint,
3218cb93a386Sopenharmony_ci                      uint32_t* registers_used,
3219cb93a386Sopenharmony_ci                      Assembler* a) const {
3220cb93a386Sopenharmony_ci        using A = Assembler;
3221cb93a386Sopenharmony_ci        using SkVMJitTypes::Reg;
3222cb93a386Sopenharmony_ci
3223cb93a386Sopenharmony_ci        SkTHashMap<int, A::Label> constants;    // Constants (mostly splats) share the same pool.
3224cb93a386Sopenharmony_ci        A::Label                  iota;         // Varies per lane, for Op::index.
3225cb93a386Sopenharmony_ci        A::Label                  load64_index; // Used to load low or high half of 64-bit lanes.
3226cb93a386Sopenharmony_ci
3227cb93a386Sopenharmony_ci        // The `regs` array tracks everything we know about each register's state:
3228cb93a386Sopenharmony_ci        //   - NA:   empty
3229cb93a386Sopenharmony_ci        //   - RES:  reserved by ABI
3230cb93a386Sopenharmony_ci        //   - TMP:  holding a temporary
3231cb93a386Sopenharmony_ci        //   - id:   holding Val id
3232cb93a386Sopenharmony_ci        constexpr Val RES = NA-1,
3233cb93a386Sopenharmony_ci                      TMP = RES-1;
3234cb93a386Sopenharmony_ci
3235cb93a386Sopenharmony_ci        // Map val -> stack slot.
3236cb93a386Sopenharmony_ci        std::vector<int> stack_slot(instructions.size(), NA);
3237cb93a386Sopenharmony_ci        int next_stack_slot = 0;
3238cb93a386Sopenharmony_ci
3239cb93a386Sopenharmony_ci        const int nstack_slots = *stack_hint >= 0 ? *stack_hint
3240cb93a386Sopenharmony_ci                                                  : stack_slot.size();
3241cb93a386Sopenharmony_ci    #if defined(__x86_64__) || defined(_M_X64)
3242cb93a386Sopenharmony_ci        if (!SkCpu::Supports(SkCpu::HSW)) {
3243cb93a386Sopenharmony_ci            return false;
3244cb93a386Sopenharmony_ci        }
3245cb93a386Sopenharmony_ci        const int K = 8;
3246cb93a386Sopenharmony_ci        #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
3247cb93a386Sopenharmony_ci            const A::GP64 N = A::rcx,
3248cb93a386Sopenharmony_ci                        GP0 = A::rax,
3249cb93a386Sopenharmony_ci                        GP1 = A::r11,
3250cb93a386Sopenharmony_ci                        arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3251cb93a386Sopenharmony_ci
3252cb93a386Sopenharmony_ci            // xmm6-15 need are callee-saved.
3253cb93a386Sopenharmony_ci            std::array<Val,16> regs = {
3254cb93a386Sopenharmony_ci                 NA, NA, NA, NA,  NA, NA,RES,RES,
3255cb93a386Sopenharmony_ci                RES,RES,RES,RES, RES,RES,RES,RES,
3256cb93a386Sopenharmony_ci            };
3257cb93a386Sopenharmony_ci            const uint32_t incoming_registers_used = *registers_used;
3258cb93a386Sopenharmony_ci
3259cb93a386Sopenharmony_ci            auto enter = [&]{
3260cb93a386Sopenharmony_ci                // rcx,rdx,r8,r9 are all already holding their correct values.
3261cb93a386Sopenharmony_ci                // Load caller-saved r10 from rsp+40 if there's a fourth arg.
3262cb93a386Sopenharmony_ci                if (fImpl->strides.size() >= 4) {
3263cb93a386Sopenharmony_ci                    a->mov(A::r10, A::Mem{A::rsp, 40});
3264cb93a386Sopenharmony_ci                }
3265cb93a386Sopenharmony_ci                // Load callee-saved rdi from rsp+48 if there's a fifth arg,
3266cb93a386Sopenharmony_ci                // first saving it to ABI reserved shadow area rsp+8.
3267cb93a386Sopenharmony_ci                if (fImpl->strides.size() >= 5) {
3268cb93a386Sopenharmony_ci                    a->mov(A::Mem{A::rsp, 8}, A::rdi);
3269cb93a386Sopenharmony_ci                    a->mov(A::rdi, A::Mem{A::rsp, 48});
3270cb93a386Sopenharmony_ci                }
3271cb93a386Sopenharmony_ci                // Load callee-saved rsi from rsp+56 if there's a sixth arg,
3272cb93a386Sopenharmony_ci                // first saving it to ABI reserved shadow area rsp+16.
3273cb93a386Sopenharmony_ci                if (fImpl->strides.size() >= 6) {
3274cb93a386Sopenharmony_ci                    a->mov(A::Mem{A::rsp, 16}, A::rsi);
3275cb93a386Sopenharmony_ci                    a->mov(A::rsi, A::Mem{A::rsp, 56});
3276cb93a386Sopenharmony_ci                }
3277cb93a386Sopenharmony_ci
3278cb93a386Sopenharmony_ci                // Allocate stack for our values and callee-saved xmm6-15.
3279cb93a386Sopenharmony_ci                int stack_needed = nstack_slots*K*4;
3280cb93a386Sopenharmony_ci                for (int r = 6; r < 16; r++) {
3281cb93a386Sopenharmony_ci                    if (incoming_registers_used & (1<<r)) {
3282cb93a386Sopenharmony_ci                        stack_needed += 16;
3283cb93a386Sopenharmony_ci                    }
3284cb93a386Sopenharmony_ci                }
3285cb93a386Sopenharmony_ci                if (stack_needed) { a->sub(A::rsp, stack_needed); }
3286cb93a386Sopenharmony_ci
3287cb93a386Sopenharmony_ci                int next_saved_xmm = nstack_slots*K*4;
3288cb93a386Sopenharmony_ci                for (int r = 6; r < 16; r++) {
3289cb93a386Sopenharmony_ci                    if (incoming_registers_used & (1<<r)) {
3290cb93a386Sopenharmony_ci                        a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3291cb93a386Sopenharmony_ci                        next_saved_xmm += 16;
3292cb93a386Sopenharmony_ci                        regs[r] = NA;
3293cb93a386Sopenharmony_ci                    }
3294cb93a386Sopenharmony_ci                }
3295cb93a386Sopenharmony_ci            };
3296cb93a386Sopenharmony_ci            auto exit  = [&]{
3297cb93a386Sopenharmony_ci                // The second pass of jit() shouldn't use any register it didn't in the first pass.
3298cb93a386Sopenharmony_ci                SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
3299cb93a386Sopenharmony_ci
3300cb93a386Sopenharmony_ci                // Restore callee-saved xmm6-15 and the stack pointer.
3301cb93a386Sopenharmony_ci                int stack_used = nstack_slots*K*4;
3302cb93a386Sopenharmony_ci                for (int r = 6; r < 16; r++) {
3303cb93a386Sopenharmony_ci                    if (incoming_registers_used & (1<<r)) {
3304cb93a386Sopenharmony_ci                        a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3305cb93a386Sopenharmony_ci                        stack_used += 16;
3306cb93a386Sopenharmony_ci                    }
3307cb93a386Sopenharmony_ci                }
3308cb93a386Sopenharmony_ci                if (stack_used) { a->add(A::rsp, stack_used); }
3309cb93a386Sopenharmony_ci
3310cb93a386Sopenharmony_ci                // Restore callee-saved rdi/rsi if we used them.
3311cb93a386Sopenharmony_ci                if (fImpl->strides.size() >= 5) {
3312cb93a386Sopenharmony_ci                    a->mov(A::rdi, A::Mem{A::rsp, 8});
3313cb93a386Sopenharmony_ci                }
3314cb93a386Sopenharmony_ci                if (fImpl->strides.size() >= 6) {
3315cb93a386Sopenharmony_ci                    a->mov(A::rsi, A::Mem{A::rsp, 16});
3316cb93a386Sopenharmony_ci                }
3317cb93a386Sopenharmony_ci
3318cb93a386Sopenharmony_ci                a->vzeroupper();
3319cb93a386Sopenharmony_ci                a->ret();
3320cb93a386Sopenharmony_ci            };
3321cb93a386Sopenharmony_ci        #elif defined(__x86_64__)
3322cb93a386Sopenharmony_ci            const A::GP64 N = A::rdi,
3323cb93a386Sopenharmony_ci                        GP0 = A::rax,
3324cb93a386Sopenharmony_ci                        GP1 = A::r11,
3325cb93a386Sopenharmony_ci                        arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3326cb93a386Sopenharmony_ci
3327cb93a386Sopenharmony_ci            // All 16 ymm registers are available to use.
3328cb93a386Sopenharmony_ci            std::array<Val,16> regs = {
3329cb93a386Sopenharmony_ci                NA,NA,NA,NA, NA,NA,NA,NA,
3330cb93a386Sopenharmony_ci                NA,NA,NA,NA, NA,NA,NA,NA,
3331cb93a386Sopenharmony_ci            };
3332cb93a386Sopenharmony_ci
3333cb93a386Sopenharmony_ci            auto enter = [&]{
3334cb93a386Sopenharmony_ci                // Load caller-saved r10 from rsp+8 if there's a sixth arg.
3335cb93a386Sopenharmony_ci                if (fImpl->strides.size() >= 6) {
3336cb93a386Sopenharmony_ci                    a->mov(A::r10, A::Mem{A::rsp, 8});
3337cb93a386Sopenharmony_ci                }
3338cb93a386Sopenharmony_ci                if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
3339cb93a386Sopenharmony_ci            };
3340cb93a386Sopenharmony_ci            auto exit  = [&]{
3341cb93a386Sopenharmony_ci                if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
3342cb93a386Sopenharmony_ci                a->vzeroupper();
3343cb93a386Sopenharmony_ci                a->ret();
3344cb93a386Sopenharmony_ci            };
3345cb93a386Sopenharmony_ci        #endif
3346cb93a386Sopenharmony_ci
3347cb93a386Sopenharmony_ci        auto load_from_memory = [&](Reg r, Val v) {
3348cb93a386Sopenharmony_ci            if (instructions[v].op == Op::splat) {
3349cb93a386Sopenharmony_ci                if (instructions[v].immA == 0) {
3350cb93a386Sopenharmony_ci                    a->vpxor(r,r,r);
3351cb93a386Sopenharmony_ci                } else {
3352cb93a386Sopenharmony_ci                    a->vmovups(r, constants.find(instructions[v].immA));
3353cb93a386Sopenharmony_ci                }
3354cb93a386Sopenharmony_ci            } else {
3355cb93a386Sopenharmony_ci                SkASSERT(stack_slot[v] != NA);
3356cb93a386Sopenharmony_ci                a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
3357cb93a386Sopenharmony_ci            }
3358cb93a386Sopenharmony_ci        };
3359cb93a386Sopenharmony_ci        auto store_to_stack = [&](Reg r, Val v) {
3360cb93a386Sopenharmony_ci            SkASSERT(next_stack_slot < nstack_slots);
3361cb93a386Sopenharmony_ci            stack_slot[v] = next_stack_slot++;
3362cb93a386Sopenharmony_ci            a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
3363cb93a386Sopenharmony_ci        };
3364cb93a386Sopenharmony_ci    #elif defined(__aarch64__)
3365cb93a386Sopenharmony_ci        const int K = 4;
3366cb93a386Sopenharmony_ci        const A::X N     = A::x0,
3367cb93a386Sopenharmony_ci                   GP0   = A::x8,
3368cb93a386Sopenharmony_ci                   GP1   = A::x9,
3369cb93a386Sopenharmony_ci                   arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3370cb93a386Sopenharmony_ci
3371cb93a386Sopenharmony_ci        // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3372cb93a386Sopenharmony_ci        std::array<Val,32> regs = {
3373cb93a386Sopenharmony_ci             NA, NA, NA, NA,  NA, NA, NA, NA,
3374cb93a386Sopenharmony_ci            RES,RES,RES,RES, RES,RES,RES,RES,
3375cb93a386Sopenharmony_ci             NA, NA, NA, NA,  NA, NA, NA, NA,
3376cb93a386Sopenharmony_ci             NA, NA, NA, NA,  NA, NA, NA, NA,
3377cb93a386Sopenharmony_ci        };
3378cb93a386Sopenharmony_ci
3379cb93a386Sopenharmony_ci        auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
3380cb93a386Sopenharmony_ci        auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
3381cb93a386Sopenharmony_ci                          a->ret(A::x30); };
3382cb93a386Sopenharmony_ci
3383cb93a386Sopenharmony_ci        auto load_from_memory = [&](Reg r, Val v) {
3384cb93a386Sopenharmony_ci            if (instructions[v].op == Op::splat) {
3385cb93a386Sopenharmony_ci                if (instructions[v].immA == 0) {
3386cb93a386Sopenharmony_ci                    a->eor16b(r,r,r);
3387cb93a386Sopenharmony_ci                } else {
3388cb93a386Sopenharmony_ci                    a->ldrq(r, constants.find(instructions[v].immA));
3389cb93a386Sopenharmony_ci                }
3390cb93a386Sopenharmony_ci            } else {
3391cb93a386Sopenharmony_ci                SkASSERT(stack_slot[v] != NA);
3392cb93a386Sopenharmony_ci                a->ldrq(r, A::sp, stack_slot[v]);
3393cb93a386Sopenharmony_ci            }
3394cb93a386Sopenharmony_ci        };
3395cb93a386Sopenharmony_ci        auto store_to_stack  = [&](Reg r, Val v) {
3396cb93a386Sopenharmony_ci            SkASSERT(next_stack_slot < nstack_slots);
3397cb93a386Sopenharmony_ci            stack_slot[v] = next_stack_slot++;
3398cb93a386Sopenharmony_ci            a->strq(r, A::sp, stack_slot[v]);
3399cb93a386Sopenharmony_ci        };
3400cb93a386Sopenharmony_ci    #endif
3401cb93a386Sopenharmony_ci
3402cb93a386Sopenharmony_ci        *registers_used = 0;  // We'll update this as we go.
3403cb93a386Sopenharmony_ci
3404cb93a386Sopenharmony_ci        if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3405cb93a386Sopenharmony_ci            return false;
3406cb93a386Sopenharmony_ci        }
3407cb93a386Sopenharmony_ci
3408cb93a386Sopenharmony_ci        auto emit = [&](Val id, bool scalar) {
3409cb93a386Sopenharmony_ci            const int active_lanes = scalar ? 1 : K;
3410cb93a386Sopenharmony_ci            const OptimizedInstruction& inst = instructions[id];
3411cb93a386Sopenharmony_ci            const Op op = inst.op;
3412cb93a386Sopenharmony_ci            const Val x = inst.x,
3413cb93a386Sopenharmony_ci                      y = inst.y,
3414cb93a386Sopenharmony_ci                      z = inst.z,
3415cb93a386Sopenharmony_ci                      w = inst.w;
3416cb93a386Sopenharmony_ci            const int immA = inst.immA,
3417cb93a386Sopenharmony_ci                      immB = inst.immB,
3418cb93a386Sopenharmony_ci                      immC = inst.immC;
3419cb93a386Sopenharmony_ci
3420cb93a386Sopenharmony_ci            // alloc_tmp() returns the first of N adjacent temporary registers,
3421cb93a386Sopenharmony_ci            // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
3422cb93a386Sopenharmony_ci            auto alloc_tmp = [&](int N=1) -> Reg {
3423cb93a386Sopenharmony_ci                auto needs_spill = [&](Val v) -> bool {
3424cb93a386Sopenharmony_ci                    SkASSERT(v >= 0);   // {NA,TMP,RES} need to be handled before calling this.
3425cb93a386Sopenharmony_ci                    return stack_slot[v] == NA               // We haven't spilled it already?
3426cb93a386Sopenharmony_ci                        && instructions[v].op != Op::splat;  // No need to spill constants.
3427cb93a386Sopenharmony_ci                };
3428cb93a386Sopenharmony_ci
3429cb93a386Sopenharmony_ci                // We want to find a block of N adjacent registers requiring the fewest spills.
3430cb93a386Sopenharmony_ci                int best_block = -1,
3431cb93a386Sopenharmony_ci                    min_spills = 0x7fff'ffff;
3432cb93a386Sopenharmony_ci                for (int block = 0; block+N <= (int)regs.size(); block++) {
3433cb93a386Sopenharmony_ci                    int spills = 0;
3434cb93a386Sopenharmony_ci                    for (int r = block; r < block+N; r++) {
3435cb93a386Sopenharmony_ci                        Val v = regs[r];
3436cb93a386Sopenharmony_ci                        // Registers holding NA (nothing) are ideal, nothing to spill.
3437cb93a386Sopenharmony_ci                        if (v == NA) {
3438cb93a386Sopenharmony_ci                            continue;
3439cb93a386Sopenharmony_ci                        }
3440cb93a386Sopenharmony_ci                        // We can't spill anything REServed or that we'll need this instruction.
3441cb93a386Sopenharmony_ci                        if (v == RES ||
3442cb93a386Sopenharmony_ci                            v == TMP || v == id || v == x || v == y || v == z || v == w) {
3443cb93a386Sopenharmony_ci                            spills = 0x7fff'ffff;
3444cb93a386Sopenharmony_ci                            block  = r;   // (optimization) continue outer loop at next register.
3445cb93a386Sopenharmony_ci                            break;
3446cb93a386Sopenharmony_ci                        }
3447cb93a386Sopenharmony_ci                        // Usually here we've got a value v that we'd have to spill to the stack
3448cb93a386Sopenharmony_ci                        // before reusing its register, but sometimes even now we get a freebie.
3449cb93a386Sopenharmony_ci                        spills += needs_spill(v) ? 1 : 0;
3450cb93a386Sopenharmony_ci                    }
3451cb93a386Sopenharmony_ci
3452cb93a386Sopenharmony_ci                    // TODO: non-arbitrary tie-breaking?
3453cb93a386Sopenharmony_ci                    if (min_spills > spills) {
3454cb93a386Sopenharmony_ci                        min_spills = spills;
3455cb93a386Sopenharmony_ci                        best_block = block;
3456cb93a386Sopenharmony_ci                    }
3457cb93a386Sopenharmony_ci                    if (min_spills == 0) {
3458cb93a386Sopenharmony_ci                        break;  // (optimization) stop early if we find an unbeatable block.
3459cb93a386Sopenharmony_ci                    }
3460cb93a386Sopenharmony_ci                }
3461cb93a386Sopenharmony_ci
3462cb93a386Sopenharmony_ci                // TODO: our search's success isn't obviously guaranteed... it depends on N
3463cb93a386Sopenharmony_ci                // and the number and relative position in regs of any unspillable values.
3464cb93a386Sopenharmony_ci                // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64;
3465cb93a386Sopenharmony_ci                // we'll need to revisit this logic should this assert fire.
3466cb93a386Sopenharmony_ci                SkASSERT(min_spills <= N);
3467cb93a386Sopenharmony_ci
3468cb93a386Sopenharmony_ci                // Spill what needs spilling, and mark the block all as TMP.
3469cb93a386Sopenharmony_ci                for (int r = best_block; r < best_block+N; r++) {
3470cb93a386Sopenharmony_ci                    Val& v = regs[r];
3471cb93a386Sopenharmony_ci                    *registers_used |= (1<<r);
3472cb93a386Sopenharmony_ci
3473cb93a386Sopenharmony_ci                    SkASSERT(v == NA || v >= 0);
3474cb93a386Sopenharmony_ci                    if (v >= 0 && needs_spill(v)) {
3475cb93a386Sopenharmony_ci                        store_to_stack((Reg)r, v);
3476cb93a386Sopenharmony_ci                        SkASSERT(!needs_spill(v));
3477cb93a386Sopenharmony_ci                        min_spills--;
3478cb93a386Sopenharmony_ci                    }
3479cb93a386Sopenharmony_ci
3480cb93a386Sopenharmony_ci                    v = TMP;
3481cb93a386Sopenharmony_ci                }
3482cb93a386Sopenharmony_ci                SkASSERT(min_spills == 0);
3483cb93a386Sopenharmony_ci                return (Reg)best_block;
3484cb93a386Sopenharmony_ci            };
3485cb93a386Sopenharmony_ci
3486cb93a386Sopenharmony_ci            auto free_tmp = [&](Reg r) {
3487cb93a386Sopenharmony_ci                SkASSERT(regs[r] == TMP);
3488cb93a386Sopenharmony_ci                regs[r] = NA;
3489cb93a386Sopenharmony_ci            };
3490cb93a386Sopenharmony_ci
3491cb93a386Sopenharmony_ci            // Which register holds dst,x,y,z,w for this instruction?  NA if none does yet.
3492cb93a386Sopenharmony_ci            int rd = NA,
3493cb93a386Sopenharmony_ci                rx = NA,
3494cb93a386Sopenharmony_ci                ry = NA,
3495cb93a386Sopenharmony_ci                rz = NA,
3496cb93a386Sopenharmony_ci                rw = NA;
3497cb93a386Sopenharmony_ci
3498cb93a386Sopenharmony_ci            auto update_regs = [&](Reg r, Val v) {
3499cb93a386Sopenharmony_ci                if (v == id) { rd = r; }
3500cb93a386Sopenharmony_ci                if (v ==  x) { rx = r; }
3501cb93a386Sopenharmony_ci                if (v ==  y) { ry = r; }
3502cb93a386Sopenharmony_ci                if (v ==  z) { rz = r; }
3503cb93a386Sopenharmony_ci                if (v ==  w) { rw = r; }
3504cb93a386Sopenharmony_ci                return r;
3505cb93a386Sopenharmony_ci            };
3506cb93a386Sopenharmony_ci
3507cb93a386Sopenharmony_ci            auto find_existing_reg = [&](Val v) -> int {
3508cb93a386Sopenharmony_ci                // Quick-check our working registers.
3509cb93a386Sopenharmony_ci                if (v == id && rd != NA) { return rd; }
3510cb93a386Sopenharmony_ci                if (v ==  x && rx != NA) { return rx; }
3511cb93a386Sopenharmony_ci                if (v ==  y && ry != NA) { return ry; }
3512cb93a386Sopenharmony_ci                if (v ==  z && rz != NA) { return rz; }
3513cb93a386Sopenharmony_ci                if (v ==  w && rw != NA) { return rw; }
3514cb93a386Sopenharmony_ci
3515cb93a386Sopenharmony_ci                // Search inter-instruction register map.
3516cb93a386Sopenharmony_ci                for (auto [r,val] : SkMakeEnumerate(regs)) {
3517cb93a386Sopenharmony_ci                    if (val == v) {
3518cb93a386Sopenharmony_ci                        return update_regs((Reg)r, v);
3519cb93a386Sopenharmony_ci                    }
3520cb93a386Sopenharmony_ci                }
3521cb93a386Sopenharmony_ci                return NA;
3522cb93a386Sopenharmony_ci            };
3523cb93a386Sopenharmony_ci
3524cb93a386Sopenharmony_ci            // Return a register for Val, holding that value if it already exists.
3525cb93a386Sopenharmony_ci            // During this instruction all calls to r(v) will return the same register.
3526cb93a386Sopenharmony_ci            auto r = [&](Val v) -> Reg {
3527cb93a386Sopenharmony_ci                SkASSERT(v >= 0);
3528cb93a386Sopenharmony_ci
3529cb93a386Sopenharmony_ci                if (int found = find_existing_reg(v); found != NA) {
3530cb93a386Sopenharmony_ci                    return (Reg)found;
3531cb93a386Sopenharmony_ci                }
3532cb93a386Sopenharmony_ci
3533cb93a386Sopenharmony_ci                Reg r = alloc_tmp();
3534cb93a386Sopenharmony_ci                SkASSERT(regs[r] == TMP);
3535cb93a386Sopenharmony_ci
3536cb93a386Sopenharmony_ci                SkASSERT(v <= id);
3537cb93a386Sopenharmony_ci                if (v < id) {
3538cb93a386Sopenharmony_ci                    // If v < id, we're loading one of this instruction's inputs.
3539cb93a386Sopenharmony_ci                    // If v == id we're just allocating its destination register.
3540cb93a386Sopenharmony_ci                    load_from_memory(r, v);
3541cb93a386Sopenharmony_ci                }
3542cb93a386Sopenharmony_ci                regs[r] = v;
3543cb93a386Sopenharmony_ci                return update_regs(r, v);
3544cb93a386Sopenharmony_ci            };
3545cb93a386Sopenharmony_ci
3546cb93a386Sopenharmony_ci            auto dies_here = [&](Val v) -> bool {
3547cb93a386Sopenharmony_ci                SkASSERT(v >= 0);
3548cb93a386Sopenharmony_ci                return instructions[v].death == id;
3549cb93a386Sopenharmony_ci            };
3550cb93a386Sopenharmony_ci
3551cb93a386Sopenharmony_ci            // Alias dst() to r(v) if dies_here(v).
3552cb93a386Sopenharmony_ci            auto try_alias = [&](Val v) -> bool {
3553cb93a386Sopenharmony_ci                SkASSERT(v == x || v == y || v == z || v == w);
3554cb93a386Sopenharmony_ci                if (dies_here(v)) {
3555cb93a386Sopenharmony_ci                    rd = r(v);      // Vals v and id share a register for this instruction.
3556cb93a386Sopenharmony_ci                    regs[rd] = id;  // Next instruction, Val id will be in the register, not Val v.
3557cb93a386Sopenharmony_ci                    return true;
3558cb93a386Sopenharmony_ci                }
3559cb93a386Sopenharmony_ci                return false;
3560cb93a386Sopenharmony_ci            };
3561cb93a386Sopenharmony_ci
3562cb93a386Sopenharmony_ci            // Generally r(id),
3563cb93a386Sopenharmony_ci            // but with a hint, try to alias dst() to r(v) if dies_here(v).
3564cb93a386Sopenharmony_ci            auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3565cb93a386Sopenharmony_ci                if (hint1 != NA && try_alias(hint1)) { return r(id); }
3566cb93a386Sopenharmony_ci                if (hint2 != NA && try_alias(hint2)) { return r(id); }
3567cb93a386Sopenharmony_ci                return r(id);
3568cb93a386Sopenharmony_ci            };
3569cb93a386Sopenharmony_ci
3570cb93a386Sopenharmony_ci        #if defined(__aarch64__)  // Nothing sneaky, just unused on x86-64.
3571cb93a386Sopenharmony_ci            auto mark_tmp_as_dst = [&](Reg tmp) {
3572cb93a386Sopenharmony_ci                SkASSERT(regs[tmp] == TMP);
3573cb93a386Sopenharmony_ci                rd = tmp;
3574cb93a386Sopenharmony_ci                regs[rd] = id;
3575cb93a386Sopenharmony_ci                SkASSERT(dst() == tmp);
3576cb93a386Sopenharmony_ci            };
3577cb93a386Sopenharmony_ci        #endif
3578cb93a386Sopenharmony_ci
3579cb93a386Sopenharmony_ci        #if defined(__x86_64__) || defined(_M_X64)
3580cb93a386Sopenharmony_ci            // On x86 we can work with many values directly from the stack or program constant pool.
3581cb93a386Sopenharmony_ci            auto any = [&](Val v) -> A::Operand {
3582cb93a386Sopenharmony_ci                SkASSERT(v >= 0);
3583cb93a386Sopenharmony_ci                SkASSERT(v < id);
3584cb93a386Sopenharmony_ci
3585cb93a386Sopenharmony_ci                if (int found = find_existing_reg(v); found != NA) {
3586cb93a386Sopenharmony_ci                    return (Reg)found;
3587cb93a386Sopenharmony_ci                }
3588cb93a386Sopenharmony_ci                if (instructions[v].op == Op::splat) {
3589cb93a386Sopenharmony_ci                    return constants.find(instructions[v].immA);
3590cb93a386Sopenharmony_ci                }
3591cb93a386Sopenharmony_ci                return A::Mem{A::rsp, stack_slot[v]*K*4};
3592cb93a386Sopenharmony_ci            };
3593cb93a386Sopenharmony_ci
3594cb93a386Sopenharmony_ci            // This is never really worth asking except when any() might be used;
3595cb93a386Sopenharmony_ci            // if we need this value in ARM, might as well just call r(v) to get it into a register.
3596cb93a386Sopenharmony_ci            auto in_reg = [&](Val v) -> bool {
3597cb93a386Sopenharmony_ci                return find_existing_reg(v) != NA;
3598cb93a386Sopenharmony_ci            };
3599cb93a386Sopenharmony_ci        #endif
3600cb93a386Sopenharmony_ci
3601cb93a386Sopenharmony_ci            switch (op) {
3602cb93a386Sopenharmony_ci                // Make sure splat constants can be found by load_from_memory() or any().
3603cb93a386Sopenharmony_ci                case Op::splat:
3604cb93a386Sopenharmony_ci                    (void)constants[immA];
3605cb93a386Sopenharmony_ci                    break;
3606cb93a386Sopenharmony_ci
3607cb93a386Sopenharmony_ci            #if defined(__x86_64__) || defined(_M_X64)
3608cb93a386Sopenharmony_ci                case Op::assert_true: {
3609cb93a386Sopenharmony_ci                    a->vptest (r(x), &constants[0xffffffff]);
3610cb93a386Sopenharmony_ci                    A::Label all_true;
3611cb93a386Sopenharmony_ci                    a->jc(&all_true);
3612cb93a386Sopenharmony_ci                    a->int3();
3613cb93a386Sopenharmony_ci                    a->label(&all_true);
3614cb93a386Sopenharmony_ci                } break;
3615cb93a386Sopenharmony_ci
3616cb93a386Sopenharmony_ci                case Op::trace_line:
3617cb93a386Sopenharmony_ci                case Op::trace_var:
3618cb93a386Sopenharmony_ci                case Op::trace_call:
3619cb93a386Sopenharmony_ci                    /* Only supported in the interpreter. */
3620cb93a386Sopenharmony_ci                    break;
3621cb93a386Sopenharmony_ci
3622cb93a386Sopenharmony_ci                case Op::store8:
3623cb93a386Sopenharmony_ci                    if (scalar) {
3624cb93a386Sopenharmony_ci                        a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3625cb93a386Sopenharmony_ci                    } else {
3626cb93a386Sopenharmony_ci                        a->vpackusdw(dst(x), r(x), r(x));
3627cb93a386Sopenharmony_ci                        a->vpermq   (dst(), dst(), 0xd8);
3628cb93a386Sopenharmony_ci                        a->vpackuswb(dst(), dst(), dst());
3629cb93a386Sopenharmony_ci                        a->vmovq    (A::Mem{arg[immA]}, (A::Xmm)dst());
3630cb93a386Sopenharmony_ci                    } break;
3631cb93a386Sopenharmony_ci
3632cb93a386Sopenharmony_ci                case Op::store16:
3633cb93a386Sopenharmony_ci                    if (scalar) {
3634cb93a386Sopenharmony_ci                        a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3635cb93a386Sopenharmony_ci                    } else {
3636cb93a386Sopenharmony_ci                        a->vpackusdw(dst(x), r(x), r(x));
3637cb93a386Sopenharmony_ci                        a->vpermq   (dst(), dst(), 0xd8);
3638cb93a386Sopenharmony_ci                        a->vmovups  (A::Mem{arg[immA]}, (A::Xmm)dst());
3639cb93a386Sopenharmony_ci                    } break;
3640cb93a386Sopenharmony_ci
3641cb93a386Sopenharmony_ci                case Op::store32: if (scalar) { a->vmovd  (A::Mem{arg[immA]}, (A::Xmm)r(x)); }
3642cb93a386Sopenharmony_ci                                  else        { a->vmovups(A::Mem{arg[immA]},         r(x)); }
3643cb93a386Sopenharmony_ci                                  break;
3644cb93a386Sopenharmony_ci
3645cb93a386Sopenharmony_ci                case Op::store64: if (scalar) {
3646cb93a386Sopenharmony_ci                                      a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x));
3647cb93a386Sopenharmony_ci                                      a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y));
3648cb93a386Sopenharmony_ci                                  } else {
3649cb93a386Sopenharmony_ci                                      // r(x) = {a,b,c,d|e,f,g,h}
3650cb93a386Sopenharmony_ci                                      // r(y) = {i,j,k,l|m,n,o,p}
3651cb93a386Sopenharmony_ci                                      // We want to write a,i,b,j,c,k,d,l,e,m...
3652cb93a386Sopenharmony_ci                                      A::Ymm L = alloc_tmp(),
3653cb93a386Sopenharmony_ci                                             H = alloc_tmp();
3654cb93a386Sopenharmony_ci                                      a->vpunpckldq(L, r(x), any(y));  // L = {a,i,b,j|e,m,f,n}
3655cb93a386Sopenharmony_ci                                      a->vpunpckhdq(H, r(x), any(y));  // H = {c,k,d,l|g,o,h,p}
3656cb93a386Sopenharmony_ci                                      a->vperm2f128(dst(), L,H, 0x20); //   = {a,i,b,j|c,k,d,l}
3657cb93a386Sopenharmony_ci                                      a->vmovups(A::Mem{arg[immA], 0}, dst());
3658cb93a386Sopenharmony_ci                                      a->vperm2f128(dst(), L,H, 0x31); //   = {e,m,f,n|g,o,h,p}
3659cb93a386Sopenharmony_ci                                      a->vmovups(A::Mem{arg[immA],32}, dst());
3660cb93a386Sopenharmony_ci                                      free_tmp(L);
3661cb93a386Sopenharmony_ci                                      free_tmp(H);
3662cb93a386Sopenharmony_ci                                  } break;
3663cb93a386Sopenharmony_ci
3664cb93a386Sopenharmony_ci                case Op::store128: {
3665cb93a386Sopenharmony_ci                    // TODO: >32-bit stores
3666cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 0*16 +  0}, (A::Xmm)r(x)   );
3667cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 0*16 +  4}, (A::Xmm)r(y)   );
3668cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 0*16 +  8}, (A::Xmm)r(z)   );
3669cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w)   );
3670cb93a386Sopenharmony_ci                    if (scalar) { break; }
3671cb93a386Sopenharmony_ci
3672cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 1*16 +  0}, (A::Xmm)r(x), 1);
3673cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 1*16 +  4}, (A::Xmm)r(y), 1);
3674cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 1*16 +  8}, (A::Xmm)r(z), 1);
3675cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1);
3676cb93a386Sopenharmony_ci
3677cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 2*16 +  0}, (A::Xmm)r(x), 2);
3678cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 2*16 +  4}, (A::Xmm)r(y), 2);
3679cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 2*16 +  8}, (A::Xmm)r(z), 2);
3680cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2);
3681cb93a386Sopenharmony_ci
3682cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 3*16 +  0}, (A::Xmm)r(x), 3);
3683cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 3*16 +  4}, (A::Xmm)r(y), 3);
3684cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 3*16 +  8}, (A::Xmm)r(z), 3);
3685cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3);
3686cb93a386Sopenharmony_ci                    // Now we need to store the upper 128 bits of x,y,z,w.
3687cb93a386Sopenharmony_ci                    // Storing in this order rather than interlacing minimizes temporaries.
3688cb93a386Sopenharmony_ci                    a->vextracti128(dst(), r(x), 1);
3689cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 4*16 +  0}, (A::Xmm)dst()   );
3690cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 5*16 +  0}, (A::Xmm)dst(), 1);
3691cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 6*16 +  0}, (A::Xmm)dst(), 2);
3692cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 7*16 +  0}, (A::Xmm)dst(), 3);
3693cb93a386Sopenharmony_ci
3694cb93a386Sopenharmony_ci                    a->vextracti128(dst(), r(y), 1);
3695cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 4*16 +  4}, (A::Xmm)dst()   );
3696cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 5*16 +  4}, (A::Xmm)dst(), 1);
3697cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 6*16 +  4}, (A::Xmm)dst(), 2);
3698cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 7*16 +  4}, (A::Xmm)dst(), 3);
3699cb93a386Sopenharmony_ci
3700cb93a386Sopenharmony_ci                    a->vextracti128(dst(), r(z), 1);
3701cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 4*16 +  8}, (A::Xmm)dst()   );
3702cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 5*16 +  8}, (A::Xmm)dst(), 1);
3703cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 6*16 +  8}, (A::Xmm)dst(), 2);
3704cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 7*16 +  8}, (A::Xmm)dst(), 3);
3705cb93a386Sopenharmony_ci
3706cb93a386Sopenharmony_ci                    a->vextracti128(dst(), r(w), 1);
3707cb93a386Sopenharmony_ci                    a->vmovd  (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst()   );
3708cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1);
3709cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2);
3710cb93a386Sopenharmony_ci                    a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3);
3711cb93a386Sopenharmony_ci                } break;
3712cb93a386Sopenharmony_ci
3713cb93a386Sopenharmony_ci                case Op::load8:  if (scalar) {
3714cb93a386Sopenharmony_ci                                     a->vpxor  (dst(), dst(), dst());
3715cb93a386Sopenharmony_ci                                     a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3716cb93a386Sopenharmony_ci                                 } else {
3717cb93a386Sopenharmony_ci                                     a->vpmovzxbd(dst(), A::Mem{arg[immA]});
3718cb93a386Sopenharmony_ci                                 } break;
3719cb93a386Sopenharmony_ci
3720cb93a386Sopenharmony_ci                case Op::load16: if (scalar) {
3721cb93a386Sopenharmony_ci                                     a->vpxor  (dst(), dst(), dst());
3722cb93a386Sopenharmony_ci                                     a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3723cb93a386Sopenharmony_ci                                 } else {
3724cb93a386Sopenharmony_ci                                     a->vpmovzxwd(dst(), A::Mem{arg[immA]});
3725cb93a386Sopenharmony_ci                                 } break;
3726cb93a386Sopenharmony_ci
3727cb93a386Sopenharmony_ci                case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), A::Mem{arg[immA]}); }
3728cb93a386Sopenharmony_ci                                 else        { a->vmovups(        dst(), A::Mem{arg[immA]}); }
3729cb93a386Sopenharmony_ci                                 break;
3730cb93a386Sopenharmony_ci
3731cb93a386Sopenharmony_ci                case Op::load64: if (scalar) {
3732cb93a386Sopenharmony_ci                                    a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3733cb93a386Sopenharmony_ci                                 } else {
3734cb93a386Sopenharmony_ci                                    A::Ymm tmp = alloc_tmp();
3735cb93a386Sopenharmony_ci                                    a->vmovups(tmp, &load64_index);
3736cb93a386Sopenharmony_ci                                    a->vpermps(dst(), tmp, A::Mem{arg[immA],  0});
3737cb93a386Sopenharmony_ci                                    a->vpermps(  tmp, tmp, A::Mem{arg[immA], 32});
3738cb93a386Sopenharmony_ci                                    // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1.
3739cb93a386Sopenharmony_ci                                    a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20);
3740cb93a386Sopenharmony_ci                                    free_tmp(tmp);
3741cb93a386Sopenharmony_ci                                 } break;
3742cb93a386Sopenharmony_ci
3743cb93a386Sopenharmony_ci                case Op::load128: if (scalar) {
3744cb93a386Sopenharmony_ci                                      a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3745cb93a386Sopenharmony_ci                                  } else {
3746cb93a386Sopenharmony_ci                                      // Load 4 low values into xmm tmp,
3747cb93a386Sopenharmony_ci                                      A::Ymm tmp = alloc_tmp();
3748cb93a386Sopenharmony_ci                                      A::Xmm t = (A::Xmm)tmp;
3749cb93a386Sopenharmony_ci                                      a->vmovd  (t,   A::Mem{arg[immA], 0*16 + 4*immB}   );
3750cb93a386Sopenharmony_ci                                      a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1);
3751cb93a386Sopenharmony_ci                                      a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2);
3752cb93a386Sopenharmony_ci                                      a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3);
3753cb93a386Sopenharmony_ci
3754cb93a386Sopenharmony_ci                                      // Load 4 high values into xmm dst(),
3755cb93a386Sopenharmony_ci                                      A::Xmm d = (A::Xmm)dst();
3756cb93a386Sopenharmony_ci                                      a->vmovd  (d,   A::Mem{arg[immA], 4*16 + 4*immB}   );
3757cb93a386Sopenharmony_ci                                      a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1);
3758cb93a386Sopenharmony_ci                                      a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2);
3759cb93a386Sopenharmony_ci                                      a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3);
3760cb93a386Sopenharmony_ci
3761cb93a386Sopenharmony_ci                                      // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3762cb93a386Sopenharmony_ci                                      a->vperm2f128(dst(), tmp,dst(), 0x20);
3763cb93a386Sopenharmony_ci                                      free_tmp(tmp);
3764cb93a386Sopenharmony_ci                                  } break;
3765cb93a386Sopenharmony_ci
3766cb93a386Sopenharmony_ci                case Op::gather8: {
3767cb93a386Sopenharmony_ci                    // As usual, the gather base pointer is immB bytes off of uniform immA.
3768cb93a386Sopenharmony_ci                    a->mov(GP0, A::Mem{arg[immA], immB});
3769cb93a386Sopenharmony_ci
3770cb93a386Sopenharmony_ci                    A::Ymm tmp = alloc_tmp();
3771cb93a386Sopenharmony_ci                    a->vmovups(tmp, any(x));
3772cb93a386Sopenharmony_ci
3773cb93a386Sopenharmony_ci                    for (int i = 0; i < active_lanes; i++) {
3774cb93a386Sopenharmony_ci                        if (i == 4) {
3775cb93a386Sopenharmony_ci                            // vpextrd can only pluck indices out from an Xmm register,
3776cb93a386Sopenharmony_ci                            // so we manually swap over to the top when we're halfway through.
3777cb93a386Sopenharmony_ci                            a->vextracti128((A::Xmm)tmp, tmp, 1);
3778cb93a386Sopenharmony_ci                        }
3779cb93a386Sopenharmony_ci                        a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3780cb93a386Sopenharmony_ci                        a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3781cb93a386Sopenharmony_ci                    }
3782cb93a386Sopenharmony_ci                    a->vpmovzxbd(dst(), dst());
3783cb93a386Sopenharmony_ci                    free_tmp(tmp);
3784cb93a386Sopenharmony_ci                } break;
3785cb93a386Sopenharmony_ci
3786cb93a386Sopenharmony_ci                case Op::gather16: {
3787cb93a386Sopenharmony_ci                    // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3788cb93a386Sopenharmony_ci                    a->mov(GP0, A::Mem{arg[immA], immB});
3789cb93a386Sopenharmony_ci
3790cb93a386Sopenharmony_ci                    A::Ymm tmp = alloc_tmp();
3791cb93a386Sopenharmony_ci                    a->vmovups(tmp, any(x));
3792cb93a386Sopenharmony_ci
3793cb93a386Sopenharmony_ci                    for (int i = 0; i < active_lanes; i++) {
3794cb93a386Sopenharmony_ci                        if (i == 4) {
3795cb93a386Sopenharmony_ci                            a->vextracti128((A::Xmm)tmp, tmp, 1);
3796cb93a386Sopenharmony_ci                        }
3797cb93a386Sopenharmony_ci                        a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3798cb93a386Sopenharmony_ci                        a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3799cb93a386Sopenharmony_ci                    }
3800cb93a386Sopenharmony_ci                    a->vpmovzxwd(dst(), dst());
3801cb93a386Sopenharmony_ci                    free_tmp(tmp);
3802cb93a386Sopenharmony_ci                } break;
3803cb93a386Sopenharmony_ci
3804cb93a386Sopenharmony_ci                case Op::gather32:
3805cb93a386Sopenharmony_ci                if (scalar) {
3806cb93a386Sopenharmony_ci                    // Our gather base pointer is immB bytes off of uniform immA.
3807cb93a386Sopenharmony_ci                    a->mov(GP0, A::Mem{arg[immA], immB});
3808cb93a386Sopenharmony_ci
3809cb93a386Sopenharmony_ci                    // Grab our index from lane 0 of the index argument.
3810cb93a386Sopenharmony_ci                    a->vmovd(GP1, (A::Xmm)r(x));
3811cb93a386Sopenharmony_ci
3812cb93a386Sopenharmony_ci                    // dst = *(base + 4*index)
3813cb93a386Sopenharmony_ci                    a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3814cb93a386Sopenharmony_ci                } else {
3815cb93a386Sopenharmony_ci                    a->mov(GP0, A::Mem{arg[immA], immB});
3816cb93a386Sopenharmony_ci
3817cb93a386Sopenharmony_ci                    A::Ymm mask = alloc_tmp();
3818cb93a386Sopenharmony_ci                    a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3819cb93a386Sopenharmony_ci
3820cb93a386Sopenharmony_ci                    a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3821cb93a386Sopenharmony_ci                    free_tmp(mask);
3822cb93a386Sopenharmony_ci                }
3823cb93a386Sopenharmony_ci                break;
3824cb93a386Sopenharmony_ci
3825cb93a386Sopenharmony_ci                case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
3826cb93a386Sopenharmony_ci                                    break;
3827cb93a386Sopenharmony_ci
3828cb93a386Sopenharmony_ci                case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB});
3829cb93a386Sopenharmony_ci                                  a->vbroadcastss(dst(), A::Mem{GP0, immC});
3830cb93a386Sopenharmony_ci                                  break;
3831cb93a386Sopenharmony_ci
3832cb93a386Sopenharmony_ci                case Op::index: a->vmovd((A::Xmm)dst(), N);
3833cb93a386Sopenharmony_ci                                a->vbroadcastss(dst(), dst());
3834cb93a386Sopenharmony_ci                                a->vpsubd(dst(), dst(), &iota);
3835cb93a386Sopenharmony_ci                                break;
3836cb93a386Sopenharmony_ci
3837cb93a386Sopenharmony_ci                // We can swap the arguments of symmetric instructions to make better use of any().
3838cb93a386Sopenharmony_ci                case Op::add_f32:
3839cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3840cb93a386Sopenharmony_ci                    else           { a->vaddps(dst(y), r(y), any(x)); }
3841cb93a386Sopenharmony_ci                                     break;
3842cb93a386Sopenharmony_ci
3843cb93a386Sopenharmony_ci                case Op::mul_f32:
3844cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3845cb93a386Sopenharmony_ci                    else           { a->vmulps(dst(y), r(y), any(x)); }
3846cb93a386Sopenharmony_ci                                     break;
3847cb93a386Sopenharmony_ci
3848cb93a386Sopenharmony_ci                case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3849cb93a386Sopenharmony_ci                case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3850cb93a386Sopenharmony_ci                case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break;  // Order matters,
3851cb93a386Sopenharmony_ci                case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break;  // see test SkVM_min_max.
3852cb93a386Sopenharmony_ci
3853cb93a386Sopenharmony_ci                case Op::fma_f32:
3854cb93a386Sopenharmony_ci                    if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3855cb93a386Sopenharmony_ci                    if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3856cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3857cb93a386Sopenharmony_ci                                      { a->vmovups    (dst(), any(x));
3858cb93a386Sopenharmony_ci                                        a->vfmadd132ps(dst(), r(z), any(y)); }
3859cb93a386Sopenharmony_ci                                        break;
3860cb93a386Sopenharmony_ci
3861cb93a386Sopenharmony_ci                case Op::fms_f32:
3862cb93a386Sopenharmony_ci                    if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3863cb93a386Sopenharmony_ci                    if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3864cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3865cb93a386Sopenharmony_ci                                      { a->vmovups    (dst(), any(x));
3866cb93a386Sopenharmony_ci                                        a->vfmsub132ps(dst(), r(z), any(y)); }
3867cb93a386Sopenharmony_ci                                        break;
3868cb93a386Sopenharmony_ci
3869cb93a386Sopenharmony_ci                case Op::fnma_f32:
3870cb93a386Sopenharmony_ci                    if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3871cb93a386Sopenharmony_ci                    if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3872cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3873cb93a386Sopenharmony_ci                                      { a->vmovups     (dst(), any(x));
3874cb93a386Sopenharmony_ci                                        a->vfnmadd132ps(dst(), r(z), any(y)); }
3875cb93a386Sopenharmony_ci                                        break;
3876cb93a386Sopenharmony_ci
3877cb93a386Sopenharmony_ci                // In situations like this we want to try aliasing dst(x) when x is
3878cb93a386Sopenharmony_ci                // already in a register, but not if we'd have to load it from the stack
3879cb93a386Sopenharmony_ci                // just to alias it.  That's done better directly into the new register.
3880cb93a386Sopenharmony_ci                case Op::sqrt_f32:
3881cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vsqrtps(dst(x),  r(x)); }
3882cb93a386Sopenharmony_ci                    else           { a->vsqrtps(dst(), any(x)); }
3883cb93a386Sopenharmony_ci                                     break;
3884cb93a386Sopenharmony_ci
3885cb93a386Sopenharmony_ci                case Op::add_i32:
3886cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3887cb93a386Sopenharmony_ci                    else           { a->vpaddd(dst(y), r(y), any(x)); }
3888cb93a386Sopenharmony_ci                                     break;
3889cb93a386Sopenharmony_ci
3890cb93a386Sopenharmony_ci                case Op::mul_i32:
3891cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3892cb93a386Sopenharmony_ci                    else           { a->vpmulld(dst(y), r(y), any(x)); }
3893cb93a386Sopenharmony_ci                                     break;
3894cb93a386Sopenharmony_ci
3895cb93a386Sopenharmony_ci                case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3896cb93a386Sopenharmony_ci
3897cb93a386Sopenharmony_ci                case Op::bit_and:
3898cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3899cb93a386Sopenharmony_ci                    else           { a->vpand(dst(y), r(y), any(x)); }
3900cb93a386Sopenharmony_ci                                     break;
3901cb93a386Sopenharmony_ci                case Op::bit_or:
3902cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3903cb93a386Sopenharmony_ci                    else           { a->vpor(dst(y), r(y), any(x)); }
3904cb93a386Sopenharmony_ci                                     break;
3905cb93a386Sopenharmony_ci                case Op::bit_xor:
3906cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3907cb93a386Sopenharmony_ci                    else           { a->vpxor(dst(y), r(y), any(x)); }
3908cb93a386Sopenharmony_ci                                     break;
3909cb93a386Sopenharmony_ci
3910cb93a386Sopenharmony_ci                case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3911cb93a386Sopenharmony_ci
3912cb93a386Sopenharmony_ci                case Op::select:
3913cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3914cb93a386Sopenharmony_ci                    else              { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3915cb93a386Sopenharmony_ci                                        break;
3916cb93a386Sopenharmony_ci
3917cb93a386Sopenharmony_ci                case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break;
3918cb93a386Sopenharmony_ci                case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break;
3919cb93a386Sopenharmony_ci                case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break;
3920cb93a386Sopenharmony_ci
3921cb93a386Sopenharmony_ci                case Op::eq_i32:
3922cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3923cb93a386Sopenharmony_ci                    else           { a->vpcmpeqd(dst(y), r(y), any(x)); }
3924cb93a386Sopenharmony_ci                                     break;
3925cb93a386Sopenharmony_ci
3926cb93a386Sopenharmony_ci                case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3927cb93a386Sopenharmony_ci
3928cb93a386Sopenharmony_ci                case Op::eq_f32:
3929cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3930cb93a386Sopenharmony_ci                    else           { a->vcmpeqps(dst(y), r(y), any(x)); }
3931cb93a386Sopenharmony_ci                                     break;
3932cb93a386Sopenharmony_ci                case Op::neq_f32:
3933cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3934cb93a386Sopenharmony_ci                    else           { a->vcmpneqps(dst(y), r(y), any(x)); }
3935cb93a386Sopenharmony_ci                                     break;
3936cb93a386Sopenharmony_ci
3937cb93a386Sopenharmony_ci                case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3938cb93a386Sopenharmony_ci                case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3939cb93a386Sopenharmony_ci
3940cb93a386Sopenharmony_ci                case Op::ceil:
3941cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::CEIL); }
3942cb93a386Sopenharmony_ci                    else           { a->vroundps(dst(), any(x), Assembler::CEIL); }
3943cb93a386Sopenharmony_ci                                     break;
3944cb93a386Sopenharmony_ci
3945cb93a386Sopenharmony_ci                case Op::floor:
3946cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::FLOOR); }
3947cb93a386Sopenharmony_ci                    else           { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3948cb93a386Sopenharmony_ci                                     break;
3949cb93a386Sopenharmony_ci
3950cb93a386Sopenharmony_ci                case Op::to_f32:
3951cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vcvtdq2ps(dst(x),  r(x)); }
3952cb93a386Sopenharmony_ci                    else           { a->vcvtdq2ps(dst(), any(x)); }
3953cb93a386Sopenharmony_ci                                     break;
3954cb93a386Sopenharmony_ci
3955cb93a386Sopenharmony_ci                case Op::trunc:
3956cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vcvttps2dq(dst(x),  r(x)); }
3957cb93a386Sopenharmony_ci                    else           { a->vcvttps2dq(dst(), any(x)); }
3958cb93a386Sopenharmony_ci                                     break;
3959cb93a386Sopenharmony_ci
3960cb93a386Sopenharmony_ci                case Op::round:
3961cb93a386Sopenharmony_ci                    if (in_reg(x)) { a->vcvtps2dq(dst(x),  r(x)); }
3962cb93a386Sopenharmony_ci                    else           { a->vcvtps2dq(dst(), any(x)); }
3963cb93a386Sopenharmony_ci                                     break;
3964cb93a386Sopenharmony_ci
3965cb93a386Sopenharmony_ci                case Op::to_fp16:
3966cb93a386Sopenharmony_ci                    a->vcvtps2ph(dst(x), r(x), A::CURRENT);  // f32 ymm -> f16 xmm
3967cb93a386Sopenharmony_ci                    a->vpmovzxwd(dst(), dst());              // f16 xmm -> f16 ymm
3968cb93a386Sopenharmony_ci                    break;
3969cb93a386Sopenharmony_ci
3970cb93a386Sopenharmony_ci                case Op::from_fp16:
3971cb93a386Sopenharmony_ci                    a->vpackusdw(dst(x), r(x), r(x));  // f16 ymm -> f16 xmm
3972cb93a386Sopenharmony_ci                    a->vpermq   (dst(), dst(), 0xd8);  // swap middle two 64-bit lanes
3973cb93a386Sopenharmony_ci                    a->vcvtph2ps(dst(), dst());        // f16 xmm -> f32 ymm
3974cb93a386Sopenharmony_ci                    break;
3975cb93a386Sopenharmony_ci
3976cb93a386Sopenharmony_ci            #elif defined(__aarch64__)
3977cb93a386Sopenharmony_ci                case Op::assert_true: {
3978cb93a386Sopenharmony_ci                    a->uminv4s(dst(), r(x));   // uminv acts like an all() across the vector.
3979cb93a386Sopenharmony_ci                    a->movs(GP0, dst(), 0);
3980cb93a386Sopenharmony_ci                    A::Label all_true;
3981cb93a386Sopenharmony_ci                    a->cbnz(GP0, &all_true);
3982cb93a386Sopenharmony_ci                    a->brk(0);
3983cb93a386Sopenharmony_ci                    a->label(&all_true);
3984cb93a386Sopenharmony_ci                } break;
3985cb93a386Sopenharmony_ci
3986cb93a386Sopenharmony_ci                case Op::trace_line:
3987cb93a386Sopenharmony_ci                case Op::trace_var:
3988cb93a386Sopenharmony_ci                case Op::trace_call:
3989cb93a386Sopenharmony_ci                    /* Only supported in the interpreter. */
3990cb93a386Sopenharmony_ci                    break;
3991cb93a386Sopenharmony_ci
3992cb93a386Sopenharmony_ci                case Op::index: {
3993cb93a386Sopenharmony_ci                    A::V tmp = alloc_tmp();
3994cb93a386Sopenharmony_ci                    a->ldrq (tmp, &iota);
3995cb93a386Sopenharmony_ci                    a->dup4s(dst(), N);
3996cb93a386Sopenharmony_ci                    a->sub4s(dst(), dst(), tmp);
3997cb93a386Sopenharmony_ci                    free_tmp(tmp);
3998cb93a386Sopenharmony_ci                } break;
3999cb93a386Sopenharmony_ci
4000cb93a386Sopenharmony_ci                case Op::store8: a->xtns2h(dst(x), r(x));
4001cb93a386Sopenharmony_ci                                 a->xtnh2b(dst(), dst());
4002cb93a386Sopenharmony_ci                   if (scalar) { a->strb  (dst(), arg[immA]); }
4003cb93a386Sopenharmony_ci                   else        { a->strs  (dst(), arg[immA]); }
4004cb93a386Sopenharmony_ci                                 break;
4005cb93a386Sopenharmony_ci
4006cb93a386Sopenharmony_ci                case Op::store16: a->xtns2h(dst(x), r(x));
4007cb93a386Sopenharmony_ci                    if (scalar) { a->strh  (dst(), arg[immA]); }
4008cb93a386Sopenharmony_ci                    else        { a->strd  (dst(), arg[immA]); }
4009cb93a386Sopenharmony_ci                                  break;
4010cb93a386Sopenharmony_ci
4011cb93a386Sopenharmony_ci                case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); }
4012cb93a386Sopenharmony_ci                                  else        { a->strq(r(x), arg[immA]); }
4013cb93a386Sopenharmony_ci                                                break;
4014cb93a386Sopenharmony_ci
4015cb93a386Sopenharmony_ci                case Op::store64: if (scalar) {
4016cb93a386Sopenharmony_ci                                      a->strs(r(x), arg[immA], 0);
4017cb93a386Sopenharmony_ci                                      a->strs(r(y), arg[immA], 1);
4018cb93a386Sopenharmony_ci                                  } else if (r(y) == r(x)+1) {
4019cb93a386Sopenharmony_ci                                      a->st24s(r(x), arg[immA]);
4020cb93a386Sopenharmony_ci                                  } else {
4021cb93a386Sopenharmony_ci                                      Reg tmp0 = alloc_tmp(2),
4022cb93a386Sopenharmony_ci                                          tmp1 = (Reg)(tmp0+1);
4023cb93a386Sopenharmony_ci                                      a->orr16b(tmp0, r(x), r(x));
4024cb93a386Sopenharmony_ci                                      a->orr16b(tmp1, r(y), r(y));
4025cb93a386Sopenharmony_ci                                      a-> st24s(tmp0, arg[immA]);
4026cb93a386Sopenharmony_ci                                      free_tmp(tmp0);
4027cb93a386Sopenharmony_ci                                      free_tmp(tmp1);
4028cb93a386Sopenharmony_ci                                  } break;
4029cb93a386Sopenharmony_ci
4030cb93a386Sopenharmony_ci                case Op::store128:
4031cb93a386Sopenharmony_ci                    if (scalar) {
4032cb93a386Sopenharmony_ci                        a->strs(r(x), arg[immA], 0);
4033cb93a386Sopenharmony_ci                        a->strs(r(y), arg[immA], 1);
4034cb93a386Sopenharmony_ci                        a->strs(r(z), arg[immA], 2);
4035cb93a386Sopenharmony_ci                        a->strs(r(w), arg[immA], 3);
4036cb93a386Sopenharmony_ci                    } else if (r(y) == r(x)+1 &&
4037cb93a386Sopenharmony_ci                               r(z) == r(x)+2 &&
4038cb93a386Sopenharmony_ci                               r(w) == r(x)+3) {
4039cb93a386Sopenharmony_ci                        a->st44s(r(x), arg[immA]);
4040cb93a386Sopenharmony_ci                    } else {
4041cb93a386Sopenharmony_ci                        Reg tmp0 = alloc_tmp(4),
4042cb93a386Sopenharmony_ci                            tmp1 = (Reg)(tmp0+1),
4043cb93a386Sopenharmony_ci                            tmp2 = (Reg)(tmp0+2),
4044cb93a386Sopenharmony_ci                            tmp3 = (Reg)(tmp0+3);
4045cb93a386Sopenharmony_ci                        a->orr16b(tmp0, r(x), r(x));
4046cb93a386Sopenharmony_ci                        a->orr16b(tmp1, r(y), r(y));
4047cb93a386Sopenharmony_ci                        a->orr16b(tmp2, r(z), r(z));
4048cb93a386Sopenharmony_ci                        a->orr16b(tmp3, r(w), r(w));
4049cb93a386Sopenharmony_ci                        a-> st44s(tmp0, arg[immA]);
4050cb93a386Sopenharmony_ci                        free_tmp(tmp0);
4051cb93a386Sopenharmony_ci                        free_tmp(tmp1);
4052cb93a386Sopenharmony_ci                        free_tmp(tmp2);
4053cb93a386Sopenharmony_ci                        free_tmp(tmp3);
4054cb93a386Sopenharmony_ci                    } break;
4055cb93a386Sopenharmony_ci
4056cb93a386Sopenharmony_ci
4057cb93a386Sopenharmony_ci                case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); }
4058cb93a386Sopenharmony_ci                                else        { a->ldrs(dst(), arg[immA]); }
4059cb93a386Sopenharmony_ci                                              a->uxtlb2h(dst(), dst());
4060cb93a386Sopenharmony_ci                                              a->uxtlh2s(dst(), dst());
4061cb93a386Sopenharmony_ci                                              break;
4062cb93a386Sopenharmony_ci
4063cb93a386Sopenharmony_ci                case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); }
4064cb93a386Sopenharmony_ci                                 else        { a->ldrd(dst(), arg[immA]); }
4065cb93a386Sopenharmony_ci                                               a->uxtlh2s(dst(), dst());
4066cb93a386Sopenharmony_ci                                               break;
4067cb93a386Sopenharmony_ci
4068cb93a386Sopenharmony_ci                case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); }
4069cb93a386Sopenharmony_ci                                 else        { a->ldrq(dst(), arg[immA]); }
4070cb93a386Sopenharmony_ci                                               break;
4071cb93a386Sopenharmony_ci
4072cb93a386Sopenharmony_ci                case Op::load64: if (scalar) {
4073cb93a386Sopenharmony_ci                                    a->ldrs(dst(), arg[immA], immB);
4074cb93a386Sopenharmony_ci                                 } else {
4075cb93a386Sopenharmony_ci                                    Reg tmp0 = alloc_tmp(2),
4076cb93a386Sopenharmony_ci                                        tmp1 = (Reg)(tmp0+1);
4077cb93a386Sopenharmony_ci                                    a->ld24s(tmp0, arg[immA]);
4078cb93a386Sopenharmony_ci                                    // TODO: return both
4079cb93a386Sopenharmony_ci                                    switch (immB) {
4080cb93a386Sopenharmony_ci                                        case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break;
4081cb93a386Sopenharmony_ci                                        case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break;
4082cb93a386Sopenharmony_ci                                    }
4083cb93a386Sopenharmony_ci                                 } break;
4084cb93a386Sopenharmony_ci
4085cb93a386Sopenharmony_ci                case Op::load128: if (scalar) {
4086cb93a386Sopenharmony_ci                                      a->ldrs(dst(), arg[immA], immB);
4087cb93a386Sopenharmony_ci                                  } else {
4088cb93a386Sopenharmony_ci                                      Reg tmp0 = alloc_tmp(4),
4089cb93a386Sopenharmony_ci                                          tmp1 = (Reg)(tmp0+1),
4090cb93a386Sopenharmony_ci                                          tmp2 = (Reg)(tmp0+2),
4091cb93a386Sopenharmony_ci                                          tmp3 = (Reg)(tmp0+3);
4092cb93a386Sopenharmony_ci                                      a->ld44s(tmp0, arg[immA]);
4093cb93a386Sopenharmony_ci                                      // TODO: return all four
4094cb93a386Sopenharmony_ci                                      switch (immB) {
4095cb93a386Sopenharmony_ci                                          case 0: mark_tmp_as_dst(tmp0); break;
4096cb93a386Sopenharmony_ci                                          case 1: mark_tmp_as_dst(tmp1); break;
4097cb93a386Sopenharmony_ci                                          case 2: mark_tmp_as_dst(tmp2); break;
4098cb93a386Sopenharmony_ci                                          case 3: mark_tmp_as_dst(tmp3); break;
4099cb93a386Sopenharmony_ci                                      }
4100cb93a386Sopenharmony_ci                                      if (immB != 0) { free_tmp(tmp0); }
4101cb93a386Sopenharmony_ci                                      if (immB != 1) { free_tmp(tmp1); }
4102cb93a386Sopenharmony_ci                                      if (immB != 2) { free_tmp(tmp2); }
4103cb93a386Sopenharmony_ci                                      if (immB != 3) { free_tmp(tmp3); }
4104cb93a386Sopenharmony_ci                                  } break;
4105cb93a386Sopenharmony_ci
4106cb93a386Sopenharmony_ci                case Op::uniform32: a->add(GP0, arg[immA], immB);
4107cb93a386Sopenharmony_ci                                    a->ld1r4s(dst(), GP0);
4108cb93a386Sopenharmony_ci                                    break;
4109cb93a386Sopenharmony_ci
4110cb93a386Sopenharmony_ci                case Op::array32: a->add(GP0, arg[immA], immB);
4111cb93a386Sopenharmony_ci                                  a->ldrd(GP0, GP0);
4112cb93a386Sopenharmony_ci                                  a->add(GP0, GP0, immC);
4113cb93a386Sopenharmony_ci                                  a->ld1r4s(dst(), GP0);
4114cb93a386Sopenharmony_ci                                  break;
4115cb93a386Sopenharmony_ci
4116cb93a386Sopenharmony_ci                case Op::gather8: {
4117cb93a386Sopenharmony_ci                    // As usual, the gather base pointer is immB bytes off of uniform immA.
4118cb93a386Sopenharmony_ci                    a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
4119cb93a386Sopenharmony_ci                    a->ldrd(GP0, GP0);              // GP0 =   gather base pointer
4120cb93a386Sopenharmony_ci
4121cb93a386Sopenharmony_ci                    for (int i = 0; i < active_lanes; i++) {
4122cb93a386Sopenharmony_ci                        a->movs(GP1, r(x), i);    // Extract index lane i into GP1.
4123cb93a386Sopenharmony_ci                        a->add (GP1, GP0, GP1);   // Add the gather base pointer.
4124cb93a386Sopenharmony_ci                        a->ldrb(GP1, GP1);        // Load that byte.
4125cb93a386Sopenharmony_ci                        a->inss(dst(x), GP1, i);  // Insert it into dst() lane i.
4126cb93a386Sopenharmony_ci                    }
4127cb93a386Sopenharmony_ci                } break;
4128cb93a386Sopenharmony_ci
4129cb93a386Sopenharmony_ci                // See gather8 for general idea; comments here only where gather16 differs.
4130cb93a386Sopenharmony_ci                case Op::gather16: {
4131cb93a386Sopenharmony_ci                    a->add (GP0, arg[immA], immB);
4132cb93a386Sopenharmony_ci                    a->ldrd(GP0, GP0);
4133cb93a386Sopenharmony_ci                    for (int i = 0; i < active_lanes; i++) {
4134cb93a386Sopenharmony_ci                        a->movs(GP1, r(x), i);
4135cb93a386Sopenharmony_ci                        a->add (GP1, GP0, GP1, A::LSL, 1);  // Scale index 2x into a byte offset.
4136cb93a386Sopenharmony_ci                        a->ldrh(GP1, GP1);                  // 2-byte load.
4137cb93a386Sopenharmony_ci                        a->inss(dst(x), GP1, i);
4138cb93a386Sopenharmony_ci                    }
4139cb93a386Sopenharmony_ci                } break;
4140cb93a386Sopenharmony_ci
4141cb93a386Sopenharmony_ci                // See gather8 for general idea; comments here only where gather32 differs.
4142cb93a386Sopenharmony_ci                case Op::gather32: {
4143cb93a386Sopenharmony_ci                    a->add (GP0, arg[immA], immB);
4144cb93a386Sopenharmony_ci                    a->ldrd(GP0, GP0);
4145cb93a386Sopenharmony_ci                    for (int i = 0; i < active_lanes; i++) {
4146cb93a386Sopenharmony_ci                        a->movs(GP1, r(x), i);
4147cb93a386Sopenharmony_ci                        a->add (GP1, GP0, GP1, A::LSL, 2);  // Scale index 4x into a byte offset.
4148cb93a386Sopenharmony_ci                        a->ldrs(GP1, GP1);                  // 4-byte load.
4149cb93a386Sopenharmony_ci                        a->inss(dst(x), GP1, i);
4150cb93a386Sopenharmony_ci                    }
4151cb93a386Sopenharmony_ci                } break;
4152cb93a386Sopenharmony_ci
4153cb93a386Sopenharmony_ci                case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
4154cb93a386Sopenharmony_ci                case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
4155cb93a386Sopenharmony_ci                case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
4156cb93a386Sopenharmony_ci                case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
4157cb93a386Sopenharmony_ci
4158cb93a386Sopenharmony_ci                case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
4159cb93a386Sopenharmony_ci
4160cb93a386Sopenharmony_ci                case Op::fma_f32: // fmla.4s is z += x*y
4161cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
4162cb93a386Sopenharmony_ci                    else              { a->orr16b(dst(), r(z), r(z));
4163cb93a386Sopenharmony_ci                                        a->fmla4s(dst(), r(x), r(y)); }
4164cb93a386Sopenharmony_ci                                        break;
4165cb93a386Sopenharmony_ci
4166cb93a386Sopenharmony_ci                case Op::fnma_f32:  // fmls.4s is z -= x*y
4167cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4168cb93a386Sopenharmony_ci                    else              { a->orr16b(dst(), r(z), r(z));
4169cb93a386Sopenharmony_ci                                        a->fmls4s(dst(), r(x), r(y)); }
4170cb93a386Sopenharmony_ci                                        break;
4171cb93a386Sopenharmony_ci
4172cb93a386Sopenharmony_ci                case Op::fms_f32:   // calculate z - xy, then negate to xy - z
4173cb93a386Sopenharmony_ci                    if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4174cb93a386Sopenharmony_ci                    else              { a->orr16b(dst(), r(z), r(z));
4175cb93a386Sopenharmony_ci                                        a->fmls4s(dst(), r(x), r(y)); }
4176cb93a386Sopenharmony_ci                                        a->fneg4s(dst(), dst());
4177cb93a386Sopenharmony_ci                                        break;
4178cb93a386Sopenharmony_ci
4179cb93a386Sopenharmony_ci                case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
4180cb93a386Sopenharmony_ci                case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
4181cb93a386Sopenharmony_ci                case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
4182cb93a386Sopenharmony_ci                case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
4183cb93a386Sopenharmony_ci                                  a->not16b  (dst(), dst());         break;
4184cb93a386Sopenharmony_ci
4185cb93a386Sopenharmony_ci
4186cb93a386Sopenharmony_ci                case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
4187cb93a386Sopenharmony_ci                case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
4188cb93a386Sopenharmony_ci                case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
4189cb93a386Sopenharmony_ci
4190cb93a386Sopenharmony_ci                case Op::bit_and  : a->and16b(dst(x,y), r(x), r(y)); break;
4191cb93a386Sopenharmony_ci                case Op::bit_or   : a->orr16b(dst(x,y), r(x), r(y)); break;
4192cb93a386Sopenharmony_ci                case Op::bit_xor  : a->eor16b(dst(x,y), r(x), r(y)); break;
4193cb93a386Sopenharmony_ci                case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
4194cb93a386Sopenharmony_ci
4195cb93a386Sopenharmony_ci                case Op::select: // bsl16b is x = x ? y : z
4196cb93a386Sopenharmony_ci                    if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
4197cb93a386Sopenharmony_ci                    else              { a->orr16b(dst(), r(x), r(x));
4198cb93a386Sopenharmony_ci                                        a->bsl16b(dst(), r(y), r(z)); }
4199cb93a386Sopenharmony_ci                                        break;
4200cb93a386Sopenharmony_ci
4201cb93a386Sopenharmony_ci                // fmin4s and fmax4s don't work the way we want with NaN,
4202cb93a386Sopenharmony_ci                // so we write them the long way:
4203cb93a386Sopenharmony_ci                case Op::min_f32: // min(x,y) = y<x ? y : x
4204cb93a386Sopenharmony_ci                                  a->fcmgt4s(dst(), r(x), r(y));
4205cb93a386Sopenharmony_ci                                  a->bsl16b (dst(), r(y), r(x));
4206cb93a386Sopenharmony_ci                                  break;
4207cb93a386Sopenharmony_ci
4208cb93a386Sopenharmony_ci                case Op::max_f32: // max(x,y) = x<y ? y : x
4209cb93a386Sopenharmony_ci                                  a->fcmgt4s(dst(), r(y), r(x));
4210cb93a386Sopenharmony_ci                                  a->bsl16b (dst(), r(y), r(x));
4211cb93a386Sopenharmony_ci                                  break;
4212cb93a386Sopenharmony_ci
4213cb93a386Sopenharmony_ci                case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break;
4214cb93a386Sopenharmony_ci                case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break;
4215cb93a386Sopenharmony_ci                case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break;
4216cb93a386Sopenharmony_ci
4217cb93a386Sopenharmony_ci                case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
4218cb93a386Sopenharmony_ci                case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
4219cb93a386Sopenharmony_ci
4220cb93a386Sopenharmony_ci                case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
4221cb93a386Sopenharmony_ci                case Op::trunc:  a->fcvtzs4s(dst(x), r(x)); break;
4222cb93a386Sopenharmony_ci                case Op::round:  a->fcvtns4s(dst(x), r(x)); break;
4223cb93a386Sopenharmony_ci                case Op::ceil:   a->frintp4s(dst(x), r(x)); break;
4224cb93a386Sopenharmony_ci                case Op::floor:  a->frintm4s(dst(x), r(x)); break;
4225cb93a386Sopenharmony_ci
4226cb93a386Sopenharmony_ci                case Op::to_fp16:
4227cb93a386Sopenharmony_ci                    a->fcvtn  (dst(x), r(x));    // 4x f32 -> 4x f16 in bottom four lanes
4228cb93a386Sopenharmony_ci                    a->uxtlh2s(dst(), dst());    // expand to 4x f16 in even 16-bit lanes
4229cb93a386Sopenharmony_ci                    break;
4230cb93a386Sopenharmony_ci
4231cb93a386Sopenharmony_ci                case Op::from_fp16:
4232cb93a386Sopenharmony_ci                    a->xtns2h(dst(x), r(x));     // pack even 16-bit lanes into bottom four lanes
4233cb93a386Sopenharmony_ci                    a->fcvtl (dst(), dst());     // 4x f16 -> 4x f32
4234cb93a386Sopenharmony_ci                    break;
4235cb93a386Sopenharmony_ci            #endif
4236cb93a386Sopenharmony_ci            }
4237cb93a386Sopenharmony_ci
4238cb93a386Sopenharmony_ci            // Proactively free the registers holding any value that dies here.
4239cb93a386Sopenharmony_ci            if (rd != NA &&                   dies_here(regs[rd])) { regs[rd] = NA; }
4240cb93a386Sopenharmony_ci            if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
4241cb93a386Sopenharmony_ci            if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
4242cb93a386Sopenharmony_ci            if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
4243cb93a386Sopenharmony_ci            if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; }
4244cb93a386Sopenharmony_ci            return true;
4245cb93a386Sopenharmony_ci        };
4246cb93a386Sopenharmony_ci
4247cb93a386Sopenharmony_ci        #if defined(__x86_64__) || defined(_M_X64)
4248cb93a386Sopenharmony_ci            auto jump_if_less = [&](A::Label* l) { a->jl (l); };
4249cb93a386Sopenharmony_ci            auto jump         = [&](A::Label* l) { a->jmp(l); };
4250cb93a386Sopenharmony_ci
4251cb93a386Sopenharmony_ci            auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
4252cb93a386Sopenharmony_ci            auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
4253cb93a386Sopenharmony_ci        #elif defined(__aarch64__)
4254cb93a386Sopenharmony_ci            auto jump_if_less = [&](A::Label* l) { a->blt(l); };
4255cb93a386Sopenharmony_ci            auto jump         = [&](A::Label* l) { a->b  (l); };
4256cb93a386Sopenharmony_ci
4257cb93a386Sopenharmony_ci            auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
4258cb93a386Sopenharmony_ci            auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
4259cb93a386Sopenharmony_ci        #endif
4260cb93a386Sopenharmony_ci
4261cb93a386Sopenharmony_ci        A::Label body,
4262cb93a386Sopenharmony_ci                 tail,
4263cb93a386Sopenharmony_ci                 done;
4264cb93a386Sopenharmony_ci
4265cb93a386Sopenharmony_ci        enter();
4266cb93a386Sopenharmony_ci        for (Val id = 0; id < (Val)instructions.size(); id++) {
4267cb93a386Sopenharmony_ci            if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4268cb93a386Sopenharmony_ci                return false;
4269cb93a386Sopenharmony_ci            }
4270cb93a386Sopenharmony_ci        }
4271cb93a386Sopenharmony_ci
4272cb93a386Sopenharmony_ci        // This point marks a kind of canonical fixed point for register contents: if loop
4273cb93a386Sopenharmony_ci        // code is generated as if these registers are holding these values, the next time
4274cb93a386Sopenharmony_ci        // the loop comes around we'd better find those same registers holding those same values.
4275cb93a386Sopenharmony_ci        auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
4276cb93a386Sopenharmony_ci                                      saved_next_stack_slot=next_stack_slot]{
4277cb93a386Sopenharmony_ci            for (int r = 0; r < (int)regs.size(); r++) {
4278cb93a386Sopenharmony_ci                if (regs[r] != incoming[r]) {
4279cb93a386Sopenharmony_ci                    regs[r]  = incoming[r];
4280cb93a386Sopenharmony_ci                    if (regs[r] >= 0) {
4281cb93a386Sopenharmony_ci                        load_from_memory((Reg)r, regs[r]);
4282cb93a386Sopenharmony_ci                    }
4283cb93a386Sopenharmony_ci                }
4284cb93a386Sopenharmony_ci            }
4285cb93a386Sopenharmony_ci            *stack_hint = std::max(*stack_hint, next_stack_slot);
4286cb93a386Sopenharmony_ci            stack_slot = saved_stack_slot;
4287cb93a386Sopenharmony_ci            next_stack_slot = saved_next_stack_slot;
4288cb93a386Sopenharmony_ci        };
4289cb93a386Sopenharmony_ci
4290cb93a386Sopenharmony_ci        a->label(&body);
4291cb93a386Sopenharmony_ci        {
4292cb93a386Sopenharmony_ci            a->cmp(N, K);
4293cb93a386Sopenharmony_ci            jump_if_less(&tail);
4294cb93a386Sopenharmony_ci            for (Val id = 0; id < (Val)instructions.size(); id++) {
4295cb93a386Sopenharmony_ci                if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4296cb93a386Sopenharmony_ci                    return false;
4297cb93a386Sopenharmony_ci                }
4298cb93a386Sopenharmony_ci            }
4299cb93a386Sopenharmony_ci            restore_incoming_regs();
4300cb93a386Sopenharmony_ci            for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4301cb93a386Sopenharmony_ci                if (fImpl->strides[i]) {
4302cb93a386Sopenharmony_ci                    add(arg[i], K*fImpl->strides[i]);
4303cb93a386Sopenharmony_ci                }
4304cb93a386Sopenharmony_ci            }
4305cb93a386Sopenharmony_ci            sub(N, K);
4306cb93a386Sopenharmony_ci            jump(&body);
4307cb93a386Sopenharmony_ci        }
4308cb93a386Sopenharmony_ci
4309cb93a386Sopenharmony_ci        a->label(&tail);
4310cb93a386Sopenharmony_ci        {
4311cb93a386Sopenharmony_ci            a->cmp(N, 1);
4312cb93a386Sopenharmony_ci            jump_if_less(&done);
4313cb93a386Sopenharmony_ci            for (Val id = 0; id < (Val)instructions.size(); id++) {
4314cb93a386Sopenharmony_ci                if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
4315cb93a386Sopenharmony_ci                    return false;
4316cb93a386Sopenharmony_ci                }
4317cb93a386Sopenharmony_ci            }
4318cb93a386Sopenharmony_ci            restore_incoming_regs();
4319cb93a386Sopenharmony_ci            for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4320cb93a386Sopenharmony_ci                if (fImpl->strides[i]) {
4321cb93a386Sopenharmony_ci                    add(arg[i], 1*fImpl->strides[i]);
4322cb93a386Sopenharmony_ci                }
4323cb93a386Sopenharmony_ci            }
4324cb93a386Sopenharmony_ci            sub(N, 1);
4325cb93a386Sopenharmony_ci            jump(&tail);
4326cb93a386Sopenharmony_ci        }
4327cb93a386Sopenharmony_ci
4328cb93a386Sopenharmony_ci        a->label(&done);
4329cb93a386Sopenharmony_ci        {
4330cb93a386Sopenharmony_ci            exit();
4331cb93a386Sopenharmony_ci        }
4332cb93a386Sopenharmony_ci
4333cb93a386Sopenharmony_ci        // Except for explicit aligned load and store instructions, AVX allows
4334cb93a386Sopenharmony_ci        // memory operands to be unaligned.  So even though we're creating 16
4335cb93a386Sopenharmony_ci        // byte patterns on ARM or 32-byte patterns on x86, we only need to
4336cb93a386Sopenharmony_ci        // align to 4 bytes, the element size and alignment requirement.
4337cb93a386Sopenharmony_ci
4338cb93a386Sopenharmony_ci        constants.foreach([&](int imm, A::Label* label) {
4339cb93a386Sopenharmony_ci            a->align(4);
4340cb93a386Sopenharmony_ci            a->label(label);
4341cb93a386Sopenharmony_ci            for (int i = 0; i < K; i++) {
4342cb93a386Sopenharmony_ci                a->word(imm);
4343cb93a386Sopenharmony_ci            }
4344cb93a386Sopenharmony_ci        });
4345cb93a386Sopenharmony_ci
4346cb93a386Sopenharmony_ci        if (!iota.references.empty()) {
4347cb93a386Sopenharmony_ci            a->align(4);
4348cb93a386Sopenharmony_ci            a->label(&iota);        // 0,1,2,3,4,...
4349cb93a386Sopenharmony_ci            for (int i = 0; i < K; i++) {
4350cb93a386Sopenharmony_ci                a->word(i);
4351cb93a386Sopenharmony_ci            }
4352cb93a386Sopenharmony_ci        }
4353cb93a386Sopenharmony_ci
4354cb93a386Sopenharmony_ci        if (!load64_index.references.empty()) {
4355cb93a386Sopenharmony_ci            a->align(4);
4356cb93a386Sopenharmony_ci            a->label(&load64_index);  // {0,2,4,6|1,3,5,7}
4357cb93a386Sopenharmony_ci            a->word(0); a->word(2); a->word(4); a->word(6);
4358cb93a386Sopenharmony_ci            a->word(1); a->word(3); a->word(5); a->word(7);
4359cb93a386Sopenharmony_ci        }
4360cb93a386Sopenharmony_ci
4361cb93a386Sopenharmony_ci        return true;
4362cb93a386Sopenharmony_ci    }
4363cb93a386Sopenharmony_ci
4364cb93a386Sopenharmony_ci    void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4365cb93a386Sopenharmony_ci                           const char* debug_name) {
4366cb93a386Sopenharmony_ci        // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4367cb93a386Sopenharmony_ci        // and stack_hint/registers_used to feed forward into the next jit() call.
4368cb93a386Sopenharmony_ci        Assembler a{nullptr};
4369cb93a386Sopenharmony_ci        int stack_hint = -1;
4370cb93a386Sopenharmony_ci        uint32_t registers_used = 0xffff'ffff;  // Start conservatively with all.
4371cb93a386Sopenharmony_ci        if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4372cb93a386Sopenharmony_ci            return;
4373cb93a386Sopenharmony_ci        }
4374cb93a386Sopenharmony_ci
4375cb93a386Sopenharmony_ci        fImpl->jit_size = a.size();
4376cb93a386Sopenharmony_ci        void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4377cb93a386Sopenharmony_ci        fImpl->jit_entry.store(jit_entry);
4378cb93a386Sopenharmony_ci
4379cb93a386Sopenharmony_ci        // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4380cb93a386Sopenharmony_ci        a = Assembler{jit_entry};
4381cb93a386Sopenharmony_ci        SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4382cb93a386Sopenharmony_ci        SkASSERT(a.size() <= fImpl->jit_size);
4383cb93a386Sopenharmony_ci
4384cb93a386Sopenharmony_ci        // Remap as executable, and flush caches on platforms that need that.
4385cb93a386Sopenharmony_ci        remap_as_executable(jit_entry, fImpl->jit_size);
4386cb93a386Sopenharmony_ci
4387cb93a386Sopenharmony_ci        notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4388cb93a386Sopenharmony_ci
4389cb93a386Sopenharmony_ci    #if !defined(SK_BUILD_FOR_WIN)
4390cb93a386Sopenharmony_ci        // For profiling and debugging, it's helpful to have this code loaded
4391cb93a386Sopenharmony_ci        // dynamically rather than just jumping info fImpl->jit_entry.
4392cb93a386Sopenharmony_ci        if (gSkVMJITViaDylib) {
4393cb93a386Sopenharmony_ci            // Dump the raw program binary.
4394cb93a386Sopenharmony_ci            SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4395cb93a386Sopenharmony_ci            int fd = mkstemp(path.writable_str());
4396cb93a386Sopenharmony_ci            ::write(fd, jit_entry, a.size());
4397cb93a386Sopenharmony_ci            close(fd);
4398cb93a386Sopenharmony_ci
4399cb93a386Sopenharmony_ci            this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
4400cb93a386Sopenharmony_ci
4401cb93a386Sopenharmony_ci            // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4402cb93a386Sopenharmony_ci            SkString cmd = SkStringPrintf(
4403cb93a386Sopenharmony_ci                    "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4404cb93a386Sopenharmony_ci                    " | clang -x assembler -shared - -o %s",
4405cb93a386Sopenharmony_ci                    path.c_str(), path.c_str());
4406cb93a386Sopenharmony_ci            system(cmd.c_str());
4407cb93a386Sopenharmony_ci
4408cb93a386Sopenharmony_ci            // Load that dynamic library and look up skvm_jit().
4409cb93a386Sopenharmony_ci            fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4410cb93a386Sopenharmony_ci            void* sym = nullptr;
4411cb93a386Sopenharmony_ci            for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4412cb93a386Sopenharmony_ci                if (!sym) { sym = dlsym(fImpl->dylib, name); }
4413cb93a386Sopenharmony_ci            }
4414cb93a386Sopenharmony_ci            fImpl->jit_entry.store(sym);
4415cb93a386Sopenharmony_ci        }
4416cb93a386Sopenharmony_ci    #endif
4417cb93a386Sopenharmony_ci    }
4418cb93a386Sopenharmony_ci
4419cb93a386Sopenharmony_ci    void Program::disassemble(SkWStream* o) const {
4420cb93a386Sopenharmony_ci    #if !defined(SK_BUILD_FOR_WIN)
4421cb93a386Sopenharmony_ci        SkDebugfStream debug;
4422cb93a386Sopenharmony_ci        if (!o) { o = &debug; }
4423cb93a386Sopenharmony_ci
4424cb93a386Sopenharmony_ci        const void* jit_entry = fImpl->jit_entry.load();
4425cb93a386Sopenharmony_ci        size_t jit_size = fImpl->jit_size;
4426cb93a386Sopenharmony_ci
4427cb93a386Sopenharmony_ci        if (!jit_entry) {
4428cb93a386Sopenharmony_ci            o->writeText("Program not JIT'd. Did you pass --jit?\n");
4429cb93a386Sopenharmony_ci            return;
4430cb93a386Sopenharmony_ci        }
4431cb93a386Sopenharmony_ci
4432cb93a386Sopenharmony_ci        char path[] = "/tmp/skvm-jit.XXXXXX";
4433cb93a386Sopenharmony_ci        int fd = mkstemp(path);
4434cb93a386Sopenharmony_ci        ::write(fd, jit_entry, jit_size);
4435cb93a386Sopenharmony_ci        close(fd);
4436cb93a386Sopenharmony_ci
4437cb93a386Sopenharmony_ci        // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4438cb93a386Sopenharmony_ci        SkString cmd = SkStringPrintf(
4439cb93a386Sopenharmony_ci                "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4440cb93a386Sopenharmony_ci                " | clang -x assembler -shared - -o %s",
4441cb93a386Sopenharmony_ci                path, path);
4442cb93a386Sopenharmony_ci        system(cmd.c_str());
4443cb93a386Sopenharmony_ci
4444cb93a386Sopenharmony_ci        // Now objdump to disassemble our function:
4445cb93a386Sopenharmony_ci        // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`,
4446cb93a386Sopenharmony_ci        // but the symbol name varies with OS, and that option may be missing from objdump on some
4447cb93a386Sopenharmony_ci        // machines? There also apears to be quite a bit of junk after the end of the JIT'd code.
4448cb93a386Sopenharmony_ci        // Trimming that would let us pass '--visualize-jumps' and get the loop annotated.
4449cb93a386Sopenharmony_ci        // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art.
4450cb93a386Sopenharmony_ci        cmd = SkStringPrintf("objdump -D %s", path);
4451cb93a386Sopenharmony_ci    #if defined(SK_BUILD_FOR_UNIX)
4452cb93a386Sopenharmony_ci        cmd.append(" --section=.text");
4453cb93a386Sopenharmony_ci    #endif
4454cb93a386Sopenharmony_ci        FILE* fp = popen(cmd.c_str(), "r");
4455cb93a386Sopenharmony_ci        if (!fp) {
4456cb93a386Sopenharmony_ci            o->writeText("objdump failed\n");
4457cb93a386Sopenharmony_ci            return;
4458cb93a386Sopenharmony_ci        }
4459cb93a386Sopenharmony_ci
4460cb93a386Sopenharmony_ci        char line[1024];
4461cb93a386Sopenharmony_ci        while (fgets(line, sizeof(line), fp)) {
4462cb93a386Sopenharmony_ci            o->writeText(line);
4463cb93a386Sopenharmony_ci        }
4464cb93a386Sopenharmony_ci
4465cb93a386Sopenharmony_ci        pclose(fp);
4466cb93a386Sopenharmony_ci    #endif
4467cb93a386Sopenharmony_ci    }
4468cb93a386Sopenharmony_ci
4469cb93a386Sopenharmony_ci#endif
4470cb93a386Sopenharmony_ci
4471cb93a386Sopenharmony_ci}  // namespace skvm
4472