xref: /third_party/skia/src/core/SkVM.cpp (revision cb93a386)
1/*
2 * Copyright 2019 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#include "include/core/SkStream.h"
9#include "include/core/SkString.h"
10#include "include/private/SkHalf.h"
11#include "include/private/SkTFitsIn.h"
12#include "include/private/SkThreadID.h"
13#include "src/core/SkColorSpacePriv.h"
14#include "src/core/SkColorSpaceXformSteps.h"
15#include "src/core/SkCpu.h"
16#include "src/core/SkEnumerate.h"
17#include "src/core/SkOpts.h"
18#include "src/core/SkVM.h"
19#include <algorithm>
20#include <atomic>
21#include <queue>
22
23#if defined(SKVM_LLVM)
24    #include <future>
25    #include <llvm/Bitcode/BitcodeWriter.h>
26    #include <llvm/ExecutionEngine/ExecutionEngine.h>
27    #include <llvm/IR/IRBuilder.h>
28    #include <llvm/IR/Verifier.h>
29    #include <llvm/Support/TargetSelect.h>
30    #include <llvm/Support/Host.h>
31
32    // Platform-specific intrinsics got their own files in LLVM 10.
33    #if __has_include(<llvm/IR/IntrinsicsX86.h>)
34        #include <llvm/IR/IntrinsicsX86.h>
35    #endif
36#endif
37
38// #define SKVM_LLVM_WAIT_FOR_COMPILATION
39
40bool gSkVMAllowJIT{false};
41bool gSkVMJITViaDylib{false};
42
43#if defined(SKVM_JIT)
44    #if defined(SK_BUILD_FOR_WIN)
45        #include "src/core/SkLeanWindows.h"
46        #include <memoryapi.h>
47
48        static void* alloc_jit_buffer(size_t* len) {
49            return VirtualAlloc(NULL, *len, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
50        }
51        static void remap_as_executable(void* ptr, size_t len) {
52            DWORD old;
53            VirtualProtect(ptr, len, PAGE_EXECUTE_READ, &old);
54            SkASSERT(old == PAGE_READWRITE);
55        }
56        #if !defined(SKVM_LLVM)
57        static void unmap_jit_buffer(void* ptr, size_t len) {
58            VirtualFree(ptr, 0, MEM_RELEASE);
59        }
60        static void close_dylib(void* dylib) {
61            SkASSERT(false);  // TODO?  For now just assert we never make one.
62        }
63        #endif
64    #else
65        #include <dlfcn.h>
66        #include <sys/mman.h>
67
68        static void* alloc_jit_buffer(size_t* len) {
69            // While mprotect and VirtualAlloc both work at page granularity,
70            // mprotect doesn't round up for you, and instead requires *len is at page granularity.
71            const size_t page = sysconf(_SC_PAGESIZE);
72            *len = ((*len + page - 1) / page) * page;
73            return mmap(nullptr,*len, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
74        }
75        static void remap_as_executable(void* ptr, size_t len) {
76            mprotect(ptr, len, PROT_READ|PROT_EXEC);
77            __builtin___clear_cache((char*)ptr,
78                                    (char*)ptr + len);
79        }
80        #if !defined(SKVM_LLVM)
81        static void unmap_jit_buffer(void* ptr, size_t len) {
82            munmap(ptr, len);
83        }
84        static void close_dylib(void* dylib) {
85            dlclose(dylib);
86        }
87        #endif
88    #endif
89
90    #if defined(SKVM_JIT_VTUNE)
91        #include <jitprofiling.h>
92        static void notify_vtune(const char* name, void* addr, size_t len) {
93            if (iJIT_IsProfilingActive() == iJIT_SAMPLING_ON) {
94                iJIT_Method_Load event;
95                memset(&event, 0, sizeof(event));
96                event.method_id           = iJIT_GetNewMethodID();
97                event.method_name         = const_cast<char*>(name);
98                event.method_load_address = addr;
99                event.method_size         = len;
100                iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, &event);
101            }
102        }
103    #else
104        static void notify_vtune(const char* name, void* addr, size_t len) {}
105    #endif
106#endif
107
108// JIT code isn't MSAN-instrumented, so we won't see when it uses
109// uninitialized memory, and we'll not see the writes it makes as properly
110// initializing memory.  Instead force the interpreter, which should let
111// MSAN see everything our programs do properly.
112//
113// Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
114#if defined(__has_feature)
115    #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
116        #define SKVM_JIT_BUT_IGNORE_IT
117    #endif
118#endif
119
120#if defined(SKSL_STANDALONE)
121    // skslc needs to link against this module (for the VM code generator). This module pulls in
122    // color-space code, but attempting to add those transitive dependencies to skslc gets out of
123    // hand. So we terminate the chain here with stub functions. Note that skslc's usage of SkVM
124    // never cares about color management.
125    skvm::F32 sk_program_transfer_fn(
126        skvm::F32 v, TFKind tf_kind,
127        skvm::F32 G, skvm::F32 A, skvm::F32 B, skvm::F32 C, skvm::F32 D, skvm::F32 E, skvm::F32 F) {
128            return v;
129    }
130
131    const skcms_TransferFunction* skcms_sRGB_TransferFunction() { return nullptr; }
132    const skcms_TransferFunction* skcms_sRGB_Inverse_TransferFunction() { return nullptr; }
133#endif
134
135namespace skvm {
136
137    static Features detect_features() {
138        static const bool fma =
139        #if defined(SK_CPU_X86)
140            SkCpu::Supports(SkCpu::HSW);
141        #elif defined(SK_CPU_ARM64)
142            true;
143        #else
144            false;
145        #endif
146
147        static const bool fp16 = false;  // TODO
148
149        return { fma, fp16 };
150    }
151
152    Builder::Builder()                  : fFeatures(detect_features()) {}
153    Builder::Builder(Features features) : fFeatures(features         ) {}
154
155
156    struct Program::Impl {
157        std::vector<InterpreterInstruction> instructions;
158        int regs = 0;
159        int loop = 0;
160        std::vector<int> strides;
161
162        std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
163        size_t jit_size = 0;
164        void*  dylib    = nullptr;
165
166    #if defined(SKVM_LLVM)
167        std::unique_ptr<llvm::LLVMContext>     llvm_ctx;
168        std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
169        std::future<void>                      llvm_compiling;
170    #endif
171    };
172
173    // Debugging tools, mostly for printing various data structures out to a stream.
174
175    namespace {
176        class SkDebugfStream final : public SkWStream {
177            size_t fBytesWritten = 0;
178
179            bool write(const void* buffer, size_t size) override {
180                SkDebugf("%.*s", (int)size, (const char*)buffer);
181                fBytesWritten += size;
182                return true;
183            }
184
185            size_t bytesWritten() const override {
186                return fBytesWritten;
187            }
188        };
189
190        struct V { Val id; };
191        struct R { Reg id; };
192        struct Shift { int bits; };
193        struct Splat { int bits; };
194        struct Hex   { int bits; };
195        // For op `trace_line` or `trace_call`
196        struct Line  { int bits; };
197        // For op `trace_var`
198        struct VarSlot { int bits; };
199        struct VarType { int bits; };
200        static constexpr VarType kVarTypeInt{0};
201        static constexpr VarType kVarTypeFloat{1};
202        static constexpr VarType kVarTypeBool{2};
203        // For op `trace_call`
204        struct CallType { int bits; };
205        static constexpr CallType kCallTypeEnter{1};
206        static constexpr CallType kCallTypeExit{0};
207
208        static void write(SkWStream* o, const char* s) {
209            o->writeText(s);
210        }
211
212        static const char* name(Op op) {
213            switch (op) {
214            #define M(x) case Op::x: return #x;
215                SKVM_OPS(M)
216            #undef M
217            }
218            return "unknown op";
219        }
220
221        static void write(SkWStream* o, Op op) {
222            o->writeText(name(op));
223        }
224        static void write(SkWStream* o, Ptr p) {
225            write(o, "ptr");
226            o->writeDecAsText(p.ix);
227        }
228        static void write(SkWStream* o, V v) {
229            write(o, "v");
230            o->writeDecAsText(v.id);
231        }
232        static void write(SkWStream* o, R r) {
233            write(o, "r");
234            o->writeDecAsText(r.id);
235        }
236        static void write(SkWStream* o, Shift s) {
237            o->writeDecAsText(s.bits);
238        }
239        static void write(SkWStream* o, Splat s) {
240            float f;
241            memcpy(&f, &s.bits, 4);
242            o->writeHexAsText(s.bits);
243            write(o, " (");
244            o->writeScalarAsText(f);
245            write(o, ")");
246        }
247        static void write(SkWStream* o, Hex h) {
248            o->writeHexAsText(h.bits);
249        }
250        static void write(SkWStream* o, Line d) {
251            write(o, "L");
252            o->writeDecAsText(d.bits);
253        }
254        static void write(SkWStream* o, VarSlot s) {
255            write(o, "$");
256            o->writeDecAsText(s.bits);
257        }
258        static void write(SkWStream* o, VarType n) {
259            if (n.bits == kVarTypeFloat.bits) {
260                write(o, "(F32)");
261            } else if (n.bits == kVarTypeInt.bits) {
262                write(o, "(I32)");
263            } else if (n.bits == kVarTypeBool.bits) {
264                write(o, "(bool)");
265            } else {
266                write(o, "???");
267            }
268        }
269        static void write(SkWStream* o, CallType n) {
270            if (n.bits == kCallTypeEnter.bits) {
271                write(o, "(enter)");
272            } else if (n.bits == kCallTypeExit.bits) {
273                write(o, "(exit)");
274            } else {
275                write(o, "???");
276            }
277        }
278
279        template <typename T, typename... Ts>
280        static void write(SkWStream* o, T first, Ts... rest) {
281            write(o, first);
282            write(o, " ");
283            write(o, rest...);
284        }
285    }  // namespace
286
287    static void write_one_instruction(Val id, const OptimizedInstruction& inst, SkWStream* o) {
288        Op  op = inst.op;
289        Val  x = inst.x,
290             y = inst.y,
291             z = inst.z,
292             w = inst.w;
293        int immA = inst.immA,
294            immB = inst.immB,
295            immC = inst.immC;
296        switch (op) {
297            case Op::assert_true: write(o, op, V{x}, V{y}); break;
298
299            case Op::trace_line: write(o, op, V{x}, Line{immA}); break;
300            case Op::trace_var:  write(o, op, V{x}, VarSlot{immA}, "=", V{y}, VarType{immB}); break;
301            case Op::trace_call: write(o, op, V{x}, Line{immA}, CallType{immB}); break;
302
303            case Op::store8:   write(o, op, Ptr{immA}, V{x}               ); break;
304            case Op::store16:  write(o, op, Ptr{immA}, V{x}               ); break;
305            case Op::store32:  write(o, op, Ptr{immA}, V{x}               ); break;
306            case Op::store64:  write(o, op, Ptr{immA}, V{x},V{y}          ); break;
307            case Op::store128: write(o, op, Ptr{immA}, V{x},V{y},V{z},V{w}); break;
308
309            case Op::index: write(o, V{id}, "=", op); break;
310
311            case Op::load8:   write(o, V{id}, "=", op, Ptr{immA}); break;
312            case Op::load16:  write(o, V{id}, "=", op, Ptr{immA}); break;
313            case Op::load32:  write(o, V{id}, "=", op, Ptr{immA}); break;
314            case Op::load64:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
315            case Op::load128: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
316
317            case Op::gather8:  write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
318            case Op::gather16: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
319            case Op::gather32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, V{x}); break;
320
321            case Op::uniform32: write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}); break;
322            case Op::array32:   write(o, V{id}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
323
324            case Op::splat: write(o, V{id}, "=", op, Splat{immA}); break;
325
326            case Op:: add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
327            case Op:: sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
328            case Op:: mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
329            case Op:: div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
330            case Op:: min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
331            case Op:: max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
332            case Op:: fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
333            case Op:: fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
334            case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
335
336
337            case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
338
339            case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
340            case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
341            case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
342            case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
343
344
345            case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
346            case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
347            case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
348
349            case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
350            case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
351            case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immA}); break;
352
353            case Op::eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
354            case Op::gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
355
356
357            case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}); break;
358            case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}); break;
359            case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}); break;
360            case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}); break;
361
362            case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
363
364            case Op::ceil:      write(o, V{id}, "=", op, V{x}); break;
365            case Op::floor:     write(o, V{id}, "=", op, V{x}); break;
366            case Op::to_f32:    write(o, V{id}, "=", op, V{x}); break;
367            case Op::to_fp16:   write(o, V{id}, "=", op, V{x}); break;
368            case Op::from_fp16: write(o, V{id}, "=", op, V{x}); break;
369            case Op::trunc:     write(o, V{id}, "=", op, V{x}); break;
370            case Op::round:     write(o, V{id}, "=", op, V{x}); break;
371        }
372
373        write(o, "\n");
374    }
375
376    void Builder::dump(SkWStream* o) const {
377        SkDebugfStream debug;
378        if (!o) { o = &debug; }
379
380        std::vector<OptimizedInstruction> optimized = this->optimize();
381        o->writeDecAsText(optimized.size());
382        o->writeText(" values (originally ");
383        o->writeDecAsText(fProgram.size());
384        o->writeText("):\n");
385        for (Val id = 0; id < (Val)optimized.size(); id++) {
386            const OptimizedInstruction& inst = optimized[id];
387            write(o, inst.can_hoist ? "↑ " : "  ");
388            write_one_instruction(id, inst, o);
389        }
390    }
391
392    void Program::dump(SkWStream* o) const {
393        SkDebugfStream debug;
394        if (!o) { o = &debug; }
395
396        o->writeDecAsText(fImpl->regs);
397        o->writeText(" registers, ");
398        o->writeDecAsText(fImpl->instructions.size());
399        o->writeText(" instructions:\n");
400        for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
401            if (i == fImpl->loop) { write(o, "loop:\n"); }
402            o->writeDecAsText(i);
403            o->writeText("\t");
404            if (i >= fImpl->loop) { write(o, "    "); }
405            const InterpreterInstruction& inst = fImpl->instructions[i];
406            Op   op = inst.op;
407            Reg   d = inst.d,
408                  x = inst.x,
409                  y = inst.y,
410                  z = inst.z,
411                  w = inst.w;
412            int immA = inst.immA,
413                immB = inst.immB,
414                immC = inst.immC;
415            switch (op) {
416                case Op::assert_true: write(o, op, R{x}, R{y}); break;
417
418                case Op::trace_line: write(o, op, R{x}, Line{immA}); break;
419                case Op::trace_var: write(o, op, R{x}, VarSlot{immA}, "=", R{y}, VarType{immB});
420                                    break;
421                case Op::trace_call: write(o, op, R{x}, Line{immA}, CallType{immB}); break;
422
423                case Op::store8:   write(o, op, Ptr{immA}, R{x}                  ); break;
424                case Op::store16:  write(o, op, Ptr{immA}, R{x}                  ); break;
425                case Op::store32:  write(o, op, Ptr{immA}, R{x}                  ); break;
426                case Op::store64:  write(o, op, Ptr{immA}, R{x}, R{y}            ); break;
427                case Op::store128: write(o, op, Ptr{immA}, R{x}, R{y}, R{z}, R{w}); break;
428
429                case Op::index: write(o, R{d}, "=", op); break;
430
431                case Op::load8:   write(o, R{d}, "=", op, Ptr{immA}); break;
432                case Op::load16:  write(o, R{d}, "=", op, Ptr{immA}); break;
433                case Op::load32:  write(o, R{d}, "=", op, Ptr{immA}); break;
434                case Op::load64:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
435                case Op::load128: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
436
437                case Op::gather8:  write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
438                case Op::gather16: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
439                case Op::gather32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, R{x}); break;
440
441                case Op::uniform32: write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}); break;
442                case Op::array32:   write(o, R{d}, "=", op, Ptr{immA}, Hex{immB}, Hex{immC}); break;
443
444                case Op::splat:     write(o, R{d}, "=", op, Splat{immA}); break;
445
446                case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
447                case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
448                case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
449                case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
450                case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
451                case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
452                case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
453                case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
454                case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
455
456                case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
457
458                case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
459                case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
460                case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
461                case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
462
463
464                case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
465                case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
466                case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
467
468                case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
469                case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
470                case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immA}); break;
471
472                case Op::eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
473                case Op::gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
474
475                case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}); break;
476                case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}); break;
477                case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}); break;
478                case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}); break;
479
480                case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
481
482                case Op::ceil:      write(o, R{d}, "=", op, R{x}); break;
483                case Op::floor:     write(o, R{d}, "=", op, R{x}); break;
484                case Op::to_f32:    write(o, R{d}, "=", op, R{x}); break;
485                case Op::to_fp16:   write(o, R{d}, "=", op, R{x}); break;
486                case Op::from_fp16: write(o, R{d}, "=", op, R{x}); break;
487                case Op::trunc:     write(o, R{d}, "=", op, R{x}); break;
488                case Op::round:     write(o, R{d}, "=", op, R{x}); break;
489            }
490            write(o, "\n");
491        }
492    }
493
494    std::vector<Instruction> eliminate_dead_code(std::vector<Instruction> program) {
495        // Determine which Instructions are live by working back from side effects.
496        std::vector<bool> live(program.size(), false);
497        for (Val id = program.size(); id--;) {
498            if (live[id] || has_side_effect(program[id].op)) {
499                live[id] = true;
500                const Instruction& inst = program[id];
501                for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
502                    if (arg != NA) { live[arg] = true; }
503                }
504            }
505        }
506
507        // After removing non-live instructions, we can be left with redundant back-to-back
508        // trace_line instructions. (e.g. one line could have multiple statements on it.)
509        // Eliminate any duplicate ops.
510        int lastId = -1;
511        for (Val id = 0; id < (Val)program.size(); id++) {
512            if (!live[id]) {
513                continue;
514            }
515            const Instruction& inst = program[id];
516            if (inst.op != Op::trace_line) {
517                lastId = -1;
518                continue;
519            }
520            if (lastId >= 0) {
521                const Instruction& last = program[lastId];
522                if (inst.immA == last.immA && inst.x == last.x) {
523                    // Found two matching trace_lines in a row. Mark the first one as dead.
524                    live[lastId] = false;
525                }
526            }
527            lastId = id;
528        }
529
530        // Rewrite the program with only live Instructions:
531        //   - remap IDs in live Instructions to what they'll be once dead Instructions are removed;
532        //   - then actually remove the dead Instructions.
533        std::vector<Val> new_id(program.size(), NA);
534        for (Val id = 0, next = 0; id < (Val)program.size(); id++) {
535            if (live[id]) {
536                Instruction& inst = program[id];
537                for (Val* arg : {&inst.x, &inst.y, &inst.z, &inst.w}) {
538                    if (*arg != NA) {
539                        *arg = new_id[*arg];
540                        SkASSERT(*arg != NA);
541                    }
542                }
543                new_id[id] = next++;
544            }
545        }
546
547        // Eliminate any non-live ops.
548        auto it = std::remove_if(program.begin(), program.end(), [&](const Instruction& inst) {
549            Val id = (Val)(&inst - program.data());
550            return !live[id];
551        });
552        program.erase(it, program.end());
553
554        return program;
555    }
556
557    std::vector<OptimizedInstruction> finalize(const std::vector<Instruction> program) {
558        std::vector<OptimizedInstruction> optimized(program.size());
559        for (Val id = 0; id < (Val)program.size(); id++) {
560            Instruction inst = program[id];
561            optimized[id] = {inst.op, inst.x,inst.y,inst.z,inst.w,
562                             inst.immA,inst.immB,inst.immC,
563                             /*death=*/id, /*can_hoist=*/true};
564        }
565
566        // Each Instruction's inputs need to live at least until that Instruction issues.
567        for (Val id = 0; id < (Val)optimized.size(); id++) {
568            OptimizedInstruction& inst = optimized[id];
569            for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
570                // (We're walking in order, so this is the same as max()ing with the existing Val.)
571                if (arg != NA) { optimized[arg].death = id; }
572            }
573        }
574
575        // Mark which values don't depend on the loop and can be hoisted.
576        for (OptimizedInstruction& inst : optimized) {
577            // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
578            if (is_always_varying(inst.op) || is_trace(inst.op)) {
579                inst.can_hoist = false;
580            }
581
582            // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
583            if (inst.can_hoist) {
584                for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
585                    if (arg != NA) { inst.can_hoist &= optimized[arg].can_hoist; }
586                }
587            }
588        }
589
590        // Extend the lifetime of any hoisted value that's used in the loop to infinity.
591        for (OptimizedInstruction& inst : optimized) {
592            if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used-in-loop*/) {
593                for (Val arg : {inst.x, inst.y, inst.z, inst.w}) {
594                    if (arg != NA && optimized[arg].can_hoist) {
595                        optimized[arg].death = (Val)program.size();
596                    }
597                }
598            }
599        }
600
601        return optimized;
602    }
603
604    std::vector<OptimizedInstruction> Builder::optimize() const {
605        std::vector<Instruction> program = this->program();
606        program = eliminate_dead_code(std::move(program));
607        return    finalize           (std::move(program));
608    }
609
610    Program Builder::done(const char* debug_name, bool allow_jit) const {
611        char buf[64] = "skvm-jit-";
612        if (!debug_name) {
613            *SkStrAppendU32(buf+9, this->hash()) = '\0';
614            debug_name = buf;
615        }
616
617        return {this->optimize(), fStrides, debug_name, allow_jit};
618    }
619
620    uint64_t Builder::hash() const {
621        uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
622                 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
623        return (uint64_t)lo | (uint64_t)hi << 32;
624    }
625
626    bool operator!=(Ptr a, Ptr b) { return a.ix != b.ix; }
627
628    bool operator==(const Instruction& a, const Instruction& b) {
629        return a.op   == b.op
630            && a.x    == b.x
631            && a.y    == b.y
632            && a.z    == b.z
633            && a.w    == b.w
634            && a.immA == b.immA
635            && a.immB == b.immB
636            && a.immC == b.immC;
637    }
638
639    uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
640        return SkOpts::hash(&inst, sizeof(inst), seed);
641    }
642
643
644    // Most instructions produce a value and return it by ID,
645    // the value-producing instruction's own index in the program vector.
646    Val Builder::push(Instruction inst) {
647        // Basic common subexpression elimination:
648        // if we've already seen this exact Instruction, use it instead of creating a new one.
649        //
650        // But we never dedup loads or stores: an intervening store could change that memory.
651        // Uniforms and gathers touch only uniform memory, so they're fine to dedup,
652        // and index is varying but doesn't touch memory, so it's fine to dedup too.
653        if (!touches_varying_memory(inst.op) && !is_trace(inst.op)) {
654            if (Val* id = fIndex.find(inst)) {
655                return *id;
656            }
657        }
658        Val id = static_cast<Val>(fProgram.size());
659        fProgram.push_back(inst);
660        fIndex.set(inst, id);
661        return id;
662    }
663
664    Ptr Builder::arg(int stride) {
665        int ix = (int)fStrides.size();
666        fStrides.push_back(stride);
667        return {ix};
668    }
669
670    void Builder::assert_true(I32 cond, I32 debug) {
671    #ifdef SK_DEBUG
672        int imm;
673        if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
674        (void)push(Op::assert_true, cond.id, debug.id);
675    #endif
676    }
677
678    void Builder::trace_line(I32 mask, int line) {
679        if (this->isImm(mask.id, 0)) { return; }
680        (void)push(Op::trace_line, mask.id,NA,NA,NA, line);
681    }
682    void Builder::trace_var(I32 mask, int slot, I32 val) {
683        if (this->isImm(mask.id, 0)) { return; }
684        (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeInt.bits);
685    }
686    void Builder::trace_var(I32 mask, int slot, F32 val) {
687        if (this->isImm(mask.id, 0)) { return; }
688        (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeFloat.bits);
689    }
690    void Builder::trace_var(I32 mask, int slot, bool b) {
691        if (this->isImm(mask.id, 0)) { return; }
692        I32 val = b ? this->splat(1) : this->splat(0);
693        (void)push(Op::trace_var, mask.id,val.id,NA,NA, slot, kVarTypeBool.bits);
694    }
695    void Builder::trace_call_enter(I32 mask, int line) {
696        if (this->isImm(mask.id, 0)) { return; }
697        (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeEnter.bits);
698    }
699    void Builder::trace_call_exit(I32 mask, int line) {
700        if (this->isImm(mask.id, 0)) { return; }
701        (void)push(Op::trace_call, mask.id,NA,NA,NA, line, kCallTypeExit.bits);
702    }
703
704    void Builder::store8 (Ptr ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA,NA, ptr.ix); }
705    void Builder::store16(Ptr ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA,NA, ptr.ix); }
706    void Builder::store32(Ptr ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA,NA, ptr.ix); }
707    void Builder::store64(Ptr ptr, I32 lo, I32 hi) {
708        (void)push(Op::store64, lo.id,hi.id,NA,NA, ptr.ix);
709    }
710    void Builder::store128(Ptr ptr, I32 x, I32 y, I32 z, I32 w) {
711        (void)push(Op::store128, x.id,y.id,z.id,w.id, ptr.ix);
712    }
713
714    I32 Builder::index() { return {this, push(Op::index)}; }
715
716    I32 Builder::load8 (Ptr ptr) { return {this, push(Op::load8 , NA,NA,NA,NA, ptr.ix) }; }
717    I32 Builder::load16(Ptr ptr) { return {this, push(Op::load16, NA,NA,NA,NA, ptr.ix) }; }
718    I32 Builder::load32(Ptr ptr) { return {this, push(Op::load32, NA,NA,NA,NA, ptr.ix) }; }
719    I32 Builder::load64(Ptr ptr, int lane) {
720        return {this, push(Op::load64 , NA,NA,NA,NA, ptr.ix,lane) };
721    }
722    I32 Builder::load128(Ptr ptr, int lane) {
723        return {this, push(Op::load128, NA,NA,NA,NA, ptr.ix,lane) };
724    }
725
726    I32 Builder::gather8 (UPtr ptr, int offset, I32 index) {
727        return {this, push(Op::gather8 , index.id,NA,NA,NA, ptr.ix,offset)};
728    }
729    I32 Builder::gather16(UPtr ptr, int offset, I32 index) {
730        return {this, push(Op::gather16, index.id,NA,NA,NA, ptr.ix,offset)};
731    }
732    I32 Builder::gather32(UPtr ptr, int offset, I32 index) {
733        return {this, push(Op::gather32, index.id,NA,NA,NA, ptr.ix,offset)};
734    }
735
736    I32 Builder::uniform32(UPtr ptr, int offset) {
737        return {this, push(Op::uniform32, NA,NA,NA,NA, ptr.ix, offset)};
738    }
739
740    // Note: this converts the array index into a byte offset for the op.
741    I32 Builder::array32  (UPtr ptr, int offset, int index) {
742        return {this, push(Op::array32, NA,NA,NA,NA, ptr.ix, offset, index * sizeof(int))};
743    }
744
745    I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA,NA, n) }; }
746
747    // Be careful peepholing float math!  Transformations you might expect to
748    // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
749    // Float peepholes must pass this equivalence test for all ~4B floats:
750    //
751    //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
752    //
753    //     unsigned bits = 0;
754    //     do {
755    //        float f;
756    //        memcpy(&f, &bits, 4);
757    //        if (!equiv(f, ...)) {
758    //           abort();
759    //        }
760    //     } while (++bits != 0);
761
762    F32 Builder::add(F32 x, F32 y) {
763        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
764        if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
765        if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
766
767        if (fFeatures.fma) {
768            if (fProgram[x.id].op == Op::mul_f32) {
769                return {this, this->push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
770            }
771            if (fProgram[y.id].op == Op::mul_f32) {
772                return {this, this->push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
773            }
774        }
775        return {this, this->push(Op::add_f32, x.id, y.id)};
776    }
777
778    F32 Builder::sub(F32 x, F32 y) {
779        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
780        if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
781        if (fFeatures.fma) {
782            if (fProgram[x.id].op == Op::mul_f32) {
783                return {this, this->push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
784            }
785            if (fProgram[y.id].op == Op::mul_f32) {
786                return {this, this->push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
787            }
788        }
789        return {this, this->push(Op::sub_f32, x.id, y.id)};
790    }
791
792    F32 Builder::mul(F32 x, F32 y) {
793        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
794        if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
795        if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
796        return {this, this->push(Op::mul_f32, x.id, y.id)};
797    }
798
799    F32 Builder::fast_mul(F32 x, F32 y) {
800        if (this->isImm(x.id, 0.0f) || this->isImm(y.id, 0.0f)) { return splat(0.0f); }
801        return mul(x,y);
802    }
803
804    F32 Builder::div(F32 x, F32 y) {
805        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(sk_ieee_float_divide(X,Y)); }
806        if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
807        return {this, this->push(Op::div_f32, x.id, y.id)};
808    }
809
810    F32 Builder::sqrt(F32 x) {
811        if (float X; this->allImm(x.id,&X)) { return splat(std::sqrt(X)); }
812        return {this, this->push(Op::sqrt_f32, x.id)};
813    }
814
815    // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
816    F32 Builder::approx_log2(F32 x) {
817        // e - 127 is a fair approximation of log2(x) in its own right...
818        F32 e = mul(to_F32(pun_to_I32(x)), splat(1.0f / (1<<23)));
819
820        // ... but using the mantissa to refine its error is _much_ better.
821        F32 m = pun_to_F32(bit_or(bit_and(pun_to_I32(x), 0x007fffff),
822                                0x3f000000));
823        F32 approx = sub(e,        124.225514990f);
824            approx = sub(approx, mul(1.498030302f, m));
825            approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
826
827        return approx;
828    }
829
830    F32 Builder::approx_pow2(F32 x) {
831        F32 f = fract(x);
832        F32 approx = add(x,         121.274057500f);
833            approx = sub(approx, mul( 1.490129070f, f));
834            approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
835
836        return pun_to_F32(round(mul(1.0f * (1<<23), approx)));
837    }
838
839    F32 Builder::approx_powf(F32 x, F32 y) {
840        // TODO: assert this instead?  Sometimes x is very slightly negative.  See skia:10210.
841        x = max(0.0f, x);
842
843        auto is_x = bit_or(eq(x, 0.0f),
844                           eq(x, 1.0f));
845        return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
846    }
847
848    // Bhaskara I's sine approximation
849    // 16x(pi - x) / (5*pi^2 - 4x(pi - x)
850    // ... divide by 4
851    // 4x(pi - x) / 5*pi^2/4 - x(pi - x)
852    //
853    // This is a good approximation only for 0 <= x <= pi, so we use symmetries to get
854    // radians into that range first.
855    //
856    F32 Builder::approx_sin(F32 radians) {
857        constexpr float Pi = SK_ScalarPI;
858        // x = radians mod 2pi
859        F32 x = fract(radians * (0.5f/Pi)) * (2*Pi);
860        I32 neg = x > Pi;   // are we pi < x < 2pi --> need to negate result
861        x = select(neg, x - Pi, x);
862
863        F32 pair = x * (Pi - x);
864        x = 4.0f * pair / ((5*Pi*Pi/4) - pair);
865        x = select(neg, -x, x);
866        return x;
867    }
868
869    /*  "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
870         https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
871
872        approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
873
874        Some simplifications:
875        1. tan(x) is periodic, -PI/2 < x < PI/2
876        2. tan(x) is odd, so tan(-x) = -tan(x)
877        3. Our polynomial approximation is best near zero, so we use the following identity
878                        tan(x) + tan(y)
879           tan(x + y) = -----------------
880                       1 - tan(x)*tan(y)
881           tan(PI/4) = 1
882
883           So for x > PI/8, we do the following refactor:
884           x' = x - PI/4
885
886                    1 + tan(x')
887           tan(x) = ------------
888                    1 - tan(x')
889     */
890    F32 Builder::approx_tan(F32 x) {
891        constexpr float Pi = SK_ScalarPI;
892        // periodic between -pi/2 ... pi/2
893        // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
894        x = fract((1/Pi)*x + 0.5f) * Pi - (Pi/2);
895
896        I32 neg = (x < 0.0f);
897        x = select(neg, -x, x);
898
899        // minimize total error by shifting if x > pi/8
900        I32 use_quotient = (x > (Pi/8));
901        x = select(use_quotient, x - (Pi/4), x);
902
903        // 9th order poly = 4th order(x^2) * x
904        x = poly(x*x, 62/2835.0f, 17/315.0f, 2/15.0f, 1/3.0f, 1.0f) * x;
905        x = select(use_quotient, (1+x)/(1-x), x);
906        x = select(neg, -x, x);
907        return x;
908    }
909
910     // http://mathforum.org/library/drmath/view/54137.html
911     // referencing Handbook of Mathematical Functions,
912     //             by Milton Abramowitz and Irene Stegun
913     F32 Builder::approx_asin(F32 x) {
914         I32 neg = (x < 0.0f);
915         x = select(neg, -x, x);
916         x = SK_ScalarPI/2 - sqrt(1-x) * poly(x, -0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f);
917         x = select(neg, -x, x);
918         return x;
919     }
920
921    /*  Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
922     *      with 129 values of x,atan(x) for x:[0...1]
923     *  This only works for 0 <= x <= 1
924     */
925    static F32 approx_atan_unit(F32 x) {
926        // for now we might be given NaN... let that through
927        x->assert_true((x != x) | ((x >= 0) & (x <= 1)));
928        return poly(x, 0.14130025741326729f,
929                      -0.34312835980675116f,
930                      -0.016172900528248768f,
931                       1.0037696976200385f,
932                      -0.00014758242182738969f);
933    }
934
935    /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
936     */
937    F32 Builder::approx_atan(F32 x) {
938        I32 neg = (x < 0.0f);
939        x = select(neg, -x, x);
940        I32 flip = (x > 1.0f);
941        x = select(flip, 1/x, x);
942        x = approx_atan_unit(x);
943        x = select(flip, SK_ScalarPI/2 - x, x);
944        x = select(neg, -x, x);
945        return x;
946    }
947
948    /*  Use identity atan(x) = pi/2 - atan(1/x) for x > 1
949     *  By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
950     *  which avoids a 2nd divide instruction if we had instead called atan().
951     */
952    F32 Builder::approx_atan2(F32 y0, F32 x0) {
953
954        I32 flip = (abs(y0) > abs(x0));
955        F32 y = select(flip, x0, y0);
956        F32 x = select(flip, y0, x0);
957        F32 arg = y/x;
958
959        I32 neg = (arg < 0.0f);
960        arg = select(neg, -arg, arg);
961
962        F32 r = approx_atan_unit(arg);
963        r = select(flip, SK_ScalarPI/2 - r, r);
964        r = select(neg, -r, r);
965
966        // handle quadrant distinctions
967        r = select((y0 >= 0) & (x0  < 0), r + SK_ScalarPI, r);
968        r = select((y0  < 0) & (x0 <= 0), r - SK_ScalarPI, r);
969        // Note: we don't try to handle 0,0 or infinities (yet)
970        return r;
971    }
972
973    F32 Builder::min(F32 x, F32 y) {
974        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::min(X,Y)); }
975        return {this, this->push(Op::min_f32, x.id, y.id)};
976    }
977    F32 Builder::max(F32 x, F32 y) {
978        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(std::max(X,Y)); }
979        return {this, this->push(Op::max_f32, x.id, y.id)};
980    }
981
982    SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
983    I32 Builder::add(I32 x, I32 y) {
984        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X+Y); }
985        if (this->isImm(x.id, 0)) { return y; }
986        if (this->isImm(y.id, 0)) { return x; }
987        return {this, this->push(Op::add_i32, x.id, y.id)};
988    }
989    SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
990    I32 Builder::sub(I32 x, I32 y) {
991        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X-Y); }
992        if (this->isImm(y.id, 0)) { return x; }
993        return {this, this->push(Op::sub_i32, x.id, y.id)};
994    }
995    SK_ATTRIBUTE(no_sanitize("signed-integer-overflow"))
996    I32 Builder::mul(I32 x, I32 y) {
997        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X*Y); }
998        if (this->isImm(x.id, 0)) { return splat(0); }
999        if (this->isImm(y.id, 0)) { return splat(0); }
1000        if (this->isImm(x.id, 1)) { return y; }
1001        if (this->isImm(y.id, 1)) { return x; }
1002        return {this, this->push(Op::mul_i32, x.id, y.id)};
1003    }
1004
1005    SK_ATTRIBUTE(no_sanitize("shift"))
1006    I32 Builder::shl(I32 x, int bits) {
1007        if (bits == 0) { return x; }
1008        if (int X; this->allImm(x.id,&X)) { return splat(X << bits); }
1009        return {this, this->push(Op::shl_i32, x.id,NA,NA,NA, bits)};
1010    }
1011    I32 Builder::shr(I32 x, int bits) {
1012        if (bits == 0) { return x; }
1013        if (int X; this->allImm(x.id,&X)) { return splat(unsigned(X) >> bits); }
1014        return {this, this->push(Op::shr_i32, x.id,NA,NA,NA, bits)};
1015    }
1016    I32 Builder::sra(I32 x, int bits) {
1017        if (bits == 0) { return x; }
1018        if (int X; this->allImm(x.id,&X)) { return splat(X >> bits); }
1019        return {this, this->push(Op::sra_i32, x.id,NA,NA,NA, bits)};
1020    }
1021
1022    I32 Builder:: eq(F32 x, F32 y) {
1023        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1024        return {this, this->push(Op::eq_f32, x.id, y.id)};
1025    }
1026    I32 Builder::neq(F32 x, F32 y) {
1027        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1028        return {this, this->push(Op::neq_f32, x.id, y.id)};
1029    }
1030    I32 Builder::lt(F32 x, F32 y) {
1031        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y> X ? ~0 : 0); }
1032        return {this, this->push(Op::gt_f32, y.id, x.id)};
1033    }
1034    I32 Builder::lte(F32 x, F32 y) {
1035        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(Y>=X ? ~0 : 0); }
1036        return {this, this->push(Op::gte_f32, y.id, x.id)};
1037    }
1038    I32 Builder::gt(F32 x, F32 y) {
1039        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1040        return {this, this->push(Op::gt_f32, x.id, y.id)};
1041    }
1042    I32 Builder::gte(F32 x, F32 y) {
1043        if (float X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1044        return {this, this->push(Op::gte_f32, x.id, y.id)};
1045    }
1046
1047    I32 Builder:: eq(I32 x, I32 y) {
1048        if (x.id == y.id) { return splat(~0); }
1049        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X==Y ? ~0 : 0); }
1050        return {this, this->push(Op:: eq_i32, x.id, y.id)};
1051    }
1052    I32 Builder::neq(I32 x, I32 y) {
1053        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X!=Y ? ~0 : 0); }
1054        return ~(x == y);
1055    }
1056    I32 Builder:: gt(I32 x, I32 y) {
1057        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X> Y ? ~0 : 0); }
1058        return {this, this->push(Op:: gt_i32, x.id, y.id)};
1059    }
1060    I32 Builder::gte(I32 x, I32 y) {
1061        if (x.id == y.id) { return splat(~0); }
1062        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X>=Y ? ~0 : 0); }
1063        return ~(x < y);
1064    }
1065    I32 Builder:: lt(I32 x, I32 y) { return y>x; }
1066    I32 Builder::lte(I32 x, I32 y) { return y>=x; }
1067
1068    I32 Builder::bit_and(I32 x, I32 y) {
1069        if (x.id == y.id) { return x; }
1070        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&Y); }
1071        if (this->isImm(y.id, 0)) { return splat(0); }   // (x & false) == false
1072        if (this->isImm(x.id, 0)) { return splat(0); }   // (false & y) == false
1073        if (this->isImm(y.id,~0)) { return x; }          // (x & true) == x
1074        if (this->isImm(x.id,~0)) { return y; }          // (true & y) == y
1075        return {this, this->push(Op::bit_and, x.id, y.id)};
1076    }
1077    I32 Builder::bit_or(I32 x, I32 y) {
1078        if (x.id == y.id) { return x; }
1079        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X|Y); }
1080        if (this->isImm(y.id, 0)) { return x; }           // (x | false) == x
1081        if (this->isImm(x.id, 0)) { return y; }           // (false | y) == y
1082        if (this->isImm(y.id,~0)) { return splat(~0); }   // (x | true) == true
1083        if (this->isImm(x.id,~0)) { return splat(~0); }   // (true | y) == true
1084        return {this, this->push(Op::bit_or, x.id, y.id)};
1085    }
1086    I32 Builder::bit_xor(I32 x, I32 y) {
1087        if (x.id == y.id) { return splat(0); }
1088        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X^Y); }
1089        if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
1090        if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
1091        return {this, this->push(Op::bit_xor, x.id, y.id)};
1092    }
1093
1094    I32 Builder::bit_clear(I32 x, I32 y) {
1095        if (x.id == y.id) { return splat(0); }
1096        if (int X,Y; this->allImm(x.id,&X, y.id,&Y)) { return splat(X&~Y); }
1097        if (this->isImm(y.id, 0)) { return x; }          // (x & ~false) == x
1098        if (this->isImm(y.id,~0)) { return splat(0); }   // (x & ~true) == false
1099        if (this->isImm(x.id, 0)) { return splat(0); }   // (false & ~y) == false
1100        return {this, this->push(Op::bit_clear, x.id, y.id)};
1101    }
1102
1103    I32 Builder::select(I32 x, I32 y, I32 z) {
1104        if (y.id == z.id) { return y; }
1105        if (int X,Y,Z; this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return splat(X?Y:Z); }
1106        if (this->isImm(x.id,~0)) { return y; }               // true  ? y : z == y
1107        if (this->isImm(x.id, 0)) { return z; }               // false ? y : z == z
1108        if (this->isImm(y.id, 0)) { return bit_clear(z,x); }  //     x ? 0 : z == ~x&z
1109        if (this->isImm(z.id, 0)) { return bit_and  (y,x); }  //     x ? y : 0 ==  x&y
1110        return {this, this->push(Op::select, x.id, y.id, z.id)};
1111    }
1112
1113    I32 Builder::extract(I32 x, int bits, I32 z) {
1114        if (unsigned Z; this->allImm(z.id,&Z) && (~0u>>bits) == Z) { return shr(x, bits); }
1115        return bit_and(z, shr(x, bits));
1116    }
1117
1118    I32 Builder::pack(I32 x, I32 y, int bits) {
1119        return bit_or(x, shl(y, bits));
1120    }
1121
1122    F32 Builder::ceil(F32 x) {
1123        if (float X; this->allImm(x.id,&X)) { return splat(ceilf(X)); }
1124        return {this, this->push(Op::ceil, x.id)};
1125    }
1126    F32 Builder::floor(F32 x) {
1127        if (float X; this->allImm(x.id,&X)) { return splat(floorf(X)); }
1128        return {this, this->push(Op::floor, x.id)};
1129    }
1130    F32 Builder::to_F32(I32 x) {
1131        if (int X; this->allImm(x.id,&X)) { return splat((float)X); }
1132        return {this, this->push(Op::to_f32, x.id)};
1133    }
1134    I32 Builder::trunc(F32 x) {
1135        if (float X; this->allImm(x.id,&X)) { return splat((int)X); }
1136        return {this, this->push(Op::trunc, x.id)};
1137    }
1138    I32 Builder::round(F32 x) {
1139        if (float X; this->allImm(x.id,&X)) { return splat((int)lrintf(X)); }
1140        return {this, this->push(Op::round, x.id)};
1141    }
1142
1143    I32 Builder::to_fp16(F32 x) {
1144        if (float X; this->allImm(x.id,&X)) { return splat((int)SkFloatToHalf(X)); }
1145        return {this, this->push(Op::to_fp16, x.id)};
1146    }
1147    F32 Builder::from_fp16(I32 x) {
1148        if (int X; this->allImm(x.id,&X)) { return splat(SkHalfToFloat(X)); }
1149        return {this, this->push(Op::from_fp16, x.id)};
1150    }
1151
1152    F32 Builder::from_unorm(int bits, I32 x) {
1153        F32 limit = splat(1 / ((1<<bits)-1.0f));
1154        return mul(to_F32(x), limit);
1155    }
1156    I32 Builder::to_unorm(int bits, F32 x) {
1157        F32 limit = splat((1<<bits)-1.0f);
1158        return round(mul(x, limit));
1159    }
1160
1161    PixelFormat SkColorType_to_PixelFormat(SkColorType ct) {
1162        auto UNORM = PixelFormat::UNORM,
1163             SRGB  = PixelFormat::SRGB,
1164             FLOAT = PixelFormat::FLOAT;
1165        switch (ct) {
1166            case kUnknown_SkColorType: break;
1167
1168            case kRGBA_F32_SkColorType: return {FLOAT,32,32,32,32, 0,32,64,96};
1169
1170            case kRGBA_F16Norm_SkColorType:       return {FLOAT,16,16,16,16, 0,16,32,48};
1171            case kRGBA_F16_SkColorType:           return {FLOAT,16,16,16,16, 0,16,32,48};
1172            case kR16G16B16A16_unorm_SkColorType: return {UNORM,16,16,16,16, 0,16,32,48};
1173
1174            case kA16_float_SkColorType:    return {FLOAT,  0, 0,0,16, 0, 0,0,0};
1175            case kR16G16_float_SkColorType: return {FLOAT, 16,16,0, 0, 0,16,0,0};
1176
1177            case kAlpha_8_SkColorType: return {UNORM, 0,0,0,8, 0,0,0,0};
1178            case kGray_8_SkColorType:  return {UNORM, 8,8,8,0, 0,0,0,0};  // Subtle.
1179
1180            case kRGB_565_SkColorType:   return {UNORM, 5,6,5,0, 11,5,0,0};  // (BGR)
1181            case kARGB_4444_SkColorType: return {UNORM, 4,4,4,4, 12,8,4,0};  // (ABGR)
1182
1183            case kRGBA_8888_SkColorType:  return {UNORM, 8,8,8,8,  0,8,16,24};
1184            case kRGB_888x_SkColorType:   return {UNORM, 8,8,8,0,  0,8,16,32};  // 32-bit
1185            case kBGRA_8888_SkColorType:  return {UNORM, 8,8,8,8, 16,8, 0,24};
1186            case kSRGBA_8888_SkColorType: return { SRGB, 8,8,8,8,  0,8,16,24};
1187
1188            case kRGBA_1010102_SkColorType: return {UNORM, 10,10,10,2,  0,10,20,30};
1189            case kBGRA_1010102_SkColorType: return {UNORM, 10,10,10,2, 20,10, 0,30};
1190            case kRGB_101010x_SkColorType:  return {UNORM, 10,10,10,0,  0,10,20, 0};
1191            case kBGR_101010x_SkColorType:  return {UNORM, 10,10,10,0, 20,10, 0, 0};
1192
1193            case kR8G8_unorm_SkColorType:   return {UNORM,  8, 8,0, 0, 0, 8,0,0};
1194            case kR16G16_unorm_SkColorType: return {UNORM, 16,16,0, 0, 0,16,0,0};
1195            case kA16_unorm_SkColorType:    return {UNORM,  0, 0,0,16, 0, 0,0,0};
1196        }
1197        SkASSERT(false);
1198        return {UNORM, 0,0,0,0, 0,0,0,0};
1199    }
1200
1201    static int byte_size(PixelFormat f) {
1202        // What's the highest bit we read?
1203        int bits = std::max(f.r_bits + f.r_shift,
1204                   std::max(f.g_bits + f.g_shift,
1205                   std::max(f.b_bits + f.b_shift,
1206                            f.a_bits + f.a_shift)));
1207        // Round up to bytes.
1208        return (bits + 7) / 8;
1209    }
1210
1211    static Color unpack(PixelFormat f, I32 x) {
1212        SkASSERT(byte_size(f) <= 4);
1213
1214        auto from_srgb = [](int bits, I32 channel) -> F32 {
1215            const skcms_TransferFunction* tf = skcms_sRGB_TransferFunction();
1216            F32 v = from_unorm(bits, channel);
1217            return sk_program_transfer_fn(v, sRGBish_TF,
1218                                          v->splat(tf->g),
1219                                          v->splat(tf->a),
1220                                          v->splat(tf->b),
1221                                          v->splat(tf->c),
1222                                          v->splat(tf->d),
1223                                          v->splat(tf->e),
1224                                          v->splat(tf->f));
1225        };
1226
1227        auto unpack_rgb = [=](int bits, int shift) -> F32 {
1228            I32 channel = extract(x, shift, (1<<bits)-1);
1229            switch (f.encoding) {
1230                case PixelFormat::UNORM: return from_unorm(bits, channel);
1231                case PixelFormat:: SRGB: return from_srgb (bits, channel);
1232                case PixelFormat::FLOAT: return from_fp16 (      channel);
1233            }
1234            SkUNREACHABLE;
1235        };
1236        auto unpack_alpha = [=](int bits, int shift) -> F32 {
1237            I32 channel = extract(x, shift, (1<<bits)-1);
1238            switch (f.encoding) {
1239                case PixelFormat::UNORM:
1240                case PixelFormat:: SRGB: return from_unorm(bits, channel);
1241                case PixelFormat::FLOAT: return from_fp16 (      channel);
1242            }
1243            SkUNREACHABLE;
1244        };
1245        return {
1246            f.r_bits ? unpack_rgb  (f.r_bits, f.r_shift) : x->splat(0.0f),
1247            f.g_bits ? unpack_rgb  (f.g_bits, f.g_shift) : x->splat(0.0f),
1248            f.b_bits ? unpack_rgb  (f.b_bits, f.b_shift) : x->splat(0.0f),
1249            f.a_bits ? unpack_alpha(f.a_bits, f.a_shift) : x->splat(1.0f),
1250        };
1251    }
1252
1253    static void split_disjoint_8byte_format(PixelFormat f, PixelFormat* lo, PixelFormat* hi) {
1254        SkASSERT(byte_size(f) == 8);
1255        // We assume some of the channels are in the low 32 bits, some in the high 32 bits.
1256        // The assert on byte_size(lo) will trigger if this assumption is violated.
1257        *lo = f;
1258        if (f.r_shift >= 32) { lo->r_bits = 0; lo->r_shift = 32; }
1259        if (f.g_shift >= 32) { lo->g_bits = 0; lo->g_shift = 32; }
1260        if (f.b_shift >= 32) { lo->b_bits = 0; lo->b_shift = 32; }
1261        if (f.a_shift >= 32) { lo->a_bits = 0; lo->a_shift = 32; }
1262        SkASSERT(byte_size(*lo) == 4);
1263
1264        *hi = f;
1265        if (f.r_shift < 32) { hi->r_bits = 0; hi->r_shift = 32; } else { hi->r_shift -= 32; }
1266        if (f.g_shift < 32) { hi->g_bits = 0; hi->g_shift = 32; } else { hi->g_shift -= 32; }
1267        if (f.b_shift < 32) { hi->b_bits = 0; hi->b_shift = 32; } else { hi->b_shift -= 32; }
1268        if (f.a_shift < 32) { hi->a_bits = 0; hi->a_shift = 32; } else { hi->a_shift -= 32; }
1269        SkASSERT(byte_size(*hi) == 4);
1270    }
1271
1272    // The only 16-byte format we support today is RGBA F32,
1273    // though, TODO, we could generalize that to any swizzle, and to allow UNORM too.
1274    static void assert_16byte_is_rgba_f32(PixelFormat f) {
1275    #if defined(SK_DEBUG)
1276        SkASSERT(byte_size(f) == 16);
1277        PixelFormat rgba_f32 = SkColorType_to_PixelFormat(kRGBA_F32_SkColorType);
1278
1279        SkASSERT(f.encoding == rgba_f32.encoding);
1280
1281        SkASSERT(f.r_bits == rgba_f32.r_bits);
1282        SkASSERT(f.g_bits == rgba_f32.g_bits);
1283        SkASSERT(f.b_bits == rgba_f32.b_bits);
1284        SkASSERT(f.a_bits == rgba_f32.a_bits);
1285
1286        SkASSERT(f.r_shift == rgba_f32.r_shift);
1287        SkASSERT(f.g_shift == rgba_f32.g_shift);
1288        SkASSERT(f.b_shift == rgba_f32.b_shift);
1289        SkASSERT(f.a_shift == rgba_f32.a_shift);
1290    #endif
1291    }
1292
1293    Color Builder::load(PixelFormat f, Ptr ptr) {
1294        switch (byte_size(f)) {
1295            case 1: return unpack(f, load8 (ptr));
1296            case 2: return unpack(f, load16(ptr));
1297            case 4: return unpack(f, load32(ptr));
1298            case 8: {
1299                PixelFormat lo,hi;
1300                split_disjoint_8byte_format(f, &lo,&hi);
1301                Color l = unpack(lo, load64(ptr, 0)),
1302                      h = unpack(hi, load64(ptr, 1));
1303                return {
1304                    lo.r_bits ? l.r : h.r,
1305                    lo.g_bits ? l.g : h.g,
1306                    lo.b_bits ? l.b : h.b,
1307                    lo.a_bits ? l.a : h.a,
1308                };
1309            }
1310            case 16: {
1311                assert_16byte_is_rgba_f32(f);
1312                return {
1313                    pun_to_F32(load128(ptr, 0)),
1314                    pun_to_F32(load128(ptr, 1)),
1315                    pun_to_F32(load128(ptr, 2)),
1316                    pun_to_F32(load128(ptr, 3)),
1317                };
1318            }
1319            default: SkUNREACHABLE;
1320        }
1321        return {};
1322    }
1323
1324    Color Builder::gather(PixelFormat f, UPtr ptr, int offset, I32 index) {
1325        switch (byte_size(f)) {
1326            case 1: return unpack(f, gather8 (ptr, offset, index));
1327            case 2: return unpack(f, gather16(ptr, offset, index));
1328            case 4: return unpack(f, gather32(ptr, offset, index));
1329            case 8: {
1330                PixelFormat lo,hi;
1331                split_disjoint_8byte_format(f, &lo,&hi);
1332                Color l = unpack(lo, gather32(ptr, offset, (index<<1)+0)),
1333                      h = unpack(hi, gather32(ptr, offset, (index<<1)+1));
1334                return {
1335                    lo.r_bits ? l.r : h.r,
1336                    lo.g_bits ? l.g : h.g,
1337                    lo.b_bits ? l.b : h.b,
1338                    lo.a_bits ? l.a : h.a,
1339                };
1340            }
1341            case 16: {
1342                assert_16byte_is_rgba_f32(f);
1343                return {
1344                    gatherF(ptr, offset, (index<<2)+0),
1345                    gatherF(ptr, offset, (index<<2)+1),
1346                    gatherF(ptr, offset, (index<<2)+2),
1347                    gatherF(ptr, offset, (index<<2)+3),
1348                };
1349            }
1350            default: SkUNREACHABLE;
1351        }
1352        return {};
1353    }
1354
1355    static I32 pack32(PixelFormat f, Color c) {
1356        SkASSERT(byte_size(f) <= 4);
1357
1358        auto to_srgb = [](int bits, F32 v) {
1359            const skcms_TransferFunction* tf = skcms_sRGB_Inverse_TransferFunction();
1360            return to_unorm(bits, sk_program_transfer_fn(v, sRGBish_TF,
1361                                                         v->splat(tf->g),
1362                                                         v->splat(tf->a),
1363                                                         v->splat(tf->b),
1364                                                         v->splat(tf->c),
1365                                                         v->splat(tf->d),
1366                                                         v->splat(tf->e),
1367                                                         v->splat(tf->f)));
1368        };
1369
1370        I32 packed = c->splat(0);
1371        auto pack_rgb = [&](F32 channel, int bits, int shift) {
1372            I32 encoded;
1373            switch (f.encoding) {
1374                case PixelFormat::UNORM: encoded = to_unorm(bits, channel); break;
1375                case PixelFormat:: SRGB: encoded = to_srgb (bits, channel); break;
1376                case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1377            }
1378            packed = pack(packed, encoded, shift);
1379        };
1380        auto pack_alpha = [&](F32 channel, int bits, int shift) {
1381            I32 encoded;
1382            switch (f.encoding) {
1383                case PixelFormat::UNORM:
1384                case PixelFormat:: SRGB: encoded = to_unorm(bits, channel); break;
1385                case PixelFormat::FLOAT: encoded = to_fp16 (      channel); break;
1386            }
1387            packed = pack(packed, encoded, shift);
1388        };
1389        if (f.r_bits) { pack_rgb  (c.r, f.r_bits, f.r_shift); }
1390        if (f.g_bits) { pack_rgb  (c.g, f.g_bits, f.g_shift); }
1391        if (f.b_bits) { pack_rgb  (c.b, f.b_bits, f.b_shift); }
1392        if (f.a_bits) { pack_alpha(c.a, f.a_bits, f.a_shift); }
1393        return packed;
1394    }
1395
1396    void Builder::store(PixelFormat f, Ptr ptr, Color c) {
1397        // Detect a grayscale PixelFormat: r,g,b bit counts and shifts all equal.
1398        if (f.r_bits  == f.g_bits  && f.g_bits  == f.b_bits &&
1399            f.r_shift == f.g_shift && f.g_shift == f.b_shift) {
1400
1401            // TODO: pull these coefficients from an SkColorSpace?  This is sRGB luma/luminance.
1402            c.r = c.r * 0.2126f
1403                + c.g * 0.7152f
1404                + c.b * 0.0722f;
1405            f.g_bits = f.b_bits = 0;
1406        }
1407
1408        switch (byte_size(f)) {
1409            case 1: store8 (ptr, pack32(f,c)); break;
1410            case 2: store16(ptr, pack32(f,c)); break;
1411            case 4: store32(ptr, pack32(f,c)); break;
1412            case 8: {
1413                PixelFormat lo,hi;
1414                split_disjoint_8byte_format(f, &lo,&hi);
1415                store64(ptr, pack32(lo,c)
1416                           , pack32(hi,c));
1417                break;
1418            }
1419            case 16: {
1420                assert_16byte_is_rgba_f32(f);
1421                store128(ptr, pun_to_I32(c.r), pun_to_I32(c.g), pun_to_I32(c.b), pun_to_I32(c.a));
1422                break;
1423            }
1424            default: SkUNREACHABLE;
1425        }
1426    }
1427
1428    void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1429        skvm::F32 invA = 1.0f / a,
1430                  inf  = pun_to_F32(splat(0x7f800000));
1431        // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1432        invA = select(invA < inf, invA
1433                                , 0.0f);
1434        *r *= invA;
1435        *g *= invA;
1436        *b *= invA;
1437    }
1438
1439    void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1440        *r *= a;
1441        *g *= a;
1442        *b *= a;
1443    }
1444
1445    Color Builder::uniformColor(SkColor4f color, Uniforms* uniforms) {
1446        auto [r,g,b,a] = color;
1447        return {
1448            uniformF(uniforms->pushF(r)),
1449            uniformF(uniforms->pushF(g)),
1450            uniformF(uniforms->pushF(b)),
1451            uniformF(uniforms->pushF(a)),
1452        };
1453    }
1454
1455    F32 Builder::lerp(F32 lo, F32 hi, F32 t) {
1456        if (this->isImm(t.id, 0.0f)) { return lo; }
1457        if (this->isImm(t.id, 1.0f)) { return hi; }
1458        return mad(sub(hi, lo), t, lo);
1459    }
1460
1461    Color Builder::lerp(Color lo, Color hi, F32 t) {
1462        return {
1463            lerp(lo.r, hi.r, t),
1464            lerp(lo.g, hi.g, t),
1465            lerp(lo.b, hi.b, t),
1466            lerp(lo.a, hi.a, t),
1467        };
1468    }
1469
1470    HSLA Builder::to_hsla(Color c) {
1471        F32 mx = max(max(c.r,c.g),c.b),
1472            mn = min(min(c.r,c.g),c.b),
1473             d = mx - mn,
1474          invd = 1.0f / d,
1475        g_lt_b = select(c.g < c.b, splat(6.0f)
1476                                 , splat(0.0f));
1477
1478        F32 h = (1/6.0f) * select(mx == mn,  0.0f,
1479                           select(mx == c.r, invd * (c.g - c.b) + g_lt_b,
1480                           select(mx == c.g, invd * (c.b - c.r) + 2.0f
1481                                           , invd * (c.r - c.g) + 4.0f)));
1482
1483        F32 sum = mx + mn,
1484              l = sum * 0.5f,
1485              s = select(mx == mn, 0.0f
1486                                 , d / select(l > 0.5f, 2.0f - sum
1487                                                      , sum));
1488        return {h, s, l, c.a};
1489    }
1490
1491    Color Builder::to_rgba(HSLA c) {
1492        // See GrRGBToHSLFilterEffect.fp
1493
1494        auto [h,s,l,a] = c;
1495        F32 x = s * (1.0f - abs(l + l - 1.0f));
1496
1497        auto hue_to_rgb = [&,l=l](auto hue) {
1498            auto q = abs(6.0f * fract(hue) - 3.0f) - 1.0f;
1499            return x * (clamp01(q) - 0.5f) + l;
1500        };
1501
1502        return {
1503            hue_to_rgb(h + 0/3.0f),
1504            hue_to_rgb(h + 2/3.0f),
1505            hue_to_rgb(h + 1/3.0f),
1506            c.a,
1507        };
1508    }
1509
1510    // We're basing our implementation of non-separable blend modes on
1511    //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1512    // and
1513    //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1514    // They're equivalent, but ES' math has been better simplified.
1515    //
1516    // Anything extra we add beyond that is to make the math work with premul inputs.
1517
1518    static skvm::F32 saturation(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1519        return max(r, max(g, b))
1520             - min(r, min(g, b));
1521    }
1522
1523    static skvm::F32 luminance(skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1524        return r*0.30f + g*0.59f + b*0.11f;
1525    }
1526
1527    static void set_sat(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1528        F32 mn  = min(*r, min(*g, *b)),
1529            mx  = max(*r, max(*g, *b)),
1530            sat = mx - mn;
1531
1532        // Map min channel to 0, max channel to s, and scale the middle proportionally.
1533        auto scale = [&](skvm::F32 c) {
1534            auto scaled = ((c - mn) * s) / sat;
1535            return select(is_finite(scaled), scaled, 0.0f);
1536        };
1537        *r = scale(*r);
1538        *g = scale(*g);
1539        *b = scale(*b);
1540    }
1541
1542    static void set_lum(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1543        auto diff = lu - luminance(*r, *g, *b);
1544        *r += diff;
1545        *g += diff;
1546        *b += diff;
1547    }
1548
1549    static void clip_color(skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1550        F32 mn  = min(*r, min(*g, *b)),
1551            mx  = max(*r, max(*g, *b)),
1552            lu = luminance(*r, *g, *b);
1553
1554        auto clip = [&](auto c) {
1555            c = select(mn >= 0, c
1556                              , lu + ((c-lu)*(  lu)) / (lu-mn));
1557            c = select(mx >  a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1558                              , c);
1559            return clamp01(c);  // May be a little negative, or worse, NaN.
1560        };
1561        *r = clip(*r);
1562        *g = clip(*g);
1563        *b = clip(*b);
1564    }
1565
1566    Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1567        auto mma = [](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1568            return x*y + z*w;
1569        };
1570
1571        auto two = [](skvm::F32 x) { return x+x; };
1572
1573        auto apply_rgba = [&](auto fn) {
1574            return Color {
1575                fn(src.r, dst.r),
1576                fn(src.g, dst.g),
1577                fn(src.b, dst.b),
1578                fn(src.a, dst.a),
1579            };
1580        };
1581
1582        auto apply_rgb_srcover_a = [&](auto fn) {
1583            return Color {
1584                fn(src.r, dst.r),
1585                fn(src.g, dst.g),
1586                fn(src.b, dst.b),
1587                mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1588            };
1589        };
1590
1591        auto non_sep = [&](auto R, auto G, auto B) {
1592            return Color{
1593                R + mma(src.r, 1-dst.a,  dst.r, 1-src.a),
1594                G + mma(src.g, 1-dst.a,  dst.g, 1-src.a),
1595                B + mma(src.b, 1-dst.a,  dst.b, 1-src.a),
1596                mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1597            };
1598        };
1599
1600        switch (mode) {
1601            default:
1602                SkASSERT(false);
1603                [[fallthrough]]; /*but also, for safety, fallthrough*/
1604
1605            case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1606
1607            case SkBlendMode::kSrc: return src;
1608            case SkBlendMode::kDst: return dst;
1609
1610            case SkBlendMode::kDstOver: std::swap(src, dst); [[fallthrough]];
1611            case SkBlendMode::kSrcOver:
1612                return apply_rgba([&](auto s, auto d) {
1613                    return mad(d,1-src.a, s);
1614                });
1615
1616            case SkBlendMode::kDstIn: std::swap(src, dst); [[fallthrough]];
1617            case SkBlendMode::kSrcIn:
1618                return apply_rgba([&](auto s, auto d) {
1619                    return s * dst.a;
1620                });
1621
1622            case SkBlendMode::kDstOut: std::swap(src, dst); [[fallthrough]];
1623
1624            case SkBlendMode::kSrcOut:
1625                return apply_rgba([&](auto s, auto d) {
1626                    return s * (1-dst.a);
1627                });
1628
1629            case SkBlendMode::kDstATop: std::swap(src, dst); [[fallthrough]];
1630            case SkBlendMode::kSrcATop:
1631                return apply_rgba([&](auto s, auto d) {
1632                    return mma(s, dst.a,  d, 1-src.a);
1633                });
1634
1635            case SkBlendMode::kXor:
1636                return apply_rgba([&](auto s, auto d) {
1637                    return mma(s, 1-dst.a,  d, 1-src.a);
1638                });
1639
1640            case SkBlendMode::kPlus:
1641                return apply_rgba([&](auto s, auto d) {
1642                    return min(s+d, 1.0f);
1643                });
1644
1645            case SkBlendMode::kModulate:
1646                return apply_rgba([&](auto s, auto d) {
1647                    return s * d;
1648                });
1649
1650            case SkBlendMode::kScreen:
1651                // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1652                // It's kind of plausible that s + (d - sd) keeps more precision?
1653                return apply_rgba([&](auto s, auto d) {
1654                    return s + (d - s*d);
1655                });
1656
1657            case SkBlendMode::kDarken:
1658                return apply_rgb_srcover_a([&](auto s, auto d) {
1659                    return s + (d - max(s * dst.a,
1660                                        d * src.a));
1661                });
1662
1663            case SkBlendMode::kLighten:
1664                return apply_rgb_srcover_a([&](auto s, auto d) {
1665                    return s + (d - min(s * dst.a,
1666                                        d * src.a));
1667                });
1668
1669            case SkBlendMode::kDifference:
1670                return apply_rgb_srcover_a([&](auto s, auto d) {
1671                    return s + (d - two(min(s * dst.a,
1672                                            d * src.a)));
1673                });
1674
1675            case SkBlendMode::kExclusion:
1676                return apply_rgb_srcover_a([&](auto s, auto d) {
1677                    return s + (d - two(s * d));
1678                });
1679
1680            case SkBlendMode::kColorBurn:
1681                return apply_rgb_srcover_a([&](auto s, auto d) {
1682                    auto mn   = min(dst.a,
1683                                    src.a * (dst.a - d) / s),
1684                         burn = src.a * (dst.a - mn) + mma(s, 1-dst.a, d, 1-src.a);
1685                    return select(d == dst.a     , s * (1-dst.a) + d,
1686                           select(is_finite(burn), burn
1687                                                 , d * (1-src.a) + s));
1688                });
1689
1690            case SkBlendMode::kColorDodge:
1691                return apply_rgb_srcover_a([&](auto s, auto d) {
1692                    auto dodge = src.a * min(dst.a,
1693                                             d * src.a / (src.a - s))
1694                                       + mma(s, 1-dst.a, d, 1-src.a);
1695                    return select(d == 0.0f       , s * (1-dst.a) + d,
1696                           select(is_finite(dodge), dodge
1697                                                  , d * (1-src.a) + s));
1698                });
1699
1700            case SkBlendMode::kHardLight:
1701                return apply_rgb_srcover_a([&](auto s, auto d) {
1702                    return mma(s, 1-dst.a, d, 1-src.a) +
1703                           select(two(s) <= src.a,
1704                                  two(s * d),
1705                                  src.a * dst.a - two((dst.a - d) * (src.a - s)));
1706                });
1707
1708            case SkBlendMode::kOverlay:
1709                return apply_rgb_srcover_a([&](auto s, auto d) {
1710                    return mma(s, 1-dst.a, d, 1-src.a) +
1711                           select(two(d) <= dst.a,
1712                                  two(s * d),
1713                                  src.a * dst.a - two((dst.a - d) * (src.a - s)));
1714                });
1715
1716            case SkBlendMode::kMultiply:
1717                return apply_rgba([&](auto s, auto d) {
1718                    return mma(s, 1-dst.a, d, 1-src.a) + s * d;
1719                });
1720
1721            case SkBlendMode::kSoftLight:
1722                return apply_rgb_srcover_a([&](auto s, auto d) {
1723                    auto  m = select(dst.a > 0.0f, d / dst.a
1724                                                 , 0.0f),
1725                         s2 = two(s),
1726                         m4 = 4*m;
1727
1728                         // The logic forks three ways:
1729                         //    1. dark src?
1730                         //    2. light src, dark dst?
1731                         //    3. light src, light dst?
1732
1733                         // Used in case 1
1734                    auto darkSrc = d * ((s2-src.a) * (1-m) + src.a),
1735                         // Used in case 2
1736                         darkDst = (m4 * m4 + m4) * (m-1) + 7*m,
1737                         // Used in case 3.
1738                         liteDst = sqrt(m) - m,
1739                         // Used in 2 or 3?
1740                         liteSrc = dst.a * (s2 - src.a) * select(4*d <= dst.a, darkDst
1741                                                                             , liteDst)
1742                                   + d * src.a;
1743                    return s * (1-dst.a) + d * (1-src.a) + select(s2 <= src.a, darkSrc
1744                                                                             , liteSrc);
1745                });
1746
1747            case SkBlendMode::kHue: {
1748                skvm::F32 R = src.r * src.a,
1749                          G = src.g * src.a,
1750                          B = src.b * src.a;
1751
1752                set_sat   (&R, &G, &B, src.a * saturation(dst.r, dst.g, dst.b));
1753                set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1754                clip_color(&R, &G, &B, src.a * dst.a);
1755
1756                return non_sep(R, G, B);
1757            }
1758
1759            case SkBlendMode::kSaturation: {
1760                skvm::F32 R = dst.r * src.a,
1761                          G = dst.g * src.a,
1762                          B = dst.b * src.a;
1763
1764                set_sat   (&R, &G, &B, dst.a * saturation(src.r, src.g, src.b));
1765                set_lum   (&R, &G, &B, src.a * luminance (dst.r, dst.g, dst.b));
1766                clip_color(&R, &G, &B, src.a * dst.a);
1767
1768                return non_sep(R, G, B);
1769            }
1770
1771            case SkBlendMode::kColor: {
1772                skvm::F32 R = src.r * dst.a,
1773                          G = src.g * dst.a,
1774                          B = src.b * dst.a;
1775
1776                set_lum   (&R, &G, &B, src.a * luminance(dst.r, dst.g, dst.b));
1777                clip_color(&R, &G, &B, src.a * dst.a);
1778
1779                return non_sep(R, G, B);
1780            }
1781
1782            case SkBlendMode::kLuminosity: {
1783                skvm::F32 R = dst.r * src.a,
1784                          G = dst.g * src.a,
1785                          B = dst.b * src.a;
1786
1787                set_lum   (&R, &G, &B, dst.a * luminance(src.r, src.g, src.b));
1788                clip_color(&R, &G, &B, dst.a * src.a);
1789
1790                return non_sep(R, G, B);
1791            }
1792        }
1793    }
1794
1795    // ~~~~ Program::eval() and co. ~~~~ //
1796
1797    // Handy references for x86-64 instruction encoding:
1798    // https://wiki.osdev.org/X86-64_Instruction_Encoding
1799    // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1800    // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1801    // http://ref.x86asm.net/coder64.html
1802
1803    // Used for ModRM / immediate instruction encoding.
1804    static uint8_t _233(int a, int b, int c) {
1805        return (a & 3) << 6
1806             | (b & 7) << 3
1807             | (c & 7) << 0;
1808    }
1809
1810    // ModRM byte encodes the arguments of an opcode.
1811    enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
1812    static uint8_t mod_rm(Mod mod, int reg, int rm) {
1813        return _233((int)mod, reg, rm);
1814    }
1815
1816    static Mod mod(int imm) {
1817        if (imm == 0)               { return Mod::Indirect; }
1818        if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1819        return Mod::FourByteImm;
1820    }
1821
1822    static int imm_bytes(Mod mod) {
1823        switch (mod) {
1824            case Mod::Indirect:    return 0;
1825            case Mod::OneByteImm:  return 1;
1826            case Mod::FourByteImm: return 4;
1827            case Mod::Direct: SkUNREACHABLE;
1828        }
1829        SkUNREACHABLE;
1830    }
1831
1832    // SIB byte encodes a memory address, base + (index * scale).
1833    static uint8_t sib(Assembler::Scale scale, int index, int base) {
1834        return _233((int)scale, index, base);
1835    }
1836
1837    // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
1838    static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1839                       bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1840                       bool X,   // Extra top bit for SIB index register.
1841                       bool B) { // Extra top bit for SIB base or ModRM rm register.
1842        return 0b01000000   // Fixed 0100 for top four bits.
1843             | (W << 3)
1844             | (R << 2)
1845             | (X << 1)
1846             | (B << 0);
1847    }
1848
1849
1850    // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1851    struct VEX {
1852        int     len;
1853        uint8_t bytes[3];
1854    };
1855
1856    static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1857                   bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1858                   bool   X,   // Same as REX X.
1859                   bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1860                   int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1861                   int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1862                   bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1863                   int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1864
1865        // Pack x86 opcode map selector to 5-bit VEX encoding.
1866        map = [map]{
1867            switch (map) {
1868                case   0x0f: return 0b00001;
1869                case 0x380f: return 0b00010;
1870                case 0x3a0f: return 0b00011;
1871                // Several more cases only used by XOP / TBM.
1872            }
1873            SkUNREACHABLE;
1874        }();
1875
1876        // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1877        pp = [pp]{
1878            switch (pp) {
1879                case 0x66: return 0b01;
1880                case 0xf3: return 0b10;
1881                case 0xf2: return 0b11;
1882            }
1883            return 0b00;
1884        }();
1885
1886        VEX vex = {0, {0,0,0}};
1887        if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1888            // With these conditions met, we can optionally compress VEX to 2-byte.
1889            vex.len = 2;
1890            vex.bytes[0] = 0xc5;
1891            vex.bytes[1] = (pp      &  3) << 0
1892                         | (L       &  1) << 2
1893                         | (~vvvv   & 15) << 3
1894                         | (~(int)R &  1) << 7;
1895        } else {
1896            // We could use this 3-byte VEX prefix all the time if we like.
1897            vex.len = 3;
1898            vex.bytes[0] = 0xc4;
1899            vex.bytes[1] = (map     & 31) << 0
1900                         | (~(int)B &  1) << 5
1901                         | (~(int)X &  1) << 6
1902                         | (~(int)R &  1) << 7;
1903            vex.bytes[2] = (pp    &  3) << 0
1904                         | (L     &  1) << 2
1905                         | (~vvvv & 15) << 3
1906                         | (WE    &  1) << 7;
1907        }
1908        return vex;
1909    }
1910
1911    Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fSize(0) {}
1912
1913    size_t Assembler::size() const { return fSize; }
1914
1915    void Assembler::bytes(const void* p, int n) {
1916        if (fCode) {
1917            memcpy(fCode+fSize, p, n);
1918        }
1919        fSize += n;
1920    }
1921
1922    void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
1923    void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1924
1925    void Assembler::align(int mod) {
1926        while (this->size() % mod) {
1927            this->byte(0x00);
1928        }
1929    }
1930
1931    void Assembler::int3() {
1932        this->byte(0xcc);
1933    }
1934
1935    void Assembler::vzeroupper() {
1936        this->byte(0xc5);
1937        this->byte(0xf8);
1938        this->byte(0x77);
1939    }
1940    void Assembler::ret() { this->byte(0xc3); }
1941
1942    void Assembler::op(int opcode, Operand dst, GP64 x) {
1943        if (dst.kind == Operand::REG) {
1944            this->byte(rex(W1,x>>3,0,dst.reg>>3));
1945            this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1946            this->byte(mod_rm(Mod::Direct, x, dst.reg&7));
1947        } else {
1948            SkASSERT(dst.kind == Operand::MEM);
1949            const Mem& m = dst.mem;
1950            const bool need_SIB = (m.base&7) == rsp
1951                               || m.index != rsp;
1952
1953            this->byte(rex(W1,x>>3,m.index>>3,m.base>>3));
1954            this->bytes(&opcode, SkTFitsIn<uint8_t>(opcode) ? 1 : 2);
1955            this->byte(mod_rm(mod(m.disp), x&7, (need_SIB ? rsp : m.base)&7));
1956            if (need_SIB) {
1957                this->byte(sib(m.scale, m.index&7, m.base&7));
1958            }
1959            this->bytes(&m.disp, imm_bytes(mod(m.disp)));
1960        }
1961    }
1962
1963    void Assembler::op(int opcode, int opcode_ext, Operand dst, int imm) {
1964        opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1965
1966        int imm_bytes = 4;
1967        if (SkTFitsIn<int8_t>(imm)) {
1968            imm_bytes = 1;
1969            opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
1970        }
1971
1972        this->op(opcode, dst, (GP64)opcode_ext);
1973        this->bytes(&imm, imm_bytes);
1974    }
1975
1976    void Assembler::add(Operand dst, int imm) { this->op(0x01,0b000, dst,imm); }
1977    void Assembler::sub(Operand dst, int imm) { this->op(0x01,0b101, dst,imm); }
1978    void Assembler::cmp(Operand dst, int imm) { this->op(0x01,0b111, dst,imm); }
1979
1980    // These don't work quite like the other instructions with immediates:
1981    // these immediates are always fixed size at 4 bytes or 1 byte.
1982    void Assembler::mov(Operand dst, int imm) {
1983        this->op(0xC7,dst,(GP64)0b000);
1984        this->word(imm);
1985    }
1986    void Assembler::movb(Operand dst, int imm) {
1987        this->op(0xC6,dst,(GP64)0b000);
1988        this->byte(imm);
1989    }
1990
1991    void Assembler::add (Operand dst, GP64 x) { this->op(0x01, dst,x); }
1992    void Assembler::sub (Operand dst, GP64 x) { this->op(0x29, dst,x); }
1993    void Assembler::cmp (Operand dst, GP64 x) { this->op(0x39, dst,x); }
1994    void Assembler::mov (Operand dst, GP64 x) { this->op(0x89, dst,x); }
1995    void Assembler::movb(Operand dst, GP64 x) { this->op(0x88, dst,x); }
1996
1997    void Assembler::add (GP64 dst, Operand x) { this->op(0x03, x,dst); }
1998    void Assembler::sub (GP64 dst, Operand x) { this->op(0x2B, x,dst); }
1999    void Assembler::cmp (GP64 dst, Operand x) { this->op(0x3B, x,dst); }
2000    void Assembler::mov (GP64 dst, Operand x) { this->op(0x8B, x,dst); }
2001    void Assembler::movb(GP64 dst, Operand x) { this->op(0x8A, x,dst); }
2002
2003    void Assembler::movzbq(GP64 dst, Operand x) { this->op(0xB60F, x,dst); }
2004    void Assembler::movzwq(GP64 dst, Operand x) { this->op(0xB70F, x,dst); }
2005
2006    void Assembler::vpaddd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
2007    void Assembler::vpsubd (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
2008    void Assembler::vpmulld(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x40, dst,x,y); }
2009
2010    void Assembler::vpaddw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xfd, dst,x,y); }
2011    void Assembler::vpsubw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xf9, dst,x,y); }
2012    void Assembler::vpmullw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xd5, dst,x,y); }
2013    void Assembler::vpavgw   (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xe3, dst,x,y); }
2014    void Assembler::vpmulhrsw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x0b, dst,x,y); }
2015    void Assembler::vpminsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xea, dst,x,y); }
2016    void Assembler::vpmaxsw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0xee, dst,x,y); }
2017    void Assembler::vpminuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3a, dst,x,y); }
2018    void Assembler::vpmaxuw  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x3e, dst,x,y); }
2019
2020    void Assembler::vpabsw(Ymm dst, Operand x) { this->op(0x66,0x380f,0x1d, dst,x); }
2021
2022
2023    void Assembler::vpand (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
2024    void Assembler::vpor  (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
2025    void Assembler::vpxor (Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xef, dst,x,y); }
2026    void Assembler::vpandn(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
2027
2028    void Assembler::vaddps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x58, dst,x,y); }
2029    void Assembler::vsubps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5c, dst,x,y); }
2030    void Assembler::vmulps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x59, dst,x,y); }
2031    void Assembler::vdivps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5e, dst,x,y); }
2032    void Assembler::vminps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5d, dst,x,y); }
2033    void Assembler::vmaxps(Ymm dst, Ymm x, Operand y) { this->op(0,0x0f,0x5f, dst,x,y); }
2034
2035    void Assembler::vfmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x98, dst,x,y); }
2036    void Assembler::vfmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
2037    void Assembler::vfmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
2038
2039    void Assembler::vfmsub132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
2040    void Assembler::vfmsub213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
2041    void Assembler::vfmsub231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xba, dst,x,y); }
2042
2043    void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
2044    void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xac, dst,x,y); }
2045    void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
2046
2047    void Assembler::vpackusdw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
2048    void Assembler::vpackuswb(Ymm dst, Ymm x, Operand y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
2049
2050    void Assembler::vpunpckldq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x62, dst,x,y); }
2051    void Assembler::vpunpckhdq(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x6a, dst,x,y); }
2052
2053    void Assembler::vpcmpeqd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x76, dst,x,y); }
2054    void Assembler::vpcmpeqw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x75, dst,x,y); }
2055    void Assembler::vpcmpgtd(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x66, dst,x,y); }
2056    void Assembler::vpcmpgtw(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x0f,0x65, dst,x,y); }
2057
2058
2059    void Assembler::imm_byte_after_operand(const Operand& operand, int imm) {
2060        // When we've embedded a label displacement in the middle of an instruction,
2061        // we need to tweak it a little so that the resolved displacement starts
2062        // from the end of the instruction and not the end of the displacement.
2063        if (operand.kind == Operand::LABEL && fCode) {
2064            int disp;
2065            memcpy(&disp, fCode+fSize-4, 4);
2066            disp--;
2067            memcpy(fCode+fSize-4, &disp, 4);
2068        }
2069        this->byte(imm);
2070    }
2071
2072    void Assembler::vcmpps(Ymm dst, Ymm x, Operand y, int imm) {
2073        this->op(0,0x0f,0xc2, dst,x,y);
2074        this->imm_byte_after_operand(y, imm);
2075    }
2076
2077    void Assembler::vpblendvb(Ymm dst, Ymm x, Operand y, Ymm z) {
2078        this->op(0x66,0x3a0f,0x4c, dst,x,y);
2079        this->imm_byte_after_operand(y, z << 4);
2080    }
2081
2082    // Shift instructions encode their opcode extension as "dst", dst as x, and x as y.
2083    void Assembler::vpslld(Ymm dst, Ymm x, int imm) {
2084        this->op(0x66,0x0f,0x72,(Ymm)6, dst,x);
2085        this->byte(imm);
2086    }
2087    void Assembler::vpsrld(Ymm dst, Ymm x, int imm) {
2088        this->op(0x66,0x0f,0x72,(Ymm)2, dst,x);
2089        this->byte(imm);
2090    }
2091    void Assembler::vpsrad(Ymm dst, Ymm x, int imm) {
2092        this->op(0x66,0x0f,0x72,(Ymm)4, dst,x);
2093        this->byte(imm);
2094    }
2095    void Assembler::vpsllw(Ymm dst, Ymm x, int imm) {
2096        this->op(0x66,0x0f,0x71,(Ymm)6, dst,x);
2097        this->byte(imm);
2098    }
2099    void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) {
2100        this->op(0x66,0x0f,0x71,(Ymm)2, dst,x);
2101        this->byte(imm);
2102    }
2103    void Assembler::vpsraw(Ymm dst, Ymm x, int imm) {
2104        this->op(0x66,0x0f,0x71,(Ymm)4, dst,x);
2105        this->byte(imm);
2106    }
2107
2108    void Assembler::vpermq(Ymm dst, Operand x, int imm) {
2109        // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
2110        this->op(0x66,0x3a0f,0x00, dst,x,W1);
2111        this->imm_byte_after_operand(x, imm);
2112    }
2113
2114    void Assembler::vperm2f128(Ymm dst, Ymm x, Operand y, int imm) {
2115        this->op(0x66,0x3a0f,0x06, dst,x,y);
2116        this->imm_byte_after_operand(y, imm);
2117    }
2118
2119    void Assembler::vpermps(Ymm dst, Ymm ix, Operand src) {
2120        this->op(0x66,0x380f,0x16, dst,ix,src);
2121    }
2122
2123    void Assembler::vroundps(Ymm dst, Operand x, Rounding imm) {
2124        this->op(0x66,0x3a0f,0x08, dst,x);
2125        this->imm_byte_after_operand(x, imm);
2126    }
2127
2128    void Assembler::vmovdqa(Ymm dst, Operand src) { this->op(0x66,0x0f,0x6f, dst,src); }
2129    void Assembler::vmovups(Ymm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
2130    void Assembler::vmovups(Xmm dst, Operand src) { this->op(   0,0x0f,0x10, dst,src); }
2131    void Assembler::vmovups(Operand dst, Ymm src) { this->op(   0,0x0f,0x11, src,dst); }
2132    void Assembler::vmovups(Operand dst, Xmm src) { this->op(   0,0x0f,0x11, src,dst); }
2133
2134    void Assembler::vcvtdq2ps (Ymm dst, Operand x) { this->op(   0,0x0f,0x5b, dst,x); }
2135    void Assembler::vcvttps2dq(Ymm dst, Operand x) { this->op(0xf3,0x0f,0x5b, dst,x); }
2136    void Assembler::vcvtps2dq (Ymm dst, Operand x) { this->op(0x66,0x0f,0x5b, dst,x); }
2137    void Assembler::vsqrtps   (Ymm dst, Operand x) { this->op(   0,0x0f,0x51, dst,x); }
2138
2139    void Assembler::vcvtps2ph(Operand dst, Ymm x, Rounding imm) {
2140        this->op(0x66,0x3a0f,0x1d, x,dst);
2141        this->imm_byte_after_operand(dst, imm);
2142    }
2143    void Assembler::vcvtph2ps(Ymm dst, Operand x) {
2144        this->op(0x66,0x380f,0x13, dst,x);
2145    }
2146
2147    int Assembler::disp19(Label* l) {
2148        SkASSERT(l->kind == Label::NotYetSet ||
2149                 l->kind == Label::ARMDisp19);
2150        int here = (int)this->size();
2151        l->kind = Label::ARMDisp19;
2152        l->references.push_back(here);
2153        // ARM 19-bit instruction count, from the beginning of this instruction.
2154        return (l->offset - here) / 4;
2155    }
2156
2157    int Assembler::disp32(Label* l) {
2158        SkASSERT(l->kind == Label::NotYetSet ||
2159                 l->kind == Label::X86Disp32);
2160        int here = (int)this->size();
2161        l->kind = Label::X86Disp32;
2162        l->references.push_back(here);
2163        // x86 32-bit byte count, from the end of this instruction.
2164        return l->offset - (here + 4);
2165    }
2166
2167    void Assembler::op(int prefix, int map, int opcode, int dst, int x, Operand y, W w, L l) {
2168        switch (y.kind) {
2169            case Operand::REG: {
2170                VEX v = vex(w, dst>>3, 0, y.reg>>3,
2171                            map, x, l, prefix);
2172                this->bytes(v.bytes, v.len);
2173                this->byte(opcode);
2174                this->byte(mod_rm(Mod::Direct, dst&7, y.reg&7));
2175            } return;
2176
2177            case Operand::MEM: {
2178                // Passing rsp as the rm argument to mod_rm() signals an SIB byte follows;
2179                // without an SIB byte, that's where the base register would usually go.
2180                // This means we have to use an SIB byte if we want to use rsp as a base register.
2181                const Mem& m = y.mem;
2182                const bool need_SIB = m.base  == rsp
2183                                   || m.index != rsp;
2184
2185                VEX v = vex(w, dst>>3, m.index>>3, m.base>>3,
2186                            map, x, l, prefix);
2187                this->bytes(v.bytes, v.len);
2188                this->byte(opcode);
2189                this->byte(mod_rm(mod(m.disp), dst&7, (need_SIB ? rsp : m.base)&7));
2190                if (need_SIB) {
2191                    this->byte(sib(m.scale, m.index&7, m.base&7));
2192                }
2193                this->bytes(&m.disp, imm_bytes(mod(m.disp)));
2194            } return;
2195
2196            case Operand::LABEL: {
2197                // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
2198                const int rip = rbp;
2199
2200                VEX v = vex(w, dst>>3, 0, rip>>3,
2201                            map, x, l, prefix);
2202                this->bytes(v.bytes, v.len);
2203                this->byte(opcode);
2204                this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
2205                this->word(this->disp32(y.label));
2206            } return;
2207        }
2208    }
2209
2210    void Assembler::vpshufb(Ymm dst, Ymm x, Operand y) { this->op(0x66,0x380f,0x00, dst,x,y); }
2211
2212    void Assembler::vptest(Ymm x, Operand y) { this->op(0x66, 0x380f, 0x17, x,y); }
2213
2214    void Assembler::vbroadcastss(Ymm dst, Operand y) { this->op(0x66,0x380f,0x18, dst,y); }
2215
2216    void Assembler::jump(uint8_t condition, Label* l) {
2217        // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
2218        //    7?     one-byte-disp
2219        //    0F 8? four-byte-disp
2220        // We always use the near displacement to make updating labels simpler (no resizing).
2221        this->byte(0x0f);
2222        this->byte(condition);
2223        this->word(this->disp32(l));
2224    }
2225    void Assembler::je (Label* l) { this->jump(0x84, l); }
2226    void Assembler::jne(Label* l) { this->jump(0x85, l); }
2227    void Assembler::jl (Label* l) { this->jump(0x8c, l); }
2228    void Assembler::jc (Label* l) { this->jump(0x82, l); }
2229
2230    void Assembler::jmp(Label* l) {
2231        // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
2232        this->byte(0xe9);
2233        this->word(this->disp32(l));
2234    }
2235
2236    void Assembler::vpmovzxwd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x33, dst,src); }
2237    void Assembler::vpmovzxbd(Ymm dst, Operand src) { this->op(0x66,0x380f,0x31, dst,src); }
2238
2239    void Assembler::vmovq(Operand dst, Xmm src) { this->op(0x66,0x0f,0xd6, src,dst); }
2240
2241    void Assembler::vmovd(Operand dst, Xmm src) { this->op(0x66,0x0f,0x7e, src,dst); }
2242    void Assembler::vmovd(Xmm dst, Operand src) { this->op(0x66,0x0f,0x6e, dst,src); }
2243
2244    void Assembler::vpinsrd(Xmm dst, Xmm src, Operand y, int imm) {
2245        this->op(0x66,0x3a0f,0x22, dst,src,y);
2246        this->imm_byte_after_operand(y, imm);
2247    }
2248    void Assembler::vpinsrw(Xmm dst, Xmm src, Operand y, int imm) {
2249        this->op(0x66,0x0f,0xc4, dst,src,y);
2250        this->imm_byte_after_operand(y, imm);
2251    }
2252    void Assembler::vpinsrb(Xmm dst, Xmm src, Operand y, int imm) {
2253        this->op(0x66,0x3a0f,0x20, dst,src,y);
2254        this->imm_byte_after_operand(y, imm);
2255    }
2256
2257    void Assembler::vextracti128(Operand dst, Ymm src, int imm) {
2258        this->op(0x66,0x3a0f,0x39, src,dst);
2259        SkASSERT(dst.kind != Operand::LABEL);
2260        this->byte(imm);
2261    }
2262    void Assembler::vpextrd(Operand dst, Xmm src, int imm) {
2263        this->op(0x66,0x3a0f,0x16, src,dst);
2264        SkASSERT(dst.kind != Operand::LABEL);
2265        this->byte(imm);
2266    }
2267    void Assembler::vpextrw(Operand dst, Xmm src, int imm) {
2268        this->op(0x66,0x3a0f,0x15, src,dst);
2269        SkASSERT(dst.kind != Operand::LABEL);
2270        this->byte(imm);
2271    }
2272    void Assembler::vpextrb(Operand dst, Xmm src, int imm) {
2273        this->op(0x66,0x3a0f,0x14, src,dst);
2274        SkASSERT(dst.kind != Operand::LABEL);
2275        this->byte(imm);
2276    }
2277
2278    void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2279        // Unlike most instructions, no aliasing is permitted here.
2280        SkASSERT(dst != ix);
2281        SkASSERT(dst != mask);
2282        SkASSERT(mask != ix);
2283
2284        int prefix = 0x66,
2285            map    = 0x380f,
2286            opcode = 0x92;
2287        VEX v = vex(0, dst>>3, ix>>3, base>>3,
2288                    map, mask, /*ymm?*/1, prefix);
2289        this->bytes(v.bytes, v.len);
2290        this->byte(opcode);
2291        this->byte(mod_rm(Mod::Indirect, dst&7, rsp/*use SIB*/));
2292        this->byte(sib(scale, ix&7, base&7));
2293    }
2294
2295    // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2296
2297    static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2298
2299    void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2300        this->word( (hi & 11_mask) << 21
2301                  | (m  &  5_mask) << 16
2302                  | (lo &  6_mask) << 10
2303                  | (n  &  5_mask) <<  5
2304                  | (d  &  5_mask) <<  0);
2305    }
2306    void Assembler::op(uint32_t op22, V n, V d, int imm) {
2307        this->word( (op22 & 22_mask) << 10
2308                  | imm  // size and location depends on the instruction
2309                  | (n    &  5_mask) <<  5
2310                  | (d    &  5_mask) <<  0);
2311    }
2312
2313    void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
2314    void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
2315    void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
2316    void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
2317    void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
2318    void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2319
2320    void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
2321    void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
2322    void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2323
2324    void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
2325    void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2326
2327    void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
2328    void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2329
2330    void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
2331    void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
2332    void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
2333    void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
2334    void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
2335    void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
2336
2337    void Assembler::fneg4s (V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n,d); }
2338    void Assembler::fsqrt4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'11111'10, n,d); }
2339
2340    void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
2341    void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
2342    void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2343
2344    void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
2345    void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2346
2347    void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2348
2349    void Assembler::uzp14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'01'10, n, d); }
2350    void Assembler::uzp24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'01'10, n, d); }
2351    void Assembler::zip14s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'0'11'10, n, d); }
2352    void Assembler::zip24s(V d, V n, V m) { this->op(0b0'1'001110'10'0, m, 0b0'1'11'10, n, d); }
2353
2354    void Assembler::sli4s(V d, V n, int imm5) {
2355        this->op(0b0'1'1'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2356    }
2357    void Assembler::shl4s(V d, V n, int imm5) {
2358        this->op(0b0'1'0'011110'0100'000'01010'1,    n, d, ( imm5 & 5_mask)<<16);
2359    }
2360    void Assembler::sshr4s(V d, V n, int imm5) {
2361        this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2362    }
2363    void Assembler::ushr4s(V d, V n, int imm5) {
2364        this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, n, d, (-imm5 & 5_mask)<<16);
2365    }
2366    void Assembler::ushr8h(V d, V n, int imm4) {
2367        this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, n, d, (-imm4 & 4_mask)<<16);
2368    }
2369
2370    void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
2371    void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
2372    void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
2373    void Assembler::frintp4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1100'0'10, n,d); }
2374    void Assembler::frintm4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1100'1'10, n,d); }
2375
2376    void Assembler::fcvtn(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10110'10, n,d); }
2377    void Assembler::fcvtl(V d, V n) { this->op(0b0'0'0'01110'0'0'10000'10111'10, n,d); }
2378
2379    void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
2380    void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2381
2382    void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
2383    void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2384
2385    void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2386
2387    void Assembler::brk(int imm16) {
2388        this->op(0b11010100'001'00000000000, (imm16 & 16_mask) << 5);
2389    }
2390
2391    void Assembler::ret(X n) { this->op(0b1101011'0'0'10'11111'0000'0'0, n, (X)0); }
2392
2393    void Assembler::add(X d, X n, int imm12) {
2394        this->op(0b1'0'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2395    }
2396    void Assembler::sub(X d, X n, int imm12) {
2397        this->op(0b1'1'0'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2398    }
2399    void Assembler::subs(X d, X n, int imm12) {
2400        this->op(0b1'1'1'10001'00'000000000000, n,d, (imm12 & 12_mask) << 10);
2401    }
2402
2403    void Assembler::add(X d, X n, X m, Shift shift, int imm6) {
2404        SkASSERT(shift != ROR);
2405
2406        int imm = (imm6  & 6_mask) << 0
2407                | (m     & 5_mask) << 6
2408                | (0     & 1_mask) << 11
2409                | (shift & 2_mask) << 12;
2410        this->op(0b1'0'0'01011'00'0'00000'000000, n,d, imm << 10);
2411    }
2412
2413    void Assembler::b(Condition cond, Label* l) {
2414        const int imm19 = this->disp19(l);
2415        this->op(0b0101010'0'00000000000000, (X)0, (V)cond, (imm19 & 19_mask) << 5);
2416    }
2417    void Assembler::cbz(X t, Label* l) {
2418        const int imm19 = this->disp19(l);
2419        this->op(0b1'011010'0'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2420    }
2421    void Assembler::cbnz(X t, Label* l) {
2422        const int imm19 = this->disp19(l);
2423        this->op(0b1'011010'1'00000000000000, (X)0, t, (imm19 & 19_mask) << 5);
2424    }
2425
2426    void Assembler::ldrd(X dst, X src, int imm12) {
2427        this->op(0b11'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2428    }
2429    void Assembler::ldrs(X dst, X src, int imm12) {
2430        this->op(0b10'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2431    }
2432    void Assembler::ldrh(X dst, X src, int imm12) {
2433        this->op(0b01'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2434    }
2435    void Assembler::ldrb(X dst, X src, int imm12) {
2436        this->op(0b00'111'0'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2437    }
2438
2439    void Assembler::ldrq(V dst, X src, int imm12) {
2440        this->op(0b00'111'1'01'11'000000000000, src, dst, (imm12 & 12_mask) << 10);
2441    }
2442    void Assembler::ldrd(V dst, X src, int imm12) {
2443        this->op(0b11'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2444    }
2445    void Assembler::ldrs(V dst, X src, int imm12) {
2446        this->op(0b10'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2447    }
2448    void Assembler::ldrh(V dst, X src, int imm12) {
2449        this->op(0b01'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2450    }
2451    void Assembler::ldrb(V dst, X src, int imm12) {
2452        this->op(0b00'111'1'01'01'000000000000, src, dst, (imm12 & 12_mask) << 10);
2453    }
2454
2455    void Assembler::strs(X src, X dst, int imm12) {
2456        this->op(0b10'111'0'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2457    }
2458
2459    void Assembler::strq(V src, X dst, int imm12) {
2460        this->op(0b00'111'1'01'10'000000000000, dst, src, (imm12 & 12_mask) << 10);
2461    }
2462    void Assembler::strd(V src, X dst, int imm12) {
2463        this->op(0b11'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2464    }
2465    void Assembler::strs(V src, X dst, int imm12) {
2466        this->op(0b10'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2467    }
2468    void Assembler::strh(V src, X dst, int imm12) {
2469        this->op(0b01'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2470    }
2471    void Assembler::strb(V src, X dst, int imm12) {
2472        this->op(0b00'111'1'01'00'000000000000, dst, src, (imm12 & 12_mask) << 10);
2473    }
2474
2475    void Assembler::movs(X dst, V src, int lane) {
2476        int imm5 = (lane << 3) | 0b100;
2477        this->op(0b0'0'0'01110000'00000'0'01'1'1'1, src, dst, (imm5 & 5_mask) << 16);
2478    }
2479    void Assembler::inss(V dst, X src, int lane) {
2480        int imm5 = (lane << 3) | 0b100;
2481        this->op(0b0'1'0'01110000'00000'0'0011'1, src, dst, (imm5 & 5_mask) << 16);
2482    }
2483
2484
2485    void Assembler::ldrq(V dst, Label* l) {
2486        const int imm19 = this->disp19(l);
2487        this->op(0b10'011'1'00'00000000000000, (V)0, dst, (imm19 & 19_mask) << 5);
2488    }
2489
2490    void Assembler::dup4s(V dst, X src) {
2491        this->op(0b0'1'0'01110000'00100'0'0001'1, src, dst);
2492    }
2493
2494    void Assembler::ld1r4s(V dst, X src) {
2495        this->op(0b0'1'0011010'1'0'00000'110'0'10, src, dst);
2496    }
2497    void Assembler::ld1r8h(V dst, X src) {
2498        this->op(0b0'1'0011010'1'0'00000'110'0'01, src, dst);
2499    }
2500    void Assembler::ld1r16b(V dst, X src) {
2501        this->op(0b0'1'0011010'1'0'00000'110'0'00, src, dst);
2502    }
2503
2504    void Assembler::ld24s(V dst, X src) { this->op(0b0'1'0011000'1'000000'1000'10, src, dst); }
2505    void Assembler::ld44s(V dst, X src) { this->op(0b0'1'0011000'1'000000'0000'10, src, dst); }
2506    void Assembler::st24s(V src, X dst) { this->op(0b0'1'0011000'0'000000'1000'10, dst, src); }
2507    void Assembler::st44s(V src, X dst) { this->op(0b0'1'0011000'0'000000'0000'10, dst, src); }
2508
2509    void Assembler::ld24s(V dst, X src, int lane) {
2510        int Q = (lane & 2)>>1,
2511            S = (lane & 1);
2512                 /*  Q                       S */
2513        this->op(0b0'0'0011010'1'1'00000'100'0'00, src, dst, (Q<<30)|(S<<12));
2514    }
2515    void Assembler::ld44s(V dst, X src, int lane) {
2516        int Q = (lane & 2)>>1,
2517            S = (lane & 1);
2518        this->op(0b0'0'0011010'1'1'00000'101'0'00, src, dst, (Q<<30)|(S<<12));
2519    }
2520
2521    void Assembler::label(Label* l) {
2522        if (fCode) {
2523            // The instructions all currently point to l->offset.
2524            // We'll want to add a delta to point them to here.
2525            int here = (int)this->size();
2526            int delta = here - l->offset;
2527            l->offset = here;
2528
2529            if (l->kind == Label::ARMDisp19) {
2530                for (int ref : l->references) {
2531                    // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2532                    uint32_t inst;
2533                    memcpy(&inst, fCode + ref, 4);
2534
2535                    // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2536                    int disp = (int)(inst << 8) >> 13;
2537
2538                    disp += delta/4;  // delta is in bytes, we want instructions.
2539
2540                    // Put it all back together, preserving the high 8 bits and low 5.
2541                    inst = ((disp << 5) &  (19_mask << 5))
2542                         | ((inst     ) & ~(19_mask << 5));
2543                    memcpy(fCode + ref, &inst, 4);
2544                }
2545            }
2546
2547            if (l->kind == Label::X86Disp32) {
2548                for (int ref : l->references) {
2549                    // ref points to a 32-bit displacement in bytes.
2550                    int disp;
2551                    memcpy(&disp, fCode + ref, 4);
2552
2553                    disp += delta;
2554
2555                    memcpy(fCode + ref, &disp, 4);
2556                }
2557            }
2558        }
2559    }
2560
2561    void Program::eval(int n, void* args[]) const {
2562    #define SKVM_JIT_STATS 0
2563    #if SKVM_JIT_STATS
2564        static std::atomic<int64_t>  calls{0}, jits{0},
2565                                    pixels{0}, fast{0};
2566        pixels += n;
2567        if (0 == calls++) {
2568            atexit([]{
2569                int64_t num = jits .load(),
2570                        den = calls.load();
2571                SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2572                num = fast  .load();
2573                den = pixels.load();
2574                SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2575            });
2576        }
2577    #endif
2578
2579    #if !defined(SKVM_JIT_BUT_IGNORE_IT)
2580        const void* jit_entry = fImpl->jit_entry.load();
2581        // jit_entry may be null either simply because we can't JIT, or when using LLVM
2582        // if the work represented by fImpl->llvm_compiling hasn't finished yet.
2583        //
2584        // Ordinarily we'd never find ourselves with non-null jit_entry and !gSkVMAllowJIT, but it
2585        // can happen during interactive programs like Viewer that toggle gSkVMAllowJIT on and off,
2586        // due to timing or program caching.
2587        if (jit_entry != nullptr && gSkVMAllowJIT) {
2588        #if SKVM_JIT_STATS
2589            jits++;
2590            fast += n;
2591        #endif
2592            void** a = args;
2593            switch (fImpl->strides.size()) {
2594                case 0: return ((void(*)(int                        ))jit_entry)(n               );
2595                case 1: return ((void(*)(int,void*                  ))jit_entry)(n,a[0]          );
2596                case 2: return ((void(*)(int,void*,void*            ))jit_entry)(n,a[0],a[1]     );
2597                case 3: return ((void(*)(int,void*,void*,void*      ))jit_entry)(n,a[0],a[1],a[2]);
2598                case 4: return ((void(*)(int,void*,void*,void*,void*))jit_entry)
2599                                (n,a[0],a[1],a[2],a[3]);
2600                case 5: return ((void(*)(int,void*,void*,void*,void*,void*))jit_entry)
2601                                (n,a[0],a[1],a[2],a[3],a[4]);
2602                case 6: return ((void(*)(int,void*,void*,void*,void*,void*,void*))jit_entry)
2603                                (n,a[0],a[1],a[2],a[3],a[4],a[5]);
2604                case 7: return ((void(*)(int,void*,void*,void*,void*,void*,void*,void*))jit_entry)
2605                                (n,a[0],a[1],a[2],a[3],a[4],a[5],a[6]);
2606                default: break; //SkASSERT(fImpl->strides.size() <= 7);
2607            }
2608        }
2609    #endif
2610
2611        // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2612        SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2613                               this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
2614                               n, args);
2615    }
2616
2617    #if defined(SKVM_LLVM)
2618    // -- SKVM_LLVM --------------------------------------------------------------------------------
2619    void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2620                            const char* debug_name) {
2621        auto ctx = std::make_unique<llvm::LLVMContext>();
2622
2623        auto mod = std::make_unique<llvm::Module>("", *ctx);
2624        // All the scary bare pointers from here on are owned by ctx or mod, I think.
2625
2626        // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2627        const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2628
2629        llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2630                   *i32 = llvm::Type::getInt32Ty(*ctx);
2631
2632        std::vector<llvm::Type*> arg_types = { i32 };
2633        for (size_t i = 0; i < fImpl->strides.size(); i++) {
2634            arg_types.push_back(ptr);
2635        }
2636
2637        llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2638                                                              arg_types, /*vararg?=*/false);
2639        llvm::Function* fn
2640            = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2641        for (size_t i = 0; i < fImpl->strides.size(); i++) {
2642            fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2643        }
2644
2645        llvm::BasicBlock *enter  = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2646                         *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2647                         *testK  = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2648                         *loopK  = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2649                         *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2650                         *test1  = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2651                         *loop1  = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2652                         *leave  = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2653
2654        using IRBuilder = llvm::IRBuilder<>;
2655
2656        llvm::PHINode*                 n;
2657        std::vector<llvm::PHINode*> args;
2658        std::vector<llvm::Value*> vals(instructions.size());
2659
2660        auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2661            auto [op, x,y,z,w, immA,immB,immC, death,can_hoist] = instructions[i];
2662
2663            llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
2664                       *i8    = llvm::Type::getInt8Ty (*ctx),
2665                       *i16   = llvm::Type::getInt16Ty(*ctx),
2666                       *f32   = llvm::Type::getFloatTy(*ctx),
2667                       *I1    = scalar ? i1    : llvm::VectorType::get(i1 , K, false  ),
2668                       *I8    = scalar ? i8    : llvm::VectorType::get(i8 , K, false  ),
2669                       *I16   = scalar ? i16   : llvm::VectorType::get(i16, K, false  ),
2670                       *I32   = scalar ? i32   : llvm::VectorType::get(i32, K, false  ),
2671                       *F32   = scalar ? f32   : llvm::VectorType::get(f32, K, false  );
2672
2673            auto I  = [&](llvm::Value* v) { return b->CreateBitCast(v, I32  ); };
2674            auto F  = [&](llvm::Value* v) { return b->CreateBitCast(v, F32  ); };
2675
2676            auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2677
2678            llvm::Type* vt = nullptr;
2679            switch (llvm::Type* t = nullptr; op) {
2680                default:
2681                    SkDebugf("can't llvm %s (%d)\n", name(op), op);
2682                    return false;
2683
2684                case Op::assert_true: /*TODO*/ break;
2685
2686                case Op::trace_line:
2687                case Op::trace_var:
2688                case Op::trace_call:
2689                    /* Only supported in the interpreter. */
2690                    break;
2691
2692                case Op::index:
2693                    if (I32->isVectorTy()) {
2694                        std::vector<llvm::Constant*> iota(K);
2695                        for (int j = 0; j < K; j++) {
2696                            iota[j] = b->getInt32(j);
2697                        }
2698                        vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2699                                               llvm::ConstantVector::get(iota));
2700                    } else {
2701                        vals[i] = n;
2702                    } break;
2703
2704                case Op::load8:  t = I8 ; goto load;
2705                case Op::load16: t = I16; goto load;
2706                case Op::load32: t = I32; goto load;
2707                load: {
2708                    llvm::Value* ptr = b->CreateBitCast(args[immA], t->getPointerTo());
2709                    vals[i] = b->CreateZExt(
2710                            b->CreateAlignedLoad(t, ptr, llvm::MaybeAlign{1}), I32);
2711                } break;
2712
2713
2714                case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immA); break;
2715
2716                case Op::uniform32: {
2717                    llvm::Value* ptr = b->CreateBitCast(
2718                            b->CreateConstInBoundsGEP1_32(i8, args[immA], immB),
2719                            i32->getPointerTo());
2720                    llvm::Value* val = b->CreateZExt(
2721                            b->CreateAlignedLoad(i32, ptr, llvm::MaybeAlign{1}), i32);
2722                    vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2723                                                : val;
2724                } break;
2725
2726                case Op::gather8:  t = i8 ; vt = I8; goto gather;
2727                case Op::gather16: t = i16; vt = I16; goto gather;
2728                case Op::gather32: t = i32; vt = I32; goto gather;
2729                gather: {
2730                    // Our gather base pointer is immB bytes off of uniform immA.
2731                    llvm::Value* base =
2732                        b->CreateLoad(b->CreateBitCast(
2733                                b->CreateConstInBoundsGEP1_32(i8, args[immA],immB),
2734                                t->getPointerTo()->getPointerTo()));
2735
2736                    llvm::Value* ptr = b->CreateInBoundsGEP(t, base, vals[x]);
2737                    llvm::Value* gathered;
2738                    if (ptr->getType()->isVectorTy()) {
2739                        gathered = b->CreateMaskedGather(
2740                                vt,
2741                                ptr,
2742                                llvm::Align{1});
2743                    } else {
2744                        gathered = b->CreateAlignedLoad(vt, ptr, llvm::MaybeAlign{1});
2745                    }
2746                    vals[i] = b->CreateZExt(gathered, I32);
2747                } break;
2748
2749                case Op::store8:  t = I8 ; goto store;
2750                case Op::store16: t = I16; goto store;
2751                case Op::store32: t = I32; goto store;
2752                store: {
2753                    llvm::Value* val = b->CreateTrunc(vals[x], t);
2754                    llvm::Value* ptr = b->CreateBitCast(args[immA],
2755                                                        val->getType()->getPointerTo());
2756                    vals[i] = b->CreateAlignedStore(val, ptr, llvm::MaybeAlign{1});
2757                } break;
2758
2759                case Op::bit_and:   vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2760                case Op::bit_or :   vals[i] = b->CreateOr (vals[x], vals[y]); break;
2761                case Op::bit_xor:   vals[i] = b->CreateXor(vals[x], vals[y]); break;
2762                case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2763
2764                case Op::select:
2765                    vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2766                    break;
2767
2768                case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2769                case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2770                case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2771
2772                case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immA); break;
2773                case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immA); break;
2774                case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immA); break;
2775
2776                case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2777                case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2778
2779                case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2780                case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2781                case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2782                case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2783
2784                case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2785                case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2786                case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2787                case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2788
2789                case Op::fma_f32:
2790                    vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2791                                                   {F(vals[x]), F(vals[y]), F(vals[z])}));
2792                    break;
2793
2794                case Op::fms_f32:
2795                    vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2796                                                   {F(vals[x]), F(vals[y]),
2797                                                    b->CreateFNeg(F(vals[z]))}));
2798                    break;
2799
2800                case Op::fnma_f32:
2801                    vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2802                                                   {b->CreateFNeg(F(vals[x])), F(vals[y]),
2803                                                    F(vals[z])}));
2804                    break;
2805
2806                case Op::ceil:
2807                    vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::ceil, F(vals[x])));
2808                    break;
2809                case Op::floor:
2810                    vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2811                    break;
2812
2813                case Op::max_f32:
2814                    vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2815                                                F(vals[y]), F(vals[x])));
2816                    break;
2817                case Op::min_f32:
2818                    vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2819                                                F(vals[y]), F(vals[x])));
2820                    break;
2821
2822                case Op::sqrt_f32:
2823                    vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2824                    break;
2825
2826                case Op::to_f32: vals[i] = I(b->CreateSIToFP(  vals[x] , F32)); break;
2827                case Op::trunc : vals[i] =   b->CreateFPToSI(F(vals[x]), I32) ; break;
2828                case Op::round : {
2829                    // Basic impl when we can't use cvtps2dq and co.
2830                    auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2831                    vals[i] = b->CreateFPToSI(round, I32);
2832
2833                #if 1 && defined(SK_CPU_X86)
2834                    // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2835                    if (scalar) {
2836                        // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3.  ¯\_(ツ)_/¯
2837                        llvm::Value* v = llvm::UndefValue::get(
2838                                llvm::VectorType::get(f32, 4, false));
2839                        v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2840                        vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2841                    } else {
2842                        SkASSERT(K == 4  || K == 8);
2843                        auto intr = K == 4 ?   llvm::Intrinsic::x86_sse2_cvtps2dq :
2844                                 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2845                        vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2846                    }
2847                #endif
2848                } break;
2849
2850            }
2851            return true;
2852        };
2853
2854        {
2855            IRBuilder b(enter);
2856            b.CreateBr(hoistK);
2857        }
2858
2859        // hoistK: emit each hoistable vector instruction; goto testK;
2860        // LLVM can do this sort of thing itself, but we've got the information cheap,
2861        // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2862        {
2863            IRBuilder b(hoistK);
2864
2865            // Hoisted instructions will need args (think, uniforms), so set that up now.
2866            // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2867            // Later on when we start looping the phi nodes will start looking useful.
2868            llvm::Argument* arg = fn->arg_begin();
2869            (void)arg++;  // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2870            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2871                args.push_back(b.CreatePHI(arg->getType(), 1));
2872                args.back()->addIncoming(arg++, enter);
2873            }
2874
2875            for (size_t i = 0; i < instructions.size(); i++) {
2876                if (instructions[i].can_hoist && !emit(i, false, &b)) {
2877                    return;
2878                }
2879            }
2880
2881            b.CreateBr(testK);
2882        }
2883
2884        // testK:  if (N >= K) goto loopK; else goto hoist1;
2885        {
2886            IRBuilder b(testK);
2887
2888            // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2889            // These also start as the initial function arguments; hoistK can't have changed them.
2890            llvm::Argument* arg = fn->arg_begin();
2891
2892            n = b.CreatePHI(arg->getType(), 2);
2893            n->addIncoming(arg++, hoistK);
2894
2895            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2896                args[i] = b.CreatePHI(arg->getType(), 2);
2897                args[i]->addIncoming(arg++, hoistK);
2898            }
2899
2900            b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2901        }
2902
2903        // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2904        {
2905            IRBuilder b(loopK);
2906            for (size_t i = 0; i < instructions.size(); i++) {
2907                if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2908                    return;
2909                }
2910            }
2911
2912            // n -= K
2913            llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2914            n->addIncoming(n_next, loopK);
2915
2916            // Each arg ptr += K
2917            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2918                llvm::Value* arg_next
2919                    = b.CreateConstInBoundsGEP1_32(
2920                            llvm::Type::getInt8Ty (*ctx),
2921                            args[i],
2922                            K*fImpl->strides[i]);
2923                args[i]->addIncoming(arg_next, loopK);
2924            }
2925            b.CreateBr(testK);
2926        }
2927
2928        // hoist1: emit each hoistable scalar instruction; goto test1;
2929        {
2930            IRBuilder b(hoist1);
2931            for (size_t i = 0; i < instructions.size(); i++) {
2932                if (instructions[i].can_hoist && !emit(i, true, &b)) {
2933                    return;
2934                }
2935            }
2936            b.CreateBr(test1);
2937        }
2938
2939        // test1:  if (N >= 1) goto loop1; else goto leave;
2940        {
2941            IRBuilder b(test1);
2942
2943            // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2944            llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2945            n_new->addIncoming(n, hoist1);
2946            n = n_new;
2947
2948            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2949                llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2950                arg_new->addIncoming(args[i], hoist1);
2951                args[i] = arg_new;
2952            }
2953
2954            b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2955        }
2956
2957        // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
2958        {
2959            IRBuilder b(loop1);
2960            for (size_t i = 0; i < instructions.size(); i++) {
2961                if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2962                    return;
2963                }
2964            }
2965
2966            // n -= 1
2967            llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
2968            n->addIncoming(n_next, loop1);
2969
2970            // Each arg ptr += 1
2971            for (size_t i = 0; i < fImpl->strides.size(); i++) {
2972                llvm::Value* arg_next
2973                    = b.CreateConstInBoundsGEP1_32(
2974                            llvm::Type::getInt8Ty (*ctx), args[i], fImpl->strides[i]);
2975                args[i]->addIncoming(arg_next, loop1);
2976            }
2977            b.CreateBr(test1);
2978        }
2979
2980        // leave:  ret
2981        {
2982            IRBuilder b(leave);
2983            b.CreateRetVoid();
2984        }
2985
2986        SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2987
2988        if (true) {
2989            SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2990            std::error_code err;
2991            llvm::raw_fd_ostream os(path.c_str(), err);
2992            if (err) {
2993                return;
2994            }
2995            llvm::WriteBitcodeToFile(*mod, os);
2996        }
2997
2998        static SkOnce once;
2999        once([]{
3000            SkAssertResult(false == llvm::InitializeNativeTarget());
3001            SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
3002        });
3003
3004        if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
3005                                            .setEngineKind(llvm::EngineKind::JIT)
3006                                            .setMCPU(llvm::sys::getHostCPUName())
3007                                            .create()) {
3008            fImpl->llvm_ctx = std::move(ctx);
3009            fImpl->llvm_ee.reset(ee);
3010
3011            #if defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3012            // Wait for llvm to compile
3013            void* function = (void*)ee->getFunctionAddress(debug_name);
3014            fImpl->jit_entry.store(function);
3015            // We have to be careful here about what we close over and how, in case fImpl moves.
3016            // fImpl itself may change, but its pointee fields won't, so close over them by value.
3017            // Also, debug_name will almost certainly leave scope, so copy it.
3018            #else
3019            fImpl->llvm_compiling = std::async(std::launch::async, [dst  = &fImpl->jit_entry,
3020                                                                    ee   =  fImpl->llvm_ee.get(),
3021                                                                    name = std::string(debug_name)]{
3022                // std::atomic<void*>*    dst;
3023                // llvm::ExecutionEngine* ee;
3024                // std::string            name;
3025                dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
3026            });
3027            #endif
3028        }
3029    }
3030    #endif  // SKVM_LLVM
3031
3032    void Program::waitForLLVM() const {
3033    #if defined(SKVM_LLVM) && !defined(SKVM_LLVM_WAIT_FOR_COMPILATION)
3034        if (fImpl->llvm_compiling.valid()) {
3035            fImpl->llvm_compiling.wait();
3036        }
3037    #endif
3038    }
3039
3040    bool Program::hasJIT() const {
3041        // Program::hasJIT() is really just a debugging / test aid,
3042        // so we don't mind adding a sync point here to wait for compilation.
3043        this->waitForLLVM();
3044
3045        return fImpl->jit_entry.load() != nullptr;
3046    }
3047
3048    void Program::dropJIT() {
3049    #if defined(SKVM_LLVM)
3050        this->waitForLLVM();
3051        fImpl->llvm_ee .reset(nullptr);
3052        fImpl->llvm_ctx.reset(nullptr);
3053    #elif defined(SKVM_JIT)
3054        if (fImpl->dylib) {
3055            close_dylib(fImpl->dylib);
3056        } else if (auto jit_entry = fImpl->jit_entry.load()) {
3057            unmap_jit_buffer(jit_entry, fImpl->jit_size);
3058        }
3059    #else
3060        SkASSERT(!this->hasJIT());
3061    #endif
3062
3063        fImpl->jit_entry.store(nullptr);
3064        fImpl->jit_size  = 0;
3065        fImpl->dylib     = nullptr;
3066    }
3067
3068    Program::Program() : fImpl(std::make_unique<Impl>()) {}
3069
3070    Program::~Program() {
3071        // Moved-from Programs may have fImpl == nullptr.
3072        if (fImpl) {
3073            this->dropJIT();
3074        }
3075    }
3076
3077    Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
3078
3079    Program& Program::operator=(Program&& other) {
3080        fImpl = std::move(other.fImpl);
3081        return *this;
3082    }
3083
3084    Program::Program(const std::vector<OptimizedInstruction>& instructions,
3085                     const std::vector<int>& strides,
3086                     const char* debug_name, bool allow_jit) : Program() {
3087        fImpl->strides = strides;
3088        if (gSkVMAllowJIT && allow_jit) {
3089        #if 1 && defined(SKVM_LLVM)
3090            this->setupLLVM(instructions, debug_name);
3091        #elif 1 && defined(SKVM_JIT)
3092            this->setupJIT(instructions, debug_name);
3093        #endif
3094        }
3095
3096        // Might as well do this after setupLLVM() to get a little more time to compile.
3097        this->setupInterpreter(instructions);
3098    }
3099
3100    std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
3101    int  Program::nargs() const { return (int)fImpl->strides.size(); }
3102    int  Program::nregs() const { return fImpl->regs; }
3103    int  Program::loop () const { return fImpl->loop; }
3104    bool Program::empty() const { return fImpl->instructions.empty(); }
3105
3106    // Translate OptimizedInstructions to InterpreterInstructions.
3107    void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
3108        // Register each instruction is assigned to.
3109        std::vector<Reg> reg(instructions.size());
3110
3111        // This next bit is a bit more complicated than strictly necessary;
3112        // we could just assign every instruction to its own register.
3113        //
3114        // But recycling registers is fairly cheap, and good practice for the
3115        // JITs where minimizing register pressure really is important.
3116        //
3117        // We have effectively infinite registers, so we hoist any value we can.
3118        // (The JIT may choose a more complex policy to reduce register pressure.)
3119
3120        fImpl->regs = 0;
3121        std::vector<Reg> avail;
3122
3123        // Assign this value to a register, recycling them where we can.
3124        auto assign_register = [&](Val id) {
3125            const OptimizedInstruction& inst = instructions[id];
3126
3127            // If this is a real input and it's lifetime ends at this instruction,
3128            // we can recycle the register it's occupying.
3129            auto maybe_recycle_register = [&](Val input) {
3130                if (input != NA && instructions[input].death == id) {
3131                    avail.push_back(reg[input]);
3132                }
3133            };
3134
3135            // Take care to not recycle the same register twice.
3136            const Val x = inst.x, y = inst.y, z = inst.z, w = inst.w;
3137            if (true                      ) { maybe_recycle_register(x); }
3138            if (y != x                    ) { maybe_recycle_register(y); }
3139            if (z != x && z != y          ) { maybe_recycle_register(z); }
3140            if (w != x && w != y && w != z) { maybe_recycle_register(w); }
3141
3142            // Instructions that die at themselves (stores) don't need a register.
3143            if (inst.death != id) {
3144                // Allocate a register if we have to, preferring to reuse anything available.
3145                if (avail.empty()) {
3146                    reg[id] = fImpl->regs++;
3147                } else {
3148                    reg[id] = avail.back();
3149                    avail.pop_back();
3150                }
3151            }
3152        };
3153
3154        // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
3155        for (Val id = 0; id < (Val)instructions.size(); id++) {
3156            if ( instructions[id].can_hoist) { assign_register(id); }
3157        }
3158        for (Val id = 0; id < (Val)instructions.size(); id++) {
3159            if (!instructions[id].can_hoist) { assign_register(id); }
3160        }
3161
3162        // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
3163        // registers.  This will be two passes, first hoisted instructions, then inside the loop.
3164
3165        // The loop begins at the fImpl->loop'th Instruction.
3166        fImpl->loop = 0;
3167        fImpl->instructions.reserve(instructions.size());
3168
3169        // Add a mapping for the N/A sentinel Val to any arbitrary register
3170        // so lookups don't have to know which arguments are used by which Ops.
3171        auto lookup_register = [&](Val id) {
3172            return id == NA ? (Reg)0
3173                            : reg[id];
3174        };
3175
3176        auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
3177            InterpreterInstruction pinst{
3178                inst.op,
3179                lookup_register(id),
3180                lookup_register(inst.x),
3181                lookup_register(inst.y),
3182                lookup_register(inst.z),
3183                lookup_register(inst.w),
3184                inst.immA,
3185                inst.immB,
3186                inst.immC,
3187            };
3188            fImpl->instructions.push_back(pinst);
3189        };
3190
3191        for (Val id = 0; id < (Val)instructions.size(); id++) {
3192            const OptimizedInstruction& inst = instructions[id];
3193            if (inst.can_hoist) {
3194                push_instruction(id, inst);
3195                fImpl->loop++;
3196            }
3197        }
3198        for (Val id = 0; id < (Val)instructions.size(); id++) {
3199            const OptimizedInstruction& inst = instructions[id];
3200            if (!inst.can_hoist) {
3201                push_instruction(id, inst);
3202            }
3203        }
3204    }
3205
3206#if defined(SKVM_JIT)
3207
3208    namespace SkVMJitTypes {
3209    #if defined(__x86_64__) || defined(_M_X64)
3210        using Reg = Assembler::Ymm;
3211    #elif defined(__aarch64__)
3212        using Reg = Assembler::V;
3213    #endif
3214    }  // namespace SkVMJitTypes
3215
3216    bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
3217                      int* stack_hint,
3218                      uint32_t* registers_used,
3219                      Assembler* a) const {
3220        using A = Assembler;
3221        using SkVMJitTypes::Reg;
3222
3223        SkTHashMap<int, A::Label> constants;    // Constants (mostly splats) share the same pool.
3224        A::Label                  iota;         // Varies per lane, for Op::index.
3225        A::Label                  load64_index; // Used to load low or high half of 64-bit lanes.
3226
3227        // The `regs` array tracks everything we know about each register's state:
3228        //   - NA:   empty
3229        //   - RES:  reserved by ABI
3230        //   - TMP:  holding a temporary
3231        //   - id:   holding Val id
3232        constexpr Val RES = NA-1,
3233                      TMP = RES-1;
3234
3235        // Map val -> stack slot.
3236        std::vector<int> stack_slot(instructions.size(), NA);
3237        int next_stack_slot = 0;
3238
3239        const int nstack_slots = *stack_hint >= 0 ? *stack_hint
3240                                                  : stack_slot.size();
3241    #if defined(__x86_64__) || defined(_M_X64)
3242        if (!SkCpu::Supports(SkCpu::HSW)) {
3243            return false;
3244        }
3245        const int K = 8;
3246        #if defined(_M_X64)  // Important to check this first; clang-cl defines both.
3247            const A::GP64 N = A::rcx,
3248                        GP0 = A::rax,
3249                        GP1 = A::r11,
3250                        arg[]    = { A::rdx, A::r8, A::r9, A::r10, A::rdi, A::rsi };
3251
3252            // xmm6-15 need are callee-saved.
3253            std::array<Val,16> regs = {
3254                 NA, NA, NA, NA,  NA, NA,RES,RES,
3255                RES,RES,RES,RES, RES,RES,RES,RES,
3256            };
3257            const uint32_t incoming_registers_used = *registers_used;
3258
3259            auto enter = [&]{
3260                // rcx,rdx,r8,r9 are all already holding their correct values.
3261                // Load caller-saved r10 from rsp+40 if there's a fourth arg.
3262                if (fImpl->strides.size() >= 4) {
3263                    a->mov(A::r10, A::Mem{A::rsp, 40});
3264                }
3265                // Load callee-saved rdi from rsp+48 if there's a fifth arg,
3266                // first saving it to ABI reserved shadow area rsp+8.
3267                if (fImpl->strides.size() >= 5) {
3268                    a->mov(A::Mem{A::rsp, 8}, A::rdi);
3269                    a->mov(A::rdi, A::Mem{A::rsp, 48});
3270                }
3271                // Load callee-saved rsi from rsp+56 if there's a sixth arg,
3272                // first saving it to ABI reserved shadow area rsp+16.
3273                if (fImpl->strides.size() >= 6) {
3274                    a->mov(A::Mem{A::rsp, 16}, A::rsi);
3275                    a->mov(A::rsi, A::Mem{A::rsp, 56});
3276                }
3277
3278                // Allocate stack for our values and callee-saved xmm6-15.
3279                int stack_needed = nstack_slots*K*4;
3280                for (int r = 6; r < 16; r++) {
3281                    if (incoming_registers_used & (1<<r)) {
3282                        stack_needed += 16;
3283                    }
3284                }
3285                if (stack_needed) { a->sub(A::rsp, stack_needed); }
3286
3287                int next_saved_xmm = nstack_slots*K*4;
3288                for (int r = 6; r < 16; r++) {
3289                    if (incoming_registers_used & (1<<r)) {
3290                        a->vmovups(A::Mem{A::rsp, next_saved_xmm}, (A::Xmm)r);
3291                        next_saved_xmm += 16;
3292                        regs[r] = NA;
3293                    }
3294                }
3295            };
3296            auto exit  = [&]{
3297                // The second pass of jit() shouldn't use any register it didn't in the first pass.
3298                SkASSERT((*registers_used & incoming_registers_used) == *registers_used);
3299
3300                // Restore callee-saved xmm6-15 and the stack pointer.
3301                int stack_used = nstack_slots*K*4;
3302                for (int r = 6; r < 16; r++) {
3303                    if (incoming_registers_used & (1<<r)) {
3304                        a->vmovups((A::Xmm)r, A::Mem{A::rsp, stack_used});
3305                        stack_used += 16;
3306                    }
3307                }
3308                if (stack_used) { a->add(A::rsp, stack_used); }
3309
3310                // Restore callee-saved rdi/rsi if we used them.
3311                if (fImpl->strides.size() >= 5) {
3312                    a->mov(A::rdi, A::Mem{A::rsp, 8});
3313                }
3314                if (fImpl->strides.size() >= 6) {
3315                    a->mov(A::rsi, A::Mem{A::rsp, 16});
3316                }
3317
3318                a->vzeroupper();
3319                a->ret();
3320            };
3321        #elif defined(__x86_64__)
3322            const A::GP64 N = A::rdi,
3323                        GP0 = A::rax,
3324                        GP1 = A::r11,
3325                        arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9, A::r10 };
3326
3327            // All 16 ymm registers are available to use.
3328            std::array<Val,16> regs = {
3329                NA,NA,NA,NA, NA,NA,NA,NA,
3330                NA,NA,NA,NA, NA,NA,NA,NA,
3331            };
3332
3333            auto enter = [&]{
3334                // Load caller-saved r10 from rsp+8 if there's a sixth arg.
3335                if (fImpl->strides.size() >= 6) {
3336                    a->mov(A::r10, A::Mem{A::rsp, 8});
3337                }
3338                if (nstack_slots) { a->sub(A::rsp, nstack_slots*K*4); }
3339            };
3340            auto exit  = [&]{
3341                if (nstack_slots) { a->add(A::rsp, nstack_slots*K*4); }
3342                a->vzeroupper();
3343                a->ret();
3344            };
3345        #endif
3346
3347        auto load_from_memory = [&](Reg r, Val v) {
3348            if (instructions[v].op == Op::splat) {
3349                if (instructions[v].immA == 0) {
3350                    a->vpxor(r,r,r);
3351                } else {
3352                    a->vmovups(r, constants.find(instructions[v].immA));
3353                }
3354            } else {
3355                SkASSERT(stack_slot[v] != NA);
3356                a->vmovups(r, A::Mem{A::rsp, stack_slot[v]*K*4});
3357            }
3358        };
3359        auto store_to_stack = [&](Reg r, Val v) {
3360            SkASSERT(next_stack_slot < nstack_slots);
3361            stack_slot[v] = next_stack_slot++;
3362            a->vmovups(A::Mem{A::rsp, stack_slot[v]*K*4}, r);
3363        };
3364    #elif defined(__aarch64__)
3365        const int K = 4;
3366        const A::X N     = A::x0,
3367                   GP0   = A::x8,
3368                   GP1   = A::x9,
3369                   arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
3370
3371        // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15 in enter/exit.
3372        std::array<Val,32> regs = {
3373             NA, NA, NA, NA,  NA, NA, NA, NA,
3374            RES,RES,RES,RES, RES,RES,RES,RES,
3375             NA, NA, NA, NA,  NA, NA, NA, NA,
3376             NA, NA, NA, NA,  NA, NA, NA, NA,
3377        };
3378
3379        auto enter = [&]{ if (nstack_slots) { a->sub(A::sp, A::sp, nstack_slots*K*4); } };
3380        auto exit  = [&]{ if (nstack_slots) { a->add(A::sp, A::sp, nstack_slots*K*4); }
3381                          a->ret(A::x30); };
3382
3383        auto load_from_memory = [&](Reg r, Val v) {
3384            if (instructions[v].op == Op::splat) {
3385                if (instructions[v].immA == 0) {
3386                    a->eor16b(r,r,r);
3387                } else {
3388                    a->ldrq(r, constants.find(instructions[v].immA));
3389                }
3390            } else {
3391                SkASSERT(stack_slot[v] != NA);
3392                a->ldrq(r, A::sp, stack_slot[v]);
3393            }
3394        };
3395        auto store_to_stack  = [&](Reg r, Val v) {
3396            SkASSERT(next_stack_slot < nstack_slots);
3397            stack_slot[v] = next_stack_slot++;
3398            a->strq(r, A::sp, stack_slot[v]);
3399        };
3400    #endif
3401
3402        *registers_used = 0;  // We'll update this as we go.
3403
3404        if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
3405            return false;
3406        }
3407
3408        auto emit = [&](Val id, bool scalar) {
3409            const int active_lanes = scalar ? 1 : K;
3410            const OptimizedInstruction& inst = instructions[id];
3411            const Op op = inst.op;
3412            const Val x = inst.x,
3413                      y = inst.y,
3414                      z = inst.z,
3415                      w = inst.w;
3416            const int immA = inst.immA,
3417                      immB = inst.immB,
3418                      immC = inst.immC;
3419
3420            // alloc_tmp() returns the first of N adjacent temporary registers,
3421            // each freed manually with free_tmp() or noted as our result with mark_tmp_as_dst().
3422            auto alloc_tmp = [&](int N=1) -> Reg {
3423                auto needs_spill = [&](Val v) -> bool {
3424                    SkASSERT(v >= 0);   // {NA,TMP,RES} need to be handled before calling this.
3425                    return stack_slot[v] == NA               // We haven't spilled it already?
3426                        && instructions[v].op != Op::splat;  // No need to spill constants.
3427                };
3428
3429                // We want to find a block of N adjacent registers requiring the fewest spills.
3430                int best_block = -1,
3431                    min_spills = 0x7fff'ffff;
3432                for (int block = 0; block+N <= (int)regs.size(); block++) {
3433                    int spills = 0;
3434                    for (int r = block; r < block+N; r++) {
3435                        Val v = regs[r];
3436                        // Registers holding NA (nothing) are ideal, nothing to spill.
3437                        if (v == NA) {
3438                            continue;
3439                        }
3440                        // We can't spill anything REServed or that we'll need this instruction.
3441                        if (v == RES ||
3442                            v == TMP || v == id || v == x || v == y || v == z || v == w) {
3443                            spills = 0x7fff'ffff;
3444                            block  = r;   // (optimization) continue outer loop at next register.
3445                            break;
3446                        }
3447                        // Usually here we've got a value v that we'd have to spill to the stack
3448                        // before reusing its register, but sometimes even now we get a freebie.
3449                        spills += needs_spill(v) ? 1 : 0;
3450                    }
3451
3452                    // TODO: non-arbitrary tie-breaking?
3453                    if (min_spills > spills) {
3454                        min_spills = spills;
3455                        best_block = block;
3456                    }
3457                    if (min_spills == 0) {
3458                        break;  // (optimization) stop early if we find an unbeatable block.
3459                    }
3460                }
3461
3462                // TODO: our search's success isn't obviously guaranteed... it depends on N
3463                // and the number and relative position in regs of any unspillable values.
3464                // I think we should be able to get away with N≤2 on x86-64 and N≤4 on arm64;
3465                // we'll need to revisit this logic should this assert fire.
3466                SkASSERT(min_spills <= N);
3467
3468                // Spill what needs spilling, and mark the block all as TMP.
3469                for (int r = best_block; r < best_block+N; r++) {
3470                    Val& v = regs[r];
3471                    *registers_used |= (1<<r);
3472
3473                    SkASSERT(v == NA || v >= 0);
3474                    if (v >= 0 && needs_spill(v)) {
3475                        store_to_stack((Reg)r, v);
3476                        SkASSERT(!needs_spill(v));
3477                        min_spills--;
3478                    }
3479
3480                    v = TMP;
3481                }
3482                SkASSERT(min_spills == 0);
3483                return (Reg)best_block;
3484            };
3485
3486            auto free_tmp = [&](Reg r) {
3487                SkASSERT(regs[r] == TMP);
3488                regs[r] = NA;
3489            };
3490
3491            // Which register holds dst,x,y,z,w for this instruction?  NA if none does yet.
3492            int rd = NA,
3493                rx = NA,
3494                ry = NA,
3495                rz = NA,
3496                rw = NA;
3497
3498            auto update_regs = [&](Reg r, Val v) {
3499                if (v == id) { rd = r; }
3500                if (v ==  x) { rx = r; }
3501                if (v ==  y) { ry = r; }
3502                if (v ==  z) { rz = r; }
3503                if (v ==  w) { rw = r; }
3504                return r;
3505            };
3506
3507            auto find_existing_reg = [&](Val v) -> int {
3508                // Quick-check our working registers.
3509                if (v == id && rd != NA) { return rd; }
3510                if (v ==  x && rx != NA) { return rx; }
3511                if (v ==  y && ry != NA) { return ry; }
3512                if (v ==  z && rz != NA) { return rz; }
3513                if (v ==  w && rw != NA) { return rw; }
3514
3515                // Search inter-instruction register map.
3516                for (auto [r,val] : SkMakeEnumerate(regs)) {
3517                    if (val == v) {
3518                        return update_regs((Reg)r, v);
3519                    }
3520                }
3521                return NA;
3522            };
3523
3524            // Return a register for Val, holding that value if it already exists.
3525            // During this instruction all calls to r(v) will return the same register.
3526            auto r = [&](Val v) -> Reg {
3527                SkASSERT(v >= 0);
3528
3529                if (int found = find_existing_reg(v); found != NA) {
3530                    return (Reg)found;
3531                }
3532
3533                Reg r = alloc_tmp();
3534                SkASSERT(regs[r] == TMP);
3535
3536                SkASSERT(v <= id);
3537                if (v < id) {
3538                    // If v < id, we're loading one of this instruction's inputs.
3539                    // If v == id we're just allocating its destination register.
3540                    load_from_memory(r, v);
3541                }
3542                regs[r] = v;
3543                return update_regs(r, v);
3544            };
3545
3546            auto dies_here = [&](Val v) -> bool {
3547                SkASSERT(v >= 0);
3548                return instructions[v].death == id;
3549            };
3550
3551            // Alias dst() to r(v) if dies_here(v).
3552            auto try_alias = [&](Val v) -> bool {
3553                SkASSERT(v == x || v == y || v == z || v == w);
3554                if (dies_here(v)) {
3555                    rd = r(v);      // Vals v and id share a register for this instruction.
3556                    regs[rd] = id;  // Next instruction, Val id will be in the register, not Val v.
3557                    return true;
3558                }
3559                return false;
3560            };
3561
3562            // Generally r(id),
3563            // but with a hint, try to alias dst() to r(v) if dies_here(v).
3564            auto dst = [&](Val hint1 = NA, Val hint2 = NA) -> Reg {
3565                if (hint1 != NA && try_alias(hint1)) { return r(id); }
3566                if (hint2 != NA && try_alias(hint2)) { return r(id); }
3567                return r(id);
3568            };
3569
3570        #if defined(__aarch64__)  // Nothing sneaky, just unused on x86-64.
3571            auto mark_tmp_as_dst = [&](Reg tmp) {
3572                SkASSERT(regs[tmp] == TMP);
3573                rd = tmp;
3574                regs[rd] = id;
3575                SkASSERT(dst() == tmp);
3576            };
3577        #endif
3578
3579        #if defined(__x86_64__) || defined(_M_X64)
3580            // On x86 we can work with many values directly from the stack or program constant pool.
3581            auto any = [&](Val v) -> A::Operand {
3582                SkASSERT(v >= 0);
3583                SkASSERT(v < id);
3584
3585                if (int found = find_existing_reg(v); found != NA) {
3586                    return (Reg)found;
3587                }
3588                if (instructions[v].op == Op::splat) {
3589                    return constants.find(instructions[v].immA);
3590                }
3591                return A::Mem{A::rsp, stack_slot[v]*K*4};
3592            };
3593
3594            // This is never really worth asking except when any() might be used;
3595            // if we need this value in ARM, might as well just call r(v) to get it into a register.
3596            auto in_reg = [&](Val v) -> bool {
3597                return find_existing_reg(v) != NA;
3598            };
3599        #endif
3600
3601            switch (op) {
3602                // Make sure splat constants can be found by load_from_memory() or any().
3603                case Op::splat:
3604                    (void)constants[immA];
3605                    break;
3606
3607            #if defined(__x86_64__) || defined(_M_X64)
3608                case Op::assert_true: {
3609                    a->vptest (r(x), &constants[0xffffffff]);
3610                    A::Label all_true;
3611                    a->jc(&all_true);
3612                    a->int3();
3613                    a->label(&all_true);
3614                } break;
3615
3616                case Op::trace_line:
3617                case Op::trace_var:
3618                case Op::trace_call:
3619                    /* Only supported in the interpreter. */
3620                    break;
3621
3622                case Op::store8:
3623                    if (scalar) {
3624                        a->vpextrb(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3625                    } else {
3626                        a->vpackusdw(dst(x), r(x), r(x));
3627                        a->vpermq   (dst(), dst(), 0xd8);
3628                        a->vpackuswb(dst(), dst(), dst());
3629                        a->vmovq    (A::Mem{arg[immA]}, (A::Xmm)dst());
3630                    } break;
3631
3632                case Op::store16:
3633                    if (scalar) {
3634                        a->vpextrw(A::Mem{arg[immA]}, (A::Xmm)r(x), 0);
3635                    } else {
3636                        a->vpackusdw(dst(x), r(x), r(x));
3637                        a->vpermq   (dst(), dst(), 0xd8);
3638                        a->vmovups  (A::Mem{arg[immA]}, (A::Xmm)dst());
3639                    } break;
3640
3641                case Op::store32: if (scalar) { a->vmovd  (A::Mem{arg[immA]}, (A::Xmm)r(x)); }
3642                                  else        { a->vmovups(A::Mem{arg[immA]},         r(x)); }
3643                                  break;
3644
3645                case Op::store64: if (scalar) {
3646                                      a->vmovd(A::Mem{arg[immA],0}, (A::Xmm)r(x));
3647                                      a->vmovd(A::Mem{arg[immA],4}, (A::Xmm)r(y));
3648                                  } else {
3649                                      // r(x) = {a,b,c,d|e,f,g,h}
3650                                      // r(y) = {i,j,k,l|m,n,o,p}
3651                                      // We want to write a,i,b,j,c,k,d,l,e,m...
3652                                      A::Ymm L = alloc_tmp(),
3653                                             H = alloc_tmp();
3654                                      a->vpunpckldq(L, r(x), any(y));  // L = {a,i,b,j|e,m,f,n}
3655                                      a->vpunpckhdq(H, r(x), any(y));  // H = {c,k,d,l|g,o,h,p}
3656                                      a->vperm2f128(dst(), L,H, 0x20); //   = {a,i,b,j|c,k,d,l}
3657                                      a->vmovups(A::Mem{arg[immA], 0}, dst());
3658                                      a->vperm2f128(dst(), L,H, 0x31); //   = {e,m,f,n|g,o,h,p}
3659                                      a->vmovups(A::Mem{arg[immA],32}, dst());
3660                                      free_tmp(L);
3661                                      free_tmp(H);
3662                                  } break;
3663
3664                case Op::store128: {
3665                    // TODO: >32-bit stores
3666                    a->vmovd  (A::Mem{arg[immA], 0*16 +  0}, (A::Xmm)r(x)   );
3667                    a->vmovd  (A::Mem{arg[immA], 0*16 +  4}, (A::Xmm)r(y)   );
3668                    a->vmovd  (A::Mem{arg[immA], 0*16 +  8}, (A::Xmm)r(z)   );
3669                    a->vmovd  (A::Mem{arg[immA], 0*16 + 12}, (A::Xmm)r(w)   );
3670                    if (scalar) { break; }
3671
3672                    a->vpextrd(A::Mem{arg[immA], 1*16 +  0}, (A::Xmm)r(x), 1);
3673                    a->vpextrd(A::Mem{arg[immA], 1*16 +  4}, (A::Xmm)r(y), 1);
3674                    a->vpextrd(A::Mem{arg[immA], 1*16 +  8}, (A::Xmm)r(z), 1);
3675                    a->vpextrd(A::Mem{arg[immA], 1*16 + 12}, (A::Xmm)r(w), 1);
3676
3677                    a->vpextrd(A::Mem{arg[immA], 2*16 +  0}, (A::Xmm)r(x), 2);
3678                    a->vpextrd(A::Mem{arg[immA], 2*16 +  4}, (A::Xmm)r(y), 2);
3679                    a->vpextrd(A::Mem{arg[immA], 2*16 +  8}, (A::Xmm)r(z), 2);
3680                    a->vpextrd(A::Mem{arg[immA], 2*16 + 12}, (A::Xmm)r(w), 2);
3681
3682                    a->vpextrd(A::Mem{arg[immA], 3*16 +  0}, (A::Xmm)r(x), 3);
3683                    a->vpextrd(A::Mem{arg[immA], 3*16 +  4}, (A::Xmm)r(y), 3);
3684                    a->vpextrd(A::Mem{arg[immA], 3*16 +  8}, (A::Xmm)r(z), 3);
3685                    a->vpextrd(A::Mem{arg[immA], 3*16 + 12}, (A::Xmm)r(w), 3);
3686                    // Now we need to store the upper 128 bits of x,y,z,w.
3687                    // Storing in this order rather than interlacing minimizes temporaries.
3688                    a->vextracti128(dst(), r(x), 1);
3689                    a->vmovd  (A::Mem{arg[immA], 4*16 +  0}, (A::Xmm)dst()   );
3690                    a->vpextrd(A::Mem{arg[immA], 5*16 +  0}, (A::Xmm)dst(), 1);
3691                    a->vpextrd(A::Mem{arg[immA], 6*16 +  0}, (A::Xmm)dst(), 2);
3692                    a->vpextrd(A::Mem{arg[immA], 7*16 +  0}, (A::Xmm)dst(), 3);
3693
3694                    a->vextracti128(dst(), r(y), 1);
3695                    a->vmovd  (A::Mem{arg[immA], 4*16 +  4}, (A::Xmm)dst()   );
3696                    a->vpextrd(A::Mem{arg[immA], 5*16 +  4}, (A::Xmm)dst(), 1);
3697                    a->vpextrd(A::Mem{arg[immA], 6*16 +  4}, (A::Xmm)dst(), 2);
3698                    a->vpextrd(A::Mem{arg[immA], 7*16 +  4}, (A::Xmm)dst(), 3);
3699
3700                    a->vextracti128(dst(), r(z), 1);
3701                    a->vmovd  (A::Mem{arg[immA], 4*16 +  8}, (A::Xmm)dst()   );
3702                    a->vpextrd(A::Mem{arg[immA], 5*16 +  8}, (A::Xmm)dst(), 1);
3703                    a->vpextrd(A::Mem{arg[immA], 6*16 +  8}, (A::Xmm)dst(), 2);
3704                    a->vpextrd(A::Mem{arg[immA], 7*16 +  8}, (A::Xmm)dst(), 3);
3705
3706                    a->vextracti128(dst(), r(w), 1);
3707                    a->vmovd  (A::Mem{arg[immA], 4*16 + 12}, (A::Xmm)dst()   );
3708                    a->vpextrd(A::Mem{arg[immA], 5*16 + 12}, (A::Xmm)dst(), 1);
3709                    a->vpextrd(A::Mem{arg[immA], 6*16 + 12}, (A::Xmm)dst(), 2);
3710                    a->vpextrd(A::Mem{arg[immA], 7*16 + 12}, (A::Xmm)dst(), 3);
3711                } break;
3712
3713                case Op::load8:  if (scalar) {
3714                                     a->vpxor  (dst(), dst(), dst());
3715                                     a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3716                                 } else {
3717                                     a->vpmovzxbd(dst(), A::Mem{arg[immA]});
3718                                 } break;
3719
3720                case Op::load16: if (scalar) {
3721                                     a->vpxor  (dst(), dst(), dst());
3722                                     a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{arg[immA]}, 0);
3723                                 } else {
3724                                     a->vpmovzxwd(dst(), A::Mem{arg[immA]});
3725                                 } break;
3726
3727                case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), A::Mem{arg[immA]}); }
3728                                 else        { a->vmovups(        dst(), A::Mem{arg[immA]}); }
3729                                 break;
3730
3731                case Op::load64: if (scalar) {
3732                                    a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3733                                 } else {
3734                                    A::Ymm tmp = alloc_tmp();
3735                                    a->vmovups(tmp, &load64_index);
3736                                    a->vpermps(dst(), tmp, A::Mem{arg[immA],  0});
3737                                    a->vpermps(  tmp, tmp, A::Mem{arg[immA], 32});
3738                                    // Low 128 bits holds immB=0 lanes, high 128 bits holds immB=1.
3739                                    a->vperm2f128(dst(), dst(),tmp, immB ? 0x31 : 0x20);
3740                                    free_tmp(tmp);
3741                                 } break;
3742
3743                case Op::load128: if (scalar) {
3744                                      a->vmovd((A::Xmm)dst(), A::Mem{arg[immA], 4*immB});
3745                                  } else {
3746                                      // Load 4 low values into xmm tmp,
3747                                      A::Ymm tmp = alloc_tmp();
3748                                      A::Xmm t = (A::Xmm)tmp;
3749                                      a->vmovd  (t,   A::Mem{arg[immA], 0*16 + 4*immB}   );
3750                                      a->vpinsrd(t,t, A::Mem{arg[immA], 1*16 + 4*immB}, 1);
3751                                      a->vpinsrd(t,t, A::Mem{arg[immA], 2*16 + 4*immB}, 2);
3752                                      a->vpinsrd(t,t, A::Mem{arg[immA], 3*16 + 4*immB}, 3);
3753
3754                                      // Load 4 high values into xmm dst(),
3755                                      A::Xmm d = (A::Xmm)dst();
3756                                      a->vmovd  (d,   A::Mem{arg[immA], 4*16 + 4*immB}   );
3757                                      a->vpinsrd(d,d, A::Mem{arg[immA], 5*16 + 4*immB}, 1);
3758                                      a->vpinsrd(d,d, A::Mem{arg[immA], 6*16 + 4*immB}, 2);
3759                                      a->vpinsrd(d,d, A::Mem{arg[immA], 7*16 + 4*immB}, 3);
3760
3761                                      // Merge the two, ymm dst() = {xmm tmp|xmm dst()}
3762                                      a->vperm2f128(dst(), tmp,dst(), 0x20);
3763                                      free_tmp(tmp);
3764                                  } break;
3765
3766                case Op::gather8: {
3767                    // As usual, the gather base pointer is immB bytes off of uniform immA.
3768                    a->mov(GP0, A::Mem{arg[immA], immB});
3769
3770                    A::Ymm tmp = alloc_tmp();
3771                    a->vmovups(tmp, any(x));
3772
3773                    for (int i = 0; i < active_lanes; i++) {
3774                        if (i == 4) {
3775                            // vpextrd can only pluck indices out from an Xmm register,
3776                            // so we manually swap over to the top when we're halfway through.
3777                            a->vextracti128((A::Xmm)tmp, tmp, 1);
3778                        }
3779                        a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3780                        a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::ONE}, i);
3781                    }
3782                    a->vpmovzxbd(dst(), dst());
3783                    free_tmp(tmp);
3784                } break;
3785
3786                case Op::gather16: {
3787                    // Just as gather8 except vpinsrb->vpinsrw, ONE->TWO, and vpmovzxbd->vpmovzxwd.
3788                    a->mov(GP0, A::Mem{arg[immA], immB});
3789
3790                    A::Ymm tmp = alloc_tmp();
3791                    a->vmovups(tmp, any(x));
3792
3793                    for (int i = 0; i < active_lanes; i++) {
3794                        if (i == 4) {
3795                            a->vextracti128((A::Xmm)tmp, tmp, 1);
3796                        }
3797                        a->vpextrd(GP1, (A::Xmm)tmp, i%4);
3798                        a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), A::Mem{GP0,0,GP1,A::TWO}, i);
3799                    }
3800                    a->vpmovzxwd(dst(), dst());
3801                    free_tmp(tmp);
3802                } break;
3803
3804                case Op::gather32:
3805                if (scalar) {
3806                    // Our gather base pointer is immB bytes off of uniform immA.
3807                    a->mov(GP0, A::Mem{arg[immA], immB});
3808
3809                    // Grab our index from lane 0 of the index argument.
3810                    a->vmovd(GP1, (A::Xmm)r(x));
3811
3812                    // dst = *(base + 4*index)
3813                    a->vmovd((A::Xmm)dst(x), A::Mem{GP0, 0, GP1, A::FOUR});
3814                } else {
3815                    a->mov(GP0, A::Mem{arg[immA], immB});
3816
3817                    A::Ymm mask = alloc_tmp();
3818                    a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3819
3820                    a->vgatherdps(dst(), A::FOUR, r(x), GP0, mask);
3821                    free_tmp(mask);
3822                }
3823                break;
3824
3825                case Op::uniform32: a->vbroadcastss(dst(), A::Mem{arg[immA], immB});
3826                                    break;
3827
3828                case Op::array32: a->mov(GP0, A::Mem{arg[immA], immB});
3829                                  a->vbroadcastss(dst(), A::Mem{GP0, immC});
3830                                  break;
3831
3832                case Op::index: a->vmovd((A::Xmm)dst(), N);
3833                                a->vbroadcastss(dst(), dst());
3834                                a->vpsubd(dst(), dst(), &iota);
3835                                break;
3836
3837                // We can swap the arguments of symmetric instructions to make better use of any().
3838                case Op::add_f32:
3839                    if (in_reg(x)) { a->vaddps(dst(x), r(x), any(y)); }
3840                    else           { a->vaddps(dst(y), r(y), any(x)); }
3841                                     break;
3842
3843                case Op::mul_f32:
3844                    if (in_reg(x)) { a->vmulps(dst(x), r(x), any(y)); }
3845                    else           { a->vmulps(dst(y), r(y), any(x)); }
3846                                     break;
3847
3848                case Op::sub_f32: a->vsubps(dst(x), r(x), any(y)); break;
3849                case Op::div_f32: a->vdivps(dst(x), r(x), any(y)); break;
3850                case Op::min_f32: a->vminps(dst(y), r(y), any(x)); break;  // Order matters,
3851                case Op::max_f32: a->vmaxps(dst(y), r(y), any(x)); break;  // see test SkVM_min_max.
3852
3853                case Op::fma_f32:
3854                    if (try_alias(x)) { a->vfmadd132ps(dst(x), r(z), any(y)); } else
3855                    if (try_alias(y)) { a->vfmadd213ps(dst(y), r(x), any(z)); } else
3856                    if (try_alias(z)) { a->vfmadd231ps(dst(z), r(x), any(y)); } else
3857                                      { a->vmovups    (dst(), any(x));
3858                                        a->vfmadd132ps(dst(), r(z), any(y)); }
3859                                        break;
3860
3861                case Op::fms_f32:
3862                    if (try_alias(x)) { a->vfmsub132ps(dst(x), r(z), any(y)); } else
3863                    if (try_alias(y)) { a->vfmsub213ps(dst(y), r(x), any(z)); } else
3864                    if (try_alias(z)) { a->vfmsub231ps(dst(z), r(x), any(y)); } else
3865                                      { a->vmovups    (dst(), any(x));
3866                                        a->vfmsub132ps(dst(), r(z), any(y)); }
3867                                        break;
3868
3869                case Op::fnma_f32:
3870                    if (try_alias(x)) { a->vfnmadd132ps(dst(x), r(z), any(y)); } else
3871                    if (try_alias(y)) { a->vfnmadd213ps(dst(y), r(x), any(z)); } else
3872                    if (try_alias(z)) { a->vfnmadd231ps(dst(z), r(x), any(y)); } else
3873                                      { a->vmovups     (dst(), any(x));
3874                                        a->vfnmadd132ps(dst(), r(z), any(y)); }
3875                                        break;
3876
3877                // In situations like this we want to try aliasing dst(x) when x is
3878                // already in a register, but not if we'd have to load it from the stack
3879                // just to alias it.  That's done better directly into the new register.
3880                case Op::sqrt_f32:
3881                    if (in_reg(x)) { a->vsqrtps(dst(x),  r(x)); }
3882                    else           { a->vsqrtps(dst(), any(x)); }
3883                                     break;
3884
3885                case Op::add_i32:
3886                    if (in_reg(x)) { a->vpaddd(dst(x), r(x), any(y)); }
3887                    else           { a->vpaddd(dst(y), r(y), any(x)); }
3888                                     break;
3889
3890                case Op::mul_i32:
3891                    if (in_reg(x)) { a->vpmulld(dst(x), r(x), any(y)); }
3892                    else           { a->vpmulld(dst(y), r(y), any(x)); }
3893                                     break;
3894
3895                case Op::sub_i32: a->vpsubd(dst(x), r(x), any(y)); break;
3896
3897                case Op::bit_and:
3898                    if (in_reg(x)) { a->vpand(dst(x), r(x), any(y)); }
3899                    else           { a->vpand(dst(y), r(y), any(x)); }
3900                                     break;
3901                case Op::bit_or:
3902                    if (in_reg(x)) { a->vpor(dst(x), r(x), any(y)); }
3903                    else           { a->vpor(dst(y), r(y), any(x)); }
3904                                     break;
3905                case Op::bit_xor:
3906                    if (in_reg(x)) { a->vpxor(dst(x), r(x), any(y)); }
3907                    else           { a->vpxor(dst(y), r(y), any(x)); }
3908                                     break;
3909
3910                case Op::bit_clear: a->vpandn(dst(y), r(y), any(x)); break; // Notice, y then x.
3911
3912                case Op::select:
3913                    if (try_alias(z)) { a->vpblendvb(dst(z), r(z), any(y), r(x)); }
3914                    else              { a->vpblendvb(dst(x), r(z), any(y), r(x)); }
3915                                        break;
3916
3917                case Op::shl_i32: a->vpslld(dst(x), r(x), immA); break;
3918                case Op::shr_i32: a->vpsrld(dst(x), r(x), immA); break;
3919                case Op::sra_i32: a->vpsrad(dst(x), r(x), immA); break;
3920
3921                case Op::eq_i32:
3922                    if (in_reg(x)) { a->vpcmpeqd(dst(x), r(x), any(y)); }
3923                    else           { a->vpcmpeqd(dst(y), r(y), any(x)); }
3924                                     break;
3925
3926                case Op::gt_i32: a->vpcmpgtd(dst(), r(x), any(y)); break;
3927
3928                case Op::eq_f32:
3929                    if (in_reg(x)) { a->vcmpeqps(dst(x), r(x), any(y)); }
3930                    else           { a->vcmpeqps(dst(y), r(y), any(x)); }
3931                                     break;
3932                case Op::neq_f32:
3933                    if (in_reg(x)) { a->vcmpneqps(dst(x), r(x), any(y)); }
3934                    else           { a->vcmpneqps(dst(y), r(y), any(x)); }
3935                                     break;
3936
3937                case Op:: gt_f32: a->vcmpltps (dst(y), r(y), any(x)); break;
3938                case Op::gte_f32: a->vcmpleps (dst(y), r(y), any(x)); break;
3939
3940                case Op::ceil:
3941                    if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::CEIL); }
3942                    else           { a->vroundps(dst(), any(x), Assembler::CEIL); }
3943                                     break;
3944
3945                case Op::floor:
3946                    if (in_reg(x)) { a->vroundps(dst(x),  r(x), Assembler::FLOOR); }
3947                    else           { a->vroundps(dst(), any(x), Assembler::FLOOR); }
3948                                     break;
3949
3950                case Op::to_f32:
3951                    if (in_reg(x)) { a->vcvtdq2ps(dst(x),  r(x)); }
3952                    else           { a->vcvtdq2ps(dst(), any(x)); }
3953                                     break;
3954
3955                case Op::trunc:
3956                    if (in_reg(x)) { a->vcvttps2dq(dst(x),  r(x)); }
3957                    else           { a->vcvttps2dq(dst(), any(x)); }
3958                                     break;
3959
3960                case Op::round:
3961                    if (in_reg(x)) { a->vcvtps2dq(dst(x),  r(x)); }
3962                    else           { a->vcvtps2dq(dst(), any(x)); }
3963                                     break;
3964
3965                case Op::to_fp16:
3966                    a->vcvtps2ph(dst(x), r(x), A::CURRENT);  // f32 ymm -> f16 xmm
3967                    a->vpmovzxwd(dst(), dst());              // f16 xmm -> f16 ymm
3968                    break;
3969
3970                case Op::from_fp16:
3971                    a->vpackusdw(dst(x), r(x), r(x));  // f16 ymm -> f16 xmm
3972                    a->vpermq   (dst(), dst(), 0xd8);  // swap middle two 64-bit lanes
3973                    a->vcvtph2ps(dst(), dst());        // f16 xmm -> f32 ymm
3974                    break;
3975
3976            #elif defined(__aarch64__)
3977                case Op::assert_true: {
3978                    a->uminv4s(dst(), r(x));   // uminv acts like an all() across the vector.
3979                    a->movs(GP0, dst(), 0);
3980                    A::Label all_true;
3981                    a->cbnz(GP0, &all_true);
3982                    a->brk(0);
3983                    a->label(&all_true);
3984                } break;
3985
3986                case Op::trace_line:
3987                case Op::trace_var:
3988                case Op::trace_call:
3989                    /* Only supported in the interpreter. */
3990                    break;
3991
3992                case Op::index: {
3993                    A::V tmp = alloc_tmp();
3994                    a->ldrq (tmp, &iota);
3995                    a->dup4s(dst(), N);
3996                    a->sub4s(dst(), dst(), tmp);
3997                    free_tmp(tmp);
3998                } break;
3999
4000                case Op::store8: a->xtns2h(dst(x), r(x));
4001                                 a->xtnh2b(dst(), dst());
4002                   if (scalar) { a->strb  (dst(), arg[immA]); }
4003                   else        { a->strs  (dst(), arg[immA]); }
4004                                 break;
4005
4006                case Op::store16: a->xtns2h(dst(x), r(x));
4007                    if (scalar) { a->strh  (dst(), arg[immA]); }
4008                    else        { a->strd  (dst(), arg[immA]); }
4009                                  break;
4010
4011                case Op::store32: if (scalar) { a->strs(r(x), arg[immA]); }
4012                                  else        { a->strq(r(x), arg[immA]); }
4013                                                break;
4014
4015                case Op::store64: if (scalar) {
4016                                      a->strs(r(x), arg[immA], 0);
4017                                      a->strs(r(y), arg[immA], 1);
4018                                  } else if (r(y) == r(x)+1) {
4019                                      a->st24s(r(x), arg[immA]);
4020                                  } else {
4021                                      Reg tmp0 = alloc_tmp(2),
4022                                          tmp1 = (Reg)(tmp0+1);
4023                                      a->orr16b(tmp0, r(x), r(x));
4024                                      a->orr16b(tmp1, r(y), r(y));
4025                                      a-> st24s(tmp0, arg[immA]);
4026                                      free_tmp(tmp0);
4027                                      free_tmp(tmp1);
4028                                  } break;
4029
4030                case Op::store128:
4031                    if (scalar) {
4032                        a->strs(r(x), arg[immA], 0);
4033                        a->strs(r(y), arg[immA], 1);
4034                        a->strs(r(z), arg[immA], 2);
4035                        a->strs(r(w), arg[immA], 3);
4036                    } else if (r(y) == r(x)+1 &&
4037                               r(z) == r(x)+2 &&
4038                               r(w) == r(x)+3) {
4039                        a->st44s(r(x), arg[immA]);
4040                    } else {
4041                        Reg tmp0 = alloc_tmp(4),
4042                            tmp1 = (Reg)(tmp0+1),
4043                            tmp2 = (Reg)(tmp0+2),
4044                            tmp3 = (Reg)(tmp0+3);
4045                        a->orr16b(tmp0, r(x), r(x));
4046                        a->orr16b(tmp1, r(y), r(y));
4047                        a->orr16b(tmp2, r(z), r(z));
4048                        a->orr16b(tmp3, r(w), r(w));
4049                        a-> st44s(tmp0, arg[immA]);
4050                        free_tmp(tmp0);
4051                        free_tmp(tmp1);
4052                        free_tmp(tmp2);
4053                        free_tmp(tmp3);
4054                    } break;
4055
4056
4057                case Op::load8: if (scalar) { a->ldrb(dst(), arg[immA]); }
4058                                else        { a->ldrs(dst(), arg[immA]); }
4059                                              a->uxtlb2h(dst(), dst());
4060                                              a->uxtlh2s(dst(), dst());
4061                                              break;
4062
4063                case Op::load16: if (scalar) { a->ldrh(dst(), arg[immA]); }
4064                                 else        { a->ldrd(dst(), arg[immA]); }
4065                                               a->uxtlh2s(dst(), dst());
4066                                               break;
4067
4068                case Op::load32: if (scalar) { a->ldrs(dst(), arg[immA]); }
4069                                 else        { a->ldrq(dst(), arg[immA]); }
4070                                               break;
4071
4072                case Op::load64: if (scalar) {
4073                                    a->ldrs(dst(), arg[immA], immB);
4074                                 } else {
4075                                    Reg tmp0 = alloc_tmp(2),
4076                                        tmp1 = (Reg)(tmp0+1);
4077                                    a->ld24s(tmp0, arg[immA]);
4078                                    // TODO: return both
4079                                    switch (immB) {
4080                                        case 0: mark_tmp_as_dst(tmp0); free_tmp(tmp1); break;
4081                                        case 1: mark_tmp_as_dst(tmp1); free_tmp(tmp0); break;
4082                                    }
4083                                 } break;
4084
4085                case Op::load128: if (scalar) {
4086                                      a->ldrs(dst(), arg[immA], immB);
4087                                  } else {
4088                                      Reg tmp0 = alloc_tmp(4),
4089                                          tmp1 = (Reg)(tmp0+1),
4090                                          tmp2 = (Reg)(tmp0+2),
4091                                          tmp3 = (Reg)(tmp0+3);
4092                                      a->ld44s(tmp0, arg[immA]);
4093                                      // TODO: return all four
4094                                      switch (immB) {
4095                                          case 0: mark_tmp_as_dst(tmp0); break;
4096                                          case 1: mark_tmp_as_dst(tmp1); break;
4097                                          case 2: mark_tmp_as_dst(tmp2); break;
4098                                          case 3: mark_tmp_as_dst(tmp3); break;
4099                                      }
4100                                      if (immB != 0) { free_tmp(tmp0); }
4101                                      if (immB != 1) { free_tmp(tmp1); }
4102                                      if (immB != 2) { free_tmp(tmp2); }
4103                                      if (immB != 3) { free_tmp(tmp3); }
4104                                  } break;
4105
4106                case Op::uniform32: a->add(GP0, arg[immA], immB);
4107                                    a->ld1r4s(dst(), GP0);
4108                                    break;
4109
4110                case Op::array32: a->add(GP0, arg[immA], immB);
4111                                  a->ldrd(GP0, GP0);
4112                                  a->add(GP0, GP0, immC);
4113                                  a->ld1r4s(dst(), GP0);
4114                                  break;
4115
4116                case Op::gather8: {
4117                    // As usual, the gather base pointer is immB bytes off of uniform immA.
4118                    a->add (GP0, arg[immA], immB);  // GP0 = &(gather base pointer)
4119                    a->ldrd(GP0, GP0);              // GP0 =   gather base pointer
4120
4121                    for (int i = 0; i < active_lanes; i++) {
4122                        a->movs(GP1, r(x), i);    // Extract index lane i into GP1.
4123                        a->add (GP1, GP0, GP1);   // Add the gather base pointer.
4124                        a->ldrb(GP1, GP1);        // Load that byte.
4125                        a->inss(dst(x), GP1, i);  // Insert it into dst() lane i.
4126                    }
4127                } break;
4128
4129                // See gather8 for general idea; comments here only where gather16 differs.
4130                case Op::gather16: {
4131                    a->add (GP0, arg[immA], immB);
4132                    a->ldrd(GP0, GP0);
4133                    for (int i = 0; i < active_lanes; i++) {
4134                        a->movs(GP1, r(x), i);
4135                        a->add (GP1, GP0, GP1, A::LSL, 1);  // Scale index 2x into a byte offset.
4136                        a->ldrh(GP1, GP1);                  // 2-byte load.
4137                        a->inss(dst(x), GP1, i);
4138                    }
4139                } break;
4140
4141                // See gather8 for general idea; comments here only where gather32 differs.
4142                case Op::gather32: {
4143                    a->add (GP0, arg[immA], immB);
4144                    a->ldrd(GP0, GP0);
4145                    for (int i = 0; i < active_lanes; i++) {
4146                        a->movs(GP1, r(x), i);
4147                        a->add (GP1, GP0, GP1, A::LSL, 2);  // Scale index 4x into a byte offset.
4148                        a->ldrs(GP1, GP1);                  // 4-byte load.
4149                        a->inss(dst(x), GP1, i);
4150                    }
4151                } break;
4152
4153                case Op::add_f32: a->fadd4s(dst(x,y), r(x), r(y)); break;
4154                case Op::sub_f32: a->fsub4s(dst(x,y), r(x), r(y)); break;
4155                case Op::mul_f32: a->fmul4s(dst(x,y), r(x), r(y)); break;
4156                case Op::div_f32: a->fdiv4s(dst(x,y), r(x), r(y)); break;
4157
4158                case Op::sqrt_f32: a->fsqrt4s(dst(x), r(x)); break;
4159
4160                case Op::fma_f32: // fmla.4s is z += x*y
4161                    if (try_alias(z)) { a->fmla4s( r(z), r(x), r(y)); }
4162                    else              { a->orr16b(dst(), r(z), r(z));
4163                                        a->fmla4s(dst(), r(x), r(y)); }
4164                                        break;
4165
4166                case Op::fnma_f32:  // fmls.4s is z -= x*y
4167                    if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4168                    else              { a->orr16b(dst(), r(z), r(z));
4169                                        a->fmls4s(dst(), r(x), r(y)); }
4170                                        break;
4171
4172                case Op::fms_f32:   // calculate z - xy, then negate to xy - z
4173                    if (try_alias(z)) { a->fmls4s( r(z), r(x), r(y)); }
4174                    else              { a->orr16b(dst(), r(z), r(z));
4175                                        a->fmls4s(dst(), r(x), r(y)); }
4176                                        a->fneg4s(dst(), dst());
4177                                        break;
4178
4179                case Op:: gt_f32: a->fcmgt4s (dst(x,y), r(x), r(y)); break;
4180                case Op::gte_f32: a->fcmge4s (dst(x,y), r(x), r(y)); break;
4181                case Op:: eq_f32: a->fcmeq4s (dst(x,y), r(x), r(y)); break;
4182                case Op::neq_f32: a->fcmeq4s (dst(x,y), r(x), r(y));
4183                                  a->not16b  (dst(), dst());         break;
4184
4185
4186                case Op::add_i32: a->add4s(dst(x,y), r(x), r(y)); break;
4187                case Op::sub_i32: a->sub4s(dst(x,y), r(x), r(y)); break;
4188                case Op::mul_i32: a->mul4s(dst(x,y), r(x), r(y)); break;
4189
4190                case Op::bit_and  : a->and16b(dst(x,y), r(x), r(y)); break;
4191                case Op::bit_or   : a->orr16b(dst(x,y), r(x), r(y)); break;
4192                case Op::bit_xor  : a->eor16b(dst(x,y), r(x), r(y)); break;
4193                case Op::bit_clear: a->bic16b(dst(x,y), r(x), r(y)); break;
4194
4195                case Op::select: // bsl16b is x = x ? y : z
4196                    if (try_alias(x)) { a->bsl16b( r(x), r(y), r(z)); }
4197                    else              { a->orr16b(dst(), r(x), r(x));
4198                                        a->bsl16b(dst(), r(y), r(z)); }
4199                                        break;
4200
4201                // fmin4s and fmax4s don't work the way we want with NaN,
4202                // so we write them the long way:
4203                case Op::min_f32: // min(x,y) = y<x ? y : x
4204                                  a->fcmgt4s(dst(), r(x), r(y));
4205                                  a->bsl16b (dst(), r(y), r(x));
4206                                  break;
4207
4208                case Op::max_f32: // max(x,y) = x<y ? y : x
4209                                  a->fcmgt4s(dst(), r(y), r(x));
4210                                  a->bsl16b (dst(), r(y), r(x));
4211                                  break;
4212
4213                case Op::shl_i32: a-> shl4s(dst(x), r(x), immA); break;
4214                case Op::shr_i32: a->ushr4s(dst(x), r(x), immA); break;
4215                case Op::sra_i32: a->sshr4s(dst(x), r(x), immA); break;
4216
4217                case Op::eq_i32: a->cmeq4s(dst(x,y), r(x), r(y)); break;
4218                case Op::gt_i32: a->cmgt4s(dst(x,y), r(x), r(y)); break;
4219
4220                case Op::to_f32: a->scvtf4s (dst(x), r(x)); break;
4221                case Op::trunc:  a->fcvtzs4s(dst(x), r(x)); break;
4222                case Op::round:  a->fcvtns4s(dst(x), r(x)); break;
4223                case Op::ceil:   a->frintp4s(dst(x), r(x)); break;
4224                case Op::floor:  a->frintm4s(dst(x), r(x)); break;
4225
4226                case Op::to_fp16:
4227                    a->fcvtn  (dst(x), r(x));    // 4x f32 -> 4x f16 in bottom four lanes
4228                    a->uxtlh2s(dst(), dst());    // expand to 4x f16 in even 16-bit lanes
4229                    break;
4230
4231                case Op::from_fp16:
4232                    a->xtns2h(dst(x), r(x));     // pack even 16-bit lanes into bottom four lanes
4233                    a->fcvtl (dst(), dst());     // 4x f16 -> 4x f32
4234                    break;
4235            #endif
4236            }
4237
4238            // Proactively free the registers holding any value that dies here.
4239            if (rd != NA &&                   dies_here(regs[rd])) { regs[rd] = NA; }
4240            if (rx != NA && regs[rx] != NA && dies_here(regs[rx])) { regs[rx] = NA; }
4241            if (ry != NA && regs[ry] != NA && dies_here(regs[ry])) { regs[ry] = NA; }
4242            if (rz != NA && regs[rz] != NA && dies_here(regs[rz])) { regs[rz] = NA; }
4243            if (rw != NA && regs[rw] != NA && dies_here(regs[rw])) { regs[rw] = NA; }
4244            return true;
4245        };
4246
4247        #if defined(__x86_64__) || defined(_M_X64)
4248            auto jump_if_less = [&](A::Label* l) { a->jl (l); };
4249            auto jump         = [&](A::Label* l) { a->jmp(l); };
4250
4251            auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
4252            auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
4253        #elif defined(__aarch64__)
4254            auto jump_if_less = [&](A::Label* l) { a->blt(l); };
4255            auto jump         = [&](A::Label* l) { a->b  (l); };
4256
4257            auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
4258            auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
4259        #endif
4260
4261        A::Label body,
4262                 tail,
4263                 done;
4264
4265        enter();
4266        for (Val id = 0; id < (Val)instructions.size(); id++) {
4267            if (instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4268                return false;
4269            }
4270        }
4271
4272        // This point marks a kind of canonical fixed point for register contents: if loop
4273        // code is generated as if these registers are holding these values, the next time
4274        // the loop comes around we'd better find those same registers holding those same values.
4275        auto restore_incoming_regs = [&,incoming=regs,saved_stack_slot=stack_slot,
4276                                      saved_next_stack_slot=next_stack_slot]{
4277            for (int r = 0; r < (int)regs.size(); r++) {
4278                if (regs[r] != incoming[r]) {
4279                    regs[r]  = incoming[r];
4280                    if (regs[r] >= 0) {
4281                        load_from_memory((Reg)r, regs[r]);
4282                    }
4283                }
4284            }
4285            *stack_hint = std::max(*stack_hint, next_stack_slot);
4286            stack_slot = saved_stack_slot;
4287            next_stack_slot = saved_next_stack_slot;
4288        };
4289
4290        a->label(&body);
4291        {
4292            a->cmp(N, K);
4293            jump_if_less(&tail);
4294            for (Val id = 0; id < (Val)instructions.size(); id++) {
4295                if (!instructions[id].can_hoist && !emit(id, /*scalar=*/false)) {
4296                    return false;
4297                }
4298            }
4299            restore_incoming_regs();
4300            for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4301                if (fImpl->strides[i]) {
4302                    add(arg[i], K*fImpl->strides[i]);
4303                }
4304            }
4305            sub(N, K);
4306            jump(&body);
4307        }
4308
4309        a->label(&tail);
4310        {
4311            a->cmp(N, 1);
4312            jump_if_less(&done);
4313            for (Val id = 0; id < (Val)instructions.size(); id++) {
4314                if (!instructions[id].can_hoist && !emit(id, /*scalar=*/true)) {
4315                    return false;
4316                }
4317            }
4318            restore_incoming_regs();
4319            for (int i = 0; i < (int)fImpl->strides.size(); i++) {
4320                if (fImpl->strides[i]) {
4321                    add(arg[i], 1*fImpl->strides[i]);
4322                }
4323            }
4324            sub(N, 1);
4325            jump(&tail);
4326        }
4327
4328        a->label(&done);
4329        {
4330            exit();
4331        }
4332
4333        // Except for explicit aligned load and store instructions, AVX allows
4334        // memory operands to be unaligned.  So even though we're creating 16
4335        // byte patterns on ARM or 32-byte patterns on x86, we only need to
4336        // align to 4 bytes, the element size and alignment requirement.
4337
4338        constants.foreach([&](int imm, A::Label* label) {
4339            a->align(4);
4340            a->label(label);
4341            for (int i = 0; i < K; i++) {
4342                a->word(imm);
4343            }
4344        });
4345
4346        if (!iota.references.empty()) {
4347            a->align(4);
4348            a->label(&iota);        // 0,1,2,3,4,...
4349            for (int i = 0; i < K; i++) {
4350                a->word(i);
4351            }
4352        }
4353
4354        if (!load64_index.references.empty()) {
4355            a->align(4);
4356            a->label(&load64_index);  // {0,2,4,6|1,3,5,7}
4357            a->word(0); a->word(2); a->word(4); a->word(6);
4358            a->word(1); a->word(3); a->word(5); a->word(7);
4359        }
4360
4361        return true;
4362    }
4363
4364    void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
4365                           const char* debug_name) {
4366        // Assemble with no buffer to determine a.size() (the number of bytes we'll assemble)
4367        // and stack_hint/registers_used to feed forward into the next jit() call.
4368        Assembler a{nullptr};
4369        int stack_hint = -1;
4370        uint32_t registers_used = 0xffff'ffff;  // Start conservatively with all.
4371        if (!this->jit(instructions, &stack_hint, &registers_used, &a)) {
4372            return;
4373        }
4374
4375        fImpl->jit_size = a.size();
4376        void* jit_entry = alloc_jit_buffer(&fImpl->jit_size);
4377        fImpl->jit_entry.store(jit_entry);
4378
4379        // Assemble the program for real with stack_hint/registers_used as feedback from first call.
4380        a = Assembler{jit_entry};
4381        SkAssertResult(this->jit(instructions, &stack_hint, &registers_used, &a));
4382        SkASSERT(a.size() <= fImpl->jit_size);
4383
4384        // Remap as executable, and flush caches on platforms that need that.
4385        remap_as_executable(jit_entry, fImpl->jit_size);
4386
4387        notify_vtune(debug_name, jit_entry, fImpl->jit_size);
4388
4389    #if !defined(SK_BUILD_FOR_WIN)
4390        // For profiling and debugging, it's helpful to have this code loaded
4391        // dynamically rather than just jumping info fImpl->jit_entry.
4392        if (gSkVMJITViaDylib) {
4393            // Dump the raw program binary.
4394            SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
4395            int fd = mkstemp(path.writable_str());
4396            ::write(fd, jit_entry, a.size());
4397            close(fd);
4398
4399            this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
4400
4401            // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4402            SkString cmd = SkStringPrintf(
4403                    "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4404                    " | clang -x assembler -shared - -o %s",
4405                    path.c_str(), path.c_str());
4406            system(cmd.c_str());
4407
4408            // Load that dynamic library and look up skvm_jit().
4409            fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
4410            void* sym = nullptr;
4411            for (const char* name : {"skvm_jit", "_skvm_jit"} ) {
4412                if (!sym) { sym = dlsym(fImpl->dylib, name); }
4413            }
4414            fImpl->jit_entry.store(sym);
4415        }
4416    #endif
4417    }
4418
4419    void Program::disassemble(SkWStream* o) const {
4420    #if !defined(SK_BUILD_FOR_WIN)
4421        SkDebugfStream debug;
4422        if (!o) { o = &debug; }
4423
4424        const void* jit_entry = fImpl->jit_entry.load();
4425        size_t jit_size = fImpl->jit_size;
4426
4427        if (!jit_entry) {
4428            o->writeText("Program not JIT'd. Did you pass --jit?\n");
4429            return;
4430        }
4431
4432        char path[] = "/tmp/skvm-jit.XXXXXX";
4433        int fd = mkstemp(path);
4434        ::write(fd, jit_entry, jit_size);
4435        close(fd);
4436
4437        // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
4438        SkString cmd = SkStringPrintf(
4439                "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
4440                " | clang -x assembler -shared - -o %s",
4441                path, path);
4442        system(cmd.c_str());
4443
4444        // Now objdump to disassemble our function:
4445        // TODO: We could trim this down to just our code using '--disassemble=<symbol name>`,
4446        // but the symbol name varies with OS, and that option may be missing from objdump on some
4447        // machines? There also apears to be quite a bit of junk after the end of the JIT'd code.
4448        // Trimming that would let us pass '--visualize-jumps' and get the loop annotated.
4449        // With the junk, we tend to end up with a bunch of stray jumps that pollute the ASCII art.
4450        cmd = SkStringPrintf("objdump -D %s", path);
4451    #if defined(SK_BUILD_FOR_UNIX)
4452        cmd.append(" --section=.text");
4453    #endif
4454        FILE* fp = popen(cmd.c_str(), "r");
4455        if (!fp) {
4456            o->writeText("objdump failed\n");
4457            return;
4458        }
4459
4460        char line[1024];
4461        while (fgets(line, sizeof(line), fp)) {
4462            o->writeText(line);
4463        }
4464
4465        pclose(fp);
4466    #endif
4467    }
4468
4469#endif
4470
4471}  // namespace skvm
4472