1cb93a386Sopenharmony_ci/* 2cb93a386Sopenharmony_ci * Copyright 2018 Google Inc. 3cb93a386Sopenharmony_ci * 4cb93a386Sopenharmony_ci * Use of this source code is governed by a BSD-style license that can be 5cb93a386Sopenharmony_ci * found in the LICENSE file. 6cb93a386Sopenharmony_ci */ 7cb93a386Sopenharmony_ci 8cb93a386Sopenharmony_ci#ifndef SkRasterPipeline_opts_DEFINED 9cb93a386Sopenharmony_ci#define SkRasterPipeline_opts_DEFINED 10cb93a386Sopenharmony_ci 11cb93a386Sopenharmony_ci#include "include/core/SkData.h" 12cb93a386Sopenharmony_ci#include "include/core/SkTypes.h" 13cb93a386Sopenharmony_ci#include "src/core/SkUtils.h" // unaligned_{load,store} 14cb93a386Sopenharmony_ci#include <cstdint> 15cb93a386Sopenharmony_ci 16cb93a386Sopenharmony_ci// Every function in this file should be marked static and inline using SI. 17cb93a386Sopenharmony_ci#if defined(__clang__) 18cb93a386Sopenharmony_ci #define SI __attribute__((always_inline)) static inline 19cb93a386Sopenharmony_ci#else 20cb93a386Sopenharmony_ci #define SI static inline 21cb93a386Sopenharmony_ci#endif 22cb93a386Sopenharmony_ci 23cb93a386Sopenharmony_citemplate <typename Dst, typename Src> 24cb93a386Sopenharmony_ciSI Dst widen_cast(const Src& src) { 25cb93a386Sopenharmony_ci static_assert(sizeof(Dst) > sizeof(Src)); 26cb93a386Sopenharmony_ci static_assert(std::is_trivially_copyable<Dst>::value); 27cb93a386Sopenharmony_ci static_assert(std::is_trivially_copyable<Src>::value); 28cb93a386Sopenharmony_ci Dst dst; 29cb93a386Sopenharmony_ci memcpy(&dst, &src, sizeof(Src)); 30cb93a386Sopenharmony_ci return dst; 31cb93a386Sopenharmony_ci} 32cb93a386Sopenharmony_ci 33cb93a386Sopenharmony_ci// Our program is an array of void*, either 34cb93a386Sopenharmony_ci// - 1 void* per stage with no context pointer, the next stage; 35cb93a386Sopenharmony_ci// - 2 void* per stage with a context pointer, first the context pointer, then the next stage. 36cb93a386Sopenharmony_ci 37cb93a386Sopenharmony_ci// load_and_inc() steps the program forward by 1 void*, returning that pointer. 38cb93a386Sopenharmony_ciSI void* load_and_inc(void**& program) { 39cb93a386Sopenharmony_ci#if defined(__GNUC__) && defined(__x86_64__) 40cb93a386Sopenharmony_ci // If program is in %rsi (we try to make this likely) then this is a single instruction. 41cb93a386Sopenharmony_ci void* rax; 42cb93a386Sopenharmony_ci asm("lodsq" : "=a"(rax), "+S"(program)); // Write-only %rax, read-write %rsi. 43cb93a386Sopenharmony_ci return rax; 44cb93a386Sopenharmony_ci#else 45cb93a386Sopenharmony_ci // On ARM *program++ compiles into pretty ideal code without any handholding. 46cb93a386Sopenharmony_ci return *program++; 47cb93a386Sopenharmony_ci#endif 48cb93a386Sopenharmony_ci} 49cb93a386Sopenharmony_ci 50cb93a386Sopenharmony_ci// Lazily resolved on first cast. Does nothing if cast to Ctx::None. 51cb93a386Sopenharmony_cistruct Ctx { 52cb93a386Sopenharmony_ci struct None {}; 53cb93a386Sopenharmony_ci 54cb93a386Sopenharmony_ci void* ptr; 55cb93a386Sopenharmony_ci void**& program; 56cb93a386Sopenharmony_ci 57cb93a386Sopenharmony_ci explicit Ctx(void**& p) : ptr(nullptr), program(p) {} 58cb93a386Sopenharmony_ci 59cb93a386Sopenharmony_ci template <typename T> 60cb93a386Sopenharmony_ci operator T*() { 61cb93a386Sopenharmony_ci if (!ptr) { ptr = load_and_inc(program); } 62cb93a386Sopenharmony_ci return (T*)ptr; 63cb93a386Sopenharmony_ci } 64cb93a386Sopenharmony_ci operator None() { return None{}; } 65cb93a386Sopenharmony_ci}; 66cb93a386Sopenharmony_ci 67cb93a386Sopenharmony_ci 68cb93a386Sopenharmony_ci#if !defined(__clang__) 69cb93a386Sopenharmony_ci #define JUMPER_IS_SCALAR 70cb93a386Sopenharmony_ci#elif defined(SK_ARM_HAS_NEON) 71cb93a386Sopenharmony_ci #define JUMPER_IS_NEON 72cb93a386Sopenharmony_ci#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX 73cb93a386Sopenharmony_ci #define JUMPER_IS_SKX 74cb93a386Sopenharmony_ci#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 75cb93a386Sopenharmony_ci #define JUMPER_IS_HSW 76cb93a386Sopenharmony_ci#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX 77cb93a386Sopenharmony_ci #define JUMPER_IS_AVX 78cb93a386Sopenharmony_ci#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 79cb93a386Sopenharmony_ci #define JUMPER_IS_SSE41 80cb93a386Sopenharmony_ci#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 81cb93a386Sopenharmony_ci #define JUMPER_IS_SSE2 82cb93a386Sopenharmony_ci#else 83cb93a386Sopenharmony_ci #define JUMPER_IS_SCALAR 84cb93a386Sopenharmony_ci#endif 85cb93a386Sopenharmony_ci 86cb93a386Sopenharmony_ci// Older Clangs seem to crash when generating non-optimized NEON code for ARMv7. 87cb93a386Sopenharmony_ci#if defined(__clang__) && !defined(__OPTIMIZE__) && defined(SK_CPU_ARM32) 88cb93a386Sopenharmony_ci // Apple Clang 9 and vanilla Clang 5 are fine, and may even be conservative. 89cb93a386Sopenharmony_ci #if defined(__apple_build_version__) && __clang_major__ < 9 90cb93a386Sopenharmony_ci #define JUMPER_IS_SCALAR 91cb93a386Sopenharmony_ci #elif __clang_major__ < 5 92cb93a386Sopenharmony_ci #define JUMPER_IS_SCALAR 93cb93a386Sopenharmony_ci #endif 94cb93a386Sopenharmony_ci 95cb93a386Sopenharmony_ci #if defined(JUMPER_IS_NEON) && defined(JUMPER_IS_SCALAR) 96cb93a386Sopenharmony_ci #undef JUMPER_IS_NEON 97cb93a386Sopenharmony_ci #endif 98cb93a386Sopenharmony_ci#endif 99cb93a386Sopenharmony_ci 100cb93a386Sopenharmony_ci#if defined(JUMPER_IS_SCALAR) 101cb93a386Sopenharmony_ci #include <math.h> 102cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_NEON) 103cb93a386Sopenharmony_ci #include <arm_neon.h> 104cb93a386Sopenharmony_ci#else 105cb93a386Sopenharmony_ci #include <immintrin.h> 106cb93a386Sopenharmony_ci#endif 107cb93a386Sopenharmony_ci 108cb93a386Sopenharmony_ci// Notes: 109cb93a386Sopenharmony_ci// * rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least 110cb93a386Sopenharmony_ci// 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise 111cb93a386Sopenharmony_ci// requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for 112cb93a386Sopenharmony_ci// Intel this requires one additional step because its estimate has 12 bit precision. 113cb93a386Sopenharmony_ci 114cb93a386Sopenharmony_cinamespace SK_OPTS_NS { 115cb93a386Sopenharmony_ci#if defined(JUMPER_IS_SCALAR) 116cb93a386Sopenharmony_ci // This path should lead to portable scalar code. 117cb93a386Sopenharmony_ci using F = float ; 118cb93a386Sopenharmony_ci using I32 = int32_t; 119cb93a386Sopenharmony_ci using U64 = uint64_t; 120cb93a386Sopenharmony_ci using U32 = uint32_t; 121cb93a386Sopenharmony_ci using U16 = uint16_t; 122cb93a386Sopenharmony_ci using U8 = uint8_t ; 123cb93a386Sopenharmony_ci 124cb93a386Sopenharmony_ci SI F mad(F f, F m, F a) { return f*m+a; } 125cb93a386Sopenharmony_ci SI F min(F a, F b) { return fminf(a,b); } 126cb93a386Sopenharmony_ci SI F max(F a, F b) { return fmaxf(a,b); } 127cb93a386Sopenharmony_ci SI F abs_ (F v) { return fabsf(v); } 128cb93a386Sopenharmony_ci SI F floor_(F v) { return floorf(v); } 129cb93a386Sopenharmony_ci SI F rcp_fast(F v) { return 1.0f / v; } 130cb93a386Sopenharmony_ci SI F rsqrt (F v) { return 1.0f / sqrtf(v); } 131cb93a386Sopenharmony_ci SI F sqrt_ (F v) { return sqrtf(v); } 132cb93a386Sopenharmony_ci SI F rcp_precise (F v) { return 1.0f / v; } 133cb93a386Sopenharmony_ci 134cb93a386Sopenharmony_ci SI U32 round (F v, F scale) { return (uint32_t)(v*scale + 0.5f); } 135cb93a386Sopenharmony_ci SI U16 pack(U32 v) { return (U16)v; } 136cb93a386Sopenharmony_ci SI U8 pack(U16 v) { return (U8)v; } 137cb93a386Sopenharmony_ci 138cb93a386Sopenharmony_ci SI F if_then_else(I32 c, F t, F e) { return c ? t : e; } 139cb93a386Sopenharmony_ci 140cb93a386Sopenharmony_ci template <typename T> 141cb93a386Sopenharmony_ci SI T gather(const T* p, U32 ix) { return p[ix]; } 142cb93a386Sopenharmony_ci 143cb93a386Sopenharmony_ci SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { 144cb93a386Sopenharmony_ci *r = ptr[0]; 145cb93a386Sopenharmony_ci *g = ptr[1]; 146cb93a386Sopenharmony_ci } 147cb93a386Sopenharmony_ci SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { 148cb93a386Sopenharmony_ci ptr[0] = r; 149cb93a386Sopenharmony_ci ptr[1] = g; 150cb93a386Sopenharmony_ci } 151cb93a386Sopenharmony_ci SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 152cb93a386Sopenharmony_ci *r = ptr[0]; 153cb93a386Sopenharmony_ci *g = ptr[1]; 154cb93a386Sopenharmony_ci *b = ptr[2]; 155cb93a386Sopenharmony_ci } 156cb93a386Sopenharmony_ci SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 157cb93a386Sopenharmony_ci *r = ptr[0]; 158cb93a386Sopenharmony_ci *g = ptr[1]; 159cb93a386Sopenharmony_ci *b = ptr[2]; 160cb93a386Sopenharmony_ci *a = ptr[3]; 161cb93a386Sopenharmony_ci } 162cb93a386Sopenharmony_ci SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 163cb93a386Sopenharmony_ci ptr[0] = r; 164cb93a386Sopenharmony_ci ptr[1] = g; 165cb93a386Sopenharmony_ci ptr[2] = b; 166cb93a386Sopenharmony_ci ptr[3] = a; 167cb93a386Sopenharmony_ci } 168cb93a386Sopenharmony_ci 169cb93a386Sopenharmony_ci SI void load2(const float* ptr, size_t tail, F* r, F* g) { 170cb93a386Sopenharmony_ci *r = ptr[0]; 171cb93a386Sopenharmony_ci *g = ptr[1]; 172cb93a386Sopenharmony_ci } 173cb93a386Sopenharmony_ci SI void store2(float* ptr, size_t tail, F r, F g) { 174cb93a386Sopenharmony_ci ptr[0] = r; 175cb93a386Sopenharmony_ci ptr[1] = g; 176cb93a386Sopenharmony_ci } 177cb93a386Sopenharmony_ci SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 178cb93a386Sopenharmony_ci *r = ptr[0]; 179cb93a386Sopenharmony_ci *g = ptr[1]; 180cb93a386Sopenharmony_ci *b = ptr[2]; 181cb93a386Sopenharmony_ci *a = ptr[3]; 182cb93a386Sopenharmony_ci } 183cb93a386Sopenharmony_ci SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 184cb93a386Sopenharmony_ci ptr[0] = r; 185cb93a386Sopenharmony_ci ptr[1] = g; 186cb93a386Sopenharmony_ci ptr[2] = b; 187cb93a386Sopenharmony_ci ptr[3] = a; 188cb93a386Sopenharmony_ci } 189cb93a386Sopenharmony_ci 190cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_NEON) 191cb93a386Sopenharmony_ci // Since we know we're using Clang, we can use its vector extensions. 192cb93a386Sopenharmony_ci template <typename T> using V = T __attribute__((ext_vector_type(4))); 193cb93a386Sopenharmony_ci using F = V<float >; 194cb93a386Sopenharmony_ci using I32 = V< int32_t>; 195cb93a386Sopenharmony_ci using U64 = V<uint64_t>; 196cb93a386Sopenharmony_ci using U32 = V<uint32_t>; 197cb93a386Sopenharmony_ci using U16 = V<uint16_t>; 198cb93a386Sopenharmony_ci using U8 = V<uint8_t >; 199cb93a386Sopenharmony_ci 200cb93a386Sopenharmony_ci // We polyfill a few routines that Clang doesn't build into ext_vector_types. 201cb93a386Sopenharmony_ci SI F min(F a, F b) { return vminq_f32(a,b); } 202cb93a386Sopenharmony_ci SI F max(F a, F b) { return vmaxq_f32(a,b); } 203cb93a386Sopenharmony_ci SI F abs_ (F v) { return vabsq_f32(v); } 204cb93a386Sopenharmony_ci SI F rcp_fast(F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; } 205cb93a386Sopenharmony_ci SI F rcp_precise (F v) { auto e = rcp_fast(v); return vrecpsq_f32 (v,e ) * e; } 206cb93a386Sopenharmony_ci SI F rsqrt (F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; } 207cb93a386Sopenharmony_ci 208cb93a386Sopenharmony_ci SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); } 209cb93a386Sopenharmony_ci SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); } 210cb93a386Sopenharmony_ci 211cb93a386Sopenharmony_ci SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); } 212cb93a386Sopenharmony_ci 213cb93a386Sopenharmony_ci #if defined(SK_CPU_ARM64) 214cb93a386Sopenharmony_ci SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); } 215cb93a386Sopenharmony_ci SI F floor_(F v) { return vrndmq_f32(v); } 216cb93a386Sopenharmony_ci SI F sqrt_(F v) { return vsqrtq_f32(v); } 217cb93a386Sopenharmony_ci SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); } 218cb93a386Sopenharmony_ci #else 219cb93a386Sopenharmony_ci SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); } 220cb93a386Sopenharmony_ci SI F floor_(F v) { 221cb93a386Sopenharmony_ci F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v)); 222cb93a386Sopenharmony_ci return roundtrip - if_then_else(roundtrip > v, 1, 0); 223cb93a386Sopenharmony_ci } 224cb93a386Sopenharmony_ci 225cb93a386Sopenharmony_ci SI F sqrt_(F v) { 226cb93a386Sopenharmony_ci auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v). 227cb93a386Sopenharmony_ci e *= vrsqrtsq_f32(v,e*e); 228cb93a386Sopenharmony_ci e *= vrsqrtsq_f32(v,e*e); 229cb93a386Sopenharmony_ci return v*e; // sqrt(v) == v*rsqrt(v). 230cb93a386Sopenharmony_ci } 231cb93a386Sopenharmony_ci 232cb93a386Sopenharmony_ci SI U32 round(F v, F scale) { 233cb93a386Sopenharmony_ci return vcvtq_u32_f32(mad(v,scale,0.5f)); 234cb93a386Sopenharmony_ci } 235cb93a386Sopenharmony_ci #endif 236cb93a386Sopenharmony_ci 237cb93a386Sopenharmony_ci 238cb93a386Sopenharmony_ci template <typename T> 239cb93a386Sopenharmony_ci SI V<T> gather(const T* p, U32 ix) { 240cb93a386Sopenharmony_ci return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; 241cb93a386Sopenharmony_ci } 242cb93a386Sopenharmony_ci SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { 243cb93a386Sopenharmony_ci uint16x4x2_t rg; 244cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 245cb93a386Sopenharmony_ci if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); } 246cb93a386Sopenharmony_ci if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); } 247cb93a386Sopenharmony_ci if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); } 248cb93a386Sopenharmony_ci } else { 249cb93a386Sopenharmony_ci rg = vld2_u16(ptr); 250cb93a386Sopenharmony_ci } 251cb93a386Sopenharmony_ci *r = rg.val[0]; 252cb93a386Sopenharmony_ci *g = rg.val[1]; 253cb93a386Sopenharmony_ci } 254cb93a386Sopenharmony_ci SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { 255cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 256cb93a386Sopenharmony_ci if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); } 257cb93a386Sopenharmony_ci if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); } 258cb93a386Sopenharmony_ci if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); } 259cb93a386Sopenharmony_ci } else { 260cb93a386Sopenharmony_ci vst2_u16(ptr, (uint16x4x2_t{{r,g}})); 261cb93a386Sopenharmony_ci } 262cb93a386Sopenharmony_ci } 263cb93a386Sopenharmony_ci SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 264cb93a386Sopenharmony_ci uint16x4x3_t rgb; 265cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 266cb93a386Sopenharmony_ci if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); } 267cb93a386Sopenharmony_ci if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); } 268cb93a386Sopenharmony_ci if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); } 269cb93a386Sopenharmony_ci } else { 270cb93a386Sopenharmony_ci rgb = vld3_u16(ptr); 271cb93a386Sopenharmony_ci } 272cb93a386Sopenharmony_ci *r = rgb.val[0]; 273cb93a386Sopenharmony_ci *g = rgb.val[1]; 274cb93a386Sopenharmony_ci *b = rgb.val[2]; 275cb93a386Sopenharmony_ci } 276cb93a386Sopenharmony_ci SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 277cb93a386Sopenharmony_ci uint16x4x4_t rgba; 278cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 279cb93a386Sopenharmony_ci if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); } 280cb93a386Sopenharmony_ci if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); } 281cb93a386Sopenharmony_ci if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); } 282cb93a386Sopenharmony_ci } else { 283cb93a386Sopenharmony_ci rgba = vld4_u16(ptr); 284cb93a386Sopenharmony_ci } 285cb93a386Sopenharmony_ci *r = rgba.val[0]; 286cb93a386Sopenharmony_ci *g = rgba.val[1]; 287cb93a386Sopenharmony_ci *b = rgba.val[2]; 288cb93a386Sopenharmony_ci *a = rgba.val[3]; 289cb93a386Sopenharmony_ci } 290cb93a386Sopenharmony_ci 291cb93a386Sopenharmony_ci SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 292cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 293cb93a386Sopenharmony_ci if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); } 294cb93a386Sopenharmony_ci if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); } 295cb93a386Sopenharmony_ci if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); } 296cb93a386Sopenharmony_ci } else { 297cb93a386Sopenharmony_ci vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}})); 298cb93a386Sopenharmony_ci } 299cb93a386Sopenharmony_ci } 300cb93a386Sopenharmony_ci SI void load2(const float* ptr, size_t tail, F* r, F* g) { 301cb93a386Sopenharmony_ci float32x4x2_t rg; 302cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 303cb93a386Sopenharmony_ci if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); } 304cb93a386Sopenharmony_ci if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); } 305cb93a386Sopenharmony_ci if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); } 306cb93a386Sopenharmony_ci } else { 307cb93a386Sopenharmony_ci rg = vld2q_f32(ptr); 308cb93a386Sopenharmony_ci } 309cb93a386Sopenharmony_ci *r = rg.val[0]; 310cb93a386Sopenharmony_ci *g = rg.val[1]; 311cb93a386Sopenharmony_ci } 312cb93a386Sopenharmony_ci SI void store2(float* ptr, size_t tail, F r, F g) { 313cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 314cb93a386Sopenharmony_ci if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); } 315cb93a386Sopenharmony_ci if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); } 316cb93a386Sopenharmony_ci if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); } 317cb93a386Sopenharmony_ci } else { 318cb93a386Sopenharmony_ci vst2q_f32(ptr, (float32x4x2_t{{r,g}})); 319cb93a386Sopenharmony_ci } 320cb93a386Sopenharmony_ci } 321cb93a386Sopenharmony_ci SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 322cb93a386Sopenharmony_ci float32x4x4_t rgba; 323cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 324cb93a386Sopenharmony_ci if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); } 325cb93a386Sopenharmony_ci if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); } 326cb93a386Sopenharmony_ci if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); } 327cb93a386Sopenharmony_ci } else { 328cb93a386Sopenharmony_ci rgba = vld4q_f32(ptr); 329cb93a386Sopenharmony_ci } 330cb93a386Sopenharmony_ci *r = rgba.val[0]; 331cb93a386Sopenharmony_ci *g = rgba.val[1]; 332cb93a386Sopenharmony_ci *b = rgba.val[2]; 333cb93a386Sopenharmony_ci *a = rgba.val[3]; 334cb93a386Sopenharmony_ci } 335cb93a386Sopenharmony_ci SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 336cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 337cb93a386Sopenharmony_ci if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); } 338cb93a386Sopenharmony_ci if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); } 339cb93a386Sopenharmony_ci if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); } 340cb93a386Sopenharmony_ci } else { 341cb93a386Sopenharmony_ci vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}})); 342cb93a386Sopenharmony_ci } 343cb93a386Sopenharmony_ci } 344cb93a386Sopenharmony_ci 345cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 346cb93a386Sopenharmony_ci // These are __m256 and __m256i, but friendlier and strongly-typed. 347cb93a386Sopenharmony_ci template <typename T> using V = T __attribute__((ext_vector_type(8))); 348cb93a386Sopenharmony_ci using F = V<float >; 349cb93a386Sopenharmony_ci using I32 = V< int32_t>; 350cb93a386Sopenharmony_ci using U64 = V<uint64_t>; 351cb93a386Sopenharmony_ci using U32 = V<uint32_t>; 352cb93a386Sopenharmony_ci using U16 = V<uint16_t>; 353cb93a386Sopenharmony_ci using U8 = V<uint8_t >; 354cb93a386Sopenharmony_ci 355cb93a386Sopenharmony_ci SI F mad(F f, F m, F a) { 356cb93a386Sopenharmony_ci #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 357cb93a386Sopenharmony_ci return _mm256_fmadd_ps(f,m,a); 358cb93a386Sopenharmony_ci #else 359cb93a386Sopenharmony_ci return f*m+a; 360cb93a386Sopenharmony_ci #endif 361cb93a386Sopenharmony_ci } 362cb93a386Sopenharmony_ci 363cb93a386Sopenharmony_ci SI F min(F a, F b) { return _mm256_min_ps(a,b); } 364cb93a386Sopenharmony_ci SI F max(F a, F b) { return _mm256_max_ps(a,b); } 365cb93a386Sopenharmony_ci SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); } 366cb93a386Sopenharmony_ci SI F floor_(F v) { return _mm256_floor_ps(v); } 367cb93a386Sopenharmony_ci SI F rcp_fast(F v) { return _mm256_rcp_ps (v); } 368cb93a386Sopenharmony_ci SI F rsqrt (F v) { return _mm256_rsqrt_ps(v); } 369cb93a386Sopenharmony_ci SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); } 370cb93a386Sopenharmony_ci SI F rcp_precise (F v) { 371cb93a386Sopenharmony_ci F e = rcp_fast(v); 372cb93a386Sopenharmony_ci #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 373cb93a386Sopenharmony_ci return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) * e; 374cb93a386Sopenharmony_ci #else 375cb93a386Sopenharmony_ci return e * (2.0f - v * e); 376cb93a386Sopenharmony_ci #endif 377cb93a386Sopenharmony_ci } 378cb93a386Sopenharmony_ci 379cb93a386Sopenharmony_ci 380cb93a386Sopenharmony_ci SI U32 round (F v, F scale) { return _mm256_cvtps_epi32(v*scale); } 381cb93a386Sopenharmony_ci SI U16 pack(U32 v) { 382cb93a386Sopenharmony_ci return _mm_packus_epi32(_mm256_extractf128_si256(v, 0), 383cb93a386Sopenharmony_ci _mm256_extractf128_si256(v, 1)); 384cb93a386Sopenharmony_ci } 385cb93a386Sopenharmony_ci SI U8 pack(U16 v) { 386cb93a386Sopenharmony_ci auto r = _mm_packus_epi16(v,v); 387cb93a386Sopenharmony_ci return sk_unaligned_load<U8>(&r); 388cb93a386Sopenharmony_ci } 389cb93a386Sopenharmony_ci 390cb93a386Sopenharmony_ci SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); } 391cb93a386Sopenharmony_ci 392cb93a386Sopenharmony_ci template <typename T> 393cb93a386Sopenharmony_ci SI V<T> gather(const T* p, U32 ix) { 394cb93a386Sopenharmony_ci return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]], 395cb93a386Sopenharmony_ci p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], }; 396cb93a386Sopenharmony_ci } 397cb93a386Sopenharmony_ci #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 398cb93a386Sopenharmony_ci SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps (p, ix, 4); } 399cb93a386Sopenharmony_ci SI U32 gather(const uint32_t* p, U32 ix) { return _mm256_i32gather_epi32(p, ix, 4); } 400cb93a386Sopenharmony_ci SI U64 gather(const uint64_t* p, U32 ix) { 401cb93a386Sopenharmony_ci __m256i parts[] = { 402cb93a386Sopenharmony_ci _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,0), 8), 403cb93a386Sopenharmony_ci _mm256_i32gather_epi64(p, _mm256_extracti128_si256(ix,1), 8), 404cb93a386Sopenharmony_ci }; 405cb93a386Sopenharmony_ci return sk_bit_cast<U64>(parts); 406cb93a386Sopenharmony_ci } 407cb93a386Sopenharmony_ci #endif 408cb93a386Sopenharmony_ci 409cb93a386Sopenharmony_ci SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { 410cb93a386Sopenharmony_ci U16 _0123, _4567; 411cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 412cb93a386Sopenharmony_ci _0123 = _4567 = _mm_setzero_si128(); 413cb93a386Sopenharmony_ci auto* d = &_0123; 414cb93a386Sopenharmony_ci if (tail > 3) { 415cb93a386Sopenharmony_ci *d = _mm_loadu_si128(((__m128i*)ptr) + 0); 416cb93a386Sopenharmony_ci tail -= 4; 417cb93a386Sopenharmony_ci ptr += 8; 418cb93a386Sopenharmony_ci d = &_4567; 419cb93a386Sopenharmony_ci } 420cb93a386Sopenharmony_ci bool high = false; 421cb93a386Sopenharmony_ci if (tail > 1) { 422cb93a386Sopenharmony_ci *d = _mm_loadu_si64(ptr); 423cb93a386Sopenharmony_ci tail -= 2; 424cb93a386Sopenharmony_ci ptr += 4; 425cb93a386Sopenharmony_ci high = true; 426cb93a386Sopenharmony_ci } 427cb93a386Sopenharmony_ci if (tail > 0) { 428cb93a386Sopenharmony_ci (*d)[high ? 4 : 0] = *(ptr + 0); 429cb93a386Sopenharmony_ci (*d)[high ? 5 : 1] = *(ptr + 1); 430cb93a386Sopenharmony_ci } 431cb93a386Sopenharmony_ci } else { 432cb93a386Sopenharmony_ci _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0); 433cb93a386Sopenharmony_ci _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1); 434cb93a386Sopenharmony_ci } 435cb93a386Sopenharmony_ci *r = _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16), 436cb93a386Sopenharmony_ci _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16)); 437cb93a386Sopenharmony_ci *g = _mm_packs_epi32(_mm_srai_epi32(_0123, 16), 438cb93a386Sopenharmony_ci _mm_srai_epi32(_4567, 16)); 439cb93a386Sopenharmony_ci } 440cb93a386Sopenharmony_ci SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { 441cb93a386Sopenharmony_ci auto _0123 = _mm_unpacklo_epi16(r, g), 442cb93a386Sopenharmony_ci _4567 = _mm_unpackhi_epi16(r, g); 443cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 444cb93a386Sopenharmony_ci const auto* s = &_0123; 445cb93a386Sopenharmony_ci if (tail > 3) { 446cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr, *s); 447cb93a386Sopenharmony_ci s = &_4567; 448cb93a386Sopenharmony_ci tail -= 4; 449cb93a386Sopenharmony_ci ptr += 8; 450cb93a386Sopenharmony_ci } 451cb93a386Sopenharmony_ci bool high = false; 452cb93a386Sopenharmony_ci if (tail > 1) { 453cb93a386Sopenharmony_ci _mm_storel_epi64((__m128i*)ptr, *s); 454cb93a386Sopenharmony_ci ptr += 4; 455cb93a386Sopenharmony_ci tail -= 2; 456cb93a386Sopenharmony_ci high = true; 457cb93a386Sopenharmony_ci } 458cb93a386Sopenharmony_ci if (tail > 0) { 459cb93a386Sopenharmony_ci if (high) { 460cb93a386Sopenharmony_ci *(int32_t*)ptr = _mm_extract_epi32(*s, 2); 461cb93a386Sopenharmony_ci } else { 462cb93a386Sopenharmony_ci *(int32_t*)ptr = _mm_cvtsi128_si32(*s); 463cb93a386Sopenharmony_ci } 464cb93a386Sopenharmony_ci } 465cb93a386Sopenharmony_ci } else { 466cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 0, _0123); 467cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 1, _4567); 468cb93a386Sopenharmony_ci } 469cb93a386Sopenharmony_ci } 470cb93a386Sopenharmony_ci 471cb93a386Sopenharmony_ci SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 472cb93a386Sopenharmony_ci __m128i _0,_1,_2,_3,_4,_5,_6,_7; 473cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 474cb93a386Sopenharmony_ci auto load_rgb = [](const uint16_t* src) { 475cb93a386Sopenharmony_ci auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); 476cb93a386Sopenharmony_ci return _mm_insert_epi16(v, src[2], 2); 477cb93a386Sopenharmony_ci }; 478cb93a386Sopenharmony_ci _1 = _2 = _3 = _4 = _5 = _6 = _7 = _mm_setzero_si128(); 479cb93a386Sopenharmony_ci if ( true ) { _0 = load_rgb(ptr + 0); } 480cb93a386Sopenharmony_ci if (tail > 1) { _1 = load_rgb(ptr + 3); } 481cb93a386Sopenharmony_ci if (tail > 2) { _2 = load_rgb(ptr + 6); } 482cb93a386Sopenharmony_ci if (tail > 3) { _3 = load_rgb(ptr + 9); } 483cb93a386Sopenharmony_ci if (tail > 4) { _4 = load_rgb(ptr + 12); } 484cb93a386Sopenharmony_ci if (tail > 5) { _5 = load_rgb(ptr + 15); } 485cb93a386Sopenharmony_ci if (tail > 6) { _6 = load_rgb(ptr + 18); } 486cb93a386Sopenharmony_ci } else { 487cb93a386Sopenharmony_ci // Load 0+1, 2+3, 4+5 normally, and 6+7 backed up 4 bytes so we don't run over. 488cb93a386Sopenharmony_ci auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ; 489cb93a386Sopenharmony_ci auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ; 490cb93a386Sopenharmony_ci auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ; 491cb93a386Sopenharmony_ci auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4); 492cb93a386Sopenharmony_ci _0 = _01; _1 = _mm_srli_si128(_01, 6); 493cb93a386Sopenharmony_ci _2 = _23; _3 = _mm_srli_si128(_23, 6); 494cb93a386Sopenharmony_ci _4 = _45; _5 = _mm_srli_si128(_45, 6); 495cb93a386Sopenharmony_ci _6 = _67; _7 = _mm_srli_si128(_67, 6); 496cb93a386Sopenharmony_ci } 497cb93a386Sopenharmony_ci 498cb93a386Sopenharmony_ci auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx 499cb93a386Sopenharmony_ci _13 = _mm_unpacklo_epi16(_1, _3), 500cb93a386Sopenharmony_ci _46 = _mm_unpacklo_epi16(_4, _6), 501cb93a386Sopenharmony_ci _57 = _mm_unpacklo_epi16(_5, _7); 502cb93a386Sopenharmony_ci 503cb93a386Sopenharmony_ci auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 504cb93a386Sopenharmony_ci bx0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 xx xx xx xx 505cb93a386Sopenharmony_ci rg4567 = _mm_unpacklo_epi16(_46, _57), 506cb93a386Sopenharmony_ci bx4567 = _mm_unpackhi_epi16(_46, _57); 507cb93a386Sopenharmony_ci 508cb93a386Sopenharmony_ci *r = _mm_unpacklo_epi64(rg0123, rg4567); 509cb93a386Sopenharmony_ci *g = _mm_unpackhi_epi64(rg0123, rg4567); 510cb93a386Sopenharmony_ci *b = _mm_unpacklo_epi64(bx0123, bx4567); 511cb93a386Sopenharmony_ci } 512cb93a386Sopenharmony_ci SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 513cb93a386Sopenharmony_ci __m128i _01, _23, _45, _67; 514cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 515cb93a386Sopenharmony_ci auto src = (const double*)ptr; 516cb93a386Sopenharmony_ci _01 = _23 = _45 = _67 = _mm_setzero_si128(); 517cb93a386Sopenharmony_ci if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); } 518cb93a386Sopenharmony_ci if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); } 519cb93a386Sopenharmony_ci if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); } 520cb93a386Sopenharmony_ci if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); } 521cb93a386Sopenharmony_ci if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); } 522cb93a386Sopenharmony_ci if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); } 523cb93a386Sopenharmony_ci if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); } 524cb93a386Sopenharmony_ci } else { 525cb93a386Sopenharmony_ci _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); 526cb93a386Sopenharmony_ci _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); 527cb93a386Sopenharmony_ci _45 = _mm_loadu_si128(((__m128i*)ptr) + 2); 528cb93a386Sopenharmony_ci _67 = _mm_loadu_si128(((__m128i*)ptr) + 3); 529cb93a386Sopenharmony_ci } 530cb93a386Sopenharmony_ci 531cb93a386Sopenharmony_ci auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 532cb93a386Sopenharmony_ci _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3 533cb93a386Sopenharmony_ci _46 = _mm_unpacklo_epi16(_45, _67), 534cb93a386Sopenharmony_ci _57 = _mm_unpackhi_epi16(_45, _67); 535cb93a386Sopenharmony_ci 536cb93a386Sopenharmony_ci auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 537cb93a386Sopenharmony_ci ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3 538cb93a386Sopenharmony_ci rg4567 = _mm_unpacklo_epi16(_46, _57), 539cb93a386Sopenharmony_ci ba4567 = _mm_unpackhi_epi16(_46, _57); 540cb93a386Sopenharmony_ci 541cb93a386Sopenharmony_ci *r = _mm_unpacklo_epi64(rg0123, rg4567); 542cb93a386Sopenharmony_ci *g = _mm_unpackhi_epi64(rg0123, rg4567); 543cb93a386Sopenharmony_ci *b = _mm_unpacklo_epi64(ba0123, ba4567); 544cb93a386Sopenharmony_ci *a = _mm_unpackhi_epi64(ba0123, ba4567); 545cb93a386Sopenharmony_ci } 546cb93a386Sopenharmony_ci SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 547cb93a386Sopenharmony_ci auto rg0123 = _mm_unpacklo_epi16(r, g), // r0 g0 r1 g1 r2 g2 r3 g3 548cb93a386Sopenharmony_ci rg4567 = _mm_unpackhi_epi16(r, g), // r4 g4 r5 g5 r6 g6 r7 g7 549cb93a386Sopenharmony_ci ba0123 = _mm_unpacklo_epi16(b, a), 550cb93a386Sopenharmony_ci ba4567 = _mm_unpackhi_epi16(b, a); 551cb93a386Sopenharmony_ci 552cb93a386Sopenharmony_ci auto _01 = _mm_unpacklo_epi32(rg0123, ba0123), 553cb93a386Sopenharmony_ci _23 = _mm_unpackhi_epi32(rg0123, ba0123), 554cb93a386Sopenharmony_ci _45 = _mm_unpacklo_epi32(rg4567, ba4567), 555cb93a386Sopenharmony_ci _67 = _mm_unpackhi_epi32(rg4567, ba4567); 556cb93a386Sopenharmony_ci 557cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 558cb93a386Sopenharmony_ci auto dst = (double*)ptr; 559cb93a386Sopenharmony_ci if (tail > 0) { _mm_storel_pd(dst+0, _01); } 560cb93a386Sopenharmony_ci if (tail > 1) { _mm_storeh_pd(dst+1, _01); } 561cb93a386Sopenharmony_ci if (tail > 2) { _mm_storel_pd(dst+2, _23); } 562cb93a386Sopenharmony_ci if (tail > 3) { _mm_storeh_pd(dst+3, _23); } 563cb93a386Sopenharmony_ci if (tail > 4) { _mm_storel_pd(dst+4, _45); } 564cb93a386Sopenharmony_ci if (tail > 5) { _mm_storeh_pd(dst+5, _45); } 565cb93a386Sopenharmony_ci if (tail > 6) { _mm_storel_pd(dst+6, _67); } 566cb93a386Sopenharmony_ci } else { 567cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 0, _01); 568cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 1, _23); 569cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 2, _45); 570cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 3, _67); 571cb93a386Sopenharmony_ci } 572cb93a386Sopenharmony_ci } 573cb93a386Sopenharmony_ci 574cb93a386Sopenharmony_ci SI void load2(const float* ptr, size_t tail, F* r, F* g) { 575cb93a386Sopenharmony_ci F _0123, _4567; 576cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 577cb93a386Sopenharmony_ci _0123 = _4567 = _mm256_setzero_ps(); 578cb93a386Sopenharmony_ci F* d = &_0123; 579cb93a386Sopenharmony_ci if (tail > 3) { 580cb93a386Sopenharmony_ci *d = _mm256_loadu_ps(ptr); 581cb93a386Sopenharmony_ci ptr += 8; 582cb93a386Sopenharmony_ci tail -= 4; 583cb93a386Sopenharmony_ci d = &_4567; 584cb93a386Sopenharmony_ci } 585cb93a386Sopenharmony_ci bool high = false; 586cb93a386Sopenharmony_ci if (tail > 1) { 587cb93a386Sopenharmony_ci *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr)); 588cb93a386Sopenharmony_ci ptr += 4; 589cb93a386Sopenharmony_ci tail -= 2; 590cb93a386Sopenharmony_ci high = true; 591cb93a386Sopenharmony_ci } 592cb93a386Sopenharmony_ci if (tail > 0) { 593cb93a386Sopenharmony_ci *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1) 594cb93a386Sopenharmony_ci : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0); 595cb93a386Sopenharmony_ci } 596cb93a386Sopenharmony_ci } else { 597cb93a386Sopenharmony_ci _0123 = _mm256_loadu_ps(ptr + 0); 598cb93a386Sopenharmony_ci _4567 = _mm256_loadu_ps(ptr + 8); 599cb93a386Sopenharmony_ci } 600cb93a386Sopenharmony_ci 601cb93a386Sopenharmony_ci F _0145 = _mm256_permute2f128_pd(_0123, _4567, 0x20), 602cb93a386Sopenharmony_ci _2367 = _mm256_permute2f128_pd(_0123, _4567, 0x31); 603cb93a386Sopenharmony_ci 604cb93a386Sopenharmony_ci *r = _mm256_shuffle_ps(_0145, _2367, 0x88); 605cb93a386Sopenharmony_ci *g = _mm256_shuffle_ps(_0145, _2367, 0xDD); 606cb93a386Sopenharmony_ci } 607cb93a386Sopenharmony_ci SI void store2(float* ptr, size_t tail, F r, F g) { 608cb93a386Sopenharmony_ci F _0145 = _mm256_unpacklo_ps(r, g), 609cb93a386Sopenharmony_ci _2367 = _mm256_unpackhi_ps(r, g); 610cb93a386Sopenharmony_ci F _0123 = _mm256_permute2f128_pd(_0145, _2367, 0x20), 611cb93a386Sopenharmony_ci _4567 = _mm256_permute2f128_pd(_0145, _2367, 0x31); 612cb93a386Sopenharmony_ci 613cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 614cb93a386Sopenharmony_ci const __m256* s = &_0123; 615cb93a386Sopenharmony_ci if (tail > 3) { 616cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr, *s); 617cb93a386Sopenharmony_ci s = &_4567; 618cb93a386Sopenharmony_ci tail -= 4; 619cb93a386Sopenharmony_ci ptr += 8; 620cb93a386Sopenharmony_ci } 621cb93a386Sopenharmony_ci bool high = false; 622cb93a386Sopenharmony_ci if (tail > 1) { 623cb93a386Sopenharmony_ci _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0)); 624cb93a386Sopenharmony_ci ptr += 4; 625cb93a386Sopenharmony_ci tail -= 2; 626cb93a386Sopenharmony_ci high = true; 627cb93a386Sopenharmony_ci } 628cb93a386Sopenharmony_ci if (tail > 0) { 629cb93a386Sopenharmony_ci *(ptr + 0) = (*s)[ high ? 4 : 0]; 630cb93a386Sopenharmony_ci *(ptr + 1) = (*s)[ high ? 5 : 1]; 631cb93a386Sopenharmony_ci } 632cb93a386Sopenharmony_ci } else { 633cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr + 0, _0123); 634cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr + 8, _4567); 635cb93a386Sopenharmony_ci } 636cb93a386Sopenharmony_ci } 637cb93a386Sopenharmony_ci 638cb93a386Sopenharmony_ci SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 639cb93a386Sopenharmony_ci F _04, _15, _26, _37; 640cb93a386Sopenharmony_ci _04 = _15 = _26 = _37 = 0; 641cb93a386Sopenharmony_ci switch (tail) { 642cb93a386Sopenharmony_ci case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); [[fallthrough]]; 643cb93a386Sopenharmony_ci case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); [[fallthrough]]; 644cb93a386Sopenharmony_ci case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); [[fallthrough]]; 645cb93a386Sopenharmony_ci case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); [[fallthrough]]; 646cb93a386Sopenharmony_ci case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); [[fallthrough]]; 647cb93a386Sopenharmony_ci case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); [[fallthrough]]; 648cb93a386Sopenharmony_ci case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); [[fallthrough]]; 649cb93a386Sopenharmony_ci case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0); 650cb93a386Sopenharmony_ci } 651cb93a386Sopenharmony_ci 652cb93a386Sopenharmony_ci F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5 653cb93a386Sopenharmony_ci ba0145 = _mm256_unpackhi_ps(_04,_15), 654cb93a386Sopenharmony_ci rg2367 = _mm256_unpacklo_ps(_26,_37), 655cb93a386Sopenharmony_ci ba2367 = _mm256_unpackhi_ps(_26,_37); 656cb93a386Sopenharmony_ci 657cb93a386Sopenharmony_ci *r = _mm256_unpacklo_pd(rg0145, rg2367); 658cb93a386Sopenharmony_ci *g = _mm256_unpackhi_pd(rg0145, rg2367); 659cb93a386Sopenharmony_ci *b = _mm256_unpacklo_pd(ba0145, ba2367); 660cb93a386Sopenharmony_ci *a = _mm256_unpackhi_pd(ba0145, ba2367); 661cb93a386Sopenharmony_ci } 662cb93a386Sopenharmony_ci SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 663cb93a386Sopenharmony_ci F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5 664cb93a386Sopenharmony_ci rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ... 665cb93a386Sopenharmony_ci ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5 666cb93a386Sopenharmony_ci ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ... 667cb93a386Sopenharmony_ci 668cb93a386Sopenharmony_ci F _04 = _mm256_unpacklo_pd(rg0145, ba0145), // r0 g0 b0 a0 | r4 g4 b4 a4 669cb93a386Sopenharmony_ci _15 = _mm256_unpackhi_pd(rg0145, ba0145), // r1 ... | r5 ... 670cb93a386Sopenharmony_ci _26 = _mm256_unpacklo_pd(rg2367, ba2367), // r2 ... | r6 ... 671cb93a386Sopenharmony_ci _37 = _mm256_unpackhi_pd(rg2367, ba2367); // r3 ... | r7 ... 672cb93a386Sopenharmony_ci 673cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 674cb93a386Sopenharmony_ci if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); } 675cb93a386Sopenharmony_ci if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); } 676cb93a386Sopenharmony_ci if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); } 677cb93a386Sopenharmony_ci if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); } 678cb93a386Sopenharmony_ci if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); } 679cb93a386Sopenharmony_ci if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); } 680cb93a386Sopenharmony_ci if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); } 681cb93a386Sopenharmony_ci } else { 682cb93a386Sopenharmony_ci F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo 683cb93a386Sopenharmony_ci _23 = _mm256_permute2f128_ps(_26, _37, 32), 684cb93a386Sopenharmony_ci _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi 685cb93a386Sopenharmony_ci _67 = _mm256_permute2f128_ps(_26, _37, 49); 686cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr+ 0, _01); 687cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr+ 8, _23); 688cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr+16, _45); 689cb93a386Sopenharmony_ci _mm256_storeu_ps(ptr+24, _67); 690cb93a386Sopenharmony_ci } 691cb93a386Sopenharmony_ci } 692cb93a386Sopenharmony_ci 693cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) 694cb93a386Sopenharmony_citemplate <typename T> using V = T __attribute__((ext_vector_type(4))); 695cb93a386Sopenharmony_ci using F = V<float >; 696cb93a386Sopenharmony_ci using I32 = V< int32_t>; 697cb93a386Sopenharmony_ci using U64 = V<uint64_t>; 698cb93a386Sopenharmony_ci using U32 = V<uint32_t>; 699cb93a386Sopenharmony_ci using U16 = V<uint16_t>; 700cb93a386Sopenharmony_ci using U8 = V<uint8_t >; 701cb93a386Sopenharmony_ci 702cb93a386Sopenharmony_ci SI F mad(F f, F m, F a) { return f*m+a; } 703cb93a386Sopenharmony_ci SI F min(F a, F b) { return _mm_min_ps(a,b); } 704cb93a386Sopenharmony_ci SI F max(F a, F b) { return _mm_max_ps(a,b); } 705cb93a386Sopenharmony_ci SI F abs_(F v) { return _mm_and_ps(v, 0-v); } 706cb93a386Sopenharmony_ci SI F rcp_fast(F v) { return _mm_rcp_ps (v); } 707cb93a386Sopenharmony_ci SI F rcp_precise (F v) { F e = rcp_fast(v); return e * (2.0f - v * e); } 708cb93a386Sopenharmony_ci SI F rsqrt (F v) { return _mm_rsqrt_ps(v); } 709cb93a386Sopenharmony_ci SI F sqrt_(F v) { return _mm_sqrt_ps (v); } 710cb93a386Sopenharmony_ci 711cb93a386Sopenharmony_ci SI U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); } 712cb93a386Sopenharmony_ci 713cb93a386Sopenharmony_ci SI U16 pack(U32 v) { 714cb93a386Sopenharmony_ci #if defined(JUMPER_IS_SSE41) 715cb93a386Sopenharmony_ci auto p = _mm_packus_epi32(v,v); 716cb93a386Sopenharmony_ci #else 717cb93a386Sopenharmony_ci // Sign extend so that _mm_packs_epi32() does the pack we want. 718cb93a386Sopenharmony_ci auto p = _mm_srai_epi32(_mm_slli_epi32(v, 16), 16); 719cb93a386Sopenharmony_ci p = _mm_packs_epi32(p,p); 720cb93a386Sopenharmony_ci #endif 721cb93a386Sopenharmony_ci return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one. 722cb93a386Sopenharmony_ci } 723cb93a386Sopenharmony_ci SI U8 pack(U16 v) { 724cb93a386Sopenharmony_ci auto r = widen_cast<__m128i>(v); 725cb93a386Sopenharmony_ci r = _mm_packus_epi16(r,r); 726cb93a386Sopenharmony_ci return sk_unaligned_load<U8>(&r); 727cb93a386Sopenharmony_ci } 728cb93a386Sopenharmony_ci 729cb93a386Sopenharmony_ci SI F if_then_else(I32 c, F t, F e) { 730cb93a386Sopenharmony_ci return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e)); 731cb93a386Sopenharmony_ci } 732cb93a386Sopenharmony_ci 733cb93a386Sopenharmony_ci SI F floor_(F v) { 734cb93a386Sopenharmony_ci #if defined(JUMPER_IS_SSE41) 735cb93a386Sopenharmony_ci return _mm_floor_ps(v); 736cb93a386Sopenharmony_ci #else 737cb93a386Sopenharmony_ci F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v)); 738cb93a386Sopenharmony_ci return roundtrip - if_then_else(roundtrip > v, 1, 0); 739cb93a386Sopenharmony_ci #endif 740cb93a386Sopenharmony_ci } 741cb93a386Sopenharmony_ci 742cb93a386Sopenharmony_ci template <typename T> 743cb93a386Sopenharmony_ci SI V<T> gather(const T* p, U32 ix) { 744cb93a386Sopenharmony_ci return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; 745cb93a386Sopenharmony_ci } 746cb93a386Sopenharmony_ci 747cb93a386Sopenharmony_ci SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) { 748cb93a386Sopenharmony_ci __m128i _01; 749cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 750cb93a386Sopenharmony_ci _01 = _mm_setzero_si128(); 751cb93a386Sopenharmony_ci if (tail > 1) { 752cb93a386Sopenharmony_ci _01 = _mm_loadl_pd(_01, (const double*)ptr); // r0 g0 r1 g1 00 00 00 00 753cb93a386Sopenharmony_ci if (tail > 2) { 754cb93a386Sopenharmony_ci _01 = _mm_insert_epi16(_01, *(ptr+4), 4); // r0 g0 r1 g1 r2 00 00 00 755cb93a386Sopenharmony_ci _01 = _mm_insert_epi16(_01, *(ptr+5), 5); // r0 g0 r1 g1 r2 g2 00 00 756cb93a386Sopenharmony_ci } 757cb93a386Sopenharmony_ci } else { 758cb93a386Sopenharmony_ci _01 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); // r0 g0 00 00 00 00 00 00 759cb93a386Sopenharmony_ci } 760cb93a386Sopenharmony_ci } else { 761cb93a386Sopenharmony_ci _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3 762cb93a386Sopenharmony_ci } 763cb93a386Sopenharmony_ci auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3 764cb93a386Sopenharmony_ci auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3 765cb93a386Sopenharmony_ci 766cb93a386Sopenharmony_ci auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3 767cb93a386Sopenharmony_ci auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3 768cb93a386Sopenharmony_ci *r = sk_unaligned_load<U16>(&R); 769cb93a386Sopenharmony_ci *g = sk_unaligned_load<U16>(&G); 770cb93a386Sopenharmony_ci } 771cb93a386Sopenharmony_ci SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) { 772cb93a386Sopenharmony_ci U32 rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)); 773cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 774cb93a386Sopenharmony_ci if (tail > 1) { 775cb93a386Sopenharmony_ci _mm_storel_epi64((__m128i*)ptr, rg); 776cb93a386Sopenharmony_ci if (tail > 2) { 777cb93a386Sopenharmony_ci int32_t rgpair = rg[2]; 778cb93a386Sopenharmony_ci memcpy(ptr + 4, &rgpair, sizeof(rgpair)); 779cb93a386Sopenharmony_ci } 780cb93a386Sopenharmony_ci } else { 781cb93a386Sopenharmony_ci int32_t rgpair = rg[0]; 782cb93a386Sopenharmony_ci memcpy(ptr, &rgpair, sizeof(rgpair)); 783cb93a386Sopenharmony_ci } 784cb93a386Sopenharmony_ci } else { 785cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 0, rg); 786cb93a386Sopenharmony_ci } 787cb93a386Sopenharmony_ci } 788cb93a386Sopenharmony_ci 789cb93a386Sopenharmony_ci SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 790cb93a386Sopenharmony_ci __m128i _0, _1, _2, _3; 791cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 792cb93a386Sopenharmony_ci _1 = _2 = _3 = _mm_setzero_si128(); 793cb93a386Sopenharmony_ci auto load_rgb = [](const uint16_t* src) { 794cb93a386Sopenharmony_ci auto v = _mm_cvtsi32_si128(*(const uint32_t*)src); 795cb93a386Sopenharmony_ci return _mm_insert_epi16(v, src[2], 2); 796cb93a386Sopenharmony_ci }; 797cb93a386Sopenharmony_ci if ( true ) { _0 = load_rgb(ptr + 0); } 798cb93a386Sopenharmony_ci if (tail > 1) { _1 = load_rgb(ptr + 3); } 799cb93a386Sopenharmony_ci if (tail > 2) { _2 = load_rgb(ptr + 6); } 800cb93a386Sopenharmony_ci } else { 801cb93a386Sopenharmony_ci // Load slightly weirdly to make sure we don't load past the end of 4x48 bits. 802cb93a386Sopenharmony_ci auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) , 803cb93a386Sopenharmony_ci _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4); 804cb93a386Sopenharmony_ci 805cb93a386Sopenharmony_ci // Each _N holds R,G,B for pixel N in its lower 3 lanes (upper 5 are ignored). 806cb93a386Sopenharmony_ci _0 = _01; 807cb93a386Sopenharmony_ci _1 = _mm_srli_si128(_01, 6); 808cb93a386Sopenharmony_ci _2 = _23; 809cb93a386Sopenharmony_ci _3 = _mm_srli_si128(_23, 6); 810cb93a386Sopenharmony_ci } 811cb93a386Sopenharmony_ci 812cb93a386Sopenharmony_ci // De-interlace to R,G,B. 813cb93a386Sopenharmony_ci auto _02 = _mm_unpacklo_epi16(_0, _2), // r0 r2 g0 g2 b0 b2 xx xx 814cb93a386Sopenharmony_ci _13 = _mm_unpacklo_epi16(_1, _3); // r1 r3 g1 g3 b1 b3 xx xx 815cb93a386Sopenharmony_ci 816cb93a386Sopenharmony_ci auto R = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 817cb93a386Sopenharmony_ci G = _mm_srli_si128(R, 8), 818cb93a386Sopenharmony_ci B = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 xx xx xx xx 819cb93a386Sopenharmony_ci 820cb93a386Sopenharmony_ci *r = sk_unaligned_load<U16>(&R); 821cb93a386Sopenharmony_ci *g = sk_unaligned_load<U16>(&G); 822cb93a386Sopenharmony_ci *b = sk_unaligned_load<U16>(&B); 823cb93a386Sopenharmony_ci } 824cb93a386Sopenharmony_ci 825cb93a386Sopenharmony_ci SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 826cb93a386Sopenharmony_ci __m128i _01, _23; 827cb93a386Sopenharmony_ci if (__builtin_expect(tail,0)) { 828cb93a386Sopenharmony_ci _01 = _23 = _mm_setzero_si128(); 829cb93a386Sopenharmony_ci auto src = (const double*)ptr; 830cb93a386Sopenharmony_ci if ( true ) { _01 = _mm_loadl_pd(_01, src + 0); } // r0 g0 b0 a0 00 00 00 00 831cb93a386Sopenharmony_ci if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1 832cb93a386Sopenharmony_ci if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00 833cb93a386Sopenharmony_ci } else { 834cb93a386Sopenharmony_ci _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1 835cb93a386Sopenharmony_ci _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3 836cb93a386Sopenharmony_ci } 837cb93a386Sopenharmony_ci 838cb93a386Sopenharmony_ci auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2 839cb93a386Sopenharmony_ci _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3 840cb93a386Sopenharmony_ci 841cb93a386Sopenharmony_ci auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3 842cb93a386Sopenharmony_ci ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3 843cb93a386Sopenharmony_ci 844cb93a386Sopenharmony_ci *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0); 845cb93a386Sopenharmony_ci *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4); 846cb93a386Sopenharmony_ci *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0); 847cb93a386Sopenharmony_ci *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4); 848cb93a386Sopenharmony_ci } 849cb93a386Sopenharmony_ci 850cb93a386Sopenharmony_ci SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 851cb93a386Sopenharmony_ci auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)), 852cb93a386Sopenharmony_ci ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a)); 853cb93a386Sopenharmony_ci 854cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 855cb93a386Sopenharmony_ci auto dst = (double*)ptr; 856cb93a386Sopenharmony_ci if ( true ) { _mm_storel_pd(dst + 0, _mm_unpacklo_epi32(rg, ba)); } 857cb93a386Sopenharmony_ci if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); } 858cb93a386Sopenharmony_ci if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); } 859cb93a386Sopenharmony_ci } else { 860cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba)); 861cb93a386Sopenharmony_ci _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba)); 862cb93a386Sopenharmony_ci } 863cb93a386Sopenharmony_ci } 864cb93a386Sopenharmony_ci 865cb93a386Sopenharmony_ci SI void load2(const float* ptr, size_t tail, F* r, F* g) { 866cb93a386Sopenharmony_ci F _01, _23; 867cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 868cb93a386Sopenharmony_ci _01 = _23 = _mm_setzero_si128(); 869cb93a386Sopenharmony_ci if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); } 870cb93a386Sopenharmony_ci if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); } 871cb93a386Sopenharmony_ci if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); } 872cb93a386Sopenharmony_ci } else { 873cb93a386Sopenharmony_ci _01 = _mm_loadu_ps(ptr + 0); 874cb93a386Sopenharmony_ci _23 = _mm_loadu_ps(ptr + 4); 875cb93a386Sopenharmony_ci } 876cb93a386Sopenharmony_ci *r = _mm_shuffle_ps(_01, _23, 0x88); 877cb93a386Sopenharmony_ci *g = _mm_shuffle_ps(_01, _23, 0xDD); 878cb93a386Sopenharmony_ci } 879cb93a386Sopenharmony_ci SI void store2(float* ptr, size_t tail, F r, F g) { 880cb93a386Sopenharmony_ci F _01 = _mm_unpacklo_ps(r, g), 881cb93a386Sopenharmony_ci _23 = _mm_unpackhi_ps(r, g); 882cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 883cb93a386Sopenharmony_ci if ( true ) { _mm_storel_pi((__m64*)(ptr + 0), _01); } 884cb93a386Sopenharmony_ci if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); } 885cb93a386Sopenharmony_ci if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); } 886cb93a386Sopenharmony_ci } else { 887cb93a386Sopenharmony_ci _mm_storeu_ps(ptr + 0, _01); 888cb93a386Sopenharmony_ci _mm_storeu_ps(ptr + 4, _23); 889cb93a386Sopenharmony_ci } 890cb93a386Sopenharmony_ci } 891cb93a386Sopenharmony_ci 892cb93a386Sopenharmony_ci SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) { 893cb93a386Sopenharmony_ci F _0, _1, _2, _3; 894cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 895cb93a386Sopenharmony_ci _1 = _2 = _3 = _mm_setzero_si128(); 896cb93a386Sopenharmony_ci if ( true ) { _0 = _mm_loadu_ps(ptr + 0); } 897cb93a386Sopenharmony_ci if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); } 898cb93a386Sopenharmony_ci if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); } 899cb93a386Sopenharmony_ci } else { 900cb93a386Sopenharmony_ci _0 = _mm_loadu_ps(ptr + 0); 901cb93a386Sopenharmony_ci _1 = _mm_loadu_ps(ptr + 4); 902cb93a386Sopenharmony_ci _2 = _mm_loadu_ps(ptr + 8); 903cb93a386Sopenharmony_ci _3 = _mm_loadu_ps(ptr +12); 904cb93a386Sopenharmony_ci } 905cb93a386Sopenharmony_ci _MM_TRANSPOSE4_PS(_0,_1,_2,_3); 906cb93a386Sopenharmony_ci *r = _0; 907cb93a386Sopenharmony_ci *g = _1; 908cb93a386Sopenharmony_ci *b = _2; 909cb93a386Sopenharmony_ci *a = _3; 910cb93a386Sopenharmony_ci } 911cb93a386Sopenharmony_ci 912cb93a386Sopenharmony_ci SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) { 913cb93a386Sopenharmony_ci _MM_TRANSPOSE4_PS(r,g,b,a); 914cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 915cb93a386Sopenharmony_ci if ( true ) { _mm_storeu_ps(ptr + 0, r); } 916cb93a386Sopenharmony_ci if (tail > 1) { _mm_storeu_ps(ptr + 4, g); } 917cb93a386Sopenharmony_ci if (tail > 2) { _mm_storeu_ps(ptr + 8, b); } 918cb93a386Sopenharmony_ci } else { 919cb93a386Sopenharmony_ci _mm_storeu_ps(ptr + 0, r); 920cb93a386Sopenharmony_ci _mm_storeu_ps(ptr + 4, g); 921cb93a386Sopenharmony_ci _mm_storeu_ps(ptr + 8, b); 922cb93a386Sopenharmony_ci _mm_storeu_ps(ptr +12, a); 923cb93a386Sopenharmony_ci } 924cb93a386Sopenharmony_ci } 925cb93a386Sopenharmony_ci#endif 926cb93a386Sopenharmony_ci 927cb93a386Sopenharmony_ci// We need to be a careful with casts. 928cb93a386Sopenharmony_ci// (F)x means cast x to float in the portable path, but bit_cast x to float in the others. 929cb93a386Sopenharmony_ci// These named casts and bit_cast() are always what they seem to be. 930cb93a386Sopenharmony_ci#if defined(JUMPER_IS_SCALAR) 931cb93a386Sopenharmony_ci SI F cast (U32 v) { return (F)v; } 932cb93a386Sopenharmony_ci SI F cast64(U64 v) { return (F)v; } 933cb93a386Sopenharmony_ci SI U32 trunc_(F v) { return (U32)v; } 934cb93a386Sopenharmony_ci SI U32 expand(U16 v) { return (U32)v; } 935cb93a386Sopenharmony_ci SI U32 expand(U8 v) { return (U32)v; } 936cb93a386Sopenharmony_ci#else 937cb93a386Sopenharmony_ci SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } 938cb93a386Sopenharmony_ci SI F cast64(U64 v) { return __builtin_convertvector( v, F); } 939cb93a386Sopenharmony_ci SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); } 940cb93a386Sopenharmony_ci SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); } 941cb93a386Sopenharmony_ci SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); } 942cb93a386Sopenharmony_ci#endif 943cb93a386Sopenharmony_ci 944cb93a386Sopenharmony_citemplate <typename V> 945cb93a386Sopenharmony_ciSI V if_then_else(I32 c, V t, V e) { 946cb93a386Sopenharmony_ci return sk_bit_cast<V>(if_then_else(c, sk_bit_cast<F>(t), sk_bit_cast<F>(e))); 947cb93a386Sopenharmony_ci} 948cb93a386Sopenharmony_ci 949cb93a386Sopenharmony_ciSI U16 bswap(U16 x) { 950cb93a386Sopenharmony_ci#if defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) 951cb93a386Sopenharmony_ci // Somewhat inexplicably Clang decides to do (x<<8) | (x>>8) in 32-bit lanes 952cb93a386Sopenharmony_ci // when generating code for SSE2 and SSE4.1. We'll do it manually... 953cb93a386Sopenharmony_ci auto v = widen_cast<__m128i>(x); 954cb93a386Sopenharmony_ci v = _mm_slli_epi16(v,8) | _mm_srli_epi16(v,8); 955cb93a386Sopenharmony_ci return sk_unaligned_load<U16>(&v); 956cb93a386Sopenharmony_ci#else 957cb93a386Sopenharmony_ci return (x<<8) | (x>>8); 958cb93a386Sopenharmony_ci#endif 959cb93a386Sopenharmony_ci} 960cb93a386Sopenharmony_ci 961cb93a386Sopenharmony_ciSI F fract(F v) { return v - floor_(v); } 962cb93a386Sopenharmony_ci 963cb93a386Sopenharmony_ci// See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. 964cb93a386Sopenharmony_ciSI F approx_log2(F x) { 965cb93a386Sopenharmony_ci // e - 127 is a fair approximation of log2(x) in its own right... 966cb93a386Sopenharmony_ci F e = cast(sk_bit_cast<U32>(x)) * (1.0f / (1<<23)); 967cb93a386Sopenharmony_ci 968cb93a386Sopenharmony_ci // ... but using the mantissa to refine its error is _much_ better. 969cb93a386Sopenharmony_ci F m = sk_bit_cast<F>((sk_bit_cast<U32>(x) & 0x007fffff) | 0x3f000000); 970cb93a386Sopenharmony_ci return e 971cb93a386Sopenharmony_ci - 124.225514990f 972cb93a386Sopenharmony_ci - 1.498030302f * m 973cb93a386Sopenharmony_ci - 1.725879990f / (0.3520887068f + m); 974cb93a386Sopenharmony_ci} 975cb93a386Sopenharmony_ci 976cb93a386Sopenharmony_ciSI F approx_log(F x) { 977cb93a386Sopenharmony_ci const float ln2 = 0.69314718f; 978cb93a386Sopenharmony_ci return ln2 * approx_log2(x); 979cb93a386Sopenharmony_ci} 980cb93a386Sopenharmony_ci 981cb93a386Sopenharmony_ciSI F approx_pow2(F x) { 982cb93a386Sopenharmony_ci F f = fract(x); 983cb93a386Sopenharmony_ci return sk_bit_cast<F>(round(1.0f * (1<<23), 984cb93a386Sopenharmony_ci x + 121.274057500f 985cb93a386Sopenharmony_ci - 1.490129070f * f 986cb93a386Sopenharmony_ci + 27.728023300f / (4.84252568f - f))); 987cb93a386Sopenharmony_ci} 988cb93a386Sopenharmony_ci 989cb93a386Sopenharmony_ciSI F approx_exp(F x) { 990cb93a386Sopenharmony_ci const float log2_e = 1.4426950408889634074f; 991cb93a386Sopenharmony_ci return approx_pow2(log2_e * x); 992cb93a386Sopenharmony_ci} 993cb93a386Sopenharmony_ci 994cb93a386Sopenharmony_ciSI F approx_powf(F x, F y) { 995cb93a386Sopenharmony_ci return if_then_else((x == 0)|(x == 1), x 996cb93a386Sopenharmony_ci , approx_pow2(approx_log2(x) * y)); 997cb93a386Sopenharmony_ci} 998cb93a386Sopenharmony_ci 999cb93a386Sopenharmony_ciSI F from_half(U16 h) { 1000cb93a386Sopenharmony_ci#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ 1001cb93a386Sopenharmony_ci && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. 1002cb93a386Sopenharmony_ci return vcvt_f32_f16(h); 1003cb93a386Sopenharmony_ci 1004cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 1005cb93a386Sopenharmony_ci return _mm256_cvtph_ps(h); 1006cb93a386Sopenharmony_ci 1007cb93a386Sopenharmony_ci#else 1008cb93a386Sopenharmony_ci // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias. 1009cb93a386Sopenharmony_ci U32 sem = expand(h), 1010cb93a386Sopenharmony_ci s = sem & 0x8000, 1011cb93a386Sopenharmony_ci em = sem ^ s; 1012cb93a386Sopenharmony_ci 1013cb93a386Sopenharmony_ci // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero. 1014cb93a386Sopenharmony_ci auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here. 1015cb93a386Sopenharmony_ci return if_then_else(denorm, F(0) 1016cb93a386Sopenharmony_ci , sk_bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) )); 1017cb93a386Sopenharmony_ci#endif 1018cb93a386Sopenharmony_ci} 1019cb93a386Sopenharmony_ci 1020cb93a386Sopenharmony_ciSI U16 to_half(F f) { 1021cb93a386Sopenharmony_ci#if defined(JUMPER_IS_NEON) && defined(SK_CPU_ARM64) \ 1022cb93a386Sopenharmony_ci && !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds. 1023cb93a386Sopenharmony_ci return vcvt_f16_f32(f); 1024cb93a386Sopenharmony_ci 1025cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 1026cb93a386Sopenharmony_ci return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION); 1027cb93a386Sopenharmony_ci 1028cb93a386Sopenharmony_ci#else 1029cb93a386Sopenharmony_ci // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias. 1030cb93a386Sopenharmony_ci U32 sem = sk_bit_cast<U32>(f), 1031cb93a386Sopenharmony_ci s = sem & 0x80000000, 1032cb93a386Sopenharmony_ci em = sem ^ s; 1033cb93a386Sopenharmony_ci 1034cb93a386Sopenharmony_ci // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero. 1035cb93a386Sopenharmony_ci auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here. 1036cb93a386Sopenharmony_ci return pack(if_then_else(denorm, U32(0) 1037cb93a386Sopenharmony_ci , (s>>16) + (em>>13) - ((127-15)<<10))); 1038cb93a386Sopenharmony_ci#endif 1039cb93a386Sopenharmony_ci} 1040cb93a386Sopenharmony_ci 1041cb93a386Sopenharmony_ci// Our fundamental vector depth is our pixel stride. 1042cb93a386Sopenharmony_cistatic const size_t N = sizeof(F) / sizeof(float); 1043cb93a386Sopenharmony_ci 1044cb93a386Sopenharmony_ci// We're finally going to get to what a Stage function looks like! 1045cb93a386Sopenharmony_ci// tail == 0 ~~> work on a full N pixels 1046cb93a386Sopenharmony_ci// tail != 0 ~~> work on only the first tail pixels 1047cb93a386Sopenharmony_ci// tail is always < N. 1048cb93a386Sopenharmony_ci 1049cb93a386Sopenharmony_ci// Any custom ABI to use for all (non-externally-facing) stage functions? 1050cb93a386Sopenharmony_ci// Also decide here whether to use narrow (compromise) or wide (ideal) stages. 1051cb93a386Sopenharmony_ci#if defined(SK_CPU_ARM32) && defined(JUMPER_IS_NEON) 1052cb93a386Sopenharmony_ci // This lets us pass vectors more efficiently on 32-bit ARM. 1053cb93a386Sopenharmony_ci // We can still only pass 16 floats, so best as 4x {r,g,b,a}. 1054cb93a386Sopenharmony_ci #define ABI __attribute__((pcs("aapcs-vfp"))) 1055cb93a386Sopenharmony_ci #define JUMPER_NARROW_STAGES 1 1056cb93a386Sopenharmony_ci#elif defined(_MSC_VER) 1057cb93a386Sopenharmony_ci // Even if not vectorized, this lets us pass {r,g,b,a} as registers, 1058cb93a386Sopenharmony_ci // instead of {b,a} on the stack. Narrow stages work best for __vectorcall. 1059cb93a386Sopenharmony_ci #define ABI __vectorcall 1060cb93a386Sopenharmony_ci #define JUMPER_NARROW_STAGES 1 1061cb93a386Sopenharmony_ci#elif defined(__x86_64__) || defined(SK_CPU_ARM64) 1062cb93a386Sopenharmony_ci // These platforms are ideal for wider stages, and their default ABI is ideal. 1063cb93a386Sopenharmony_ci #define ABI 1064cb93a386Sopenharmony_ci #define JUMPER_NARROW_STAGES 0 1065cb93a386Sopenharmony_ci#else 1066cb93a386Sopenharmony_ci // 32-bit or unknown... shunt them down the narrow path. 1067cb93a386Sopenharmony_ci // Odds are these have few registers and are better off there. 1068cb93a386Sopenharmony_ci #define ABI 1069cb93a386Sopenharmony_ci #define JUMPER_NARROW_STAGES 1 1070cb93a386Sopenharmony_ci#endif 1071cb93a386Sopenharmony_ci 1072cb93a386Sopenharmony_ci#if JUMPER_NARROW_STAGES 1073cb93a386Sopenharmony_ci struct Params { 1074cb93a386Sopenharmony_ci size_t dx, dy, tail; 1075cb93a386Sopenharmony_ci F dr,dg,db,da; 1076cb93a386Sopenharmony_ci }; 1077cb93a386Sopenharmony_ci using Stage = void(ABI*)(Params*, void** program, F r, F g, F b, F a); 1078cb93a386Sopenharmony_ci#else 1079cb93a386Sopenharmony_ci // We keep program the second argument, so that it's passed in rsi for load_and_inc(). 1080cb93a386Sopenharmony_ci using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F); 1081cb93a386Sopenharmony_ci#endif 1082cb93a386Sopenharmony_ci 1083cb93a386Sopenharmony_ci 1084cb93a386Sopenharmony_cistatic void start_pipeline(size_t dx, size_t dy, size_t xlimit, size_t ylimit, void** program) { 1085cb93a386Sopenharmony_ci auto start = (Stage)load_and_inc(program); 1086cb93a386Sopenharmony_ci const size_t x0 = dx; 1087cb93a386Sopenharmony_ci for (; dy < ylimit; dy++) { 1088cb93a386Sopenharmony_ci #if JUMPER_NARROW_STAGES 1089cb93a386Sopenharmony_ci Params params = { x0,dy,0, 0,0,0,0 }; 1090cb93a386Sopenharmony_ci while (params.dx + N <= xlimit) { 1091cb93a386Sopenharmony_ci start(¶ms,program, 0,0,0,0); 1092cb93a386Sopenharmony_ci params.dx += N; 1093cb93a386Sopenharmony_ci } 1094cb93a386Sopenharmony_ci if (size_t tail = xlimit - params.dx) { 1095cb93a386Sopenharmony_ci params.tail = tail; 1096cb93a386Sopenharmony_ci start(¶ms,program, 0,0,0,0); 1097cb93a386Sopenharmony_ci } 1098cb93a386Sopenharmony_ci #else 1099cb93a386Sopenharmony_ci dx = x0; 1100cb93a386Sopenharmony_ci while (dx + N <= xlimit) { 1101cb93a386Sopenharmony_ci start(0,program,dx,dy, 0,0,0,0, 0,0,0,0); 1102cb93a386Sopenharmony_ci dx += N; 1103cb93a386Sopenharmony_ci } 1104cb93a386Sopenharmony_ci if (size_t tail = xlimit - dx) { 1105cb93a386Sopenharmony_ci start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); 1106cb93a386Sopenharmony_ci } 1107cb93a386Sopenharmony_ci #endif 1108cb93a386Sopenharmony_ci } 1109cb93a386Sopenharmony_ci} 1110cb93a386Sopenharmony_ci 1111cb93a386Sopenharmony_ci#if JUMPER_NARROW_STAGES 1112cb93a386Sopenharmony_ci #define STAGE(name, ...) \ 1113cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 1114cb93a386Sopenharmony_ci F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ 1115cb93a386Sopenharmony_ci static void ABI name(Params* params, void** program, \ 1116cb93a386Sopenharmony_ci F r, F g, F b, F a) { \ 1117cb93a386Sopenharmony_ci name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \ 1118cb93a386Sopenharmony_ci params->dr, params->dg, params->db, params->da); \ 1119cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 1120cb93a386Sopenharmony_ci next(params,program, r,g,b,a); \ 1121cb93a386Sopenharmony_ci } \ 1122cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 1123cb93a386Sopenharmony_ci F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) 1124cb93a386Sopenharmony_ci#else 1125cb93a386Sopenharmony_ci #define STAGE(name, ...) \ 1126cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 1127cb93a386Sopenharmony_ci F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \ 1128cb93a386Sopenharmony_ci static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ 1129cb93a386Sopenharmony_ci F r, F g, F b, F a, F dr, F dg, F db, F da) { \ 1130cb93a386Sopenharmony_ci name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \ 1131cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 1132cb93a386Sopenharmony_ci next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 1133cb93a386Sopenharmony_ci } \ 1134cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 1135cb93a386Sopenharmony_ci F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da) 1136cb93a386Sopenharmony_ci#endif 1137cb93a386Sopenharmony_ci 1138cb93a386Sopenharmony_ci 1139cb93a386Sopenharmony_ci// just_return() is a simple no-op stage that only exists to end the chain, 1140cb93a386Sopenharmony_ci// returning back up to start_pipeline(), and from there to the caller. 1141cb93a386Sopenharmony_ci#if JUMPER_NARROW_STAGES 1142cb93a386Sopenharmony_ci static void ABI just_return(Params*, void**, F,F,F,F) {} 1143cb93a386Sopenharmony_ci#else 1144cb93a386Sopenharmony_ci static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {} 1145cb93a386Sopenharmony_ci#endif 1146cb93a386Sopenharmony_ci 1147cb93a386Sopenharmony_ci 1148cb93a386Sopenharmony_ci// We could start defining normal Stages now. But first, some helper functions. 1149cb93a386Sopenharmony_ci 1150cb93a386Sopenharmony_ci// These load() and store() methods are tail-aware, 1151cb93a386Sopenharmony_ci// but focus mainly on keeping the at-stride tail==0 case fast. 1152cb93a386Sopenharmony_ci 1153cb93a386Sopenharmony_citemplate <typename V, typename T> 1154cb93a386Sopenharmony_ciSI V load(const T* src, size_t tail) { 1155cb93a386Sopenharmony_ci#if !defined(JUMPER_IS_SCALAR) 1156cb93a386Sopenharmony_ci __builtin_assume(tail < N); 1157cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 1158cb93a386Sopenharmony_ci V v{}; // Any inactive lanes are zeroed. 1159cb93a386Sopenharmony_ci switch (tail) { 1160cb93a386Sopenharmony_ci case 7: v[6] = src[6]; [[fallthrough]]; 1161cb93a386Sopenharmony_ci case 6: v[5] = src[5]; [[fallthrough]]; 1162cb93a386Sopenharmony_ci case 5: v[4] = src[4]; [[fallthrough]]; 1163cb93a386Sopenharmony_ci case 4: memcpy(&v, src, 4*sizeof(T)); break; 1164cb93a386Sopenharmony_ci case 3: v[2] = src[2]; [[fallthrough]]; 1165cb93a386Sopenharmony_ci case 2: memcpy(&v, src, 2*sizeof(T)); break; 1166cb93a386Sopenharmony_ci case 1: memcpy(&v, src, 1*sizeof(T)); break; 1167cb93a386Sopenharmony_ci } 1168cb93a386Sopenharmony_ci return v; 1169cb93a386Sopenharmony_ci } 1170cb93a386Sopenharmony_ci#endif 1171cb93a386Sopenharmony_ci return sk_unaligned_load<V>(src); 1172cb93a386Sopenharmony_ci} 1173cb93a386Sopenharmony_ci 1174cb93a386Sopenharmony_citemplate <typename V, typename T> 1175cb93a386Sopenharmony_ciSI void store(T* dst, V v, size_t tail) { 1176cb93a386Sopenharmony_ci#if !defined(JUMPER_IS_SCALAR) 1177cb93a386Sopenharmony_ci __builtin_assume(tail < N); 1178cb93a386Sopenharmony_ci if (__builtin_expect(tail, 0)) { 1179cb93a386Sopenharmony_ci switch (tail) { 1180cb93a386Sopenharmony_ci case 7: dst[6] = v[6]; [[fallthrough]]; 1181cb93a386Sopenharmony_ci case 6: dst[5] = v[5]; [[fallthrough]]; 1182cb93a386Sopenharmony_ci case 5: dst[4] = v[4]; [[fallthrough]]; 1183cb93a386Sopenharmony_ci case 4: memcpy(dst, &v, 4*sizeof(T)); break; 1184cb93a386Sopenharmony_ci case 3: dst[2] = v[2]; [[fallthrough]]; 1185cb93a386Sopenharmony_ci case 2: memcpy(dst, &v, 2*sizeof(T)); break; 1186cb93a386Sopenharmony_ci case 1: memcpy(dst, &v, 1*sizeof(T)); break; 1187cb93a386Sopenharmony_ci } 1188cb93a386Sopenharmony_ci return; 1189cb93a386Sopenharmony_ci } 1190cb93a386Sopenharmony_ci#endif 1191cb93a386Sopenharmony_ci sk_unaligned_store(dst, v); 1192cb93a386Sopenharmony_ci} 1193cb93a386Sopenharmony_ci 1194cb93a386Sopenharmony_ciSI F from_byte(U8 b) { 1195cb93a386Sopenharmony_ci return cast(expand(b)) * (1/255.0f); 1196cb93a386Sopenharmony_ci} 1197cb93a386Sopenharmony_ciSI F from_short(U16 s) { 1198cb93a386Sopenharmony_ci return cast(expand(s)) * (1/65535.0f); 1199cb93a386Sopenharmony_ci} 1200cb93a386Sopenharmony_ciSI void from_565(U16 _565, F* r, F* g, F* b) { 1201cb93a386Sopenharmony_ci U32 wide = expand(_565); 1202cb93a386Sopenharmony_ci *r = cast(wide & (31<<11)) * (1.0f / (31<<11)); 1203cb93a386Sopenharmony_ci *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5)); 1204cb93a386Sopenharmony_ci *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0)); 1205cb93a386Sopenharmony_ci} 1206cb93a386Sopenharmony_ciSI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) { 1207cb93a386Sopenharmony_ci U32 wide = expand(_4444); 1208cb93a386Sopenharmony_ci *r = cast(wide & (15<<12)) * (1.0f / (15<<12)); 1209cb93a386Sopenharmony_ci *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8)); 1210cb93a386Sopenharmony_ci *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4)); 1211cb93a386Sopenharmony_ci *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0)); 1212cb93a386Sopenharmony_ci} 1213cb93a386Sopenharmony_ciSI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) { 1214cb93a386Sopenharmony_ci *r = cast((_8888 ) & 0xff) * (1/255.0f); 1215cb93a386Sopenharmony_ci *g = cast((_8888 >> 8) & 0xff) * (1/255.0f); 1216cb93a386Sopenharmony_ci *b = cast((_8888 >> 16) & 0xff) * (1/255.0f); 1217cb93a386Sopenharmony_ci *a = cast((_8888 >> 24) ) * (1/255.0f); 1218cb93a386Sopenharmony_ci} 1219cb93a386Sopenharmony_ciSI void from_88(U16 _88, F* r, F* g) { 1220cb93a386Sopenharmony_ci U32 wide = expand(_88); 1221cb93a386Sopenharmony_ci *r = cast((wide ) & 0xff) * (1/255.0f); 1222cb93a386Sopenharmony_ci *g = cast((wide >> 8) & 0xff) * (1/255.0f); 1223cb93a386Sopenharmony_ci} 1224cb93a386Sopenharmony_ciSI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) { 1225cb93a386Sopenharmony_ci *r = cast((rgba ) & 0x3ff) * (1/1023.0f); 1226cb93a386Sopenharmony_ci *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f); 1227cb93a386Sopenharmony_ci *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f); 1228cb93a386Sopenharmony_ci *a = cast((rgba >> 30) ) * (1/ 3.0f); 1229cb93a386Sopenharmony_ci} 1230cb93a386Sopenharmony_ciSI void from_1616(U32 _1616, F* r, F* g) { 1231cb93a386Sopenharmony_ci *r = cast((_1616 ) & 0xffff) * (1/65535.0f); 1232cb93a386Sopenharmony_ci *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f); 1233cb93a386Sopenharmony_ci} 1234cb93a386Sopenharmony_ciSI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) { 1235cb93a386Sopenharmony_ci *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f); 1236cb93a386Sopenharmony_ci *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f); 1237cb93a386Sopenharmony_ci *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f); 1238cb93a386Sopenharmony_ci *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f); 1239cb93a386Sopenharmony_ci} 1240cb93a386Sopenharmony_ci 1241cb93a386Sopenharmony_ci// Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory. 1242cb93a386Sopenharmony_citemplate <typename T> 1243cb93a386Sopenharmony_ciSI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { 1244cb93a386Sopenharmony_ci return (T*)ctx->pixels + dy*ctx->stride + dx; 1245cb93a386Sopenharmony_ci} 1246cb93a386Sopenharmony_ci 1247cb93a386Sopenharmony_ci// clamp v to [0,limit). 1248cb93a386Sopenharmony_ciSI F clamp(F v, F limit) { 1249cb93a386Sopenharmony_ci F inclusive = sk_bit_cast<F>( sk_bit_cast<U32>(limit) - 1 ); // Exclusive -> inclusive. 1250cb93a386Sopenharmony_ci return min(max(0, v), inclusive); 1251cb93a386Sopenharmony_ci} 1252cb93a386Sopenharmony_ci 1253cb93a386Sopenharmony_ci// Used by gather_ stages to calculate the base pointer and a vector of indices to load. 1254cb93a386Sopenharmony_citemplate <typename T> 1255cb93a386Sopenharmony_ciSI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { 1256cb93a386Sopenharmony_ci x = clamp(x, ctx->width); 1257cb93a386Sopenharmony_ci y = clamp(y, ctx->height); 1258cb93a386Sopenharmony_ci 1259cb93a386Sopenharmony_ci *ptr = (const T*)ctx->pixels; 1260cb93a386Sopenharmony_ci return trunc_(y)*ctx->stride + trunc_(x); 1261cb93a386Sopenharmony_ci} 1262cb93a386Sopenharmony_ci 1263cb93a386Sopenharmony_ci// We often have a nominally [0,1] float value we need to scale and convert to an integer, 1264cb93a386Sopenharmony_ci// whether for a table lookup or to pack back down into bytes for storage. 1265cb93a386Sopenharmony_ci// 1266cb93a386Sopenharmony_ci// In practice, especially when dealing with interesting color spaces, that notionally 1267cb93a386Sopenharmony_ci// [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp. 1268cb93a386Sopenharmony_ci// 1269cb93a386Sopenharmony_ci// You can adjust the expected input to [0,bias] by tweaking that parameter. 1270cb93a386Sopenharmony_ciSI U32 to_unorm(F v, F scale, F bias = 1.0f) { 1271cb93a386Sopenharmony_ci // Any time we use round() we probably want to use to_unorm(). 1272cb93a386Sopenharmony_ci return round(min(max(0, v), bias), scale); 1273cb93a386Sopenharmony_ci} 1274cb93a386Sopenharmony_ci 1275cb93a386Sopenharmony_ciSI I32 cond_to_mask(I32 cond) { return if_then_else(cond, I32(~0), I32(0)); } 1276cb93a386Sopenharmony_ci 1277cb93a386Sopenharmony_ci// Now finally, normal Stages! 1278cb93a386Sopenharmony_ci 1279cb93a386Sopenharmony_ciSTAGE(seed_shader, Ctx::None) { 1280cb93a386Sopenharmony_ci static const float iota[] = { 1281cb93a386Sopenharmony_ci 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, 1282cb93a386Sopenharmony_ci 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, 1283cb93a386Sopenharmony_ci }; 1284cb93a386Sopenharmony_ci // It's important for speed to explicitly cast(dx) and cast(dy), 1285cb93a386Sopenharmony_ci // which has the effect of splatting them to vectors before converting to floats. 1286cb93a386Sopenharmony_ci // On Intel this breaks a data dependency on previous loop iterations' registers. 1287cb93a386Sopenharmony_ci r = cast(dx) + sk_unaligned_load<F>(iota); 1288cb93a386Sopenharmony_ci g = cast(dy) + 0.5f; 1289cb93a386Sopenharmony_ci b = 1.0f; 1290cb93a386Sopenharmony_ci a = 0; 1291cb93a386Sopenharmony_ci dr = dg = db = da = 0; 1292cb93a386Sopenharmony_ci} 1293cb93a386Sopenharmony_ci 1294cb93a386Sopenharmony_ciSTAGE(dither, const float* rate) { 1295cb93a386Sopenharmony_ci // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors. 1296cb93a386Sopenharmony_ci uint32_t iota[] = {0,1,2,3,4,5,6,7}; 1297cb93a386Sopenharmony_ci U32 X = dx + sk_unaligned_load<U32>(iota), 1298cb93a386Sopenharmony_ci Y = dy; 1299cb93a386Sopenharmony_ci 1300cb93a386Sopenharmony_ci // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering. 1301cb93a386Sopenharmony_ci // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ]. 1302cb93a386Sopenharmony_ci 1303cb93a386Sopenharmony_ci // We only need X and X^Y from here on, so it's easier to just think of that as "Y". 1304cb93a386Sopenharmony_ci Y ^= X; 1305cb93a386Sopenharmony_ci 1306cb93a386Sopenharmony_ci // We'll mix the bottom 3 bits of each of X and Y to make 6 bits, 1307cb93a386Sopenharmony_ci // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda. 1308cb93a386Sopenharmony_ci U32 M = (Y & 1) << 5 | (X & 1) << 4 1309cb93a386Sopenharmony_ci | (Y & 2) << 2 | (X & 2) << 1 1310cb93a386Sopenharmony_ci | (Y & 4) >> 1 | (X & 4) >> 2; 1311cb93a386Sopenharmony_ci 1312cb93a386Sopenharmony_ci // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon. 1313cb93a386Sopenharmony_ci // We want to make sure our dither is less than 0.5 in either direction to keep exact values 1314cb93a386Sopenharmony_ci // like 0 and 1 unchanged after rounding. 1315cb93a386Sopenharmony_ci F dither = cast(M) * (2/128.0f) - (63/128.0f); 1316cb93a386Sopenharmony_ci 1317cb93a386Sopenharmony_ci r += *rate*dither; 1318cb93a386Sopenharmony_ci g += *rate*dither; 1319cb93a386Sopenharmony_ci b += *rate*dither; 1320cb93a386Sopenharmony_ci 1321cb93a386Sopenharmony_ci r = max(0, min(r, a)); 1322cb93a386Sopenharmony_ci g = max(0, min(g, a)); 1323cb93a386Sopenharmony_ci b = max(0, min(b, a)); 1324cb93a386Sopenharmony_ci} 1325cb93a386Sopenharmony_ci 1326cb93a386Sopenharmony_ci// load 4 floats from memory, and splat them into r,g,b,a 1327cb93a386Sopenharmony_ciSTAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { 1328cb93a386Sopenharmony_ci r = c->r; 1329cb93a386Sopenharmony_ci g = c->g; 1330cb93a386Sopenharmony_ci b = c->b; 1331cb93a386Sopenharmony_ci a = c->a; 1332cb93a386Sopenharmony_ci} 1333cb93a386Sopenharmony_ciSTAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) { 1334cb93a386Sopenharmony_ci r = c->r; 1335cb93a386Sopenharmony_ci g = c->g; 1336cb93a386Sopenharmony_ci b = c->b; 1337cb93a386Sopenharmony_ci a = c->a; 1338cb93a386Sopenharmony_ci} 1339cb93a386Sopenharmony_ci// load 4 floats from memory, and splat them into dr,dg,db,da 1340cb93a386Sopenharmony_ciSTAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { 1341cb93a386Sopenharmony_ci dr = c->r; 1342cb93a386Sopenharmony_ci dg = c->g; 1343cb93a386Sopenharmony_ci db = c->b; 1344cb93a386Sopenharmony_ci da = c->a; 1345cb93a386Sopenharmony_ci} 1346cb93a386Sopenharmony_ci 1347cb93a386Sopenharmony_ci// splats opaque-black into r,g,b,a 1348cb93a386Sopenharmony_ciSTAGE(black_color, Ctx::None) { 1349cb93a386Sopenharmony_ci r = g = b = 0.0f; 1350cb93a386Sopenharmony_ci a = 1.0f; 1351cb93a386Sopenharmony_ci} 1352cb93a386Sopenharmony_ci 1353cb93a386Sopenharmony_ciSTAGE(white_color, Ctx::None) { 1354cb93a386Sopenharmony_ci r = g = b = a = 1.0f; 1355cb93a386Sopenharmony_ci} 1356cb93a386Sopenharmony_ci 1357cb93a386Sopenharmony_ci// load registers r,g,b,a from context (mirrors store_rgba) 1358cb93a386Sopenharmony_ciSTAGE(load_src, const float* ptr) { 1359cb93a386Sopenharmony_ci r = sk_unaligned_load<F>(ptr + 0*N); 1360cb93a386Sopenharmony_ci g = sk_unaligned_load<F>(ptr + 1*N); 1361cb93a386Sopenharmony_ci b = sk_unaligned_load<F>(ptr + 2*N); 1362cb93a386Sopenharmony_ci a = sk_unaligned_load<F>(ptr + 3*N); 1363cb93a386Sopenharmony_ci} 1364cb93a386Sopenharmony_ci 1365cb93a386Sopenharmony_ci// store registers r,g,b,a into context (mirrors load_rgba) 1366cb93a386Sopenharmony_ciSTAGE(store_src, float* ptr) { 1367cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 0*N, r); 1368cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 1*N, g); 1369cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 2*N, b); 1370cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 3*N, a); 1371cb93a386Sopenharmony_ci} 1372cb93a386Sopenharmony_ciSTAGE(store_src_a, float* ptr) { 1373cb93a386Sopenharmony_ci sk_unaligned_store(ptr, a); 1374cb93a386Sopenharmony_ci} 1375cb93a386Sopenharmony_ci 1376cb93a386Sopenharmony_ci// load registers dr,dg,db,da from context (mirrors store_dst) 1377cb93a386Sopenharmony_ciSTAGE(load_dst, const float* ptr) { 1378cb93a386Sopenharmony_ci dr = sk_unaligned_load<F>(ptr + 0*N); 1379cb93a386Sopenharmony_ci dg = sk_unaligned_load<F>(ptr + 1*N); 1380cb93a386Sopenharmony_ci db = sk_unaligned_load<F>(ptr + 2*N); 1381cb93a386Sopenharmony_ci da = sk_unaligned_load<F>(ptr + 3*N); 1382cb93a386Sopenharmony_ci} 1383cb93a386Sopenharmony_ci 1384cb93a386Sopenharmony_ci// store registers dr,dg,db,da into context (mirrors load_dst) 1385cb93a386Sopenharmony_ciSTAGE(store_dst, float* ptr) { 1386cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 0*N, dr); 1387cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 1*N, dg); 1388cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 2*N, db); 1389cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 3*N, da); 1390cb93a386Sopenharmony_ci} 1391cb93a386Sopenharmony_ci 1392cb93a386Sopenharmony_ci// Most blend modes apply the same logic to each channel. 1393cb93a386Sopenharmony_ci#define BLEND_MODE(name) \ 1394cb93a386Sopenharmony_ci SI F name##_channel(F s, F d, F sa, F da); \ 1395cb93a386Sopenharmony_ci STAGE(name, Ctx::None) { \ 1396cb93a386Sopenharmony_ci r = name##_channel(r,dr,a,da); \ 1397cb93a386Sopenharmony_ci g = name##_channel(g,dg,a,da); \ 1398cb93a386Sopenharmony_ci b = name##_channel(b,db,a,da); \ 1399cb93a386Sopenharmony_ci a = name##_channel(a,da,a,da); \ 1400cb93a386Sopenharmony_ci } \ 1401cb93a386Sopenharmony_ci SI F name##_channel(F s, F d, F sa, F da) 1402cb93a386Sopenharmony_ci 1403cb93a386Sopenharmony_ciSI F inv(F x) { return 1.0f - x; } 1404cb93a386Sopenharmony_ciSI F two(F x) { return x + x; } 1405cb93a386Sopenharmony_ci 1406cb93a386Sopenharmony_ci 1407cb93a386Sopenharmony_ciBLEND_MODE(clear) { return 0; } 1408cb93a386Sopenharmony_ciBLEND_MODE(srcatop) { return s*da + d*inv(sa); } 1409cb93a386Sopenharmony_ciBLEND_MODE(dstatop) { return d*sa + s*inv(da); } 1410cb93a386Sopenharmony_ciBLEND_MODE(srcin) { return s * da; } 1411cb93a386Sopenharmony_ciBLEND_MODE(dstin) { return d * sa; } 1412cb93a386Sopenharmony_ciBLEND_MODE(srcout) { return s * inv(da); } 1413cb93a386Sopenharmony_ciBLEND_MODE(dstout) { return d * inv(sa); } 1414cb93a386Sopenharmony_ciBLEND_MODE(srcover) { return mad(d, inv(sa), s); } 1415cb93a386Sopenharmony_ciBLEND_MODE(dstover) { return mad(s, inv(da), d); } 1416cb93a386Sopenharmony_ci 1417cb93a386Sopenharmony_ciBLEND_MODE(modulate) { return s*d; } 1418cb93a386Sopenharmony_ciBLEND_MODE(multiply) { return s*inv(da) + d*inv(sa) + s*d; } 1419cb93a386Sopenharmony_ciBLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa. 1420cb93a386Sopenharmony_ciBLEND_MODE(screen) { return s + d - s*d; } 1421cb93a386Sopenharmony_ciBLEND_MODE(xor_) { return s*inv(da) + d*inv(sa); } 1422cb93a386Sopenharmony_ci#undef BLEND_MODE 1423cb93a386Sopenharmony_ci 1424cb93a386Sopenharmony_ci// Most other blend modes apply the same logic to colors, and srcover to alpha. 1425cb93a386Sopenharmony_ci#define BLEND_MODE(name) \ 1426cb93a386Sopenharmony_ci SI F name##_channel(F s, F d, F sa, F da); \ 1427cb93a386Sopenharmony_ci STAGE(name, Ctx::None) { \ 1428cb93a386Sopenharmony_ci r = name##_channel(r,dr,a,da); \ 1429cb93a386Sopenharmony_ci g = name##_channel(g,dg,a,da); \ 1430cb93a386Sopenharmony_ci b = name##_channel(b,db,a,da); \ 1431cb93a386Sopenharmony_ci a = mad(da, inv(a), a); \ 1432cb93a386Sopenharmony_ci } \ 1433cb93a386Sopenharmony_ci SI F name##_channel(F s, F d, F sa, F da) 1434cb93a386Sopenharmony_ci 1435cb93a386Sopenharmony_ciBLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; } 1436cb93a386Sopenharmony_ciBLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; } 1437cb93a386Sopenharmony_ciBLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); } 1438cb93a386Sopenharmony_ciBLEND_MODE(exclusion) { return s + d - two(s*d); } 1439cb93a386Sopenharmony_ci 1440cb93a386Sopenharmony_ciBLEND_MODE(colorburn) { 1441cb93a386Sopenharmony_ci return if_then_else(d == da, d + s*inv(da), 1442cb93a386Sopenharmony_ci if_then_else(s == 0, /* s + */ d*inv(sa), 1443cb93a386Sopenharmony_ci sa*(da - min(da, (da-d)*sa*rcp_fast(s))) + s*inv(da) + d*inv(sa))); 1444cb93a386Sopenharmony_ci} 1445cb93a386Sopenharmony_ciBLEND_MODE(colordodge) { 1446cb93a386Sopenharmony_ci return if_then_else(d == 0, /* d + */ s*inv(da), 1447cb93a386Sopenharmony_ci if_then_else(s == sa, s + d*inv(sa), 1448cb93a386Sopenharmony_ci sa*min(da, (d*sa)*rcp_fast(sa - s)) + s*inv(da) + d*inv(sa))); 1449cb93a386Sopenharmony_ci} 1450cb93a386Sopenharmony_ciBLEND_MODE(hardlight) { 1451cb93a386Sopenharmony_ci return s*inv(da) + d*inv(sa) 1452cb93a386Sopenharmony_ci + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s))); 1453cb93a386Sopenharmony_ci} 1454cb93a386Sopenharmony_ciBLEND_MODE(overlay) { 1455cb93a386Sopenharmony_ci return s*inv(da) + d*inv(sa) 1456cb93a386Sopenharmony_ci + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s))); 1457cb93a386Sopenharmony_ci} 1458cb93a386Sopenharmony_ci 1459cb93a386Sopenharmony_ciBLEND_MODE(softlight) { 1460cb93a386Sopenharmony_ci F m = if_then_else(da > 0, d / da, 0), 1461cb93a386Sopenharmony_ci s2 = two(s), 1462cb93a386Sopenharmony_ci m4 = two(two(m)); 1463cb93a386Sopenharmony_ci 1464cb93a386Sopenharmony_ci // The logic forks three ways: 1465cb93a386Sopenharmony_ci // 1. dark src? 1466cb93a386Sopenharmony_ci // 2. light src, dark dst? 1467cb93a386Sopenharmony_ci // 3. light src, light dst? 1468cb93a386Sopenharmony_ci F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1. 1469cb93a386Sopenharmony_ci darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2. 1470cb93a386Sopenharmony_ci #if defined(SK_RASTER_PIPELINE_LEGACY_RCP_RSQRT) 1471cb93a386Sopenharmony_ci liteDst = rcp_fast(rsqrt(m)) - m, // Used in case 3. 1472cb93a386Sopenharmony_ci #else 1473cb93a386Sopenharmony_ci liteDst = sqrt_(m) - m, 1474cb93a386Sopenharmony_ci #endif 1475cb93a386Sopenharmony_ci liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3? 1476cb93a386Sopenharmony_ci return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)? 1477cb93a386Sopenharmony_ci} 1478cb93a386Sopenharmony_ci#undef BLEND_MODE 1479cb93a386Sopenharmony_ci 1480cb93a386Sopenharmony_ci// We're basing our implemenation of non-separable blend modes on 1481cb93a386Sopenharmony_ci// https://www.w3.org/TR/compositing-1/#blendingnonseparable. 1482cb93a386Sopenharmony_ci// and 1483cb93a386Sopenharmony_ci// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf 1484cb93a386Sopenharmony_ci// They're equivalent, but ES' math has been better simplified. 1485cb93a386Sopenharmony_ci// 1486cb93a386Sopenharmony_ci// Anything extra we add beyond that is to make the math work with premul inputs. 1487cb93a386Sopenharmony_ci 1488cb93a386Sopenharmony_ciSI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); } 1489cb93a386Sopenharmony_ciSI F lum(F r, F g, F b) { return r*0.30f + g*0.59f + b*0.11f; } 1490cb93a386Sopenharmony_ci 1491cb93a386Sopenharmony_ciSI void set_sat(F* r, F* g, F* b, F s) { 1492cb93a386Sopenharmony_ci F mn = min(*r, min(*g,*b)), 1493cb93a386Sopenharmony_ci mx = max(*r, max(*g,*b)), 1494cb93a386Sopenharmony_ci sat = mx - mn; 1495cb93a386Sopenharmony_ci 1496cb93a386Sopenharmony_ci // Map min channel to 0, max channel to s, and scale the middle proportionally. 1497cb93a386Sopenharmony_ci auto scale = [=](F c) { 1498cb93a386Sopenharmony_ci return if_then_else(sat == 0, 0, (c - mn) * s / sat); 1499cb93a386Sopenharmony_ci }; 1500cb93a386Sopenharmony_ci *r = scale(*r); 1501cb93a386Sopenharmony_ci *g = scale(*g); 1502cb93a386Sopenharmony_ci *b = scale(*b); 1503cb93a386Sopenharmony_ci} 1504cb93a386Sopenharmony_ciSI void set_lum(F* r, F* g, F* b, F l) { 1505cb93a386Sopenharmony_ci F diff = l - lum(*r, *g, *b); 1506cb93a386Sopenharmony_ci *r += diff; 1507cb93a386Sopenharmony_ci *g += diff; 1508cb93a386Sopenharmony_ci *b += diff; 1509cb93a386Sopenharmony_ci} 1510cb93a386Sopenharmony_ciSI void clip_color(F* r, F* g, F* b, F a) { 1511cb93a386Sopenharmony_ci F mn = min(*r, min(*g, *b)), 1512cb93a386Sopenharmony_ci mx = max(*r, max(*g, *b)), 1513cb93a386Sopenharmony_ci l = lum(*r, *g, *b); 1514cb93a386Sopenharmony_ci 1515cb93a386Sopenharmony_ci auto clip = [=](F c) { 1516cb93a386Sopenharmony_ci c = if_then_else(mn >= 0, c, l + (c - l) * ( l) / (l - mn) ); 1517cb93a386Sopenharmony_ci c = if_then_else(mx > a, l + (c - l) * (a - l) / (mx - l), c); 1518cb93a386Sopenharmony_ci c = max(c, 0); // Sometimes without this we may dip just a little negative. 1519cb93a386Sopenharmony_ci return c; 1520cb93a386Sopenharmony_ci }; 1521cb93a386Sopenharmony_ci *r = clip(*r); 1522cb93a386Sopenharmony_ci *g = clip(*g); 1523cb93a386Sopenharmony_ci *b = clip(*b); 1524cb93a386Sopenharmony_ci} 1525cb93a386Sopenharmony_ci 1526cb93a386Sopenharmony_ciSTAGE(hue, Ctx::None) { 1527cb93a386Sopenharmony_ci F R = r*a, 1528cb93a386Sopenharmony_ci G = g*a, 1529cb93a386Sopenharmony_ci B = b*a; 1530cb93a386Sopenharmony_ci 1531cb93a386Sopenharmony_ci set_sat(&R, &G, &B, sat(dr,dg,db)*a); 1532cb93a386Sopenharmony_ci set_lum(&R, &G, &B, lum(dr,dg,db)*a); 1533cb93a386Sopenharmony_ci clip_color(&R,&G,&B, a*da); 1534cb93a386Sopenharmony_ci 1535cb93a386Sopenharmony_ci r = r*inv(da) + dr*inv(a) + R; 1536cb93a386Sopenharmony_ci g = g*inv(da) + dg*inv(a) + G; 1537cb93a386Sopenharmony_ci b = b*inv(da) + db*inv(a) + B; 1538cb93a386Sopenharmony_ci a = a + da - a*da; 1539cb93a386Sopenharmony_ci} 1540cb93a386Sopenharmony_ciSTAGE(saturation, Ctx::None) { 1541cb93a386Sopenharmony_ci F R = dr*a, 1542cb93a386Sopenharmony_ci G = dg*a, 1543cb93a386Sopenharmony_ci B = db*a; 1544cb93a386Sopenharmony_ci 1545cb93a386Sopenharmony_ci set_sat(&R, &G, &B, sat( r, g, b)*da); 1546cb93a386Sopenharmony_ci set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.) 1547cb93a386Sopenharmony_ci clip_color(&R,&G,&B, a*da); 1548cb93a386Sopenharmony_ci 1549cb93a386Sopenharmony_ci r = r*inv(da) + dr*inv(a) + R; 1550cb93a386Sopenharmony_ci g = g*inv(da) + dg*inv(a) + G; 1551cb93a386Sopenharmony_ci b = b*inv(da) + db*inv(a) + B; 1552cb93a386Sopenharmony_ci a = a + da - a*da; 1553cb93a386Sopenharmony_ci} 1554cb93a386Sopenharmony_ciSTAGE(color, Ctx::None) { 1555cb93a386Sopenharmony_ci F R = r*da, 1556cb93a386Sopenharmony_ci G = g*da, 1557cb93a386Sopenharmony_ci B = b*da; 1558cb93a386Sopenharmony_ci 1559cb93a386Sopenharmony_ci set_lum(&R, &G, &B, lum(dr,dg,db)*a); 1560cb93a386Sopenharmony_ci clip_color(&R,&G,&B, a*da); 1561cb93a386Sopenharmony_ci 1562cb93a386Sopenharmony_ci r = r*inv(da) + dr*inv(a) + R; 1563cb93a386Sopenharmony_ci g = g*inv(da) + dg*inv(a) + G; 1564cb93a386Sopenharmony_ci b = b*inv(da) + db*inv(a) + B; 1565cb93a386Sopenharmony_ci a = a + da - a*da; 1566cb93a386Sopenharmony_ci} 1567cb93a386Sopenharmony_ciSTAGE(luminosity, Ctx::None) { 1568cb93a386Sopenharmony_ci F R = dr*a, 1569cb93a386Sopenharmony_ci G = dg*a, 1570cb93a386Sopenharmony_ci B = db*a; 1571cb93a386Sopenharmony_ci 1572cb93a386Sopenharmony_ci set_lum(&R, &G, &B, lum(r,g,b)*da); 1573cb93a386Sopenharmony_ci clip_color(&R,&G,&B, a*da); 1574cb93a386Sopenharmony_ci 1575cb93a386Sopenharmony_ci r = r*inv(da) + dr*inv(a) + R; 1576cb93a386Sopenharmony_ci g = g*inv(da) + dg*inv(a) + G; 1577cb93a386Sopenharmony_ci b = b*inv(da) + db*inv(a) + B; 1578cb93a386Sopenharmony_ci a = a + da - a*da; 1579cb93a386Sopenharmony_ci} 1580cb93a386Sopenharmony_ci 1581cb93a386Sopenharmony_ciSTAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { 1582cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 1583cb93a386Sopenharmony_ci 1584cb93a386Sopenharmony_ci U32 dst = load<U32>(ptr, tail); 1585cb93a386Sopenharmony_ci dr = cast((dst ) & 0xff); 1586cb93a386Sopenharmony_ci dg = cast((dst >> 8) & 0xff); 1587cb93a386Sopenharmony_ci db = cast((dst >> 16) & 0xff); 1588cb93a386Sopenharmony_ci da = cast((dst >> 24) ); 1589cb93a386Sopenharmony_ci // {dr,dg,db,da} are in [0,255] 1590cb93a386Sopenharmony_ci // { r, g, b, a} are in [0, 1] (but may be out of gamut) 1591cb93a386Sopenharmony_ci 1592cb93a386Sopenharmony_ci r = mad(dr, inv(a), r*255.0f); 1593cb93a386Sopenharmony_ci g = mad(dg, inv(a), g*255.0f); 1594cb93a386Sopenharmony_ci b = mad(db, inv(a), b*255.0f); 1595cb93a386Sopenharmony_ci a = mad(da, inv(a), a*255.0f); 1596cb93a386Sopenharmony_ci // { r, g, b, a} are now in [0,255] (but may be out of gamut) 1597cb93a386Sopenharmony_ci 1598cb93a386Sopenharmony_ci // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased. 1599cb93a386Sopenharmony_ci dst = to_unorm(r, 1, 255) 1600cb93a386Sopenharmony_ci | to_unorm(g, 1, 255) << 8 1601cb93a386Sopenharmony_ci | to_unorm(b, 1, 255) << 16 1602cb93a386Sopenharmony_ci | to_unorm(a, 1, 255) << 24; 1603cb93a386Sopenharmony_ci store(ptr, dst, tail); 1604cb93a386Sopenharmony_ci} 1605cb93a386Sopenharmony_ci 1606cb93a386Sopenharmony_ciSTAGE(clamp_0, Ctx::None) { 1607cb93a386Sopenharmony_ci r = max(r, 0); 1608cb93a386Sopenharmony_ci g = max(g, 0); 1609cb93a386Sopenharmony_ci b = max(b, 0); 1610cb93a386Sopenharmony_ci a = max(a, 0); 1611cb93a386Sopenharmony_ci} 1612cb93a386Sopenharmony_ci 1613cb93a386Sopenharmony_ciSTAGE(clamp_1, Ctx::None) { 1614cb93a386Sopenharmony_ci r = min(r, 1.0f); 1615cb93a386Sopenharmony_ci g = min(g, 1.0f); 1616cb93a386Sopenharmony_ci b = min(b, 1.0f); 1617cb93a386Sopenharmony_ci a = min(a, 1.0f); 1618cb93a386Sopenharmony_ci} 1619cb93a386Sopenharmony_ci 1620cb93a386Sopenharmony_ciSTAGE(clamp_a, Ctx::None) { 1621cb93a386Sopenharmony_ci a = min(a, 1.0f); 1622cb93a386Sopenharmony_ci r = min(r, a); 1623cb93a386Sopenharmony_ci g = min(g, a); 1624cb93a386Sopenharmony_ci b = min(b, a); 1625cb93a386Sopenharmony_ci} 1626cb93a386Sopenharmony_ci 1627cb93a386Sopenharmony_ciSTAGE(clamp_gamut, Ctx::None) { 1628cb93a386Sopenharmony_ci a = min(max(a, 0), 1.0f); 1629cb93a386Sopenharmony_ci r = min(max(r, 0), a); 1630cb93a386Sopenharmony_ci g = min(max(g, 0), a); 1631cb93a386Sopenharmony_ci b = min(max(b, 0), a); 1632cb93a386Sopenharmony_ci} 1633cb93a386Sopenharmony_ci 1634cb93a386Sopenharmony_ciSTAGE(set_rgb, const float* rgb) { 1635cb93a386Sopenharmony_ci r = rgb[0]; 1636cb93a386Sopenharmony_ci g = rgb[1]; 1637cb93a386Sopenharmony_ci b = rgb[2]; 1638cb93a386Sopenharmony_ci} 1639cb93a386Sopenharmony_ciSTAGE(unbounded_set_rgb, const float* rgb) { 1640cb93a386Sopenharmony_ci r = rgb[0]; 1641cb93a386Sopenharmony_ci g = rgb[1]; 1642cb93a386Sopenharmony_ci b = rgb[2]; 1643cb93a386Sopenharmony_ci} 1644cb93a386Sopenharmony_ci 1645cb93a386Sopenharmony_ciSTAGE(swap_rb, Ctx::None) { 1646cb93a386Sopenharmony_ci auto tmp = r; 1647cb93a386Sopenharmony_ci r = b; 1648cb93a386Sopenharmony_ci b = tmp; 1649cb93a386Sopenharmony_ci} 1650cb93a386Sopenharmony_ciSTAGE(swap_rb_dst, Ctx::None) { 1651cb93a386Sopenharmony_ci auto tmp = dr; 1652cb93a386Sopenharmony_ci dr = db; 1653cb93a386Sopenharmony_ci db = tmp; 1654cb93a386Sopenharmony_ci} 1655cb93a386Sopenharmony_ci 1656cb93a386Sopenharmony_ciSTAGE(move_src_dst, Ctx::None) { 1657cb93a386Sopenharmony_ci dr = r; 1658cb93a386Sopenharmony_ci dg = g; 1659cb93a386Sopenharmony_ci db = b; 1660cb93a386Sopenharmony_ci da = a; 1661cb93a386Sopenharmony_ci} 1662cb93a386Sopenharmony_ciSTAGE(move_dst_src, Ctx::None) { 1663cb93a386Sopenharmony_ci r = dr; 1664cb93a386Sopenharmony_ci g = dg; 1665cb93a386Sopenharmony_ci b = db; 1666cb93a386Sopenharmony_ci a = da; 1667cb93a386Sopenharmony_ci} 1668cb93a386Sopenharmony_ciSTAGE(swap_src_dst, Ctx::None) { 1669cb93a386Sopenharmony_ci std::swap(r, dr); 1670cb93a386Sopenharmony_ci std::swap(g, dg); 1671cb93a386Sopenharmony_ci std::swap(b, db); 1672cb93a386Sopenharmony_ci std::swap(a, da); 1673cb93a386Sopenharmony_ci} 1674cb93a386Sopenharmony_ci 1675cb93a386Sopenharmony_ciSTAGE(premul, Ctx::None) { 1676cb93a386Sopenharmony_ci r = r * a; 1677cb93a386Sopenharmony_ci g = g * a; 1678cb93a386Sopenharmony_ci b = b * a; 1679cb93a386Sopenharmony_ci} 1680cb93a386Sopenharmony_ciSTAGE(premul_dst, Ctx::None) { 1681cb93a386Sopenharmony_ci dr = dr * da; 1682cb93a386Sopenharmony_ci dg = dg * da; 1683cb93a386Sopenharmony_ci db = db * da; 1684cb93a386Sopenharmony_ci} 1685cb93a386Sopenharmony_ciSTAGE(unpremul, Ctx::None) { 1686cb93a386Sopenharmony_ci float inf = sk_bit_cast<float>(0x7f800000); 1687cb93a386Sopenharmony_ci auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0); 1688cb93a386Sopenharmony_ci r *= scale; 1689cb93a386Sopenharmony_ci g *= scale; 1690cb93a386Sopenharmony_ci b *= scale; 1691cb93a386Sopenharmony_ci} 1692cb93a386Sopenharmony_ci 1693cb93a386Sopenharmony_ciSTAGE(force_opaque , Ctx::None) { a = 1; } 1694cb93a386Sopenharmony_ciSTAGE(force_opaque_dst, Ctx::None) { da = 1; } 1695cb93a386Sopenharmony_ci 1696cb93a386Sopenharmony_ci// Clamp x to [0,1], both sides inclusive (think, gradients). 1697cb93a386Sopenharmony_ci// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. 1698cb93a386Sopenharmony_ciSI F clamp_01(F v) { return min(max(0, v), 1); } 1699cb93a386Sopenharmony_ci 1700cb93a386Sopenharmony_ciSTAGE(rgb_to_hsl, Ctx::None) { 1701cb93a386Sopenharmony_ci F mx = max(r, max(g,b)), 1702cb93a386Sopenharmony_ci mn = min(r, min(g,b)), 1703cb93a386Sopenharmony_ci d = mx - mn, 1704cb93a386Sopenharmony_ci d_rcp = 1.0f / d; 1705cb93a386Sopenharmony_ci 1706cb93a386Sopenharmony_ci F h = (1/6.0f) * 1707cb93a386Sopenharmony_ci if_then_else(mx == mn, 0, 1708cb93a386Sopenharmony_ci if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0), 1709cb93a386Sopenharmony_ci if_then_else(mx == g, (b-r)*d_rcp + 2.0f, 1710cb93a386Sopenharmony_ci (r-g)*d_rcp + 4.0f))); 1711cb93a386Sopenharmony_ci 1712cb93a386Sopenharmony_ci F l = (mx + mn) * 0.5f; 1713cb93a386Sopenharmony_ci F s = if_then_else(mx == mn, 0, 1714cb93a386Sopenharmony_ci d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn)); 1715cb93a386Sopenharmony_ci 1716cb93a386Sopenharmony_ci r = h; 1717cb93a386Sopenharmony_ci g = s; 1718cb93a386Sopenharmony_ci b = l; 1719cb93a386Sopenharmony_ci} 1720cb93a386Sopenharmony_ciSTAGE(hsl_to_rgb, Ctx::None) { 1721cb93a386Sopenharmony_ci // See GrRGBToHSLFilterEffect.fp 1722cb93a386Sopenharmony_ci 1723cb93a386Sopenharmony_ci F h = r, 1724cb93a386Sopenharmony_ci s = g, 1725cb93a386Sopenharmony_ci l = b, 1726cb93a386Sopenharmony_ci c = (1.0f - abs_(2.0f * l - 1)) * s; 1727cb93a386Sopenharmony_ci 1728cb93a386Sopenharmony_ci auto hue_to_rgb = [&](F hue) { 1729cb93a386Sopenharmony_ci F q = clamp_01(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f); 1730cb93a386Sopenharmony_ci return (q - 0.5f) * c + l; 1731cb93a386Sopenharmony_ci }; 1732cb93a386Sopenharmony_ci 1733cb93a386Sopenharmony_ci r = hue_to_rgb(h + 0.0f/3.0f); 1734cb93a386Sopenharmony_ci g = hue_to_rgb(h + 2.0f/3.0f); 1735cb93a386Sopenharmony_ci b = hue_to_rgb(h + 1.0f/3.0f); 1736cb93a386Sopenharmony_ci} 1737cb93a386Sopenharmony_ci 1738cb93a386Sopenharmony_ci// Derive alpha's coverage from rgb coverage and the values of src and dst alpha. 1739cb93a386Sopenharmony_ciSI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) { 1740cb93a386Sopenharmony_ci return if_then_else(a < da, min(cr, min(cg,cb)) 1741cb93a386Sopenharmony_ci , max(cr, max(cg,cb))); 1742cb93a386Sopenharmony_ci} 1743cb93a386Sopenharmony_ci 1744cb93a386Sopenharmony_ciSTAGE(scale_1_float, const float* c) { 1745cb93a386Sopenharmony_ci r = r * *c; 1746cb93a386Sopenharmony_ci g = g * *c; 1747cb93a386Sopenharmony_ci b = b * *c; 1748cb93a386Sopenharmony_ci a = a * *c; 1749cb93a386Sopenharmony_ci} 1750cb93a386Sopenharmony_ciSTAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { 1751cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); 1752cb93a386Sopenharmony_ci 1753cb93a386Sopenharmony_ci auto scales = load<U8>(ptr, tail); 1754cb93a386Sopenharmony_ci auto c = from_byte(scales); 1755cb93a386Sopenharmony_ci 1756cb93a386Sopenharmony_ci r = r * c; 1757cb93a386Sopenharmony_ci g = g * c; 1758cb93a386Sopenharmony_ci b = b * c; 1759cb93a386Sopenharmony_ci a = a * c; 1760cb93a386Sopenharmony_ci} 1761cb93a386Sopenharmony_ciSTAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { 1762cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 1763cb93a386Sopenharmony_ci 1764cb93a386Sopenharmony_ci F cr,cg,cb; 1765cb93a386Sopenharmony_ci from_565(load<U16>(ptr, tail), &cr, &cg, &cb); 1766cb93a386Sopenharmony_ci 1767cb93a386Sopenharmony_ci F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 1768cb93a386Sopenharmony_ci 1769cb93a386Sopenharmony_ci r = r * cr; 1770cb93a386Sopenharmony_ci g = g * cg; 1771cb93a386Sopenharmony_ci b = b * cb; 1772cb93a386Sopenharmony_ci a = a * ca; 1773cb93a386Sopenharmony_ci} 1774cb93a386Sopenharmony_ci 1775cb93a386Sopenharmony_ciSI F lerp(F from, F to, F t) { 1776cb93a386Sopenharmony_ci return mad(to-from, t, from); 1777cb93a386Sopenharmony_ci} 1778cb93a386Sopenharmony_ci 1779cb93a386Sopenharmony_ciSTAGE(lerp_1_float, const float* c) { 1780cb93a386Sopenharmony_ci r = lerp(dr, r, *c); 1781cb93a386Sopenharmony_ci g = lerp(dg, g, *c); 1782cb93a386Sopenharmony_ci b = lerp(db, b, *c); 1783cb93a386Sopenharmony_ci a = lerp(da, a, *c); 1784cb93a386Sopenharmony_ci} 1785cb93a386Sopenharmony_ciSTAGE(scale_native, const float scales[]) { 1786cb93a386Sopenharmony_ci auto c = sk_unaligned_load<F>(scales); 1787cb93a386Sopenharmony_ci r = r * c; 1788cb93a386Sopenharmony_ci g = g * c; 1789cb93a386Sopenharmony_ci b = b * c; 1790cb93a386Sopenharmony_ci a = a * c; 1791cb93a386Sopenharmony_ci} 1792cb93a386Sopenharmony_ciSTAGE(lerp_native, const float scales[]) { 1793cb93a386Sopenharmony_ci auto c = sk_unaligned_load<F>(scales); 1794cb93a386Sopenharmony_ci r = lerp(dr, r, c); 1795cb93a386Sopenharmony_ci g = lerp(dg, g, c); 1796cb93a386Sopenharmony_ci b = lerp(db, b, c); 1797cb93a386Sopenharmony_ci a = lerp(da, a, c); 1798cb93a386Sopenharmony_ci} 1799cb93a386Sopenharmony_ciSTAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { 1800cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); 1801cb93a386Sopenharmony_ci 1802cb93a386Sopenharmony_ci auto scales = load<U8>(ptr, tail); 1803cb93a386Sopenharmony_ci auto c = from_byte(scales); 1804cb93a386Sopenharmony_ci 1805cb93a386Sopenharmony_ci r = lerp(dr, r, c); 1806cb93a386Sopenharmony_ci g = lerp(dg, g, c); 1807cb93a386Sopenharmony_ci b = lerp(db, b, c); 1808cb93a386Sopenharmony_ci a = lerp(da, a, c); 1809cb93a386Sopenharmony_ci} 1810cb93a386Sopenharmony_ciSTAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { 1811cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 1812cb93a386Sopenharmony_ci 1813cb93a386Sopenharmony_ci F cr,cg,cb; 1814cb93a386Sopenharmony_ci from_565(load<U16>(ptr, tail), &cr, &cg, &cb); 1815cb93a386Sopenharmony_ci 1816cb93a386Sopenharmony_ci F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 1817cb93a386Sopenharmony_ci 1818cb93a386Sopenharmony_ci r = lerp(dr, r, cr); 1819cb93a386Sopenharmony_ci g = lerp(dg, g, cg); 1820cb93a386Sopenharmony_ci b = lerp(db, b, cb); 1821cb93a386Sopenharmony_ci a = lerp(da, a, ca); 1822cb93a386Sopenharmony_ci} 1823cb93a386Sopenharmony_ci 1824cb93a386Sopenharmony_ciSTAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) { 1825cb93a386Sopenharmony_ci auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), 1826cb93a386Sopenharmony_ci aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy); 1827cb93a386Sopenharmony_ci 1828cb93a386Sopenharmony_ci F mul = from_byte(load<U8>(mptr, tail)), 1829cb93a386Sopenharmony_ci add = from_byte(load<U8>(aptr, tail)); 1830cb93a386Sopenharmony_ci 1831cb93a386Sopenharmony_ci r = mad(r, mul, add); 1832cb93a386Sopenharmony_ci g = mad(g, mul, add); 1833cb93a386Sopenharmony_ci b = mad(b, mul, add); 1834cb93a386Sopenharmony_ci} 1835cb93a386Sopenharmony_ci 1836cb93a386Sopenharmony_ciSTAGE(byte_tables, const void* ctx) { 1837cb93a386Sopenharmony_ci struct Tables { const uint8_t *r, *g, *b, *a; }; 1838cb93a386Sopenharmony_ci auto tables = (const Tables*)ctx; 1839cb93a386Sopenharmony_ci 1840cb93a386Sopenharmony_ci r = from_byte(gather(tables->r, to_unorm(r, 255))); 1841cb93a386Sopenharmony_ci g = from_byte(gather(tables->g, to_unorm(g, 255))); 1842cb93a386Sopenharmony_ci b = from_byte(gather(tables->b, to_unorm(b, 255))); 1843cb93a386Sopenharmony_ci a = from_byte(gather(tables->a, to_unorm(a, 255))); 1844cb93a386Sopenharmony_ci} 1845cb93a386Sopenharmony_ci 1846cb93a386Sopenharmony_ciSI F strip_sign(F x, U32* sign) { 1847cb93a386Sopenharmony_ci U32 bits = sk_bit_cast<U32>(x); 1848cb93a386Sopenharmony_ci *sign = bits & 0x80000000; 1849cb93a386Sopenharmony_ci return sk_bit_cast<F>(bits ^ *sign); 1850cb93a386Sopenharmony_ci} 1851cb93a386Sopenharmony_ci 1852cb93a386Sopenharmony_ciSI F apply_sign(F x, U32 sign) { 1853cb93a386Sopenharmony_ci return sk_bit_cast<F>(sign | sk_bit_cast<U32>(x)); 1854cb93a386Sopenharmony_ci} 1855cb93a386Sopenharmony_ci 1856cb93a386Sopenharmony_ciSTAGE(parametric, const skcms_TransferFunction* ctx) { 1857cb93a386Sopenharmony_ci auto fn = [&](F v) { 1858cb93a386Sopenharmony_ci U32 sign; 1859cb93a386Sopenharmony_ci v = strip_sign(v, &sign); 1860cb93a386Sopenharmony_ci 1861cb93a386Sopenharmony_ci F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f) 1862cb93a386Sopenharmony_ci , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e); 1863cb93a386Sopenharmony_ci return apply_sign(r, sign); 1864cb93a386Sopenharmony_ci }; 1865cb93a386Sopenharmony_ci r = fn(r); 1866cb93a386Sopenharmony_ci g = fn(g); 1867cb93a386Sopenharmony_ci b = fn(b); 1868cb93a386Sopenharmony_ci} 1869cb93a386Sopenharmony_ci 1870cb93a386Sopenharmony_ciSTAGE(gamma_, const float* G) { 1871cb93a386Sopenharmony_ci auto fn = [&](F v) { 1872cb93a386Sopenharmony_ci U32 sign; 1873cb93a386Sopenharmony_ci v = strip_sign(v, &sign); 1874cb93a386Sopenharmony_ci return apply_sign(approx_powf(v, *G), sign); 1875cb93a386Sopenharmony_ci }; 1876cb93a386Sopenharmony_ci r = fn(r); 1877cb93a386Sopenharmony_ci g = fn(g); 1878cb93a386Sopenharmony_ci b = fn(b); 1879cb93a386Sopenharmony_ci} 1880cb93a386Sopenharmony_ci 1881cb93a386Sopenharmony_ciSTAGE(PQish, const skcms_TransferFunction* ctx) { 1882cb93a386Sopenharmony_ci auto fn = [&](F v) { 1883cb93a386Sopenharmony_ci U32 sign; 1884cb93a386Sopenharmony_ci v = strip_sign(v, &sign); 1885cb93a386Sopenharmony_ci 1886cb93a386Sopenharmony_ci F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0) 1887cb93a386Sopenharmony_ci / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)), 1888cb93a386Sopenharmony_ci ctx->f); 1889cb93a386Sopenharmony_ci 1890cb93a386Sopenharmony_ci return apply_sign(r, sign); 1891cb93a386Sopenharmony_ci }; 1892cb93a386Sopenharmony_ci r = fn(r); 1893cb93a386Sopenharmony_ci g = fn(g); 1894cb93a386Sopenharmony_ci b = fn(b); 1895cb93a386Sopenharmony_ci} 1896cb93a386Sopenharmony_ci 1897cb93a386Sopenharmony_ciSTAGE(HLGish, const skcms_TransferFunction* ctx) { 1898cb93a386Sopenharmony_ci auto fn = [&](F v) { 1899cb93a386Sopenharmony_ci U32 sign; 1900cb93a386Sopenharmony_ci v = strip_sign(v, &sign); 1901cb93a386Sopenharmony_ci 1902cb93a386Sopenharmony_ci const float R = ctx->a, G = ctx->b, 1903cb93a386Sopenharmony_ci a = ctx->c, b = ctx->d, c = ctx->e, 1904cb93a386Sopenharmony_ci K = ctx->f + 1.0f; 1905cb93a386Sopenharmony_ci 1906cb93a386Sopenharmony_ci F r = if_then_else(v*R <= 1, approx_powf(v*R, G) 1907cb93a386Sopenharmony_ci , approx_exp((v-c)*a) + b); 1908cb93a386Sopenharmony_ci 1909cb93a386Sopenharmony_ci return K * apply_sign(r, sign); 1910cb93a386Sopenharmony_ci }; 1911cb93a386Sopenharmony_ci r = fn(r); 1912cb93a386Sopenharmony_ci g = fn(g); 1913cb93a386Sopenharmony_ci b = fn(b); 1914cb93a386Sopenharmony_ci} 1915cb93a386Sopenharmony_ci 1916cb93a386Sopenharmony_ciSTAGE(HLGinvish, const skcms_TransferFunction* ctx) { 1917cb93a386Sopenharmony_ci auto fn = [&](F v) { 1918cb93a386Sopenharmony_ci U32 sign; 1919cb93a386Sopenharmony_ci v = strip_sign(v, &sign); 1920cb93a386Sopenharmony_ci 1921cb93a386Sopenharmony_ci const float R = ctx->a, G = ctx->b, 1922cb93a386Sopenharmony_ci a = ctx->c, b = ctx->d, c = ctx->e, 1923cb93a386Sopenharmony_ci K = ctx->f + 1.0f; 1924cb93a386Sopenharmony_ci 1925cb93a386Sopenharmony_ci v /= K; 1926cb93a386Sopenharmony_ci F r = if_then_else(v <= 1, R * approx_powf(v, G) 1927cb93a386Sopenharmony_ci , a * approx_log(v - b) + c); 1928cb93a386Sopenharmony_ci 1929cb93a386Sopenharmony_ci return apply_sign(r, sign); 1930cb93a386Sopenharmony_ci }; 1931cb93a386Sopenharmony_ci r = fn(r); 1932cb93a386Sopenharmony_ci g = fn(g); 1933cb93a386Sopenharmony_ci b = fn(b); 1934cb93a386Sopenharmony_ci} 1935cb93a386Sopenharmony_ci 1936cb93a386Sopenharmony_ciSTAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { 1937cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); 1938cb93a386Sopenharmony_ci 1939cb93a386Sopenharmony_ci r = g = b = 0.0f; 1940cb93a386Sopenharmony_ci a = from_byte(load<U8>(ptr, tail)); 1941cb93a386Sopenharmony_ci} 1942cb93a386Sopenharmony_ciSTAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { 1943cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy); 1944cb93a386Sopenharmony_ci 1945cb93a386Sopenharmony_ci dr = dg = db = 0.0f; 1946cb93a386Sopenharmony_ci da = from_byte(load<U8>(ptr, tail)); 1947cb93a386Sopenharmony_ci} 1948cb93a386Sopenharmony_ciSTAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { 1949cb93a386Sopenharmony_ci const uint8_t* ptr; 1950cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 1951cb93a386Sopenharmony_ci r = g = b = 0.0f; 1952cb93a386Sopenharmony_ci a = from_byte(gather(ptr, ix)); 1953cb93a386Sopenharmony_ci} 1954cb93a386Sopenharmony_ciSTAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { 1955cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy); 1956cb93a386Sopenharmony_ci 1957cb93a386Sopenharmony_ci U8 packed = pack(pack(to_unorm(a, 255))); 1958cb93a386Sopenharmony_ci store(ptr, packed, tail); 1959cb93a386Sopenharmony_ci} 1960cb93a386Sopenharmony_ci 1961cb93a386Sopenharmony_ciSTAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) { 1962cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 1963cb93a386Sopenharmony_ci 1964cb93a386Sopenharmony_ci from_565(load<U16>(ptr, tail), &r,&g,&b); 1965cb93a386Sopenharmony_ci a = 1.0f; 1966cb93a386Sopenharmony_ci} 1967cb93a386Sopenharmony_ciSTAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { 1968cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 1969cb93a386Sopenharmony_ci 1970cb93a386Sopenharmony_ci from_565(load<U16>(ptr, tail), &dr,&dg,&db); 1971cb93a386Sopenharmony_ci da = 1.0f; 1972cb93a386Sopenharmony_ci} 1973cb93a386Sopenharmony_ciSTAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) { 1974cb93a386Sopenharmony_ci const uint16_t* ptr; 1975cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 1976cb93a386Sopenharmony_ci from_565(gather(ptr, ix), &r,&g,&b); 1977cb93a386Sopenharmony_ci a = 1.0f; 1978cb93a386Sopenharmony_ci} 1979cb93a386Sopenharmony_ciSTAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) { 1980cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); 1981cb93a386Sopenharmony_ci 1982cb93a386Sopenharmony_ci U16 px = pack( to_unorm(r, 31) << 11 1983cb93a386Sopenharmony_ci | to_unorm(g, 63) << 5 1984cb93a386Sopenharmony_ci | to_unorm(b, 31) ); 1985cb93a386Sopenharmony_ci store(ptr, px, tail); 1986cb93a386Sopenharmony_ci} 1987cb93a386Sopenharmony_ci 1988cb93a386Sopenharmony_ciSTAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { 1989cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 1990cb93a386Sopenharmony_ci from_4444(load<U16>(ptr, tail), &r,&g,&b,&a); 1991cb93a386Sopenharmony_ci} 1992cb93a386Sopenharmony_ciSTAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { 1993cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 1994cb93a386Sopenharmony_ci from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da); 1995cb93a386Sopenharmony_ci} 1996cb93a386Sopenharmony_ciSTAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { 1997cb93a386Sopenharmony_ci const uint16_t* ptr; 1998cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 1999cb93a386Sopenharmony_ci from_4444(gather(ptr, ix), &r,&g,&b,&a); 2000cb93a386Sopenharmony_ci} 2001cb93a386Sopenharmony_ciSTAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { 2002cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); 2003cb93a386Sopenharmony_ci U16 px = pack( to_unorm(r, 15) << 12 2004cb93a386Sopenharmony_ci | to_unorm(g, 15) << 8 2005cb93a386Sopenharmony_ci | to_unorm(b, 15) << 4 2006cb93a386Sopenharmony_ci | to_unorm(a, 15) ); 2007cb93a386Sopenharmony_ci store(ptr, px, tail); 2008cb93a386Sopenharmony_ci} 2009cb93a386Sopenharmony_ci 2010cb93a386Sopenharmony_ciSTAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { 2011cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); 2012cb93a386Sopenharmony_ci from_8888(load<U32>(ptr, tail), &r,&g,&b,&a); 2013cb93a386Sopenharmony_ci} 2014cb93a386Sopenharmony_ciSTAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2015cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); 2016cb93a386Sopenharmony_ci from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da); 2017cb93a386Sopenharmony_ci} 2018cb93a386Sopenharmony_ciSTAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { 2019cb93a386Sopenharmony_ci const uint32_t* ptr; 2020cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 2021cb93a386Sopenharmony_ci from_8888(gather(ptr, ix), &r,&g,&b,&a); 2022cb93a386Sopenharmony_ci} 2023cb93a386Sopenharmony_ciSTAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { 2024cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 2025cb93a386Sopenharmony_ci 2026cb93a386Sopenharmony_ci U32 px = to_unorm(r, 255) 2027cb93a386Sopenharmony_ci | to_unorm(g, 255) << 8 2028cb93a386Sopenharmony_ci | to_unorm(b, 255) << 16 2029cb93a386Sopenharmony_ci | to_unorm(a, 255) << 24; 2030cb93a386Sopenharmony_ci store(ptr, px, tail); 2031cb93a386Sopenharmony_ci} 2032cb93a386Sopenharmony_ci 2033cb93a386Sopenharmony_ciSTAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { 2034cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); 2035cb93a386Sopenharmony_ci from_88(load<U16>(ptr, tail), &r, &g); 2036cb93a386Sopenharmony_ci b = 0; 2037cb93a386Sopenharmony_ci a = 1; 2038cb93a386Sopenharmony_ci} 2039cb93a386Sopenharmony_ciSTAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2040cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); 2041cb93a386Sopenharmony_ci from_88(load<U16>(ptr, tail), &dr, &dg); 2042cb93a386Sopenharmony_ci db = 0; 2043cb93a386Sopenharmony_ci da = 1; 2044cb93a386Sopenharmony_ci} 2045cb93a386Sopenharmony_ciSTAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { 2046cb93a386Sopenharmony_ci const uint16_t* ptr; 2047cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r, g); 2048cb93a386Sopenharmony_ci from_88(gather(ptr, ix), &r, &g); 2049cb93a386Sopenharmony_ci b = 0; 2050cb93a386Sopenharmony_ci a = 1; 2051cb93a386Sopenharmony_ci} 2052cb93a386Sopenharmony_ciSTAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { 2053cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy); 2054cb93a386Sopenharmony_ci U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) << 8 ); 2055cb93a386Sopenharmony_ci store(ptr, px, tail); 2056cb93a386Sopenharmony_ci} 2057cb93a386Sopenharmony_ci 2058cb93a386Sopenharmony_ciSTAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) { 2059cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 2060cb93a386Sopenharmony_ci r = g = b = 0; 2061cb93a386Sopenharmony_ci a = from_short(load<U16>(ptr, tail)); 2062cb93a386Sopenharmony_ci} 2063cb93a386Sopenharmony_ciSTAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2064cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); 2065cb93a386Sopenharmony_ci dr = dg = db = 0.0f; 2066cb93a386Sopenharmony_ci da = from_short(load<U16>(ptr, tail)); 2067cb93a386Sopenharmony_ci} 2068cb93a386Sopenharmony_ciSTAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) { 2069cb93a386Sopenharmony_ci const uint16_t* ptr; 2070cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r, g); 2071cb93a386Sopenharmony_ci r = g = b = 0.0f; 2072cb93a386Sopenharmony_ci a = from_short(gather(ptr, ix)); 2073cb93a386Sopenharmony_ci} 2074cb93a386Sopenharmony_ciSTAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) { 2075cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); 2076cb93a386Sopenharmony_ci 2077cb93a386Sopenharmony_ci U16 px = pack(to_unorm(a, 65535)); 2078cb93a386Sopenharmony_ci store(ptr, px, tail); 2079cb93a386Sopenharmony_ci} 2080cb93a386Sopenharmony_ci 2081cb93a386Sopenharmony_ciSTAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { 2082cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); 2083cb93a386Sopenharmony_ci b = 0; a = 1; 2084cb93a386Sopenharmony_ci from_1616(load<U32>(ptr, tail), &r,&g); 2085cb93a386Sopenharmony_ci} 2086cb93a386Sopenharmony_ciSTAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2087cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); 2088cb93a386Sopenharmony_ci from_1616(load<U32>(ptr, tail), &dr, &dg); 2089cb93a386Sopenharmony_ci db = 0; 2090cb93a386Sopenharmony_ci da = 1; 2091cb93a386Sopenharmony_ci} 2092cb93a386Sopenharmony_ciSTAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) { 2093cb93a386Sopenharmony_ci const uint32_t* ptr; 2094cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r, g); 2095cb93a386Sopenharmony_ci from_1616(gather(ptr, ix), &r, &g); 2096cb93a386Sopenharmony_ci b = 0; 2097cb93a386Sopenharmony_ci a = 1; 2098cb93a386Sopenharmony_ci} 2099cb93a386Sopenharmony_ciSTAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) { 2100cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 2101cb93a386Sopenharmony_ci 2102cb93a386Sopenharmony_ci U32 px = to_unorm(r, 65535) 2103cb93a386Sopenharmony_ci | to_unorm(g, 65535) << 16; 2104cb93a386Sopenharmony_ci store(ptr, px, tail); 2105cb93a386Sopenharmony_ci} 2106cb93a386Sopenharmony_ci 2107cb93a386Sopenharmony_ciSTAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) { 2108cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); 2109cb93a386Sopenharmony_ci from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a); 2110cb93a386Sopenharmony_ci} 2111cb93a386Sopenharmony_ciSTAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2112cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy); 2113cb93a386Sopenharmony_ci from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da); 2114cb93a386Sopenharmony_ci} 2115cb93a386Sopenharmony_ciSTAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) { 2116cb93a386Sopenharmony_ci const uint64_t* ptr; 2117cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r, g); 2118cb93a386Sopenharmony_ci from_16161616(gather(ptr, ix), &r, &g, &b, &a); 2119cb93a386Sopenharmony_ci} 2120cb93a386Sopenharmony_ciSTAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) { 2121cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy); 2122cb93a386Sopenharmony_ci 2123cb93a386Sopenharmony_ci U16 R = pack(to_unorm(r, 65535)), 2124cb93a386Sopenharmony_ci G = pack(to_unorm(g, 65535)), 2125cb93a386Sopenharmony_ci B = pack(to_unorm(b, 65535)), 2126cb93a386Sopenharmony_ci A = pack(to_unorm(a, 65535)); 2127cb93a386Sopenharmony_ci 2128cb93a386Sopenharmony_ci store4(ptr,tail, R,G,B,A); 2129cb93a386Sopenharmony_ci} 2130cb93a386Sopenharmony_ci 2131cb93a386Sopenharmony_ci 2132cb93a386Sopenharmony_ciSTAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) { 2133cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); 2134cb93a386Sopenharmony_ci from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a); 2135cb93a386Sopenharmony_ci} 2136cb93a386Sopenharmony_ciSTAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2137cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy); 2138cb93a386Sopenharmony_ci from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da); 2139cb93a386Sopenharmony_ci} 2140cb93a386Sopenharmony_ciSTAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) { 2141cb93a386Sopenharmony_ci const uint32_t* ptr; 2142cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 2143cb93a386Sopenharmony_ci from_1010102(gather(ptr, ix), &r,&g,&b,&a); 2144cb93a386Sopenharmony_ci} 2145cb93a386Sopenharmony_ciSTAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) { 2146cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 2147cb93a386Sopenharmony_ci 2148cb93a386Sopenharmony_ci U32 px = to_unorm(r, 1023) 2149cb93a386Sopenharmony_ci | to_unorm(g, 1023) << 10 2150cb93a386Sopenharmony_ci | to_unorm(b, 1023) << 20 2151cb93a386Sopenharmony_ci | to_unorm(a, 3) << 30; 2152cb93a386Sopenharmony_ci store(ptr, px, tail); 2153cb93a386Sopenharmony_ci} 2154cb93a386Sopenharmony_ci 2155cb93a386Sopenharmony_ciSTAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) { 2156cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); 2157cb93a386Sopenharmony_ci 2158cb93a386Sopenharmony_ci U16 R,G,B,A; 2159cb93a386Sopenharmony_ci load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); 2160cb93a386Sopenharmony_ci r = from_half(R); 2161cb93a386Sopenharmony_ci g = from_half(G); 2162cb93a386Sopenharmony_ci b = from_half(B); 2163cb93a386Sopenharmony_ci a = from_half(A); 2164cb93a386Sopenharmony_ci} 2165cb93a386Sopenharmony_ciSTAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2166cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy); 2167cb93a386Sopenharmony_ci 2168cb93a386Sopenharmony_ci U16 R,G,B,A; 2169cb93a386Sopenharmony_ci load4((const uint16_t*)ptr,tail, &R,&G,&B,&A); 2170cb93a386Sopenharmony_ci dr = from_half(R); 2171cb93a386Sopenharmony_ci dg = from_half(G); 2172cb93a386Sopenharmony_ci db = from_half(B); 2173cb93a386Sopenharmony_ci da = from_half(A); 2174cb93a386Sopenharmony_ci} 2175cb93a386Sopenharmony_ciSTAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) { 2176cb93a386Sopenharmony_ci const uint64_t* ptr; 2177cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 2178cb93a386Sopenharmony_ci auto px = gather(ptr, ix); 2179cb93a386Sopenharmony_ci 2180cb93a386Sopenharmony_ci U16 R,G,B,A; 2181cb93a386Sopenharmony_ci load4((const uint16_t*)&px,0, &R,&G,&B,&A); 2182cb93a386Sopenharmony_ci r = from_half(R); 2183cb93a386Sopenharmony_ci g = from_half(G); 2184cb93a386Sopenharmony_ci b = from_half(B); 2185cb93a386Sopenharmony_ci a = from_half(A); 2186cb93a386Sopenharmony_ci} 2187cb93a386Sopenharmony_ciSTAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) { 2188cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy); 2189cb93a386Sopenharmony_ci store4((uint16_t*)ptr,tail, to_half(r) 2190cb93a386Sopenharmony_ci , to_half(g) 2191cb93a386Sopenharmony_ci , to_half(b) 2192cb93a386Sopenharmony_ci , to_half(a)); 2193cb93a386Sopenharmony_ci} 2194cb93a386Sopenharmony_ci 2195cb93a386Sopenharmony_ciSTAGE(store_u16_be, const SkRasterPipeline_MemoryCtx* ctx) { 2196cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy); 2197cb93a386Sopenharmony_ci 2198cb93a386Sopenharmony_ci U16 R = bswap(pack(to_unorm(r, 65535))), 2199cb93a386Sopenharmony_ci G = bswap(pack(to_unorm(g, 65535))), 2200cb93a386Sopenharmony_ci B = bswap(pack(to_unorm(b, 65535))), 2201cb93a386Sopenharmony_ci A = bswap(pack(to_unorm(a, 65535))); 2202cb93a386Sopenharmony_ci 2203cb93a386Sopenharmony_ci store4(ptr,tail, R,G,B,A); 2204cb93a386Sopenharmony_ci} 2205cb93a386Sopenharmony_ci 2206cb93a386Sopenharmony_ciSTAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) { 2207cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy); 2208cb93a386Sopenharmony_ci 2209cb93a386Sopenharmony_ci U16 A = load<U16>((const uint16_t*)ptr, tail); 2210cb93a386Sopenharmony_ci r = 0; 2211cb93a386Sopenharmony_ci g = 0; 2212cb93a386Sopenharmony_ci b = 0; 2213cb93a386Sopenharmony_ci a = from_half(A); 2214cb93a386Sopenharmony_ci} 2215cb93a386Sopenharmony_ciSTAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2216cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy); 2217cb93a386Sopenharmony_ci 2218cb93a386Sopenharmony_ci U16 A = load<U16>((const uint16_t*)ptr, tail); 2219cb93a386Sopenharmony_ci dr = dg = db = 0.0f; 2220cb93a386Sopenharmony_ci da = from_half(A); 2221cb93a386Sopenharmony_ci} 2222cb93a386Sopenharmony_ciSTAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) { 2223cb93a386Sopenharmony_ci const uint16_t* ptr; 2224cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r, g); 2225cb93a386Sopenharmony_ci r = g = b = 0.0f; 2226cb93a386Sopenharmony_ci a = from_half(gather(ptr, ix)); 2227cb93a386Sopenharmony_ci} 2228cb93a386Sopenharmony_ciSTAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) { 2229cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy); 2230cb93a386Sopenharmony_ci store(ptr, to_half(a), tail); 2231cb93a386Sopenharmony_ci} 2232cb93a386Sopenharmony_ci 2233cb93a386Sopenharmony_ciSTAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { 2234cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); 2235cb93a386Sopenharmony_ci 2236cb93a386Sopenharmony_ci U16 R,G; 2237cb93a386Sopenharmony_ci load2((const uint16_t*)ptr, tail, &R, &G); 2238cb93a386Sopenharmony_ci r = from_half(R); 2239cb93a386Sopenharmony_ci g = from_half(G); 2240cb93a386Sopenharmony_ci b = 0; 2241cb93a386Sopenharmony_ci a = 1; 2242cb93a386Sopenharmony_ci} 2243cb93a386Sopenharmony_ciSTAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2244cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy); 2245cb93a386Sopenharmony_ci 2246cb93a386Sopenharmony_ci U16 R,G; 2247cb93a386Sopenharmony_ci load2((const uint16_t*)ptr, tail, &R, &G); 2248cb93a386Sopenharmony_ci dr = from_half(R); 2249cb93a386Sopenharmony_ci dg = from_half(G); 2250cb93a386Sopenharmony_ci db = 0; 2251cb93a386Sopenharmony_ci da = 1; 2252cb93a386Sopenharmony_ci} 2253cb93a386Sopenharmony_ciSTAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) { 2254cb93a386Sopenharmony_ci const uint32_t* ptr; 2255cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r, g); 2256cb93a386Sopenharmony_ci auto px = gather(ptr, ix); 2257cb93a386Sopenharmony_ci 2258cb93a386Sopenharmony_ci U16 R,G; 2259cb93a386Sopenharmony_ci load2((const uint16_t*)&px, 0, &R, &G); 2260cb93a386Sopenharmony_ci r = from_half(R); 2261cb93a386Sopenharmony_ci g = from_half(G); 2262cb93a386Sopenharmony_ci b = 0; 2263cb93a386Sopenharmony_ci a = 1; 2264cb93a386Sopenharmony_ci} 2265cb93a386Sopenharmony_ciSTAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) { 2266cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy); 2267cb93a386Sopenharmony_ci store2((uint16_t*)ptr, tail, to_half(r) 2268cb93a386Sopenharmony_ci , to_half(g)); 2269cb93a386Sopenharmony_ci} 2270cb93a386Sopenharmony_ci 2271cb93a386Sopenharmony_ciSTAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) { 2272cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); 2273cb93a386Sopenharmony_ci load4(ptr,tail, &r,&g,&b,&a); 2274cb93a386Sopenharmony_ci} 2275cb93a386Sopenharmony_ciSTAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) { 2276cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy); 2277cb93a386Sopenharmony_ci load4(ptr,tail, &dr,&dg,&db,&da); 2278cb93a386Sopenharmony_ci} 2279cb93a386Sopenharmony_ciSTAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) { 2280cb93a386Sopenharmony_ci const float* ptr; 2281cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, r,g); 2282cb93a386Sopenharmony_ci r = gather(ptr, 4*ix + 0); 2283cb93a386Sopenharmony_ci g = gather(ptr, 4*ix + 1); 2284cb93a386Sopenharmony_ci b = gather(ptr, 4*ix + 2); 2285cb93a386Sopenharmony_ci a = gather(ptr, 4*ix + 3); 2286cb93a386Sopenharmony_ci} 2287cb93a386Sopenharmony_ciSTAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) { 2288cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy); 2289cb93a386Sopenharmony_ci store4(ptr,tail, r,g,b,a); 2290cb93a386Sopenharmony_ci} 2291cb93a386Sopenharmony_ci 2292cb93a386Sopenharmony_ciSTAGE(load_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { 2293cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy); 2294cb93a386Sopenharmony_ci load2(ptr, tail, &r, &g); 2295cb93a386Sopenharmony_ci b = 0; 2296cb93a386Sopenharmony_ci a = 1; 2297cb93a386Sopenharmony_ci} 2298cb93a386Sopenharmony_ciSTAGE(store_rgf32, const SkRasterPipeline_MemoryCtx* ctx) { 2299cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy); 2300cb93a386Sopenharmony_ci store2(ptr, tail, r, g); 2301cb93a386Sopenharmony_ci} 2302cb93a386Sopenharmony_ci 2303cb93a386Sopenharmony_ciSI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) { 2304cb93a386Sopenharmony_ci return v - floor_(v*ctx->invScale)*ctx->scale; 2305cb93a386Sopenharmony_ci} 2306cb93a386Sopenharmony_ciSI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) { 2307cb93a386Sopenharmony_ci auto limit = ctx->scale; 2308cb93a386Sopenharmony_ci auto invLimit = ctx->invScale; 2309cb93a386Sopenharmony_ci return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); 2310cb93a386Sopenharmony_ci} 2311cb93a386Sopenharmony_ci// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images). 2312cb93a386Sopenharmony_ci// The gather stages will hard clamp the output of these stages to [0,limit)... 2313cb93a386Sopenharmony_ci// we just need to do the basic repeat or mirroring. 2314cb93a386Sopenharmony_ciSTAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); } 2315cb93a386Sopenharmony_ciSTAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); } 2316cb93a386Sopenharmony_ciSTAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); } 2317cb93a386Sopenharmony_ciSTAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); } 2318cb93a386Sopenharmony_ci 2319cb93a386Sopenharmony_ciSTAGE( clamp_x_1, Ctx::None) { r = clamp_01(r); } 2320cb93a386Sopenharmony_ciSTAGE(repeat_x_1, Ctx::None) { r = clamp_01(r - floor_(r)); } 2321cb93a386Sopenharmony_ciSTAGE(mirror_x_1, Ctx::None) { r = clamp_01(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); } 2322cb93a386Sopenharmony_ci 2323cb93a386Sopenharmony_ci// Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain: 2324cb93a386Sopenharmony_ci// mask == 0x00000000 if the coordinate(s) are out of bounds 2325cb93a386Sopenharmony_ci// mask == 0xFFFFFFFF if the coordinate(s) are in bounds 2326cb93a386Sopenharmony_ci// After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0 2327cb93a386Sopenharmony_ci// if either of the coordinates were out of bounds. 2328cb93a386Sopenharmony_ci 2329cb93a386Sopenharmony_ciSTAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { 2330cb93a386Sopenharmony_ci auto w = ctx->limit_x; 2331cb93a386Sopenharmony_ci sk_unaligned_store(ctx->mask, cond_to_mask((0 <= r) & (r < w))); 2332cb93a386Sopenharmony_ci} 2333cb93a386Sopenharmony_ciSTAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { 2334cb93a386Sopenharmony_ci auto h = ctx->limit_y; 2335cb93a386Sopenharmony_ci sk_unaligned_store(ctx->mask, cond_to_mask((0 <= g) & (g < h))); 2336cb93a386Sopenharmony_ci} 2337cb93a386Sopenharmony_ciSTAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { 2338cb93a386Sopenharmony_ci auto w = ctx->limit_x; 2339cb93a386Sopenharmony_ci auto h = ctx->limit_y; 2340cb93a386Sopenharmony_ci sk_unaligned_store(ctx->mask, 2341cb93a386Sopenharmony_ci cond_to_mask((0 <= r) & (r < w) & (0 <= g) & (g < h))); 2342cb93a386Sopenharmony_ci} 2343cb93a386Sopenharmony_ciSTAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { 2344cb93a386Sopenharmony_ci auto mask = sk_unaligned_load<U32>(ctx->mask); 2345cb93a386Sopenharmony_ci r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask); 2346cb93a386Sopenharmony_ci g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask); 2347cb93a386Sopenharmony_ci b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask); 2348cb93a386Sopenharmony_ci a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask); 2349cb93a386Sopenharmony_ci} 2350cb93a386Sopenharmony_ci 2351cb93a386Sopenharmony_ciSTAGE(alpha_to_gray, Ctx::None) { 2352cb93a386Sopenharmony_ci r = g = b = a; 2353cb93a386Sopenharmony_ci a = 1; 2354cb93a386Sopenharmony_ci} 2355cb93a386Sopenharmony_ciSTAGE(alpha_to_gray_dst, Ctx::None) { 2356cb93a386Sopenharmony_ci dr = dg = db = da; 2357cb93a386Sopenharmony_ci da = 1; 2358cb93a386Sopenharmony_ci} 2359cb93a386Sopenharmony_ciSTAGE(bt709_luminance_or_luma_to_alpha, Ctx::None) { 2360cb93a386Sopenharmony_ci a = r*0.2126f + g*0.7152f + b*0.0722f; 2361cb93a386Sopenharmony_ci r = g = b = 0; 2362cb93a386Sopenharmony_ci} 2363cb93a386Sopenharmony_ciSTAGE(bt709_luminance_or_luma_to_rgb, Ctx::None) { 2364cb93a386Sopenharmony_ci r = g = b = r*0.2126f + g*0.7152f + b*0.0722f; 2365cb93a386Sopenharmony_ci} 2366cb93a386Sopenharmony_ci 2367cb93a386Sopenharmony_ciSTAGE(matrix_translate, const float* m) { 2368cb93a386Sopenharmony_ci r += m[0]; 2369cb93a386Sopenharmony_ci g += m[1]; 2370cb93a386Sopenharmony_ci} 2371cb93a386Sopenharmony_ciSTAGE(matrix_scale_translate, const float* m) { 2372cb93a386Sopenharmony_ci r = mad(r,m[0], m[2]); 2373cb93a386Sopenharmony_ci g = mad(g,m[1], m[3]); 2374cb93a386Sopenharmony_ci} 2375cb93a386Sopenharmony_ciSTAGE(matrix_2x3, const float* m) { 2376cb93a386Sopenharmony_ci auto R = mad(r,m[0], mad(g,m[1], m[2])), 2377cb93a386Sopenharmony_ci G = mad(r,m[3], mad(g,m[4], m[5])); 2378cb93a386Sopenharmony_ci r = R; 2379cb93a386Sopenharmony_ci g = G; 2380cb93a386Sopenharmony_ci} 2381cb93a386Sopenharmony_ciSTAGE(matrix_3x3, const float* m) { 2382cb93a386Sopenharmony_ci auto R = mad(r,m[0], mad(g,m[3], b*m[6])), 2383cb93a386Sopenharmony_ci G = mad(r,m[1], mad(g,m[4], b*m[7])), 2384cb93a386Sopenharmony_ci B = mad(r,m[2], mad(g,m[5], b*m[8])); 2385cb93a386Sopenharmony_ci r = R; 2386cb93a386Sopenharmony_ci g = G; 2387cb93a386Sopenharmony_ci b = B; 2388cb93a386Sopenharmony_ci} 2389cb93a386Sopenharmony_ciSTAGE(matrix_3x4, const float* m) { 2390cb93a386Sopenharmony_ci auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))), 2391cb93a386Sopenharmony_ci G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))), 2392cb93a386Sopenharmony_ci B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11]))); 2393cb93a386Sopenharmony_ci r = R; 2394cb93a386Sopenharmony_ci g = G; 2395cb93a386Sopenharmony_ci b = B; 2396cb93a386Sopenharmony_ci} 2397cb93a386Sopenharmony_ciSTAGE(matrix_4x5, const float* m) { 2398cb93a386Sopenharmony_ci auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))), 2399cb93a386Sopenharmony_ci G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))), 2400cb93a386Sopenharmony_ci B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))), 2401cb93a386Sopenharmony_ci A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19])))); 2402cb93a386Sopenharmony_ci r = R; 2403cb93a386Sopenharmony_ci g = G; 2404cb93a386Sopenharmony_ci b = B; 2405cb93a386Sopenharmony_ci a = A; 2406cb93a386Sopenharmony_ci} 2407cb93a386Sopenharmony_ciSTAGE(matrix_4x3, const float* m) { 2408cb93a386Sopenharmony_ci auto X = r, 2409cb93a386Sopenharmony_ci Y = g; 2410cb93a386Sopenharmony_ci 2411cb93a386Sopenharmony_ci r = mad(X, m[0], mad(Y, m[4], m[ 8])); 2412cb93a386Sopenharmony_ci g = mad(X, m[1], mad(Y, m[5], m[ 9])); 2413cb93a386Sopenharmony_ci b = mad(X, m[2], mad(Y, m[6], m[10])); 2414cb93a386Sopenharmony_ci a = mad(X, m[3], mad(Y, m[7], m[11])); 2415cb93a386Sopenharmony_ci} 2416cb93a386Sopenharmony_ciSTAGE(matrix_perspective, const float* m) { 2417cb93a386Sopenharmony_ci // N.B. Unlike the other matrix_ stages, this matrix is row-major. 2418cb93a386Sopenharmony_ci auto R = mad(r,m[0], mad(g,m[1], m[2])), 2419cb93a386Sopenharmony_ci G = mad(r,m[3], mad(g,m[4], m[5])), 2420cb93a386Sopenharmony_ci Z = mad(r,m[6], mad(g,m[7], m[8])); 2421cb93a386Sopenharmony_ci r = R * rcp_precise(Z); 2422cb93a386Sopenharmony_ci g = G * rcp_precise(Z); 2423cb93a386Sopenharmony_ci} 2424cb93a386Sopenharmony_ci 2425cb93a386Sopenharmony_ciSI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, 2426cb93a386Sopenharmony_ci F* r, F* g, F* b, F* a) { 2427cb93a386Sopenharmony_ci F fr, br, fg, bg, fb, bb, fa, ba; 2428cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 2429cb93a386Sopenharmony_ci if (c->stopCount <=8) { 2430cb93a386Sopenharmony_ci fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), idx); 2431cb93a386Sopenharmony_ci br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), idx); 2432cb93a386Sopenharmony_ci fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), idx); 2433cb93a386Sopenharmony_ci bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), idx); 2434cb93a386Sopenharmony_ci fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), idx); 2435cb93a386Sopenharmony_ci bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), idx); 2436cb93a386Sopenharmony_ci fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), idx); 2437cb93a386Sopenharmony_ci ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), idx); 2438cb93a386Sopenharmony_ci } else 2439cb93a386Sopenharmony_ci#endif 2440cb93a386Sopenharmony_ci { 2441cb93a386Sopenharmony_ci fr = gather(c->fs[0], idx); 2442cb93a386Sopenharmony_ci br = gather(c->bs[0], idx); 2443cb93a386Sopenharmony_ci fg = gather(c->fs[1], idx); 2444cb93a386Sopenharmony_ci bg = gather(c->bs[1], idx); 2445cb93a386Sopenharmony_ci fb = gather(c->fs[2], idx); 2446cb93a386Sopenharmony_ci bb = gather(c->bs[2], idx); 2447cb93a386Sopenharmony_ci fa = gather(c->fs[3], idx); 2448cb93a386Sopenharmony_ci ba = gather(c->bs[3], idx); 2449cb93a386Sopenharmony_ci } 2450cb93a386Sopenharmony_ci 2451cb93a386Sopenharmony_ci *r = mad(t, fr, br); 2452cb93a386Sopenharmony_ci *g = mad(t, fg, bg); 2453cb93a386Sopenharmony_ci *b = mad(t, fb, bb); 2454cb93a386Sopenharmony_ci *a = mad(t, fa, ba); 2455cb93a386Sopenharmony_ci} 2456cb93a386Sopenharmony_ci 2457cb93a386Sopenharmony_ciSTAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { 2458cb93a386Sopenharmony_ci auto t = r; 2459cb93a386Sopenharmony_ci auto idx = trunc_(t * (c->stopCount-1)); 2460cb93a386Sopenharmony_ci gradient_lookup(c, idx, t, &r, &g, &b, &a); 2461cb93a386Sopenharmony_ci} 2462cb93a386Sopenharmony_ci 2463cb93a386Sopenharmony_ciSTAGE(gradient, const SkRasterPipeline_GradientCtx* c) { 2464cb93a386Sopenharmony_ci auto t = r; 2465cb93a386Sopenharmony_ci U32 idx = 0; 2466cb93a386Sopenharmony_ci 2467cb93a386Sopenharmony_ci // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. 2468cb93a386Sopenharmony_ci for (size_t i = 1; i < c->stopCount; i++) { 2469cb93a386Sopenharmony_ci idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); 2470cb93a386Sopenharmony_ci } 2471cb93a386Sopenharmony_ci 2472cb93a386Sopenharmony_ci gradient_lookup(c, idx, t, &r, &g, &b, &a); 2473cb93a386Sopenharmony_ci} 2474cb93a386Sopenharmony_ci 2475cb93a386Sopenharmony_ciSTAGE(evenly_spaced_2_stop_gradient, const void* ctx) { 2476cb93a386Sopenharmony_ci struct Ctx { float f[4], b[4]; }; 2477cb93a386Sopenharmony_ci auto c = (const Ctx*)ctx; 2478cb93a386Sopenharmony_ci 2479cb93a386Sopenharmony_ci auto t = r; 2480cb93a386Sopenharmony_ci r = mad(t, c->f[0], c->b[0]); 2481cb93a386Sopenharmony_ci g = mad(t, c->f[1], c->b[1]); 2482cb93a386Sopenharmony_ci b = mad(t, c->f[2], c->b[2]); 2483cb93a386Sopenharmony_ci a = mad(t, c->f[3], c->b[3]); 2484cb93a386Sopenharmony_ci} 2485cb93a386Sopenharmony_ci 2486cb93a386Sopenharmony_ciSTAGE(xy_to_unit_angle, Ctx::None) { 2487cb93a386Sopenharmony_ci F X = r, 2488cb93a386Sopenharmony_ci Y = g; 2489cb93a386Sopenharmony_ci F xabs = abs_(X), 2490cb93a386Sopenharmony_ci yabs = abs_(Y); 2491cb93a386Sopenharmony_ci 2492cb93a386Sopenharmony_ci F slope = min(xabs, yabs)/max(xabs, yabs); 2493cb93a386Sopenharmony_ci F s = slope * slope; 2494cb93a386Sopenharmony_ci 2495cb93a386Sopenharmony_ci // Use a 7th degree polynomial to approximate atan. 2496cb93a386Sopenharmony_ci // This was generated using sollya.gforge.inria.fr. 2497cb93a386Sopenharmony_ci // A float optimized polynomial was generated using the following command. 2498cb93a386Sopenharmony_ci // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); 2499cb93a386Sopenharmony_ci F phi = slope 2500cb93a386Sopenharmony_ci * (0.15912117063999176025390625f + s 2501cb93a386Sopenharmony_ci * (-5.185396969318389892578125e-2f + s 2502cb93a386Sopenharmony_ci * (2.476101927459239959716796875e-2f + s 2503cb93a386Sopenharmony_ci * (-7.0547382347285747528076171875e-3f)))); 2504cb93a386Sopenharmony_ci 2505cb93a386Sopenharmony_ci phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); 2506cb93a386Sopenharmony_ci phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi); 2507cb93a386Sopenharmony_ci phi = if_then_else(Y < 0.0f , 1.0f - phi , phi); 2508cb93a386Sopenharmony_ci phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. 2509cb93a386Sopenharmony_ci r = phi; 2510cb93a386Sopenharmony_ci} 2511cb93a386Sopenharmony_ci 2512cb93a386Sopenharmony_ciSTAGE(xy_to_radius, Ctx::None) { 2513cb93a386Sopenharmony_ci F X2 = r * r, 2514cb93a386Sopenharmony_ci Y2 = g * g; 2515cb93a386Sopenharmony_ci r = sqrt_(X2 + Y2); 2516cb93a386Sopenharmony_ci} 2517cb93a386Sopenharmony_ci 2518cb93a386Sopenharmony_ci// Please see https://skia.org/dev/design/conical for how our 2pt conical shader works. 2519cb93a386Sopenharmony_ci 2520cb93a386Sopenharmony_ciSTAGE(negate_x, Ctx::None) { r = -r; } 2521cb93a386Sopenharmony_ci 2522cb93a386Sopenharmony_ciSTAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) { 2523cb93a386Sopenharmony_ci F x = r, y = g, &t = r; 2524cb93a386Sopenharmony_ci t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0 2525cb93a386Sopenharmony_ci} 2526cb93a386Sopenharmony_ci 2527cb93a386Sopenharmony_ciSTAGE(xy_to_2pt_conical_focal_on_circle, Ctx::None) { 2528cb93a386Sopenharmony_ci F x = r, y = g, &t = r; 2529cb93a386Sopenharmony_ci t = x + y*y / x; // (x^2 + y^2) / x 2530cb93a386Sopenharmony_ci} 2531cb93a386Sopenharmony_ci 2532cb93a386Sopenharmony_ciSTAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) { 2533cb93a386Sopenharmony_ci F x = r, y = g, &t = r; 2534cb93a386Sopenharmony_ci t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 2535cb93a386Sopenharmony_ci} 2536cb93a386Sopenharmony_ci 2537cb93a386Sopenharmony_ciSTAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) { 2538cb93a386Sopenharmony_ci F x = r, y = g, &t = r; 2539cb93a386Sopenharmony_ci t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 2540cb93a386Sopenharmony_ci} 2541cb93a386Sopenharmony_ci 2542cb93a386Sopenharmony_ciSTAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) { 2543cb93a386Sopenharmony_ci F x = r, y = g, &t = r; 2544cb93a386Sopenharmony_ci t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1 2545cb93a386Sopenharmony_ci} 2546cb93a386Sopenharmony_ci 2547cb93a386Sopenharmony_ciSTAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) { 2548cb93a386Sopenharmony_ci F& t = r; 2549cb93a386Sopenharmony_ci t = t + ctx->fP1; // ctx->fP1 = f 2550cb93a386Sopenharmony_ci} 2551cb93a386Sopenharmony_ci 2552cb93a386Sopenharmony_ciSTAGE(alter_2pt_conical_unswap, Ctx::None) { 2553cb93a386Sopenharmony_ci F& t = r; 2554cb93a386Sopenharmony_ci t = 1 - t; 2555cb93a386Sopenharmony_ci} 2556cb93a386Sopenharmony_ci 2557cb93a386Sopenharmony_ciSTAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) { 2558cb93a386Sopenharmony_ci F& t = r; 2559cb93a386Sopenharmony_ci auto is_degenerate = (t != t); // NaN 2560cb93a386Sopenharmony_ci t = if_then_else(is_degenerate, F(0), t); 2561cb93a386Sopenharmony_ci sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); 2562cb93a386Sopenharmony_ci} 2563cb93a386Sopenharmony_ci 2564cb93a386Sopenharmony_ciSTAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) { 2565cb93a386Sopenharmony_ci F& t = r; 2566cb93a386Sopenharmony_ci auto is_degenerate = (t <= 0) | (t != t); 2567cb93a386Sopenharmony_ci t = if_then_else(is_degenerate, F(0), t); 2568cb93a386Sopenharmony_ci sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate)); 2569cb93a386Sopenharmony_ci} 2570cb93a386Sopenharmony_ci 2571cb93a386Sopenharmony_ciSTAGE(apply_vector_mask, const uint32_t* ctx) { 2572cb93a386Sopenharmony_ci const U32 mask = sk_unaligned_load<U32>(ctx); 2573cb93a386Sopenharmony_ci r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask); 2574cb93a386Sopenharmony_ci g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask); 2575cb93a386Sopenharmony_ci b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask); 2576cb93a386Sopenharmony_ci a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask); 2577cb93a386Sopenharmony_ci} 2578cb93a386Sopenharmony_ci 2579cb93a386Sopenharmony_ciSTAGE(save_xy, SkRasterPipeline_SamplerCtx* c) { 2580cb93a386Sopenharmony_ci // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy). 2581cb93a386Sopenharmony_ci // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid 2582cb93a386Sopenharmony_ci // surrounding (x,y) at (0.5,0.5) off-center. 2583cb93a386Sopenharmony_ci F fx = fract(r + 0.5f), 2584cb93a386Sopenharmony_ci fy = fract(g + 0.5f); 2585cb93a386Sopenharmony_ci 2586cb93a386Sopenharmony_ci // Samplers will need to load x and fx, or y and fy. 2587cb93a386Sopenharmony_ci sk_unaligned_store(c->x, r); 2588cb93a386Sopenharmony_ci sk_unaligned_store(c->y, g); 2589cb93a386Sopenharmony_ci sk_unaligned_store(c->fx, fx); 2590cb93a386Sopenharmony_ci sk_unaligned_store(c->fy, fy); 2591cb93a386Sopenharmony_ci} 2592cb93a386Sopenharmony_ci 2593cb93a386Sopenharmony_ciSTAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) { 2594cb93a386Sopenharmony_ci // Bilinear and bicubic filters are both separable, so we produce independent contributions 2595cb93a386Sopenharmony_ci // from x and y, multiplying them together here to get each pixel's total scale factor. 2596cb93a386Sopenharmony_ci auto scale = sk_unaligned_load<F>(c->scalex) 2597cb93a386Sopenharmony_ci * sk_unaligned_load<F>(c->scaley); 2598cb93a386Sopenharmony_ci dr = mad(scale, r, dr); 2599cb93a386Sopenharmony_ci dg = mad(scale, g, dg); 2600cb93a386Sopenharmony_ci db = mad(scale, b, db); 2601cb93a386Sopenharmony_ci da = mad(scale, a, da); 2602cb93a386Sopenharmony_ci} 2603cb93a386Sopenharmony_ci 2604cb93a386Sopenharmony_ci// In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center 2605cb93a386Sopenharmony_ci// are combined in direct proportion to their area overlapping that logical query pixel. 2606cb93a386Sopenharmony_ci// At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x. 2607cb93a386Sopenharmony_ci// The y-axis is symmetric. 2608cb93a386Sopenharmony_ci 2609cb93a386Sopenharmony_citemplate <int kScale> 2610cb93a386Sopenharmony_ciSI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { 2611cb93a386Sopenharmony_ci *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); 2612cb93a386Sopenharmony_ci F fx = sk_unaligned_load<F>(ctx->fx); 2613cb93a386Sopenharmony_ci 2614cb93a386Sopenharmony_ci F scalex; 2615cb93a386Sopenharmony_ci if (kScale == -1) { scalex = 1.0f - fx; } 2616cb93a386Sopenharmony_ci if (kScale == +1) { scalex = fx; } 2617cb93a386Sopenharmony_ci sk_unaligned_store(ctx->scalex, scalex); 2618cb93a386Sopenharmony_ci} 2619cb93a386Sopenharmony_citemplate <int kScale> 2620cb93a386Sopenharmony_ciSI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { 2621cb93a386Sopenharmony_ci *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); 2622cb93a386Sopenharmony_ci F fy = sk_unaligned_load<F>(ctx->fy); 2623cb93a386Sopenharmony_ci 2624cb93a386Sopenharmony_ci F scaley; 2625cb93a386Sopenharmony_ci if (kScale == -1) { scaley = 1.0f - fy; } 2626cb93a386Sopenharmony_ci if (kScale == +1) { scaley = fy; } 2627cb93a386Sopenharmony_ci sk_unaligned_store(ctx->scaley, scaley); 2628cb93a386Sopenharmony_ci} 2629cb93a386Sopenharmony_ci 2630cb93a386Sopenharmony_ciSTAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); } 2631cb93a386Sopenharmony_ciSTAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); } 2632cb93a386Sopenharmony_ciSTAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); } 2633cb93a386Sopenharmony_ciSTAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); } 2634cb93a386Sopenharmony_ci 2635cb93a386Sopenharmony_ci 2636cb93a386Sopenharmony_ci// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample 2637cb93a386Sopenharmony_ci// pixel center are combined with a non-uniform cubic filter, with higher values near the center. 2638cb93a386Sopenharmony_ci// 2639cb93a386Sopenharmony_ci// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets. 2640cb93a386Sopenharmony_ci// See GrCubicEffect for details of this particular filter. 2641cb93a386Sopenharmony_ci 2642cb93a386Sopenharmony_ciSI F bicubic_near(F t) { 2643cb93a386Sopenharmony_ci // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18 2644cb93a386Sopenharmony_ci return mad(t, mad(t, mad((-21/18.0f), t, (27/18.0f)), (9/18.0f)), (1/18.0f)); 2645cb93a386Sopenharmony_ci} 2646cb93a386Sopenharmony_ciSI F bicubic_far(F t) { 2647cb93a386Sopenharmony_ci // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18) 2648cb93a386Sopenharmony_ci return (t*t)*mad((7/18.0f), t, (-6/18.0f)); 2649cb93a386Sopenharmony_ci} 2650cb93a386Sopenharmony_ci 2651cb93a386Sopenharmony_citemplate <int kScale> 2652cb93a386Sopenharmony_ciSI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) { 2653cb93a386Sopenharmony_ci *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f); 2654cb93a386Sopenharmony_ci F fx = sk_unaligned_load<F>(ctx->fx); 2655cb93a386Sopenharmony_ci 2656cb93a386Sopenharmony_ci F scalex; 2657cb93a386Sopenharmony_ci if (kScale == -3) { scalex = bicubic_far (1.0f - fx); } 2658cb93a386Sopenharmony_ci if (kScale == -1) { scalex = bicubic_near(1.0f - fx); } 2659cb93a386Sopenharmony_ci if (kScale == +1) { scalex = bicubic_near( fx); } 2660cb93a386Sopenharmony_ci if (kScale == +3) { scalex = bicubic_far ( fx); } 2661cb93a386Sopenharmony_ci sk_unaligned_store(ctx->scalex, scalex); 2662cb93a386Sopenharmony_ci} 2663cb93a386Sopenharmony_citemplate <int kScale> 2664cb93a386Sopenharmony_ciSI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) { 2665cb93a386Sopenharmony_ci *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f); 2666cb93a386Sopenharmony_ci F fy = sk_unaligned_load<F>(ctx->fy); 2667cb93a386Sopenharmony_ci 2668cb93a386Sopenharmony_ci F scaley; 2669cb93a386Sopenharmony_ci if (kScale == -3) { scaley = bicubic_far (1.0f - fy); } 2670cb93a386Sopenharmony_ci if (kScale == -1) { scaley = bicubic_near(1.0f - fy); } 2671cb93a386Sopenharmony_ci if (kScale == +1) { scaley = bicubic_near( fy); } 2672cb93a386Sopenharmony_ci if (kScale == +3) { scaley = bicubic_far ( fy); } 2673cb93a386Sopenharmony_ci sk_unaligned_store(ctx->scaley, scaley); 2674cb93a386Sopenharmony_ci} 2675cb93a386Sopenharmony_ci 2676cb93a386Sopenharmony_ciSTAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); } 2677cb93a386Sopenharmony_ciSTAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); } 2678cb93a386Sopenharmony_ciSTAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); } 2679cb93a386Sopenharmony_ciSTAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); } 2680cb93a386Sopenharmony_ci 2681cb93a386Sopenharmony_ciSTAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); } 2682cb93a386Sopenharmony_ciSTAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); } 2683cb93a386Sopenharmony_ciSTAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); } 2684cb93a386Sopenharmony_ciSTAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); } 2685cb93a386Sopenharmony_ci 2686cb93a386Sopenharmony_ciSTAGE(callback, SkRasterPipeline_CallbackCtx* c) { 2687cb93a386Sopenharmony_ci store4(c->rgba,0, r,g,b,a); 2688cb93a386Sopenharmony_ci c->fn(c, tail ? tail : N); 2689cb93a386Sopenharmony_ci load4(c->read_from,0, &r,&g,&b,&a); 2690cb93a386Sopenharmony_ci} 2691cb93a386Sopenharmony_ci 2692cb93a386Sopenharmony_ciSTAGE(gauss_a_to_rgba, Ctx::None) { 2693cb93a386Sopenharmony_ci // x = 1 - x; 2694cb93a386Sopenharmony_ci // exp(-x * x * 4) - 0.018f; 2695cb93a386Sopenharmony_ci // ... now approximate with quartic 2696cb93a386Sopenharmony_ci // 2697cb93a386Sopenharmony_ci const float c4 = -2.26661229133605957031f; 2698cb93a386Sopenharmony_ci const float c3 = 2.89795351028442382812f; 2699cb93a386Sopenharmony_ci const float c2 = 0.21345567703247070312f; 2700cb93a386Sopenharmony_ci const float c1 = 0.15489584207534790039f; 2701cb93a386Sopenharmony_ci const float c0 = 0.00030726194381713867f; 2702cb93a386Sopenharmony_ci a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0); 2703cb93a386Sopenharmony_ci r = a; 2704cb93a386Sopenharmony_ci g = a; 2705cb93a386Sopenharmony_ci b = a; 2706cb93a386Sopenharmony_ci} 2707cb93a386Sopenharmony_ci 2708cb93a386Sopenharmony_ciSI F tile(F v, SkTileMode mode, float limit, float invLimit) { 2709cb93a386Sopenharmony_ci // The ix_and_ptr() calls in sample() will clamp tile()'s output, so no need to clamp here. 2710cb93a386Sopenharmony_ci switch (mode) { 2711cb93a386Sopenharmony_ci case SkTileMode::kDecal: 2712cb93a386Sopenharmony_ci case SkTileMode::kClamp: return v; 2713cb93a386Sopenharmony_ci case SkTileMode::kRepeat: return v - floor_(v*invLimit)*limit; 2714cb93a386Sopenharmony_ci case SkTileMode::kMirror: 2715cb93a386Sopenharmony_ci return abs_( (v-limit) - (limit+limit)*floor_((v-limit)*(invLimit*0.5f)) - limit ); 2716cb93a386Sopenharmony_ci } 2717cb93a386Sopenharmony_ci SkUNREACHABLE; 2718cb93a386Sopenharmony_ci} 2719cb93a386Sopenharmony_ci 2720cb93a386Sopenharmony_ciSI void sample(const SkRasterPipeline_SamplerCtx2* ctx, F x, F y, 2721cb93a386Sopenharmony_ci F* r, F* g, F* b, F* a) { 2722cb93a386Sopenharmony_ci x = tile(x, ctx->tileX, ctx->width , ctx->invWidth ); 2723cb93a386Sopenharmony_ci y = tile(y, ctx->tileY, ctx->height, ctx->invHeight); 2724cb93a386Sopenharmony_ci 2725cb93a386Sopenharmony_ci switch (ctx->ct) { 2726cb93a386Sopenharmony_ci default: *r = *g = *b = *a = 0; 2727cb93a386Sopenharmony_ci break; 2728cb93a386Sopenharmony_ci 2729cb93a386Sopenharmony_ci case kRGBA_8888_SkColorType: 2730cb93a386Sopenharmony_ci case kBGRA_8888_SkColorType: { 2731cb93a386Sopenharmony_ci const uint32_t* ptr; 2732cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x,y); 2733cb93a386Sopenharmony_ci from_8888(gather(ptr, ix), r,g,b,a); 2734cb93a386Sopenharmony_ci if (ctx->ct == kBGRA_8888_SkColorType) { 2735cb93a386Sopenharmony_ci std::swap(*r,*b); 2736cb93a386Sopenharmony_ci } 2737cb93a386Sopenharmony_ci } break; 2738cb93a386Sopenharmony_ci } 2739cb93a386Sopenharmony_ci} 2740cb93a386Sopenharmony_ci 2741cb93a386Sopenharmony_citemplate <int D> 2742cb93a386Sopenharmony_ciSI void sampler(const SkRasterPipeline_SamplerCtx2* ctx, 2743cb93a386Sopenharmony_ci F cx, F cy, const F (&wx)[D], const F (&wy)[D], 2744cb93a386Sopenharmony_ci F* r, F* g, F* b, F* a) { 2745cb93a386Sopenharmony_ci 2746cb93a386Sopenharmony_ci float start = -0.5f*(D-1); 2747cb93a386Sopenharmony_ci 2748cb93a386Sopenharmony_ci *r = *g = *b = *a = 0; 2749cb93a386Sopenharmony_ci F y = cy + start; 2750cb93a386Sopenharmony_ci for (int j = 0; j < D; j++, y += 1.0f) { 2751cb93a386Sopenharmony_ci F x = cx + start; 2752cb93a386Sopenharmony_ci for (int i = 0; i < D; i++, x += 1.0f) { 2753cb93a386Sopenharmony_ci F R,G,B,A; 2754cb93a386Sopenharmony_ci sample(ctx, x,y, &R,&G,&B,&A); 2755cb93a386Sopenharmony_ci 2756cb93a386Sopenharmony_ci F w = wx[i] * wy[j]; 2757cb93a386Sopenharmony_ci *r = mad(w,R,*r); 2758cb93a386Sopenharmony_ci *g = mad(w,G,*g); 2759cb93a386Sopenharmony_ci *b = mad(w,B,*b); 2760cb93a386Sopenharmony_ci *a = mad(w,A,*a); 2761cb93a386Sopenharmony_ci } 2762cb93a386Sopenharmony_ci } 2763cb93a386Sopenharmony_ci} 2764cb93a386Sopenharmony_ci 2765cb93a386Sopenharmony_ciSTAGE(bilinear, const SkRasterPipeline_SamplerCtx2* ctx) { 2766cb93a386Sopenharmony_ci F x = r, fx = fract(x + 0.5f), 2767cb93a386Sopenharmony_ci y = g, fy = fract(y + 0.5f); 2768cb93a386Sopenharmony_ci const F wx[] = {1.0f - fx, fx}; 2769cb93a386Sopenharmony_ci const F wy[] = {1.0f - fy, fy}; 2770cb93a386Sopenharmony_ci 2771cb93a386Sopenharmony_ci sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); 2772cb93a386Sopenharmony_ci} 2773cb93a386Sopenharmony_ciSTAGE(bicubic, SkRasterPipeline_SamplerCtx2* ctx) { 2774cb93a386Sopenharmony_ci F x = r, fx = fract(x + 0.5f), 2775cb93a386Sopenharmony_ci y = g, fy = fract(y + 0.5f); 2776cb93a386Sopenharmony_ci const F wx[] = { bicubic_far(1-fx), bicubic_near(1-fx), bicubic_near(fx), bicubic_far(fx) }; 2777cb93a386Sopenharmony_ci const F wy[] = { bicubic_far(1-fy), bicubic_near(1-fy), bicubic_near(fy), bicubic_far(fy) }; 2778cb93a386Sopenharmony_ci 2779cb93a386Sopenharmony_ci sampler(ctx, x,y, wx,wy, &r,&g,&b,&a); 2780cb93a386Sopenharmony_ci} 2781cb93a386Sopenharmony_ci 2782cb93a386Sopenharmony_ci// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. 2783cb93a386Sopenharmony_ciSTAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { 2784cb93a386Sopenharmony_ci // (cx,cy) are the center of our sample. 2785cb93a386Sopenharmony_ci F cx = r, 2786cb93a386Sopenharmony_ci cy = g; 2787cb93a386Sopenharmony_ci 2788cb93a386Sopenharmony_ci // All sample points are at the same fractional offset (fx,fy). 2789cb93a386Sopenharmony_ci // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. 2790cb93a386Sopenharmony_ci F fx = fract(cx + 0.5f), 2791cb93a386Sopenharmony_ci fy = fract(cy + 0.5f); 2792cb93a386Sopenharmony_ci 2793cb93a386Sopenharmony_ci // We'll accumulate the color of all four samples into {r,g,b,a} directly. 2794cb93a386Sopenharmony_ci r = g = b = a = 0; 2795cb93a386Sopenharmony_ci 2796cb93a386Sopenharmony_ci for (float py = -0.5f; py <= +0.5f; py += 1.0f) 2797cb93a386Sopenharmony_ci for (float px = -0.5f; px <= +0.5f; px += 1.0f) { 2798cb93a386Sopenharmony_ci // (x,y) are the coordinates of this sample point. 2799cb93a386Sopenharmony_ci F x = cx + px, 2800cb93a386Sopenharmony_ci y = cy + py; 2801cb93a386Sopenharmony_ci 2802cb93a386Sopenharmony_ci // ix_and_ptr() will clamp to the image's bounds for us. 2803cb93a386Sopenharmony_ci const uint32_t* ptr; 2804cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x,y); 2805cb93a386Sopenharmony_ci 2806cb93a386Sopenharmony_ci F sr,sg,sb,sa; 2807cb93a386Sopenharmony_ci from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); 2808cb93a386Sopenharmony_ci 2809cb93a386Sopenharmony_ci // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center 2810cb93a386Sopenharmony_ci // are combined in direct proportion to their area overlapping that logical query pixel. 2811cb93a386Sopenharmony_ci // At positive offsets, the x-axis contribution to that rectangle is fx, 2812cb93a386Sopenharmony_ci // or (1-fx) at negative x. Same deal for y. 2813cb93a386Sopenharmony_ci F sx = (px > 0) ? fx : 1.0f - fx, 2814cb93a386Sopenharmony_ci sy = (py > 0) ? fy : 1.0f - fy, 2815cb93a386Sopenharmony_ci area = sx * sy; 2816cb93a386Sopenharmony_ci 2817cb93a386Sopenharmony_ci r += sr * area; 2818cb93a386Sopenharmony_ci g += sg * area; 2819cb93a386Sopenharmony_ci b += sb * area; 2820cb93a386Sopenharmony_ci a += sa * area; 2821cb93a386Sopenharmony_ci } 2822cb93a386Sopenharmony_ci} 2823cb93a386Sopenharmony_ci 2824cb93a386Sopenharmony_ci// A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling. 2825cb93a386Sopenharmony_ciSTAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { 2826cb93a386Sopenharmony_ci // (cx,cy) are the center of our sample. 2827cb93a386Sopenharmony_ci F cx = r, 2828cb93a386Sopenharmony_ci cy = g; 2829cb93a386Sopenharmony_ci 2830cb93a386Sopenharmony_ci // All sample points are at the same fractional offset (fx,fy). 2831cb93a386Sopenharmony_ci // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets. 2832cb93a386Sopenharmony_ci F fx = fract(cx + 0.5f), 2833cb93a386Sopenharmony_ci fy = fract(cy + 0.5f); 2834cb93a386Sopenharmony_ci 2835cb93a386Sopenharmony_ci // We'll accumulate the color of all four samples into {r,g,b,a} directly. 2836cb93a386Sopenharmony_ci r = g = b = a = 0; 2837cb93a386Sopenharmony_ci 2838cb93a386Sopenharmony_ci const F scaley[4] = { 2839cb93a386Sopenharmony_ci bicubic_far (1.0f - fy), bicubic_near(1.0f - fy), 2840cb93a386Sopenharmony_ci bicubic_near( fy), bicubic_far ( fy), 2841cb93a386Sopenharmony_ci }; 2842cb93a386Sopenharmony_ci const F scalex[4] = { 2843cb93a386Sopenharmony_ci bicubic_far (1.0f - fx), bicubic_near(1.0f - fx), 2844cb93a386Sopenharmony_ci bicubic_near( fx), bicubic_far ( fx), 2845cb93a386Sopenharmony_ci }; 2846cb93a386Sopenharmony_ci 2847cb93a386Sopenharmony_ci F sample_y = cy - 1.5f; 2848cb93a386Sopenharmony_ci for (int yy = 0; yy <= 3; ++yy) { 2849cb93a386Sopenharmony_ci F sample_x = cx - 1.5f; 2850cb93a386Sopenharmony_ci for (int xx = 0; xx <= 3; ++xx) { 2851cb93a386Sopenharmony_ci F scale = scalex[xx] * scaley[yy]; 2852cb93a386Sopenharmony_ci 2853cb93a386Sopenharmony_ci // ix_and_ptr() will clamp to the image's bounds for us. 2854cb93a386Sopenharmony_ci const uint32_t* ptr; 2855cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y); 2856cb93a386Sopenharmony_ci 2857cb93a386Sopenharmony_ci F sr,sg,sb,sa; 2858cb93a386Sopenharmony_ci from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa); 2859cb93a386Sopenharmony_ci 2860cb93a386Sopenharmony_ci r = mad(scale, sr, r); 2861cb93a386Sopenharmony_ci g = mad(scale, sg, g); 2862cb93a386Sopenharmony_ci b = mad(scale, sb, b); 2863cb93a386Sopenharmony_ci a = mad(scale, sa, a); 2864cb93a386Sopenharmony_ci 2865cb93a386Sopenharmony_ci sample_x += 1; 2866cb93a386Sopenharmony_ci } 2867cb93a386Sopenharmony_ci sample_y += 1; 2868cb93a386Sopenharmony_ci } 2869cb93a386Sopenharmony_ci} 2870cb93a386Sopenharmony_ci 2871cb93a386Sopenharmony_ci// ~~~~~~ GrSwizzle stage ~~~~~~ // 2872cb93a386Sopenharmony_ci 2873cb93a386Sopenharmony_ciSTAGE(swizzle, void* ctx) { 2874cb93a386Sopenharmony_ci auto ir = r, ig = g, ib = b, ia = a; 2875cb93a386Sopenharmony_ci F* o[] = {&r, &g, &b, &a}; 2876cb93a386Sopenharmony_ci char swiz[4]; 2877cb93a386Sopenharmony_ci memcpy(swiz, &ctx, sizeof(swiz)); 2878cb93a386Sopenharmony_ci 2879cb93a386Sopenharmony_ci for (int i = 0; i < 4; ++i) { 2880cb93a386Sopenharmony_ci switch (swiz[i]) { 2881cb93a386Sopenharmony_ci case 'r': *o[i] = ir; break; 2882cb93a386Sopenharmony_ci case 'g': *o[i] = ig; break; 2883cb93a386Sopenharmony_ci case 'b': *o[i] = ib; break; 2884cb93a386Sopenharmony_ci case 'a': *o[i] = ia; break; 2885cb93a386Sopenharmony_ci case '0': *o[i] = F(0); break; 2886cb93a386Sopenharmony_ci case '1': *o[i] = F(1); break; 2887cb93a386Sopenharmony_ci default: break; 2888cb93a386Sopenharmony_ci } 2889cb93a386Sopenharmony_ci } 2890cb93a386Sopenharmony_ci} 2891cb93a386Sopenharmony_ci 2892cb93a386Sopenharmony_cinamespace lowp { 2893cb93a386Sopenharmony_ci#if defined(JUMPER_IS_SCALAR) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE) 2894cb93a386Sopenharmony_ci // If we're not compiled by Clang, or otherwise switched into scalar mode (old Clang, manually), 2895cb93a386Sopenharmony_ci // we don't generate lowp stages. All these nullptrs will tell SkJumper.cpp to always use the 2896cb93a386Sopenharmony_ci // highp float pipeline. 2897cb93a386Sopenharmony_ci #define M(st) static void (*st)(void) = nullptr; 2898cb93a386Sopenharmony_ci SK_RASTER_PIPELINE_STAGES(M) 2899cb93a386Sopenharmony_ci #undef M 2900cb93a386Sopenharmony_ci static void (*just_return)(void) = nullptr; 2901cb93a386Sopenharmony_ci 2902cb93a386Sopenharmony_ci static void start_pipeline(size_t,size_t,size_t,size_t, void**) {} 2903cb93a386Sopenharmony_ci 2904cb93a386Sopenharmony_ci#else // We are compiling vector code with Clang... let's make some lowp stages! 2905cb93a386Sopenharmony_ci 2906cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 2907cb93a386Sopenharmony_ci using U8 = uint8_t __attribute__((ext_vector_type(16))); 2908cb93a386Sopenharmony_ci using U16 = uint16_t __attribute__((ext_vector_type(16))); 2909cb93a386Sopenharmony_ci using I16 = int16_t __attribute__((ext_vector_type(16))); 2910cb93a386Sopenharmony_ci using I32 = int32_t __attribute__((ext_vector_type(16))); 2911cb93a386Sopenharmony_ci using U32 = uint32_t __attribute__((ext_vector_type(16))); 2912cb93a386Sopenharmony_ci using I64 = int64_t __attribute__((ext_vector_type(16))); 2913cb93a386Sopenharmony_ci using U64 = uint64_t __attribute__((ext_vector_type(16))); 2914cb93a386Sopenharmony_ci using F = float __attribute__((ext_vector_type(16))); 2915cb93a386Sopenharmony_ci#else 2916cb93a386Sopenharmony_ci using U8 = uint8_t __attribute__((ext_vector_type(8))); 2917cb93a386Sopenharmony_ci using U16 = uint16_t __attribute__((ext_vector_type(8))); 2918cb93a386Sopenharmony_ci using I16 = int16_t __attribute__((ext_vector_type(8))); 2919cb93a386Sopenharmony_ci using I32 = int32_t __attribute__((ext_vector_type(8))); 2920cb93a386Sopenharmony_ci using U32 = uint32_t __attribute__((ext_vector_type(8))); 2921cb93a386Sopenharmony_ci using I64 = int64_t __attribute__((ext_vector_type(8))); 2922cb93a386Sopenharmony_ci using U64 = uint64_t __attribute__((ext_vector_type(8))); 2923cb93a386Sopenharmony_ci using F = float __attribute__((ext_vector_type(8))); 2924cb93a386Sopenharmony_ci#endif 2925cb93a386Sopenharmony_ci 2926cb93a386Sopenharmony_cistatic const size_t N = sizeof(U16) / sizeof(uint16_t); 2927cb93a386Sopenharmony_ci 2928cb93a386Sopenharmony_ci// Once again, some platforms benefit from a restricted Stage calling convention, 2929cb93a386Sopenharmony_ci// but others can pass tons and tons of registers and we're happy to exploit that. 2930cb93a386Sopenharmony_ci// It's exactly the same decision and implementation strategy as the F stages above. 2931cb93a386Sopenharmony_ci#if JUMPER_NARROW_STAGES 2932cb93a386Sopenharmony_ci struct Params { 2933cb93a386Sopenharmony_ci size_t dx, dy, tail; 2934cb93a386Sopenharmony_ci U16 dr,dg,db,da; 2935cb93a386Sopenharmony_ci }; 2936cb93a386Sopenharmony_ci using Stage = void(ABI*)(Params*, void** program, U16 r, U16 g, U16 b, U16 a); 2937cb93a386Sopenharmony_ci#else 2938cb93a386Sopenharmony_ci // We pass program as the second argument so that load_and_inc() will find it in %rsi on x86-64. 2939cb93a386Sopenharmony_ci using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy, 2940cb93a386Sopenharmony_ci U16 r, U16 g, U16 b, U16 a, 2941cb93a386Sopenharmony_ci U16 dr, U16 dg, U16 db, U16 da); 2942cb93a386Sopenharmony_ci#endif 2943cb93a386Sopenharmony_ci 2944cb93a386Sopenharmony_cistatic void start_pipeline(const size_t x0, const size_t y0, 2945cb93a386Sopenharmony_ci const size_t xlimit, const size_t ylimit, void** program) { 2946cb93a386Sopenharmony_ci auto start = (Stage)load_and_inc(program); 2947cb93a386Sopenharmony_ci for (size_t dy = y0; dy < ylimit; dy++) { 2948cb93a386Sopenharmony_ci #if JUMPER_NARROW_STAGES 2949cb93a386Sopenharmony_ci Params params = { x0,dy,0, 0,0,0,0 }; 2950cb93a386Sopenharmony_ci for (; params.dx + N <= xlimit; params.dx += N) { 2951cb93a386Sopenharmony_ci start(¶ms,program, 0,0,0,0); 2952cb93a386Sopenharmony_ci } 2953cb93a386Sopenharmony_ci if (size_t tail = xlimit - params.dx) { 2954cb93a386Sopenharmony_ci params.tail = tail; 2955cb93a386Sopenharmony_ci start(¶ms,program, 0,0,0,0); 2956cb93a386Sopenharmony_ci } 2957cb93a386Sopenharmony_ci #else 2958cb93a386Sopenharmony_ci size_t dx = x0; 2959cb93a386Sopenharmony_ci for (; dx + N <= xlimit; dx += N) { 2960cb93a386Sopenharmony_ci start( 0,program,dx,dy, 0,0,0,0, 0,0,0,0); 2961cb93a386Sopenharmony_ci } 2962cb93a386Sopenharmony_ci if (size_t tail = xlimit - dx) { 2963cb93a386Sopenharmony_ci start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0); 2964cb93a386Sopenharmony_ci } 2965cb93a386Sopenharmony_ci #endif 2966cb93a386Sopenharmony_ci } 2967cb93a386Sopenharmony_ci} 2968cb93a386Sopenharmony_ci 2969cb93a386Sopenharmony_ci#if JUMPER_NARROW_STAGES 2970cb93a386Sopenharmony_ci static void ABI just_return(Params*, void**, U16,U16,U16,U16) {} 2971cb93a386Sopenharmony_ci#else 2972cb93a386Sopenharmony_ci static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {} 2973cb93a386Sopenharmony_ci#endif 2974cb93a386Sopenharmony_ci 2975cb93a386Sopenharmony_ci// All stages use the same function call ABI to chain into each other, but there are three types: 2976cb93a386Sopenharmony_ci// GG: geometry in, geometry out -- think, a matrix 2977cb93a386Sopenharmony_ci// GP: geometry in, pixels out. -- think, a memory gather 2978cb93a386Sopenharmony_ci// PP: pixels in, pixels out. -- think, a blend mode 2979cb93a386Sopenharmony_ci// 2980cb93a386Sopenharmony_ci// (Some stages ignore their inputs or produce no logical output. That's perfectly fine.) 2981cb93a386Sopenharmony_ci// 2982cb93a386Sopenharmony_ci// These three STAGE_ macros let you define each type of stage, 2983cb93a386Sopenharmony_ci// and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate. 2984cb93a386Sopenharmony_ci 2985cb93a386Sopenharmony_ci#if JUMPER_NARROW_STAGES 2986cb93a386Sopenharmony_ci #define STAGE_GG(name, ...) \ 2987cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ 2988cb93a386Sopenharmony_ci static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ 2989cb93a386Sopenharmony_ci auto x = join<F>(r,g), \ 2990cb93a386Sopenharmony_ci y = join<F>(b,a); \ 2991cb93a386Sopenharmony_ci name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \ 2992cb93a386Sopenharmony_ci split(x, &r,&g); \ 2993cb93a386Sopenharmony_ci split(y, &b,&a); \ 2994cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 2995cb93a386Sopenharmony_ci next(params,program, r,g,b,a); \ 2996cb93a386Sopenharmony_ci } \ 2997cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) 2998cb93a386Sopenharmony_ci 2999cb93a386Sopenharmony_ci #define STAGE_GP(name, ...) \ 3000cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 3001cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3002cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da); \ 3003cb93a386Sopenharmony_ci static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ 3004cb93a386Sopenharmony_ci auto x = join<F>(r,g), \ 3005cb93a386Sopenharmony_ci y = join<F>(b,a); \ 3006cb93a386Sopenharmony_ci name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \ 3007cb93a386Sopenharmony_ci params->dr,params->dg,params->db,params->da); \ 3008cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 3009cb93a386Sopenharmony_ci next(params,program, r,g,b,a); \ 3010cb93a386Sopenharmony_ci } \ 3011cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 3012cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3013cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da) 3014cb93a386Sopenharmony_ci 3015cb93a386Sopenharmony_ci #define STAGE_PP(name, ...) \ 3016cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 3017cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3018cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da); \ 3019cb93a386Sopenharmony_ci static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \ 3020cb93a386Sopenharmony_ci name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \ 3021cb93a386Sopenharmony_ci params->dr,params->dg,params->db,params->da); \ 3022cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 3023cb93a386Sopenharmony_ci next(params,program, r,g,b,a); \ 3024cb93a386Sopenharmony_ci } \ 3025cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 3026cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3027cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da) 3028cb93a386Sopenharmony_ci#else 3029cb93a386Sopenharmony_ci #define STAGE_GG(name, ...) \ 3030cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \ 3031cb93a386Sopenharmony_ci static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ 3032cb93a386Sopenharmony_ci U16 r, U16 g, U16 b, U16 a, \ 3033cb93a386Sopenharmony_ci U16 dr, U16 dg, U16 db, U16 da) { \ 3034cb93a386Sopenharmony_ci auto x = join<F>(r,g), \ 3035cb93a386Sopenharmony_ci y = join<F>(b,a); \ 3036cb93a386Sopenharmony_ci name##_k(Ctx{program}, dx,dy,tail, x,y); \ 3037cb93a386Sopenharmony_ci split(x, &r,&g); \ 3038cb93a386Sopenharmony_ci split(y, &b,&a); \ 3039cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 3040cb93a386Sopenharmony_ci next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 3041cb93a386Sopenharmony_ci } \ 3042cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y) 3043cb93a386Sopenharmony_ci 3044cb93a386Sopenharmony_ci #define STAGE_GP(name, ...) \ 3045cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 3046cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3047cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da); \ 3048cb93a386Sopenharmony_ci static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ 3049cb93a386Sopenharmony_ci U16 r, U16 g, U16 b, U16 a, \ 3050cb93a386Sopenharmony_ci U16 dr, U16 dg, U16 db, U16 da) { \ 3051cb93a386Sopenharmony_ci auto x = join<F>(r,g), \ 3052cb93a386Sopenharmony_ci y = join<F>(b,a); \ 3053cb93a386Sopenharmony_ci name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \ 3054cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 3055cb93a386Sopenharmony_ci next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 3056cb93a386Sopenharmony_ci } \ 3057cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \ 3058cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3059cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da) 3060cb93a386Sopenharmony_ci 3061cb93a386Sopenharmony_ci #define STAGE_PP(name, ...) \ 3062cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 3063cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3064cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da); \ 3065cb93a386Sopenharmony_ci static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \ 3066cb93a386Sopenharmony_ci U16 r, U16 g, U16 b, U16 a, \ 3067cb93a386Sopenharmony_ci U16 dr, U16 dg, U16 db, U16 da) { \ 3068cb93a386Sopenharmony_ci name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \ 3069cb93a386Sopenharmony_ci auto next = (Stage)load_and_inc(program); \ 3070cb93a386Sopenharmony_ci next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \ 3071cb93a386Sopenharmony_ci } \ 3072cb93a386Sopenharmony_ci SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \ 3073cb93a386Sopenharmony_ci U16& r, U16& g, U16& b, U16& a, \ 3074cb93a386Sopenharmony_ci U16& dr, U16& dg, U16& db, U16& da) 3075cb93a386Sopenharmony_ci#endif 3076cb93a386Sopenharmony_ci 3077cb93a386Sopenharmony_ci// ~~~~~~ Commonly used helper functions ~~~~~~ // 3078cb93a386Sopenharmony_ci 3079cb93a386Sopenharmony_ci/** 3080cb93a386Sopenharmony_ci * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow, 3081cb93a386Sopenharmony_ci * thanks to a division by a non-power of two: 3082cb93a386Sopenharmony_ci * [1] (v + 127) / 255 3083cb93a386Sopenharmony_ci * 3084cb93a386Sopenharmony_ci * There is a two-step process that computes the correct answer for all inputs: 3085cb93a386Sopenharmony_ci * [2] (v + 128 + ((v + 128) >> 8)) >> 8 3086cb93a386Sopenharmony_ci * 3087cb93a386Sopenharmony_ci * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time: 3088cb93a386Sopenharmony_ci * [3] (v + 255) >> 8; 3089cb93a386Sopenharmony_ci * 3090cb93a386Sopenharmony_ci * We offer two different implementations here, depending on the requirements of the calling stage. 3091cb93a386Sopenharmony_ci */ 3092cb93a386Sopenharmony_ci 3093cb93a386Sopenharmony_ci/** 3094cb93a386Sopenharmony_ci * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast 3095cb93a386Sopenharmony_ci * as [3]), and uses [3] elsewhere. 3096cb93a386Sopenharmony_ci */ 3097cb93a386Sopenharmony_ciSI U16 div255(U16 v) { 3098cb93a386Sopenharmony_ci#if defined(JUMPER_IS_NEON) 3099cb93a386Sopenharmony_ci // With NEON we can compute [2] just as fast as [3], so let's be correct. 3100cb93a386Sopenharmony_ci // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up: 3101cb93a386Sopenharmony_ci return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8); 3102cb93a386Sopenharmony_ci#else 3103cb93a386Sopenharmony_ci // Otherwise, use [3], which is never wrong by more than 1: 3104cb93a386Sopenharmony_ci return (v+255)/256; 3105cb93a386Sopenharmony_ci#endif 3106cb93a386Sopenharmony_ci} 3107cb93a386Sopenharmony_ci 3108cb93a386Sopenharmony_ci/** 3109cb93a386Sopenharmony_ci * div255_accurate guarantees the right answer on all platforms, at the expense of performance. 3110cb93a386Sopenharmony_ci */ 3111cb93a386Sopenharmony_ciSI U16 div255_accurate(U16 v) { 3112cb93a386Sopenharmony_ci#if defined(JUMPER_IS_NEON) 3113cb93a386Sopenharmony_ci // Our NEON implementation of div255 is already correct for all inputs: 3114cb93a386Sopenharmony_ci return div255(v); 3115cb93a386Sopenharmony_ci#else 3116cb93a386Sopenharmony_ci // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics: 3117cb93a386Sopenharmony_ci v += 128; 3118cb93a386Sopenharmony_ci return (v+(v/256))/256; 3119cb93a386Sopenharmony_ci#endif 3120cb93a386Sopenharmony_ci} 3121cb93a386Sopenharmony_ci 3122cb93a386Sopenharmony_ciSI U16 inv(U16 v) { return 255-v; } 3123cb93a386Sopenharmony_ci 3124cb93a386Sopenharmony_ciSI U16 if_then_else(I16 c, U16 t, U16 e) { return (t & c) | (e & ~c); } 3125cb93a386Sopenharmony_ciSI U32 if_then_else(I32 c, U32 t, U32 e) { return (t & c) | (e & ~c); } 3126cb93a386Sopenharmony_ci 3127cb93a386Sopenharmony_ciSI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); } 3128cb93a386Sopenharmony_ciSI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); } 3129cb93a386Sopenharmony_ci 3130cb93a386Sopenharmony_ciSI U16 from_float(float f) { return f * 255.0f + 0.5f; } 3131cb93a386Sopenharmony_ci 3132cb93a386Sopenharmony_ciSI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); } 3133cb93a386Sopenharmony_ci 3134cb93a386Sopenharmony_citemplate <typename D, typename S> 3135cb93a386Sopenharmony_ciSI D cast(S src) { 3136cb93a386Sopenharmony_ci return __builtin_convertvector(src, D); 3137cb93a386Sopenharmony_ci} 3138cb93a386Sopenharmony_ci 3139cb93a386Sopenharmony_citemplate <typename D, typename S> 3140cb93a386Sopenharmony_ciSI void split(S v, D* lo, D* hi) { 3141cb93a386Sopenharmony_ci static_assert(2*sizeof(D) == sizeof(S), ""); 3142cb93a386Sopenharmony_ci memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D)); 3143cb93a386Sopenharmony_ci memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D)); 3144cb93a386Sopenharmony_ci} 3145cb93a386Sopenharmony_citemplate <typename D, typename S> 3146cb93a386Sopenharmony_ciSI D join(S lo, S hi) { 3147cb93a386Sopenharmony_ci static_assert(sizeof(D) == 2*sizeof(S), ""); 3148cb93a386Sopenharmony_ci D v; 3149cb93a386Sopenharmony_ci memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S)); 3150cb93a386Sopenharmony_ci memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S)); 3151cb93a386Sopenharmony_ci return v; 3152cb93a386Sopenharmony_ci} 3153cb93a386Sopenharmony_ci 3154cb93a386Sopenharmony_ciSI F if_then_else(I32 c, F t, F e) { 3155cb93a386Sopenharmony_ci return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) ); 3156cb93a386Sopenharmony_ci} 3157cb93a386Sopenharmony_ciSI F max(F x, F y) { return if_then_else(x < y, y, x); } 3158cb93a386Sopenharmony_ciSI F min(F x, F y) { return if_then_else(x < y, x, y); } 3159cb93a386Sopenharmony_ci 3160cb93a386Sopenharmony_ciSI I32 if_then_else(I32 c, I32 t, I32 e) { 3161cb93a386Sopenharmony_ci return (t & c) | (e & ~c); 3162cb93a386Sopenharmony_ci} 3163cb93a386Sopenharmony_ciSI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); } 3164cb93a386Sopenharmony_ciSI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); } 3165cb93a386Sopenharmony_ci 3166cb93a386Sopenharmony_ciSI F mad(F f, F m, F a) { return f*m+a; } 3167cb93a386Sopenharmony_ciSI U32 trunc_(F x) { return (U32)cast<I32>(x); } 3168cb93a386Sopenharmony_ci 3169cb93a386Sopenharmony_ci// Use approximate instructions and one Newton-Raphson step to calculate 1/x. 3170cb93a386Sopenharmony_ciSI F rcp_precise(F x) { 3171cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3172cb93a386Sopenharmony_ci __m256 lo,hi; 3173cb93a386Sopenharmony_ci split(x, &lo,&hi); 3174cb93a386Sopenharmony_ci return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi)); 3175cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) 3176cb93a386Sopenharmony_ci __m128 lo,hi; 3177cb93a386Sopenharmony_ci split(x, &lo,&hi); 3178cb93a386Sopenharmony_ci return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi)); 3179cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_NEON) 3180cb93a386Sopenharmony_ci float32x4_t lo,hi; 3181cb93a386Sopenharmony_ci split(x, &lo,&hi); 3182cb93a386Sopenharmony_ci return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi)); 3183cb93a386Sopenharmony_ci#else 3184cb93a386Sopenharmony_ci return 1.0f / x; 3185cb93a386Sopenharmony_ci#endif 3186cb93a386Sopenharmony_ci} 3187cb93a386Sopenharmony_ciSI F sqrt_(F x) { 3188cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3189cb93a386Sopenharmony_ci __m256 lo,hi; 3190cb93a386Sopenharmony_ci split(x, &lo,&hi); 3191cb93a386Sopenharmony_ci return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi)); 3192cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_SSE2) || defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) 3193cb93a386Sopenharmony_ci __m128 lo,hi; 3194cb93a386Sopenharmony_ci split(x, &lo,&hi); 3195cb93a386Sopenharmony_ci return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi)); 3196cb93a386Sopenharmony_ci#elif defined(SK_CPU_ARM64) 3197cb93a386Sopenharmony_ci float32x4_t lo,hi; 3198cb93a386Sopenharmony_ci split(x, &lo,&hi); 3199cb93a386Sopenharmony_ci return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi)); 3200cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_NEON) 3201cb93a386Sopenharmony_ci auto sqrt = [](float32x4_t v) { 3202cb93a386Sopenharmony_ci auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v). 3203cb93a386Sopenharmony_ci est *= vrsqrtsq_f32(v,est*est); 3204cb93a386Sopenharmony_ci est *= vrsqrtsq_f32(v,est*est); 3205cb93a386Sopenharmony_ci return v*est; // sqrt(v) == v*rsqrt(v). 3206cb93a386Sopenharmony_ci }; 3207cb93a386Sopenharmony_ci float32x4_t lo,hi; 3208cb93a386Sopenharmony_ci split(x, &lo,&hi); 3209cb93a386Sopenharmony_ci return join<F>(sqrt(lo), sqrt(hi)); 3210cb93a386Sopenharmony_ci#else 3211cb93a386Sopenharmony_ci return F{ 3212cb93a386Sopenharmony_ci sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]), 3213cb93a386Sopenharmony_ci sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]), 3214cb93a386Sopenharmony_ci }; 3215cb93a386Sopenharmony_ci#endif 3216cb93a386Sopenharmony_ci} 3217cb93a386Sopenharmony_ci 3218cb93a386Sopenharmony_ciSI F floor_(F x) { 3219cb93a386Sopenharmony_ci#if defined(SK_CPU_ARM64) 3220cb93a386Sopenharmony_ci float32x4_t lo,hi; 3221cb93a386Sopenharmony_ci split(x, &lo,&hi); 3222cb93a386Sopenharmony_ci return join<F>(vrndmq_f32(lo), vrndmq_f32(hi)); 3223cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3224cb93a386Sopenharmony_ci __m256 lo,hi; 3225cb93a386Sopenharmony_ci split(x, &lo,&hi); 3226cb93a386Sopenharmony_ci return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi)); 3227cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) 3228cb93a386Sopenharmony_ci __m128 lo,hi; 3229cb93a386Sopenharmony_ci split(x, &lo,&hi); 3230cb93a386Sopenharmony_ci return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi)); 3231cb93a386Sopenharmony_ci#else 3232cb93a386Sopenharmony_ci F roundtrip = cast<F>(cast<I32>(x)); 3233cb93a386Sopenharmony_ci return roundtrip - if_then_else(roundtrip > x, F(1), F(0)); 3234cb93a386Sopenharmony_ci#endif 3235cb93a386Sopenharmony_ci} 3236cb93a386Sopenharmony_ci 3237cb93a386Sopenharmony_ci// scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally 3238cb93a386Sopenharmony_ci// this multiply is: 3239cb93a386Sopenharmony_ci// (2 * a * b + (1 << 15)) >> 16 3240cb93a386Sopenharmony_ci// The result is a number on [-1, 1). 3241cb93a386Sopenharmony_ci// Note: on neon this is a saturating multiply while the others are not. 3242cb93a386Sopenharmony_ciSI I16 scaled_mult(I16 a, I16 b) { 3243cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3244cb93a386Sopenharmony_ci return _mm256_mulhrs_epi16(a, b); 3245cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX) 3246cb93a386Sopenharmony_ci return _mm_mulhrs_epi16(a, b); 3247cb93a386Sopenharmony_ci#elif defined(SK_CPU_ARM64) 3248cb93a386Sopenharmony_ci return vqrdmulhq_s16(a, b); 3249cb93a386Sopenharmony_ci#elif defined(JUMPER_IS_NEON) 3250cb93a386Sopenharmony_ci return vqrdmulhq_s16(a, b); 3251cb93a386Sopenharmony_ci#else 3252cb93a386Sopenharmony_ci const I32 roundingTerm = 1 << 14; 3253cb93a386Sopenharmony_ci return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15); 3254cb93a386Sopenharmony_ci#endif 3255cb93a386Sopenharmony_ci} 3256cb93a386Sopenharmony_ci 3257cb93a386Sopenharmony_ci// This sum is to support lerp where the result will always be a positive number. In general, 3258cb93a386Sopenharmony_ci// a sum like this would require an additional bit, but because we know the range of the result 3259cb93a386Sopenharmony_ci// we know that the extra bit will always be zero. 3260cb93a386Sopenharmony_ciSI U16 constrained_add(I16 a, U16 b) { 3261cb93a386Sopenharmony_ci #if defined(SK_DEBUG) 3262cb93a386Sopenharmony_ci for (size_t i = 0; i < N; i++) { 3263cb93a386Sopenharmony_ci // Ensure that a + b is on the interval [0, UINT16_MAX] 3264cb93a386Sopenharmony_ci int ia = a[i], 3265cb93a386Sopenharmony_ci ib = b[i]; 3266cb93a386Sopenharmony_ci // Use 65535 here because fuchsia's compiler evaluates UINT16_MAX - ib, which is 3267cb93a386Sopenharmony_ci // 65536U - ib, as an uint32_t instead of an int32_t. This was forcing ia to be 3268cb93a386Sopenharmony_ci // interpreted as an uint32_t. 3269cb93a386Sopenharmony_ci SkASSERT(-ib <= ia && ia <= 65535 - ib); 3270cb93a386Sopenharmony_ci } 3271cb93a386Sopenharmony_ci #endif 3272cb93a386Sopenharmony_ci return b + a; 3273cb93a386Sopenharmony_ci} 3274cb93a386Sopenharmony_ci 3275cb93a386Sopenharmony_ciSI F fract(F x) { return x - floor_(x); } 3276cb93a386Sopenharmony_ciSI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); } 3277cb93a386Sopenharmony_ci 3278cb93a386Sopenharmony_ci// ~~~~~~ Basic / misc. stages ~~~~~~ // 3279cb93a386Sopenharmony_ci 3280cb93a386Sopenharmony_ciSTAGE_GG(seed_shader, Ctx::None) { 3281cb93a386Sopenharmony_ci static const float iota[] = { 3282cb93a386Sopenharmony_ci 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f, 3283cb93a386Sopenharmony_ci 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f, 3284cb93a386Sopenharmony_ci }; 3285cb93a386Sopenharmony_ci x = cast<F>(I32(dx)) + sk_unaligned_load<F>(iota); 3286cb93a386Sopenharmony_ci y = cast<F>(I32(dy)) + 0.5f; 3287cb93a386Sopenharmony_ci} 3288cb93a386Sopenharmony_ci 3289cb93a386Sopenharmony_ciSTAGE_GG(matrix_translate, const float* m) { 3290cb93a386Sopenharmony_ci x += m[0]; 3291cb93a386Sopenharmony_ci y += m[1]; 3292cb93a386Sopenharmony_ci} 3293cb93a386Sopenharmony_ciSTAGE_GG(matrix_scale_translate, const float* m) { 3294cb93a386Sopenharmony_ci x = mad(x,m[0], m[2]); 3295cb93a386Sopenharmony_ci y = mad(y,m[1], m[3]); 3296cb93a386Sopenharmony_ci} 3297cb93a386Sopenharmony_ciSTAGE_GG(matrix_2x3, const float* m) { 3298cb93a386Sopenharmony_ci auto X = mad(x,m[0], mad(y,m[1], m[2])), 3299cb93a386Sopenharmony_ci Y = mad(x,m[3], mad(y,m[4], m[5])); 3300cb93a386Sopenharmony_ci x = X; 3301cb93a386Sopenharmony_ci y = Y; 3302cb93a386Sopenharmony_ci} 3303cb93a386Sopenharmony_ciSTAGE_GG(matrix_perspective, const float* m) { 3304cb93a386Sopenharmony_ci // N.B. Unlike the other matrix_ stages, this matrix is row-major. 3305cb93a386Sopenharmony_ci auto X = mad(x,m[0], mad(y,m[1], m[2])), 3306cb93a386Sopenharmony_ci Y = mad(x,m[3], mad(y,m[4], m[5])), 3307cb93a386Sopenharmony_ci Z = mad(x,m[6], mad(y,m[7], m[8])); 3308cb93a386Sopenharmony_ci x = X * rcp_precise(Z); 3309cb93a386Sopenharmony_ci y = Y * rcp_precise(Z); 3310cb93a386Sopenharmony_ci} 3311cb93a386Sopenharmony_ci 3312cb93a386Sopenharmony_ciSTAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) { 3313cb93a386Sopenharmony_ci r = c->rgba[0]; 3314cb93a386Sopenharmony_ci g = c->rgba[1]; 3315cb93a386Sopenharmony_ci b = c->rgba[2]; 3316cb93a386Sopenharmony_ci a = c->rgba[3]; 3317cb93a386Sopenharmony_ci} 3318cb93a386Sopenharmony_ciSTAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) { 3319cb93a386Sopenharmony_ci dr = c->rgba[0]; 3320cb93a386Sopenharmony_ci dg = c->rgba[1]; 3321cb93a386Sopenharmony_ci db = c->rgba[2]; 3322cb93a386Sopenharmony_ci da = c->rgba[3]; 3323cb93a386Sopenharmony_ci} 3324cb93a386Sopenharmony_ciSTAGE_PP(black_color, Ctx::None) { r = g = b = 0; a = 255; } 3325cb93a386Sopenharmony_ciSTAGE_PP(white_color, Ctx::None) { r = g = b = 255; a = 255; } 3326cb93a386Sopenharmony_ci 3327cb93a386Sopenharmony_ciSTAGE_PP(set_rgb, const float rgb[3]) { 3328cb93a386Sopenharmony_ci r = from_float(rgb[0]); 3329cb93a386Sopenharmony_ci g = from_float(rgb[1]); 3330cb93a386Sopenharmony_ci b = from_float(rgb[2]); 3331cb93a386Sopenharmony_ci} 3332cb93a386Sopenharmony_ci 3333cb93a386Sopenharmony_ciSTAGE_PP(clamp_0, Ctx::None) { /*definitely a noop*/ } 3334cb93a386Sopenharmony_ciSTAGE_PP(clamp_1, Ctx::None) { /*_should_ be a noop*/ } 3335cb93a386Sopenharmony_ci 3336cb93a386Sopenharmony_ciSTAGE_PP(clamp_a, Ctx::None) { 3337cb93a386Sopenharmony_ci r = min(r, a); 3338cb93a386Sopenharmony_ci g = min(g, a); 3339cb93a386Sopenharmony_ci b = min(b, a); 3340cb93a386Sopenharmony_ci} 3341cb93a386Sopenharmony_ci 3342cb93a386Sopenharmony_ciSTAGE_PP(clamp_gamut, Ctx::None) { 3343cb93a386Sopenharmony_ci // It shouldn't be possible to get out-of-gamut 3344cb93a386Sopenharmony_ci // colors when working in lowp. 3345cb93a386Sopenharmony_ci} 3346cb93a386Sopenharmony_ci 3347cb93a386Sopenharmony_ciSTAGE_PP(premul, Ctx::None) { 3348cb93a386Sopenharmony_ci r = div255_accurate(r * a); 3349cb93a386Sopenharmony_ci g = div255_accurate(g * a); 3350cb93a386Sopenharmony_ci b = div255_accurate(b * a); 3351cb93a386Sopenharmony_ci} 3352cb93a386Sopenharmony_ciSTAGE_PP(premul_dst, Ctx::None) { 3353cb93a386Sopenharmony_ci dr = div255_accurate(dr * da); 3354cb93a386Sopenharmony_ci dg = div255_accurate(dg * da); 3355cb93a386Sopenharmony_ci db = div255_accurate(db * da); 3356cb93a386Sopenharmony_ci} 3357cb93a386Sopenharmony_ci 3358cb93a386Sopenharmony_ciSTAGE_PP(force_opaque , Ctx::None) { a = 255; } 3359cb93a386Sopenharmony_ciSTAGE_PP(force_opaque_dst, Ctx::None) { da = 255; } 3360cb93a386Sopenharmony_ci 3361cb93a386Sopenharmony_ciSTAGE_PP(swap_rb, Ctx::None) { 3362cb93a386Sopenharmony_ci auto tmp = r; 3363cb93a386Sopenharmony_ci r = b; 3364cb93a386Sopenharmony_ci b = tmp; 3365cb93a386Sopenharmony_ci} 3366cb93a386Sopenharmony_ciSTAGE_PP(swap_rb_dst, Ctx::None) { 3367cb93a386Sopenharmony_ci auto tmp = dr; 3368cb93a386Sopenharmony_ci dr = db; 3369cb93a386Sopenharmony_ci db = tmp; 3370cb93a386Sopenharmony_ci} 3371cb93a386Sopenharmony_ci 3372cb93a386Sopenharmony_ciSTAGE_PP(move_src_dst, Ctx::None) { 3373cb93a386Sopenharmony_ci dr = r; 3374cb93a386Sopenharmony_ci dg = g; 3375cb93a386Sopenharmony_ci db = b; 3376cb93a386Sopenharmony_ci da = a; 3377cb93a386Sopenharmony_ci} 3378cb93a386Sopenharmony_ci 3379cb93a386Sopenharmony_ciSTAGE_PP(move_dst_src, Ctx::None) { 3380cb93a386Sopenharmony_ci r = dr; 3381cb93a386Sopenharmony_ci g = dg; 3382cb93a386Sopenharmony_ci b = db; 3383cb93a386Sopenharmony_ci a = da; 3384cb93a386Sopenharmony_ci} 3385cb93a386Sopenharmony_ci 3386cb93a386Sopenharmony_ciSTAGE_PP(swap_src_dst, Ctx::None) { 3387cb93a386Sopenharmony_ci std::swap(r, dr); 3388cb93a386Sopenharmony_ci std::swap(g, dg); 3389cb93a386Sopenharmony_ci std::swap(b, db); 3390cb93a386Sopenharmony_ci std::swap(a, da); 3391cb93a386Sopenharmony_ci} 3392cb93a386Sopenharmony_ci 3393cb93a386Sopenharmony_ci// ~~~~~~ Blend modes ~~~~~~ // 3394cb93a386Sopenharmony_ci 3395cb93a386Sopenharmony_ci// The same logic applied to all 4 channels. 3396cb93a386Sopenharmony_ci#define BLEND_MODE(name) \ 3397cb93a386Sopenharmony_ci SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 3398cb93a386Sopenharmony_ci STAGE_PP(name, Ctx::None) { \ 3399cb93a386Sopenharmony_ci r = name##_channel(r,dr,a,da); \ 3400cb93a386Sopenharmony_ci g = name##_channel(g,dg,a,da); \ 3401cb93a386Sopenharmony_ci b = name##_channel(b,db,a,da); \ 3402cb93a386Sopenharmony_ci a = name##_channel(a,da,a,da); \ 3403cb93a386Sopenharmony_ci } \ 3404cb93a386Sopenharmony_ci SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 3405cb93a386Sopenharmony_ci 3406cb93a386Sopenharmony_ci BLEND_MODE(clear) { return 0; } 3407cb93a386Sopenharmony_ci BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); } 3408cb93a386Sopenharmony_ci BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); } 3409cb93a386Sopenharmony_ci BLEND_MODE(srcin) { return div255( s*da ); } 3410cb93a386Sopenharmony_ci BLEND_MODE(dstin) { return div255( d*sa ); } 3411cb93a386Sopenharmony_ci BLEND_MODE(srcout) { return div255( s*inv(da) ); } 3412cb93a386Sopenharmony_ci BLEND_MODE(dstout) { return div255( d*inv(sa) ); } 3413cb93a386Sopenharmony_ci BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); } 3414cb93a386Sopenharmony_ci BLEND_MODE(dstover) { return d + div255( s*inv(da) ); } 3415cb93a386Sopenharmony_ci BLEND_MODE(modulate) { return div255( s*d ); } 3416cb93a386Sopenharmony_ci BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); } 3417cb93a386Sopenharmony_ci BLEND_MODE(plus_) { return min(s+d, 255); } 3418cb93a386Sopenharmony_ci BLEND_MODE(screen) { return s + d - div255( s*d ); } 3419cb93a386Sopenharmony_ci BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); } 3420cb93a386Sopenharmony_ci#undef BLEND_MODE 3421cb93a386Sopenharmony_ci 3422cb93a386Sopenharmony_ci// The same logic applied to color, and srcover for alpha. 3423cb93a386Sopenharmony_ci#define BLEND_MODE(name) \ 3424cb93a386Sopenharmony_ci SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \ 3425cb93a386Sopenharmony_ci STAGE_PP(name, Ctx::None) { \ 3426cb93a386Sopenharmony_ci r = name##_channel(r,dr,a,da); \ 3427cb93a386Sopenharmony_ci g = name##_channel(g,dg,a,da); \ 3428cb93a386Sopenharmony_ci b = name##_channel(b,db,a,da); \ 3429cb93a386Sopenharmony_ci a = a + div255( da*inv(a) ); \ 3430cb93a386Sopenharmony_ci } \ 3431cb93a386Sopenharmony_ci SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da) 3432cb93a386Sopenharmony_ci 3433cb93a386Sopenharmony_ci BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); } 3434cb93a386Sopenharmony_ci BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); } 3435cb93a386Sopenharmony_ci BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); } 3436cb93a386Sopenharmony_ci BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); } 3437cb93a386Sopenharmony_ci 3438cb93a386Sopenharmony_ci BLEND_MODE(hardlight) { 3439cb93a386Sopenharmony_ci return div255( s*inv(da) + d*inv(sa) + 3440cb93a386Sopenharmony_ci if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 3441cb93a386Sopenharmony_ci } 3442cb93a386Sopenharmony_ci BLEND_MODE(overlay) { 3443cb93a386Sopenharmony_ci return div255( s*inv(da) + d*inv(sa) + 3444cb93a386Sopenharmony_ci if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) ); 3445cb93a386Sopenharmony_ci } 3446cb93a386Sopenharmony_ci#undef BLEND_MODE 3447cb93a386Sopenharmony_ci 3448cb93a386Sopenharmony_ci// ~~~~~~ Helpers for interacting with memory ~~~~~~ // 3449cb93a386Sopenharmony_ci 3450cb93a386Sopenharmony_citemplate <typename T> 3451cb93a386Sopenharmony_ciSI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) { 3452cb93a386Sopenharmony_ci return (T*)ctx->pixels + dy*ctx->stride + dx; 3453cb93a386Sopenharmony_ci} 3454cb93a386Sopenharmony_ci 3455cb93a386Sopenharmony_citemplate <typename T> 3456cb93a386Sopenharmony_ciSI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) { 3457cb93a386Sopenharmony_ci // Exclusive -> inclusive. 3458cb93a386Sopenharmony_ci const F w = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - 1), 3459cb93a386Sopenharmony_ci h = sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - 1); 3460cb93a386Sopenharmony_ci 3461cb93a386Sopenharmony_ci x = min(max(0, x), w); 3462cb93a386Sopenharmony_ci y = min(max(0, y), h); 3463cb93a386Sopenharmony_ci 3464cb93a386Sopenharmony_ci *ptr = (const T*)ctx->pixels; 3465cb93a386Sopenharmony_ci return trunc_(y)*ctx->stride + trunc_(x); 3466cb93a386Sopenharmony_ci} 3467cb93a386Sopenharmony_ci 3468cb93a386Sopenharmony_citemplate <typename T> 3469cb93a386Sopenharmony_ciSI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) { 3470cb93a386Sopenharmony_ci // Exclusive -> inclusive. 3471cb93a386Sopenharmony_ci const I32 w = ctx->width - 1, 3472cb93a386Sopenharmony_ci h = ctx->height - 1; 3473cb93a386Sopenharmony_ci 3474cb93a386Sopenharmony_ci U32 ax = cast<U32>(min(max(0, x), w)), 3475cb93a386Sopenharmony_ci ay = cast<U32>(min(max(0, y), h)); 3476cb93a386Sopenharmony_ci 3477cb93a386Sopenharmony_ci *ptr = (const T*)ctx->pixels; 3478cb93a386Sopenharmony_ci return ay * ctx->stride + ax; 3479cb93a386Sopenharmony_ci} 3480cb93a386Sopenharmony_ci 3481cb93a386Sopenharmony_citemplate <typename V, typename T> 3482cb93a386Sopenharmony_ciSI V load(const T* ptr, size_t tail) { 3483cb93a386Sopenharmony_ci V v = 0; 3484cb93a386Sopenharmony_ci switch (tail & (N-1)) { 3485cb93a386Sopenharmony_ci case 0: memcpy(&v, ptr, sizeof(v)); break; 3486cb93a386Sopenharmony_ci #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3487cb93a386Sopenharmony_ci case 15: v[14] = ptr[14]; [[fallthrough]]; 3488cb93a386Sopenharmony_ci case 14: v[13] = ptr[13]; [[fallthrough]]; 3489cb93a386Sopenharmony_ci case 13: v[12] = ptr[12]; [[fallthrough]]; 3490cb93a386Sopenharmony_ci case 12: memcpy(&v, ptr, 12*sizeof(T)); break; 3491cb93a386Sopenharmony_ci case 11: v[10] = ptr[10]; [[fallthrough]]; 3492cb93a386Sopenharmony_ci case 10: v[ 9] = ptr[ 9]; [[fallthrough]]; 3493cb93a386Sopenharmony_ci case 9: v[ 8] = ptr[ 8]; [[fallthrough]]; 3494cb93a386Sopenharmony_ci case 8: memcpy(&v, ptr, 8*sizeof(T)); break; 3495cb93a386Sopenharmony_ci #endif 3496cb93a386Sopenharmony_ci case 7: v[ 6] = ptr[ 6]; [[fallthrough]]; 3497cb93a386Sopenharmony_ci case 6: v[ 5] = ptr[ 5]; [[fallthrough]]; 3498cb93a386Sopenharmony_ci case 5: v[ 4] = ptr[ 4]; [[fallthrough]]; 3499cb93a386Sopenharmony_ci case 4: memcpy(&v, ptr, 4*sizeof(T)); break; 3500cb93a386Sopenharmony_ci case 3: v[ 2] = ptr[ 2]; [[fallthrough]]; 3501cb93a386Sopenharmony_ci case 2: memcpy(&v, ptr, 2*sizeof(T)); break; 3502cb93a386Sopenharmony_ci case 1: v[ 0] = ptr[ 0]; 3503cb93a386Sopenharmony_ci } 3504cb93a386Sopenharmony_ci return v; 3505cb93a386Sopenharmony_ci} 3506cb93a386Sopenharmony_citemplate <typename V, typename T> 3507cb93a386Sopenharmony_ciSI void store(T* ptr, size_t tail, V v) { 3508cb93a386Sopenharmony_ci switch (tail & (N-1)) { 3509cb93a386Sopenharmony_ci case 0: memcpy(ptr, &v, sizeof(v)); break; 3510cb93a386Sopenharmony_ci #if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3511cb93a386Sopenharmony_ci case 15: ptr[14] = v[14]; [[fallthrough]]; 3512cb93a386Sopenharmony_ci case 14: ptr[13] = v[13]; [[fallthrough]]; 3513cb93a386Sopenharmony_ci case 13: ptr[12] = v[12]; [[fallthrough]]; 3514cb93a386Sopenharmony_ci case 12: memcpy(ptr, &v, 12*sizeof(T)); break; 3515cb93a386Sopenharmony_ci case 11: ptr[10] = v[10]; [[fallthrough]]; 3516cb93a386Sopenharmony_ci case 10: ptr[ 9] = v[ 9]; [[fallthrough]]; 3517cb93a386Sopenharmony_ci case 9: ptr[ 8] = v[ 8]; [[fallthrough]]; 3518cb93a386Sopenharmony_ci case 8: memcpy(ptr, &v, 8*sizeof(T)); break; 3519cb93a386Sopenharmony_ci #endif 3520cb93a386Sopenharmony_ci case 7: ptr[ 6] = v[ 6]; [[fallthrough]]; 3521cb93a386Sopenharmony_ci case 6: ptr[ 5] = v[ 5]; [[fallthrough]]; 3522cb93a386Sopenharmony_ci case 5: ptr[ 4] = v[ 4]; [[fallthrough]]; 3523cb93a386Sopenharmony_ci case 4: memcpy(ptr, &v, 4*sizeof(T)); break; 3524cb93a386Sopenharmony_ci case 3: ptr[ 2] = v[ 2]; [[fallthrough]]; 3525cb93a386Sopenharmony_ci case 2: memcpy(ptr, &v, 2*sizeof(T)); break; 3526cb93a386Sopenharmony_ci case 1: ptr[ 0] = v[ 0]; 3527cb93a386Sopenharmony_ci } 3528cb93a386Sopenharmony_ci} 3529cb93a386Sopenharmony_ci 3530cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3531cb93a386Sopenharmony_ci template <typename V, typename T> 3532cb93a386Sopenharmony_ci SI V gather(const T* ptr, U32 ix) { 3533cb93a386Sopenharmony_ci return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 3534cb93a386Sopenharmony_ci ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], 3535cb93a386Sopenharmony_ci ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]], 3536cb93a386Sopenharmony_ci ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], }; 3537cb93a386Sopenharmony_ci } 3538cb93a386Sopenharmony_ci 3539cb93a386Sopenharmony_ci template<> 3540cb93a386Sopenharmony_ci F gather(const float* ptr, U32 ix) { 3541cb93a386Sopenharmony_ci __m256i lo, hi; 3542cb93a386Sopenharmony_ci split(ix, &lo, &hi); 3543cb93a386Sopenharmony_ci 3544cb93a386Sopenharmony_ci return join<F>(_mm256_i32gather_ps(ptr, lo, 4), 3545cb93a386Sopenharmony_ci _mm256_i32gather_ps(ptr, hi, 4)); 3546cb93a386Sopenharmony_ci } 3547cb93a386Sopenharmony_ci 3548cb93a386Sopenharmony_ci template<> 3549cb93a386Sopenharmony_ci U32 gather(const uint32_t* ptr, U32 ix) { 3550cb93a386Sopenharmony_ci __m256i lo, hi; 3551cb93a386Sopenharmony_ci split(ix, &lo, &hi); 3552cb93a386Sopenharmony_ci 3553cb93a386Sopenharmony_ci return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4), 3554cb93a386Sopenharmony_ci _mm256_i32gather_epi32(ptr, hi, 4)); 3555cb93a386Sopenharmony_ci } 3556cb93a386Sopenharmony_ci#else 3557cb93a386Sopenharmony_ci template <typename V, typename T> 3558cb93a386Sopenharmony_ci SI V gather(const T* ptr, U32 ix) { 3559cb93a386Sopenharmony_ci return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]], 3560cb93a386Sopenharmony_ci ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], }; 3561cb93a386Sopenharmony_ci } 3562cb93a386Sopenharmony_ci#endif 3563cb93a386Sopenharmony_ci 3564cb93a386Sopenharmony_ci 3565cb93a386Sopenharmony_ci// ~~~~~~ 32-bit memory loads and stores ~~~~~~ // 3566cb93a386Sopenharmony_ci 3567cb93a386Sopenharmony_ciSI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) { 3568cb93a386Sopenharmony_ci#if 1 && defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 3569cb93a386Sopenharmony_ci // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely. 3570cb93a386Sopenharmony_ci __m256i _01,_23; 3571cb93a386Sopenharmony_ci split(rgba, &_01, &_23); 3572cb93a386Sopenharmony_ci __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20), 3573cb93a386Sopenharmony_ci _13 = _mm256_permute2x128_si256(_01,_23, 0x31); 3574cb93a386Sopenharmony_ci rgba = join<U32>(_02, _13); 3575cb93a386Sopenharmony_ci 3576cb93a386Sopenharmony_ci auto cast_U16 = [](U32 v) -> U16 { 3577cb93a386Sopenharmony_ci __m256i _02,_13; 3578cb93a386Sopenharmony_ci split(v, &_02,&_13); 3579cb93a386Sopenharmony_ci return _mm256_packus_epi32(_02,_13); 3580cb93a386Sopenharmony_ci }; 3581cb93a386Sopenharmony_ci#else 3582cb93a386Sopenharmony_ci auto cast_U16 = [](U32 v) -> U16 { 3583cb93a386Sopenharmony_ci return cast<U16>(v); 3584cb93a386Sopenharmony_ci }; 3585cb93a386Sopenharmony_ci#endif 3586cb93a386Sopenharmony_ci *r = cast_U16(rgba & 65535) & 255; 3587cb93a386Sopenharmony_ci *g = cast_U16(rgba & 65535) >> 8; 3588cb93a386Sopenharmony_ci *b = cast_U16(rgba >> 16) & 255; 3589cb93a386Sopenharmony_ci *a = cast_U16(rgba >> 16) >> 8; 3590cb93a386Sopenharmony_ci} 3591cb93a386Sopenharmony_ci 3592cb93a386Sopenharmony_ciSI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 3593cb93a386Sopenharmony_ci#if 1 && defined(JUMPER_IS_NEON) 3594cb93a386Sopenharmony_ci uint8x8x4_t rgba; 3595cb93a386Sopenharmony_ci switch (tail & (N-1)) { 3596cb93a386Sopenharmony_ci case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break; 3597cb93a386Sopenharmony_ci case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); [[fallthrough]]; 3598cb93a386Sopenharmony_ci case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); [[fallthrough]]; 3599cb93a386Sopenharmony_ci case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); [[fallthrough]]; 3600cb93a386Sopenharmony_ci case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); [[fallthrough]]; 3601cb93a386Sopenharmony_ci case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); [[fallthrough]]; 3602cb93a386Sopenharmony_ci case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); [[fallthrough]]; 3603cb93a386Sopenharmony_ci case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0); 3604cb93a386Sopenharmony_ci } 3605cb93a386Sopenharmony_ci *r = cast<U16>(rgba.val[0]); 3606cb93a386Sopenharmony_ci *g = cast<U16>(rgba.val[1]); 3607cb93a386Sopenharmony_ci *b = cast<U16>(rgba.val[2]); 3608cb93a386Sopenharmony_ci *a = cast<U16>(rgba.val[3]); 3609cb93a386Sopenharmony_ci#else 3610cb93a386Sopenharmony_ci from_8888(load<U32>(ptr, tail), r,g,b,a); 3611cb93a386Sopenharmony_ci#endif 3612cb93a386Sopenharmony_ci} 3613cb93a386Sopenharmony_ciSI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 3614cb93a386Sopenharmony_ci#if 1 && defined(JUMPER_IS_NEON) 3615cb93a386Sopenharmony_ci uint8x8x4_t rgba = {{ 3616cb93a386Sopenharmony_ci cast<U8>(r), 3617cb93a386Sopenharmony_ci cast<U8>(g), 3618cb93a386Sopenharmony_ci cast<U8>(b), 3619cb93a386Sopenharmony_ci cast<U8>(a), 3620cb93a386Sopenharmony_ci }}; 3621cb93a386Sopenharmony_ci switch (tail & (N-1)) { 3622cb93a386Sopenharmony_ci case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break; 3623cb93a386Sopenharmony_ci case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); [[fallthrough]]; 3624cb93a386Sopenharmony_ci case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); [[fallthrough]]; 3625cb93a386Sopenharmony_ci case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); [[fallthrough]]; 3626cb93a386Sopenharmony_ci case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); [[fallthrough]]; 3627cb93a386Sopenharmony_ci case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); [[fallthrough]]; 3628cb93a386Sopenharmony_ci case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); [[fallthrough]]; 3629cb93a386Sopenharmony_ci case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0); 3630cb93a386Sopenharmony_ci } 3631cb93a386Sopenharmony_ci#else 3632cb93a386Sopenharmony_ci store(ptr, tail, cast<U32>(r | (g<<8)) << 0 3633cb93a386Sopenharmony_ci | cast<U32>(b | (a<<8)) << 16); 3634cb93a386Sopenharmony_ci#endif 3635cb93a386Sopenharmony_ci} 3636cb93a386Sopenharmony_ci 3637cb93a386Sopenharmony_ciSTAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) { 3638cb93a386Sopenharmony_ci load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 3639cb93a386Sopenharmony_ci} 3640cb93a386Sopenharmony_ciSTAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) { 3641cb93a386Sopenharmony_ci load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 3642cb93a386Sopenharmony_ci} 3643cb93a386Sopenharmony_ciSTAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) { 3644cb93a386Sopenharmony_ci store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a); 3645cb93a386Sopenharmony_ci} 3646cb93a386Sopenharmony_ciSTAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) { 3647cb93a386Sopenharmony_ci const uint32_t* ptr; 3648cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x,y); 3649cb93a386Sopenharmony_ci from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a); 3650cb93a386Sopenharmony_ci} 3651cb93a386Sopenharmony_ci 3652cb93a386Sopenharmony_ci// ~~~~~~ 16-bit memory loads and stores ~~~~~~ // 3653cb93a386Sopenharmony_ci 3654cb93a386Sopenharmony_ciSI void from_565(U16 rgb, U16* r, U16* g, U16* b) { 3655cb93a386Sopenharmony_ci // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0 3656cb93a386Sopenharmony_ci U16 R = (rgb >> 11) & 31, 3657cb93a386Sopenharmony_ci G = (rgb >> 5) & 63, 3658cb93a386Sopenharmony_ci B = (rgb >> 0) & 31; 3659cb93a386Sopenharmony_ci 3660cb93a386Sopenharmony_ci // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit. 3661cb93a386Sopenharmony_ci *r = (R << 3) | (R >> 2); 3662cb93a386Sopenharmony_ci *g = (G << 2) | (G >> 4); 3663cb93a386Sopenharmony_ci *b = (B << 3) | (B >> 2); 3664cb93a386Sopenharmony_ci} 3665cb93a386Sopenharmony_ciSI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) { 3666cb93a386Sopenharmony_ci from_565(load<U16>(ptr, tail), r,g,b); 3667cb93a386Sopenharmony_ci} 3668cb93a386Sopenharmony_ciSI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) { 3669cb93a386Sopenharmony_ci // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f. 3670cb93a386Sopenharmony_ci // (Don't feel like you need to find some fundamental truth in these... 3671cb93a386Sopenharmony_ci // they were brute-force searched.) 3672cb93a386Sopenharmony_ci U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half. 3673cb93a386Sopenharmony_ci G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly. 3674cb93a386Sopenharmony_ci B = (b * 9 + 36) / 74; 3675cb93a386Sopenharmony_ci // Pack them back into 15|rrrrr gggggg bbbbb|0. 3676cb93a386Sopenharmony_ci store(ptr, tail, R << 11 3677cb93a386Sopenharmony_ci | G << 5 3678cb93a386Sopenharmony_ci | B << 0); 3679cb93a386Sopenharmony_ci} 3680cb93a386Sopenharmony_ci 3681cb93a386Sopenharmony_ciSTAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) { 3682cb93a386Sopenharmony_ci load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b); 3683cb93a386Sopenharmony_ci a = 255; 3684cb93a386Sopenharmony_ci} 3685cb93a386Sopenharmony_ciSTAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) { 3686cb93a386Sopenharmony_ci load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db); 3687cb93a386Sopenharmony_ci da = 255; 3688cb93a386Sopenharmony_ci} 3689cb93a386Sopenharmony_ciSTAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) { 3690cb93a386Sopenharmony_ci store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b); 3691cb93a386Sopenharmony_ci} 3692cb93a386Sopenharmony_ciSTAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) { 3693cb93a386Sopenharmony_ci const uint16_t* ptr; 3694cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x,y); 3695cb93a386Sopenharmony_ci from_565(gather<U16>(ptr, ix), &r, &g, &b); 3696cb93a386Sopenharmony_ci a = 255; 3697cb93a386Sopenharmony_ci} 3698cb93a386Sopenharmony_ci 3699cb93a386Sopenharmony_ciSI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) { 3700cb93a386Sopenharmony_ci // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0. 3701cb93a386Sopenharmony_ci U16 R = (rgba >> 12) & 15, 3702cb93a386Sopenharmony_ci G = (rgba >> 8) & 15, 3703cb93a386Sopenharmony_ci B = (rgba >> 4) & 15, 3704cb93a386Sopenharmony_ci A = (rgba >> 0) & 15; 3705cb93a386Sopenharmony_ci 3706cb93a386Sopenharmony_ci // Scale [0,15] to [0,255]. 3707cb93a386Sopenharmony_ci *r = (R << 4) | R; 3708cb93a386Sopenharmony_ci *g = (G << 4) | G; 3709cb93a386Sopenharmony_ci *b = (B << 4) | B; 3710cb93a386Sopenharmony_ci *a = (A << 4) | A; 3711cb93a386Sopenharmony_ci} 3712cb93a386Sopenharmony_ciSI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) { 3713cb93a386Sopenharmony_ci from_4444(load<U16>(ptr, tail), r,g,b,a); 3714cb93a386Sopenharmony_ci} 3715cb93a386Sopenharmony_ciSI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) { 3716cb93a386Sopenharmony_ci // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f). 3717cb93a386Sopenharmony_ci U16 R = (r + 8) / 17, 3718cb93a386Sopenharmony_ci G = (g + 8) / 17, 3719cb93a386Sopenharmony_ci B = (b + 8) / 17, 3720cb93a386Sopenharmony_ci A = (a + 8) / 17; 3721cb93a386Sopenharmony_ci // Pack them back into 15|rrrr gggg bbbb aaaa|0. 3722cb93a386Sopenharmony_ci store(ptr, tail, R << 12 3723cb93a386Sopenharmony_ci | G << 8 3724cb93a386Sopenharmony_ci | B << 4 3725cb93a386Sopenharmony_ci | A << 0); 3726cb93a386Sopenharmony_ci} 3727cb93a386Sopenharmony_ci 3728cb93a386Sopenharmony_ciSTAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) { 3729cb93a386Sopenharmony_ci load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a); 3730cb93a386Sopenharmony_ci} 3731cb93a386Sopenharmony_ciSTAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) { 3732cb93a386Sopenharmony_ci load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da); 3733cb93a386Sopenharmony_ci} 3734cb93a386Sopenharmony_ciSTAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) { 3735cb93a386Sopenharmony_ci store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a); 3736cb93a386Sopenharmony_ci} 3737cb93a386Sopenharmony_ciSTAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) { 3738cb93a386Sopenharmony_ci const uint16_t* ptr; 3739cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x,y); 3740cb93a386Sopenharmony_ci from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a); 3741cb93a386Sopenharmony_ci} 3742cb93a386Sopenharmony_ci 3743cb93a386Sopenharmony_ciSI void from_88(U16 rg, U16* r, U16* g) { 3744cb93a386Sopenharmony_ci *r = (rg & 0xFF); 3745cb93a386Sopenharmony_ci *g = (rg >> 8); 3746cb93a386Sopenharmony_ci} 3747cb93a386Sopenharmony_ci 3748cb93a386Sopenharmony_ciSI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) { 3749cb93a386Sopenharmony_ci#if 1 && defined(JUMPER_IS_NEON) 3750cb93a386Sopenharmony_ci uint8x8x2_t rg; 3751cb93a386Sopenharmony_ci switch (tail & (N-1)) { 3752cb93a386Sopenharmony_ci case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break; 3753cb93a386Sopenharmony_ci case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); [[fallthrough]]; 3754cb93a386Sopenharmony_ci case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); [[fallthrough]]; 3755cb93a386Sopenharmony_ci case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); [[fallthrough]]; 3756cb93a386Sopenharmony_ci case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); [[fallthrough]]; 3757cb93a386Sopenharmony_ci case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); [[fallthrough]]; 3758cb93a386Sopenharmony_ci case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); [[fallthrough]]; 3759cb93a386Sopenharmony_ci case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0); 3760cb93a386Sopenharmony_ci } 3761cb93a386Sopenharmony_ci *r = cast<U16>(rg.val[0]); 3762cb93a386Sopenharmony_ci *g = cast<U16>(rg.val[1]); 3763cb93a386Sopenharmony_ci#else 3764cb93a386Sopenharmony_ci from_88(load<U16>(ptr, tail), r,g); 3765cb93a386Sopenharmony_ci#endif 3766cb93a386Sopenharmony_ci} 3767cb93a386Sopenharmony_ci 3768cb93a386Sopenharmony_ciSI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) { 3769cb93a386Sopenharmony_ci#if 1 && defined(JUMPER_IS_NEON) 3770cb93a386Sopenharmony_ci uint8x8x2_t rg = {{ 3771cb93a386Sopenharmony_ci cast<U8>(r), 3772cb93a386Sopenharmony_ci cast<U8>(g), 3773cb93a386Sopenharmony_ci }}; 3774cb93a386Sopenharmony_ci switch (tail & (N-1)) { 3775cb93a386Sopenharmony_ci case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break; 3776cb93a386Sopenharmony_ci case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); [[fallthrough]]; 3777cb93a386Sopenharmony_ci case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); [[fallthrough]]; 3778cb93a386Sopenharmony_ci case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); [[fallthrough]]; 3779cb93a386Sopenharmony_ci case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); [[fallthrough]]; 3780cb93a386Sopenharmony_ci case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); [[fallthrough]]; 3781cb93a386Sopenharmony_ci case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); [[fallthrough]]; 3782cb93a386Sopenharmony_ci case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0); 3783cb93a386Sopenharmony_ci } 3784cb93a386Sopenharmony_ci#else 3785cb93a386Sopenharmony_ci store(ptr, tail, cast<U16>(r | (g<<8)) << 0); 3786cb93a386Sopenharmony_ci#endif 3787cb93a386Sopenharmony_ci} 3788cb93a386Sopenharmony_ci 3789cb93a386Sopenharmony_ciSTAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) { 3790cb93a386Sopenharmony_ci load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g); 3791cb93a386Sopenharmony_ci b = 0; 3792cb93a386Sopenharmony_ci a = 255; 3793cb93a386Sopenharmony_ci} 3794cb93a386Sopenharmony_ciSTAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) { 3795cb93a386Sopenharmony_ci load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg); 3796cb93a386Sopenharmony_ci db = 0; 3797cb93a386Sopenharmony_ci da = 255; 3798cb93a386Sopenharmony_ci} 3799cb93a386Sopenharmony_ciSTAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) { 3800cb93a386Sopenharmony_ci store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g); 3801cb93a386Sopenharmony_ci} 3802cb93a386Sopenharmony_ciSTAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) { 3803cb93a386Sopenharmony_ci const uint16_t* ptr; 3804cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x, y); 3805cb93a386Sopenharmony_ci from_88(gather<U16>(ptr, ix), &r, &g); 3806cb93a386Sopenharmony_ci b = 0; 3807cb93a386Sopenharmony_ci a = 255; 3808cb93a386Sopenharmony_ci} 3809cb93a386Sopenharmony_ci 3810cb93a386Sopenharmony_ci// ~~~~~~ 8-bit memory loads and stores ~~~~~~ // 3811cb93a386Sopenharmony_ci 3812cb93a386Sopenharmony_ciSI U16 load_8(const uint8_t* ptr, size_t tail) { 3813cb93a386Sopenharmony_ci return cast<U16>(load<U8>(ptr, tail)); 3814cb93a386Sopenharmony_ci} 3815cb93a386Sopenharmony_ciSI void store_8(uint8_t* ptr, size_t tail, U16 v) { 3816cb93a386Sopenharmony_ci store(ptr, tail, cast<U8>(v)); 3817cb93a386Sopenharmony_ci} 3818cb93a386Sopenharmony_ci 3819cb93a386Sopenharmony_ciSTAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) { 3820cb93a386Sopenharmony_ci r = g = b = 0; 3821cb93a386Sopenharmony_ci a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 3822cb93a386Sopenharmony_ci} 3823cb93a386Sopenharmony_ciSTAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) { 3824cb93a386Sopenharmony_ci dr = dg = db = 0; 3825cb93a386Sopenharmony_ci da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 3826cb93a386Sopenharmony_ci} 3827cb93a386Sopenharmony_ciSTAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) { 3828cb93a386Sopenharmony_ci store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a); 3829cb93a386Sopenharmony_ci} 3830cb93a386Sopenharmony_ciSTAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) { 3831cb93a386Sopenharmony_ci const uint8_t* ptr; 3832cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, x,y); 3833cb93a386Sopenharmony_ci r = g = b = 0; 3834cb93a386Sopenharmony_ci a = cast<U16>(gather<U8>(ptr, ix)); 3835cb93a386Sopenharmony_ci} 3836cb93a386Sopenharmony_ci 3837cb93a386Sopenharmony_ciSTAGE_PP(alpha_to_gray, Ctx::None) { 3838cb93a386Sopenharmony_ci r = g = b = a; 3839cb93a386Sopenharmony_ci a = 255; 3840cb93a386Sopenharmony_ci} 3841cb93a386Sopenharmony_ciSTAGE_PP(alpha_to_gray_dst, Ctx::None) { 3842cb93a386Sopenharmony_ci dr = dg = db = da; 3843cb93a386Sopenharmony_ci da = 255; 3844cb93a386Sopenharmony_ci} 3845cb93a386Sopenharmony_ciSTAGE_PP(bt709_luminance_or_luma_to_alpha, Ctx::None) { 3846cb93a386Sopenharmony_ci a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. 3847cb93a386Sopenharmony_ci r = g = b = 0; 3848cb93a386Sopenharmony_ci} 3849cb93a386Sopenharmony_ciSTAGE_PP(bt709_luminance_or_luma_to_rgb, Ctx::None) { 3850cb93a386Sopenharmony_ci r = g = b =(r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator. 3851cb93a386Sopenharmony_ci} 3852cb93a386Sopenharmony_ci 3853cb93a386Sopenharmony_ci// ~~~~~~ Coverage scales / lerps ~~~~~~ // 3854cb93a386Sopenharmony_ci 3855cb93a386Sopenharmony_ciSTAGE_PP(load_src, const uint16_t* ptr) { 3856cb93a386Sopenharmony_ci r = sk_unaligned_load<U16>(ptr + 0*N); 3857cb93a386Sopenharmony_ci g = sk_unaligned_load<U16>(ptr + 1*N); 3858cb93a386Sopenharmony_ci b = sk_unaligned_load<U16>(ptr + 2*N); 3859cb93a386Sopenharmony_ci a = sk_unaligned_load<U16>(ptr + 3*N); 3860cb93a386Sopenharmony_ci} 3861cb93a386Sopenharmony_ciSTAGE_PP(store_src, uint16_t* ptr) { 3862cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 0*N, r); 3863cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 1*N, g); 3864cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 2*N, b); 3865cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 3*N, a); 3866cb93a386Sopenharmony_ci} 3867cb93a386Sopenharmony_ciSTAGE_PP(store_src_a, uint16_t* ptr) { 3868cb93a386Sopenharmony_ci sk_unaligned_store(ptr, a); 3869cb93a386Sopenharmony_ci} 3870cb93a386Sopenharmony_ciSTAGE_PP(load_dst, const uint16_t* ptr) { 3871cb93a386Sopenharmony_ci dr = sk_unaligned_load<U16>(ptr + 0*N); 3872cb93a386Sopenharmony_ci dg = sk_unaligned_load<U16>(ptr + 1*N); 3873cb93a386Sopenharmony_ci db = sk_unaligned_load<U16>(ptr + 2*N); 3874cb93a386Sopenharmony_ci da = sk_unaligned_load<U16>(ptr + 3*N); 3875cb93a386Sopenharmony_ci} 3876cb93a386Sopenharmony_ciSTAGE_PP(store_dst, uint16_t* ptr) { 3877cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 0*N, dr); 3878cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 1*N, dg); 3879cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 2*N, db); 3880cb93a386Sopenharmony_ci sk_unaligned_store(ptr + 3*N, da); 3881cb93a386Sopenharmony_ci} 3882cb93a386Sopenharmony_ci 3883cb93a386Sopenharmony_ci// ~~~~~~ Coverage scales / lerps ~~~~~~ // 3884cb93a386Sopenharmony_ci 3885cb93a386Sopenharmony_ciSTAGE_PP(scale_1_float, const float* f) { 3886cb93a386Sopenharmony_ci U16 c = from_float(*f); 3887cb93a386Sopenharmony_ci r = div255( r * c ); 3888cb93a386Sopenharmony_ci g = div255( g * c ); 3889cb93a386Sopenharmony_ci b = div255( b * c ); 3890cb93a386Sopenharmony_ci a = div255( a * c ); 3891cb93a386Sopenharmony_ci} 3892cb93a386Sopenharmony_ciSTAGE_PP(lerp_1_float, const float* f) { 3893cb93a386Sopenharmony_ci U16 c = from_float(*f); 3894cb93a386Sopenharmony_ci r = lerp(dr, r, c); 3895cb93a386Sopenharmony_ci g = lerp(dg, g, c); 3896cb93a386Sopenharmony_ci b = lerp(db, b, c); 3897cb93a386Sopenharmony_ci a = lerp(da, a, c); 3898cb93a386Sopenharmony_ci} 3899cb93a386Sopenharmony_ciSTAGE_PP(scale_native, const uint16_t scales[]) { 3900cb93a386Sopenharmony_ci auto c = sk_unaligned_load<U16>(scales); 3901cb93a386Sopenharmony_ci r = div255( r * c ); 3902cb93a386Sopenharmony_ci g = div255( g * c ); 3903cb93a386Sopenharmony_ci b = div255( b * c ); 3904cb93a386Sopenharmony_ci a = div255( a * c ); 3905cb93a386Sopenharmony_ci} 3906cb93a386Sopenharmony_ci 3907cb93a386Sopenharmony_ciSTAGE_PP(lerp_native, const uint16_t scales[]) { 3908cb93a386Sopenharmony_ci auto c = sk_unaligned_load<U16>(scales); 3909cb93a386Sopenharmony_ci r = lerp(dr, r, c); 3910cb93a386Sopenharmony_ci g = lerp(dg, g, c); 3911cb93a386Sopenharmony_ci b = lerp(db, b, c); 3912cb93a386Sopenharmony_ci a = lerp(da, a, c); 3913cb93a386Sopenharmony_ci} 3914cb93a386Sopenharmony_ci 3915cb93a386Sopenharmony_ciSTAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) { 3916cb93a386Sopenharmony_ci U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 3917cb93a386Sopenharmony_ci r = div255( r * c ); 3918cb93a386Sopenharmony_ci g = div255( g * c ); 3919cb93a386Sopenharmony_ci b = div255( b * c ); 3920cb93a386Sopenharmony_ci a = div255( a * c ); 3921cb93a386Sopenharmony_ci} 3922cb93a386Sopenharmony_ciSTAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) { 3923cb93a386Sopenharmony_ci U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail); 3924cb93a386Sopenharmony_ci r = lerp(dr, r, c); 3925cb93a386Sopenharmony_ci g = lerp(dg, g, c); 3926cb93a386Sopenharmony_ci b = lerp(db, b, c); 3927cb93a386Sopenharmony_ci a = lerp(da, a, c); 3928cb93a386Sopenharmony_ci} 3929cb93a386Sopenharmony_ci 3930cb93a386Sopenharmony_ci// Derive alpha's coverage from rgb coverage and the values of src and dst alpha. 3931cb93a386Sopenharmony_ciSI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) { 3932cb93a386Sopenharmony_ci return if_then_else(a < da, min(cr, min(cg,cb)) 3933cb93a386Sopenharmony_ci , max(cr, max(cg,cb))); 3934cb93a386Sopenharmony_ci} 3935cb93a386Sopenharmony_ciSTAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) { 3936cb93a386Sopenharmony_ci U16 cr,cg,cb; 3937cb93a386Sopenharmony_ci load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 3938cb93a386Sopenharmony_ci U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 3939cb93a386Sopenharmony_ci 3940cb93a386Sopenharmony_ci r = div255( r * cr ); 3941cb93a386Sopenharmony_ci g = div255( g * cg ); 3942cb93a386Sopenharmony_ci b = div255( b * cb ); 3943cb93a386Sopenharmony_ci a = div255( a * ca ); 3944cb93a386Sopenharmony_ci} 3945cb93a386Sopenharmony_ciSTAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) { 3946cb93a386Sopenharmony_ci U16 cr,cg,cb; 3947cb93a386Sopenharmony_ci load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb); 3948cb93a386Sopenharmony_ci U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb); 3949cb93a386Sopenharmony_ci 3950cb93a386Sopenharmony_ci r = lerp(dr, r, cr); 3951cb93a386Sopenharmony_ci g = lerp(dg, g, cg); 3952cb93a386Sopenharmony_ci b = lerp(db, b, cb); 3953cb93a386Sopenharmony_ci a = lerp(da, a, ca); 3954cb93a386Sopenharmony_ci} 3955cb93a386Sopenharmony_ci 3956cb93a386Sopenharmony_ciSTAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) { 3957cb93a386Sopenharmony_ci U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail), 3958cb93a386Sopenharmony_ci add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail); 3959cb93a386Sopenharmony_ci 3960cb93a386Sopenharmony_ci r = min(div255(r*mul) + add, a); 3961cb93a386Sopenharmony_ci g = min(div255(g*mul) + add, a); 3962cb93a386Sopenharmony_ci b = min(div255(b*mul) + add, a); 3963cb93a386Sopenharmony_ci} 3964cb93a386Sopenharmony_ci 3965cb93a386Sopenharmony_ci 3966cb93a386Sopenharmony_ci// ~~~~~~ Gradient stages ~~~~~~ // 3967cb93a386Sopenharmony_ci 3968cb93a386Sopenharmony_ci// Clamp x to [0,1], both sides inclusive (think, gradients). 3969cb93a386Sopenharmony_ci// Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN. 3970cb93a386Sopenharmony_ciSI F clamp_01(F v) { return min(max(0, v), 1); } 3971cb93a386Sopenharmony_ci 3972cb93a386Sopenharmony_ciSTAGE_GG(clamp_x_1 , Ctx::None) { x = clamp_01(x); } 3973cb93a386Sopenharmony_ciSTAGE_GG(repeat_x_1, Ctx::None) { x = clamp_01(x - floor_(x)); } 3974cb93a386Sopenharmony_ciSTAGE_GG(mirror_x_1, Ctx::None) { 3975cb93a386Sopenharmony_ci auto two = [](F x){ return x+x; }; 3976cb93a386Sopenharmony_ci x = clamp_01(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f )); 3977cb93a386Sopenharmony_ci} 3978cb93a386Sopenharmony_ci 3979cb93a386Sopenharmony_ciSI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); } 3980cb93a386Sopenharmony_ci 3981cb93a386Sopenharmony_ciSTAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) { 3982cb93a386Sopenharmony_ci auto w = ctx->limit_x; 3983cb93a386Sopenharmony_ci sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w))); 3984cb93a386Sopenharmony_ci} 3985cb93a386Sopenharmony_ciSTAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) { 3986cb93a386Sopenharmony_ci auto h = ctx->limit_y; 3987cb93a386Sopenharmony_ci sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h))); 3988cb93a386Sopenharmony_ci} 3989cb93a386Sopenharmony_ciSTAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) { 3990cb93a386Sopenharmony_ci auto w = ctx->limit_x; 3991cb93a386Sopenharmony_ci auto h = ctx->limit_y; 3992cb93a386Sopenharmony_ci sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h))); 3993cb93a386Sopenharmony_ci} 3994cb93a386Sopenharmony_ciSTAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) { 3995cb93a386Sopenharmony_ci auto mask = sk_unaligned_load<U16>(ctx->mask); 3996cb93a386Sopenharmony_ci r = r & mask; 3997cb93a386Sopenharmony_ci g = g & mask; 3998cb93a386Sopenharmony_ci b = b & mask; 3999cb93a386Sopenharmony_ci a = a & mask; 4000cb93a386Sopenharmony_ci} 4001cb93a386Sopenharmony_ci 4002cb93a386Sopenharmony_ciSI void round_F_to_U16(F R, F G, F B, F A, bool interpolatedInPremul, 4003cb93a386Sopenharmony_ci U16* r, U16* g, U16* b, U16* a) { 4004cb93a386Sopenharmony_ci auto round = [](F x) { return cast<U16>(x * 255.0f + 0.5f); }; 4005cb93a386Sopenharmony_ci 4006cb93a386Sopenharmony_ci F limit = interpolatedInPremul ? A 4007cb93a386Sopenharmony_ci : 1; 4008cb93a386Sopenharmony_ci *r = round(min(max(0,R), limit)); 4009cb93a386Sopenharmony_ci *g = round(min(max(0,G), limit)); 4010cb93a386Sopenharmony_ci *b = round(min(max(0,B), limit)); 4011cb93a386Sopenharmony_ci *a = round(A); // we assume alpha is already in [0,1]. 4012cb93a386Sopenharmony_ci} 4013cb93a386Sopenharmony_ci 4014cb93a386Sopenharmony_ciSI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t, 4015cb93a386Sopenharmony_ci U16* r, U16* g, U16* b, U16* a) { 4016cb93a386Sopenharmony_ci 4017cb93a386Sopenharmony_ci F fr, fg, fb, fa, br, bg, bb, ba; 4018cb93a386Sopenharmony_ci#if defined(JUMPER_IS_HSW) || defined(JUMPER_IS_SKX) 4019cb93a386Sopenharmony_ci if (c->stopCount <=8) { 4020cb93a386Sopenharmony_ci __m256i lo, hi; 4021cb93a386Sopenharmony_ci split(idx, &lo, &hi); 4022cb93a386Sopenharmony_ci 4023cb93a386Sopenharmony_ci fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo), 4024cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi)); 4025cb93a386Sopenharmony_ci br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo), 4026cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi)); 4027cb93a386Sopenharmony_ci fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo), 4028cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi)); 4029cb93a386Sopenharmony_ci bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo), 4030cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi)); 4031cb93a386Sopenharmony_ci fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo), 4032cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi)); 4033cb93a386Sopenharmony_ci bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo), 4034cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi)); 4035cb93a386Sopenharmony_ci fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo), 4036cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi)); 4037cb93a386Sopenharmony_ci ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo), 4038cb93a386Sopenharmony_ci _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi)); 4039cb93a386Sopenharmony_ci } else 4040cb93a386Sopenharmony_ci#endif 4041cb93a386Sopenharmony_ci { 4042cb93a386Sopenharmony_ci fr = gather<F>(c->fs[0], idx); 4043cb93a386Sopenharmony_ci fg = gather<F>(c->fs[1], idx); 4044cb93a386Sopenharmony_ci fb = gather<F>(c->fs[2], idx); 4045cb93a386Sopenharmony_ci fa = gather<F>(c->fs[3], idx); 4046cb93a386Sopenharmony_ci br = gather<F>(c->bs[0], idx); 4047cb93a386Sopenharmony_ci bg = gather<F>(c->bs[1], idx); 4048cb93a386Sopenharmony_ci bb = gather<F>(c->bs[2], idx); 4049cb93a386Sopenharmony_ci ba = gather<F>(c->bs[3], idx); 4050cb93a386Sopenharmony_ci } 4051cb93a386Sopenharmony_ci round_F_to_U16(mad(t, fr, br), 4052cb93a386Sopenharmony_ci mad(t, fg, bg), 4053cb93a386Sopenharmony_ci mad(t, fb, bb), 4054cb93a386Sopenharmony_ci mad(t, fa, ba), 4055cb93a386Sopenharmony_ci c->interpolatedInPremul, 4056cb93a386Sopenharmony_ci r,g,b,a); 4057cb93a386Sopenharmony_ci} 4058cb93a386Sopenharmony_ci 4059cb93a386Sopenharmony_ciSTAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) { 4060cb93a386Sopenharmony_ci auto t = x; 4061cb93a386Sopenharmony_ci U32 idx = 0; 4062cb93a386Sopenharmony_ci 4063cb93a386Sopenharmony_ci // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. 4064cb93a386Sopenharmony_ci for (size_t i = 1; i < c->stopCount; i++) { 4065cb93a386Sopenharmony_ci idx += if_then_else(t >= c->ts[i], U32(1), U32(0)); 4066cb93a386Sopenharmony_ci } 4067cb93a386Sopenharmony_ci 4068cb93a386Sopenharmony_ci gradient_lookup(c, idx, t, &r, &g, &b, &a); 4069cb93a386Sopenharmony_ci} 4070cb93a386Sopenharmony_ci 4071cb93a386Sopenharmony_ciSTAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) { 4072cb93a386Sopenharmony_ci auto t = x; 4073cb93a386Sopenharmony_ci auto idx = trunc_(t * (c->stopCount-1)); 4074cb93a386Sopenharmony_ci gradient_lookup(c, idx, t, &r, &g, &b, &a); 4075cb93a386Sopenharmony_ci} 4076cb93a386Sopenharmony_ci 4077cb93a386Sopenharmony_ciSTAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) { 4078cb93a386Sopenharmony_ci auto t = x; 4079cb93a386Sopenharmony_ci round_F_to_U16(mad(t, c->f[0], c->b[0]), 4080cb93a386Sopenharmony_ci mad(t, c->f[1], c->b[1]), 4081cb93a386Sopenharmony_ci mad(t, c->f[2], c->b[2]), 4082cb93a386Sopenharmony_ci mad(t, c->f[3], c->b[3]), 4083cb93a386Sopenharmony_ci c->interpolatedInPremul, 4084cb93a386Sopenharmony_ci &r,&g,&b,&a); 4085cb93a386Sopenharmony_ci} 4086cb93a386Sopenharmony_ci 4087cb93a386Sopenharmony_ciSI F cast (U32 v) { return __builtin_convertvector((I32)v, F); } 4088cb93a386Sopenharmony_ci#if !defined(SK_SUPPORT_LEGACY_BILERP_HIGHP) 4089cb93a386Sopenharmony_ciSTAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) { 4090cb93a386Sopenharmony_ci // Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed 4091cb93a386Sopenharmony_ci // point number. 4092cb93a386Sopenharmony_ci I32 qx = cast<I32>(floor_(65536.0f * x + 0.5f)) - 32768, 4093cb93a386Sopenharmony_ci qy = cast<I32>(floor_(65536.0f * y + 0.5f)) - 32768; 4094cb93a386Sopenharmony_ci 4095cb93a386Sopenharmony_ci // Calculate screen coordinates sx & sy by flooring qx and qy. 4096cb93a386Sopenharmony_ci I32 sx = qx >> 16, 4097cb93a386Sopenharmony_ci sy = qy >> 16; 4098cb93a386Sopenharmony_ci 4099cb93a386Sopenharmony_ci // We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1). 4100cb93a386Sopenharmony_ci // This will put tx in Q15 format for use with q_mult. 4101cb93a386Sopenharmony_ci // Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval 4102cb93a386Sopenharmony_ci // [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows 4103cb93a386Sopenharmony_ci // the same math: 4104cb93a386Sopenharmony_ci // tx = 2 * {qx} - 1, so 4105cb93a386Sopenharmony_ci // {qx} = (tx + 1) / 2. 4106cb93a386Sopenharmony_ci // Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1 4107cb93a386Sopenharmony_ci // is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in 4108cb93a386Sopenharmony_ci // order to use the full 16-bit resolution. 4109cb93a386Sopenharmony_ci I16 tx = cast<I16>(qx ^ 0x8000), 4110cb93a386Sopenharmony_ci ty = cast<I16>(qy ^ 0x8000); 4111cb93a386Sopenharmony_ci 4112cb93a386Sopenharmony_ci // Substituting the {qx} by the equation for tx from above into the lerp equation where v is 4113cb93a386Sopenharmony_ci // the lerped value: 4114cb93a386Sopenharmony_ci // v = {qx}*(R - L) + L, 4115cb93a386Sopenharmony_ci // v = 1/2*(tx + 1)*(R - L) + L 4116cb93a386Sopenharmony_ci // 2 * v = (tx + 1)*(R - L) + 2*L 4117cb93a386Sopenharmony_ci // = tx*R - tx*L + R - L + 2*L 4118cb93a386Sopenharmony_ci // = tx*(R - L) + (R + L). 4119cb93a386Sopenharmony_ci // Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form 4120cb93a386Sopenharmony_ci // for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In 4121cb93a386Sopenharmony_ci // code, we can multiply by 2^7 to get the value directly. 4122cb93a386Sopenharmony_ci // 2 * v = tx*(R - L) + (R + L) 4123cb93a386Sopenharmony_ci // 2^-9 * 2 * v = tx*(R - L)*2^-9 + (R + L)*2^-9 4124cb93a386Sopenharmony_ci // 2^-8 * v = 2^-9 * (tx*(R - L) + (R + L)) 4125cb93a386Sopenharmony_ci // v = 1/2 * (tx*(R - L) + (R + L)) 4126cb93a386Sopenharmony_ci auto lerpX = [&](U16 left, U16 right) -> U16 { 4127cb93a386Sopenharmony_ci I16 width = (I16)(right - left) << 7; 4128cb93a386Sopenharmony_ci U16 middle = (right + left) << 7; 4129cb93a386Sopenharmony_ci // The constrained_add is the most subtle part of lerp. The first term is on the interval 4130cb93a386Sopenharmony_ci // [-1, 1), and the second term is on the interval is on the interval [0, 1) because 4131cb93a386Sopenharmony_ci // both terms are too high by a factor of 2 which will be handled below. (Both R and L are 4132cb93a386Sopenharmony_ci // on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below 4133cb93a386Sopenharmony_ci // should overflow, but because we know that sum produces an output on the 4134cb93a386Sopenharmony_ci // interval [0, 1) we know that the extra bit that would be needed will always be 0. So 4135cb93a386Sopenharmony_ci // we need to be careful to treat this sum as an unsigned positive number in the divide 4136cb93a386Sopenharmony_ci // by 2 below. Add +1 for rounding. 4137cb93a386Sopenharmony_ci U16 v2 = constrained_add(scaled_mult(tx, width), middle) + 1; 4138cb93a386Sopenharmony_ci // Divide by 2 to calculate v and at the same time bring the intermediate value onto the 4139cb93a386Sopenharmony_ci // interval [0, 1/2] to set up for the lerpY. 4140cb93a386Sopenharmony_ci return v2 >> 1; 4141cb93a386Sopenharmony_ci }; 4142cb93a386Sopenharmony_ci 4143cb93a386Sopenharmony_ci const uint32_t* ptr; 4144cb93a386Sopenharmony_ci U32 ix = ix_and_ptr(&ptr, ctx, sx, sy); 4145cb93a386Sopenharmony_ci U16 leftR, leftG, leftB, leftA; 4146cb93a386Sopenharmony_ci from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA); 4147cb93a386Sopenharmony_ci 4148cb93a386Sopenharmony_ci ix = ix_and_ptr(&ptr, ctx, sx+1, sy); 4149cb93a386Sopenharmony_ci U16 rightR, rightG, rightB, rightA; 4150cb93a386Sopenharmony_ci from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA); 4151cb93a386Sopenharmony_ci 4152cb93a386Sopenharmony_ci U16 topR = lerpX(leftR, rightR), 4153cb93a386Sopenharmony_ci topG = lerpX(leftG, rightG), 4154cb93a386Sopenharmony_ci topB = lerpX(leftB, rightB), 4155cb93a386Sopenharmony_ci topA = lerpX(leftA, rightA); 4156cb93a386Sopenharmony_ci 4157cb93a386Sopenharmony_ci ix = ix_and_ptr(&ptr, ctx, sx, sy+1); 4158cb93a386Sopenharmony_ci from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA); 4159cb93a386Sopenharmony_ci 4160cb93a386Sopenharmony_ci ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1); 4161cb93a386Sopenharmony_ci from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA); 4162cb93a386Sopenharmony_ci 4163cb93a386Sopenharmony_ci U16 bottomR = lerpX(leftR, rightR), 4164cb93a386Sopenharmony_ci bottomG = lerpX(leftG, rightG), 4165cb93a386Sopenharmony_ci bottomB = lerpX(leftB, rightB), 4166cb93a386Sopenharmony_ci bottomA = lerpX(leftA, rightA); 4167cb93a386Sopenharmony_ci 4168cb93a386Sopenharmony_ci // lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting 4169cb93a386Sopenharmony_ci // in a value on [0, 255]. 4170cb93a386Sopenharmony_ci auto lerpY = [&](U16 top, U16 bottom) -> U16 { 4171cb93a386Sopenharmony_ci I16 width = (I16)bottom - top; 4172cb93a386Sopenharmony_ci U16 middle = bottom + top; 4173cb93a386Sopenharmony_ci // Add + 0x80 for rounding. 4174cb93a386Sopenharmony_ci U16 blend = constrained_add(scaled_mult(ty, width), middle) + 0x80; 4175cb93a386Sopenharmony_ci 4176cb93a386Sopenharmony_ci return blend >> 8; 4177cb93a386Sopenharmony_ci }; 4178cb93a386Sopenharmony_ci 4179cb93a386Sopenharmony_ci r = lerpY(topR, bottomR); 4180cb93a386Sopenharmony_ci g = lerpY(topG, bottomG); 4181cb93a386Sopenharmony_ci b = lerpY(topB, bottomB); 4182cb93a386Sopenharmony_ci a = lerpY(topA, bottomA); 4183cb93a386Sopenharmony_ci} 4184cb93a386Sopenharmony_ci#endif // SK_SUPPORT_LEGACY_BILERP_HIGHP 4185cb93a386Sopenharmony_ci 4186cb93a386Sopenharmony_ciSTAGE_GG(xy_to_unit_angle, Ctx::None) { 4187cb93a386Sopenharmony_ci F xabs = abs_(x), 4188cb93a386Sopenharmony_ci yabs = abs_(y); 4189cb93a386Sopenharmony_ci 4190cb93a386Sopenharmony_ci F slope = min(xabs, yabs)/max(xabs, yabs); 4191cb93a386Sopenharmony_ci F s = slope * slope; 4192cb93a386Sopenharmony_ci 4193cb93a386Sopenharmony_ci // Use a 7th degree polynomial to approximate atan. 4194cb93a386Sopenharmony_ci // This was generated using sollya.gforge.inria.fr. 4195cb93a386Sopenharmony_ci // A float optimized polynomial was generated using the following command. 4196cb93a386Sopenharmony_ci // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative); 4197cb93a386Sopenharmony_ci F phi = slope 4198cb93a386Sopenharmony_ci * (0.15912117063999176025390625f + s 4199cb93a386Sopenharmony_ci * (-5.185396969318389892578125e-2f + s 4200cb93a386Sopenharmony_ci * (2.476101927459239959716796875e-2f + s 4201cb93a386Sopenharmony_ci * (-7.0547382347285747528076171875e-3f)))); 4202cb93a386Sopenharmony_ci 4203cb93a386Sopenharmony_ci phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi); 4204cb93a386Sopenharmony_ci phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi); 4205cb93a386Sopenharmony_ci phi = if_then_else(y < 0.0f , 1.0f - phi , phi); 4206cb93a386Sopenharmony_ci phi = if_then_else(phi != phi , 0 , phi); // Check for NaN. 4207cb93a386Sopenharmony_ci x = phi; 4208cb93a386Sopenharmony_ci} 4209cb93a386Sopenharmony_ciSTAGE_GG(xy_to_radius, Ctx::None) { 4210cb93a386Sopenharmony_ci x = sqrt_(x*x + y*y); 4211cb93a386Sopenharmony_ci} 4212cb93a386Sopenharmony_ci 4213cb93a386Sopenharmony_ci// ~~~~~~ Compound stages ~~~~~~ // 4214cb93a386Sopenharmony_ci 4215cb93a386Sopenharmony_ciSTAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) { 4216cb93a386Sopenharmony_ci auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy); 4217cb93a386Sopenharmony_ci 4218cb93a386Sopenharmony_ci load_8888_(ptr, tail, &dr,&dg,&db,&da); 4219cb93a386Sopenharmony_ci r = r + div255( dr*inv(a) ); 4220cb93a386Sopenharmony_ci g = g + div255( dg*inv(a) ); 4221cb93a386Sopenharmony_ci b = b + div255( db*inv(a) ); 4222cb93a386Sopenharmony_ci a = a + div255( da*inv(a) ); 4223cb93a386Sopenharmony_ci store_8888_(ptr, tail, r,g,b,a); 4224cb93a386Sopenharmony_ci} 4225cb93a386Sopenharmony_ci 4226cb93a386Sopenharmony_ci// ~~~~~~ GrSwizzle stage ~~~~~~ // 4227cb93a386Sopenharmony_ci 4228cb93a386Sopenharmony_ciSTAGE_PP(swizzle, void* ctx) { 4229cb93a386Sopenharmony_ci auto ir = r, ig = g, ib = b, ia = a; 4230cb93a386Sopenharmony_ci U16* o[] = {&r, &g, &b, &a}; 4231cb93a386Sopenharmony_ci char swiz[4]; 4232cb93a386Sopenharmony_ci memcpy(swiz, &ctx, sizeof(swiz)); 4233cb93a386Sopenharmony_ci 4234cb93a386Sopenharmony_ci for (int i = 0; i < 4; ++i) { 4235cb93a386Sopenharmony_ci switch (swiz[i]) { 4236cb93a386Sopenharmony_ci case 'r': *o[i] = ir; break; 4237cb93a386Sopenharmony_ci case 'g': *o[i] = ig; break; 4238cb93a386Sopenharmony_ci case 'b': *o[i] = ib; break; 4239cb93a386Sopenharmony_ci case 'a': *o[i] = ia; break; 4240cb93a386Sopenharmony_ci case '0': *o[i] = U16(0); break; 4241cb93a386Sopenharmony_ci case '1': *o[i] = U16(255); break; 4242cb93a386Sopenharmony_ci default: break; 4243cb93a386Sopenharmony_ci } 4244cb93a386Sopenharmony_ci } 4245cb93a386Sopenharmony_ci} 4246cb93a386Sopenharmony_ci 4247cb93a386Sopenharmony_ci// Now we'll add null stand-ins for stages we haven't implemented in lowp. 4248cb93a386Sopenharmony_ci// If a pipeline uses these stages, it'll boot it out of lowp into highp. 4249cb93a386Sopenharmony_ci#define NOT_IMPLEMENTED(st) static void (*st)(void) = nullptr; 4250cb93a386Sopenharmony_ci NOT_IMPLEMENTED(callback) 4251cb93a386Sopenharmony_ci NOT_IMPLEMENTED(unbounded_set_rgb) 4252cb93a386Sopenharmony_ci NOT_IMPLEMENTED(unbounded_uniform_color) 4253cb93a386Sopenharmony_ci NOT_IMPLEMENTED(unpremul) 4254cb93a386Sopenharmony_ci NOT_IMPLEMENTED(dither) 4255cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_16161616) 4256cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_16161616_dst) 4257cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_16161616) 4258cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_16161616) 4259cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_a16) 4260cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_a16_dst) 4261cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_a16) 4262cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_a16) 4263cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_rg1616) 4264cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_rg1616_dst) 4265cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_rg1616) 4266cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_rg1616) 4267cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_f16) 4268cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_f16_dst) 4269cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_f16) 4270cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_f16) 4271cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_af16) 4272cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_af16_dst) 4273cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_af16) 4274cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_af16) 4275cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_rgf16) 4276cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_rgf16_dst) 4277cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_rgf16) 4278cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_rgf16) 4279cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_f32) 4280cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_f32_dst) 4281cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_f32) 4282cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_f32) 4283cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_rgf32) 4284cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_rgf32) 4285cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_1010102) 4286cb93a386Sopenharmony_ci NOT_IMPLEMENTED(load_1010102_dst) 4287cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_1010102) 4288cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gather_1010102) 4289cb93a386Sopenharmony_ci NOT_IMPLEMENTED(store_u16_be) 4290cb93a386Sopenharmony_ci NOT_IMPLEMENTED(byte_tables) 4291cb93a386Sopenharmony_ci NOT_IMPLEMENTED(colorburn) 4292cb93a386Sopenharmony_ci NOT_IMPLEMENTED(colordodge) 4293cb93a386Sopenharmony_ci NOT_IMPLEMENTED(softlight) 4294cb93a386Sopenharmony_ci NOT_IMPLEMENTED(hue) 4295cb93a386Sopenharmony_ci NOT_IMPLEMENTED(saturation) 4296cb93a386Sopenharmony_ci NOT_IMPLEMENTED(color) 4297cb93a386Sopenharmony_ci NOT_IMPLEMENTED(luminosity) 4298cb93a386Sopenharmony_ci NOT_IMPLEMENTED(matrix_3x3) 4299cb93a386Sopenharmony_ci NOT_IMPLEMENTED(matrix_3x4) 4300cb93a386Sopenharmony_ci NOT_IMPLEMENTED(matrix_4x5) 4301cb93a386Sopenharmony_ci NOT_IMPLEMENTED(matrix_4x3) 4302cb93a386Sopenharmony_ci NOT_IMPLEMENTED(parametric) 4303cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gamma_) 4304cb93a386Sopenharmony_ci NOT_IMPLEMENTED(PQish) 4305cb93a386Sopenharmony_ci NOT_IMPLEMENTED(HLGish) 4306cb93a386Sopenharmony_ci NOT_IMPLEMENTED(HLGinvish) 4307cb93a386Sopenharmony_ci NOT_IMPLEMENTED(rgb_to_hsl) 4308cb93a386Sopenharmony_ci NOT_IMPLEMENTED(hsl_to_rgb) 4309cb93a386Sopenharmony_ci NOT_IMPLEMENTED(gauss_a_to_rgba) 4310cb93a386Sopenharmony_ci NOT_IMPLEMENTED(mirror_x) 4311cb93a386Sopenharmony_ci NOT_IMPLEMENTED(repeat_x) 4312cb93a386Sopenharmony_ci NOT_IMPLEMENTED(mirror_y) 4313cb93a386Sopenharmony_ci NOT_IMPLEMENTED(repeat_y) 4314cb93a386Sopenharmony_ci NOT_IMPLEMENTED(negate_x) 4315cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bilinear) 4316cb93a386Sopenharmony_ci#if defined(SK_SUPPORT_LEGACY_BILERP_HIGHP) 4317cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bilerp_clamp_8888) 4318cb93a386Sopenharmony_ci#endif 4319cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic) 4320cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_clamp_8888) 4321cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bilinear_nx) 4322cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bilinear_ny) 4323cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bilinear_px) 4324cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bilinear_py) 4325cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_n3x) 4326cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_n1x) 4327cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_p1x) 4328cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_p3x) 4329cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_n3y) 4330cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_n1y) 4331cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_p1y) 4332cb93a386Sopenharmony_ci NOT_IMPLEMENTED(bicubic_p3y) 4333cb93a386Sopenharmony_ci NOT_IMPLEMENTED(save_xy) 4334cb93a386Sopenharmony_ci NOT_IMPLEMENTED(accumulate) 4335cb93a386Sopenharmony_ci NOT_IMPLEMENTED(xy_to_2pt_conical_well_behaved) 4336cb93a386Sopenharmony_ci NOT_IMPLEMENTED(xy_to_2pt_conical_strip) 4337cb93a386Sopenharmony_ci NOT_IMPLEMENTED(xy_to_2pt_conical_focal_on_circle) 4338cb93a386Sopenharmony_ci NOT_IMPLEMENTED(xy_to_2pt_conical_smaller) 4339cb93a386Sopenharmony_ci NOT_IMPLEMENTED(xy_to_2pt_conical_greater) 4340cb93a386Sopenharmony_ci NOT_IMPLEMENTED(alter_2pt_conical_compensate_focal) 4341cb93a386Sopenharmony_ci NOT_IMPLEMENTED(alter_2pt_conical_unswap) 4342cb93a386Sopenharmony_ci NOT_IMPLEMENTED(mask_2pt_conical_nan) 4343cb93a386Sopenharmony_ci NOT_IMPLEMENTED(mask_2pt_conical_degenerates) 4344cb93a386Sopenharmony_ci NOT_IMPLEMENTED(apply_vector_mask) 4345cb93a386Sopenharmony_ci#undef NOT_IMPLEMENTED 4346cb93a386Sopenharmony_ci 4347cb93a386Sopenharmony_ci#endif//defined(JUMPER_IS_SCALAR) controlling whether we build lowp stages 4348cb93a386Sopenharmony_ci} // namespace lowp 4349cb93a386Sopenharmony_ci 4350cb93a386Sopenharmony_ci} // namespace SK_OPTS_NS 4351cb93a386Sopenharmony_ci 4352cb93a386Sopenharmony_ci#undef SI 4353cb93a386Sopenharmony_ci 4354cb93a386Sopenharmony_ci#endif//SkRasterPipeline_opts_DEFINED 4355