Lines Matching refs:ptr
54 void* ptr;
57 explicit Ctx(void**& p) : ptr(nullptr), program(p) {}
61 if (!ptr) { ptr = load_and_inc(program); }
62 return (T*)ptr;
143 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
144 *r = ptr[0];
145 *g = ptr[1];
147 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
148 ptr[0] = r;
149 ptr[1] = g;
151 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
152 *r = ptr[0];
153 *g = ptr[1];
154 *b = ptr[2];
156 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
157 *r = ptr[0];
158 *g = ptr[1];
159 *b = ptr[2];
160 *a = ptr[3];
162 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
163 ptr[0] = r;
164 ptr[1] = g;
165 ptr[2] = b;
166 ptr[3] = a;
169 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
170 *r = ptr[0];
171 *g = ptr[1];
173 SI void store2(float* ptr, size_t tail, F r, F g) {
174 ptr[0] = r;
175 ptr[1] = g;
177 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
178 *r = ptr[0];
179 *g = ptr[1];
180 *b = ptr[2];
181 *a = ptr[3];
183 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
184 ptr[0] = r;
185 ptr[1] = g;
186 ptr[2] = b;
187 ptr[3] = a;
242 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
245 if ( true ) { rg = vld2_lane_u16(ptr + 0, rg, 0); }
246 if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); }
247 if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); }
249 rg = vld2_u16(ptr);
254 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
256 if ( true ) { vst2_lane_u16(ptr + 0, (uint16x4x2_t{{r,g}}), 0); }
257 if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); }
258 if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); }
260 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
263 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
266 if ( true ) { rgb = vld3_lane_u16(ptr + 0, rgb, 0); }
267 if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
268 if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
270 rgb = vld3_u16(ptr);
276 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
279 if ( true ) { rgba = vld4_lane_u16(ptr + 0, rgba, 0); }
280 if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
281 if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
283 rgba = vld4_u16(ptr);
291 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
293 if ( true ) { vst4_lane_u16(ptr + 0, (uint16x4x4_t{{r,g,b,a}}), 0); }
294 if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
295 if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
297 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
300 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
303 if ( true ) { rg = vld2q_lane_f32(ptr + 0, rg, 0); }
304 if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); }
305 if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); }
307 rg = vld2q_f32(ptr);
312 SI void store2(float* ptr, size_t tail, F r, F g) {
314 if ( true ) { vst2q_lane_f32(ptr + 0, (float32x4x2_t{{r,g}}), 0); }
315 if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); }
316 if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); }
318 vst2q_f32(ptr, (float32x4x2_t{{r,g}}));
321 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
324 if ( true ) { rgba = vld4q_lane_f32(ptr + 0, rgba, 0); }
325 if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
326 if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
328 rgba = vld4q_f32(ptr);
335 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
337 if ( true ) { vst4q_lane_f32(ptr + 0, (float32x4x4_t{{r,g,b,a}}), 0); }
338 if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
339 if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
341 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
409 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
415 *d = _mm_loadu_si128(((__m128i*)ptr) + 0);
417 ptr += 8;
422 *d = _mm_loadu_si64(ptr);
424 ptr += 4;
428 (*d)[high ? 4 : 0] = *(ptr + 0);
429 (*d)[high ? 5 : 1] = *(ptr + 1);
432 _0123 = _mm_loadu_si128(((__m128i*)ptr) + 0);
433 _4567 = _mm_loadu_si128(((__m128i*)ptr) + 1);
440 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
446 _mm_storeu_si128((__m128i*)ptr, *s);
449 ptr += 8;
453 _mm_storel_epi64((__m128i*)ptr, *s);
454 ptr += 4;
460 *(int32_t*)ptr = _mm_extract_epi32(*s, 2);
462 *(int32_t*)ptr = _mm_cvtsi128_si32(*s);
466 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
467 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
471 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
479 if ( true ) { _0 = load_rgb(ptr + 0); }
480 if (tail > 1) { _1 = load_rgb(ptr + 3); }
481 if (tail > 2) { _2 = load_rgb(ptr + 6); }
482 if (tail > 3) { _3 = load_rgb(ptr + 9); }
483 if (tail > 4) { _4 = load_rgb(ptr + 12); }
484 if (tail > 5) { _5 = load_rgb(ptr + 15); }
485 if (tail > 6) { _6 = load_rgb(ptr + 18); }
488 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ;
489 auto _23 = _mm_loadu_si128((const __m128i*)(ptr + 6)) ;
490 auto _45 = _mm_loadu_si128((const __m128i*)(ptr + 12)) ;
491 auto _67 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 16)), 4);
512 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
515 auto src = (const double*)ptr;
525 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0);
526 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
527 _45 = _mm_loadu_si128(((__m128i*)ptr) + 2);
528 _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
546 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
558 auto dst = (double*)ptr;
567 _mm_storeu_si128((__m128i*)ptr + 0, _01);
568 _mm_storeu_si128((__m128i*)ptr + 1, _23);
569 _mm_storeu_si128((__m128i*)ptr + 2, _45);
570 _mm_storeu_si128((__m128i*)ptr + 3, _67);
574 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
580 *d = _mm256_loadu_ps(ptr);
581 ptr += 8;
587 *d = _mm256_castps128_ps256(_mm_loadu_ps(ptr));
588 ptr += 4;
593 *d = high ? _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 1)
594 : _mm256_insertf128_ps(*d, _mm_loadu_si64(ptr), 0);
597 _0123 = _mm256_loadu_ps(ptr + 0);
598 _4567 = _mm256_loadu_ps(ptr + 8);
607 SI void store2(float* ptr, size_t tail, F r, F g) {
616 _mm256_storeu_ps(ptr, *s);
619 ptr += 8;
623 _mm_storeu_ps(ptr, _mm256_extractf128_ps(*s, 0));
624 ptr += 4;
629 *(ptr + 0) = (*s)[ high ? 4 : 0];
630 *(ptr + 1) = (*s)[ high ? 5 : 1];
633 _mm256_storeu_ps(ptr + 0, _0123);
634 _mm256_storeu_ps(ptr + 8, _4567);
638 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
642 case 0: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1); [[fallthrough]];
643 case 7: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1); [[fallthrough]];
644 case 6: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1); [[fallthrough]];
645 case 5: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1); [[fallthrough]];
646 case 4: _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+12), 0); [[fallthrough]];
647 case 3: _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+ 8), 0); [[fallthrough]];
648 case 2: _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+ 4), 0); [[fallthrough]];
649 case 1: _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+ 0), 0);
662 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
674 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
675 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
676 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
677 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
678 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
679 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
680 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
686 _mm256_storeu_ps(ptr+ 0, _01);
687 _mm256_storeu_ps(ptr+ 8, _23);
688 _mm256_storeu_ps(ptr+16, _45);
689 _mm256_storeu_ps(ptr+24, _67);
747 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
752 _01 = _mm_loadl_pd(_01, (const double*)ptr); // r0 g0 r1 g1 00 00 00 00
754 _01 = _mm_insert_epi16(_01, *(ptr+4), 4); // r0 g0 r1 g1 r2 00 00 00
755 _01 = _mm_insert_epi16(_01, *(ptr+5), 5); // r0 g0 r1 g1 r2 g2 00 00
758 _01 = _mm_cvtsi32_si128(*(const uint32_t*)ptr); // r0 g0 00 00 00 00 00 00
761 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3
771 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
775 _mm_storel_epi64((__m128i*)ptr, rg);
778 memcpy(ptr + 4, &rgpair, sizeof(rgpair));
782 memcpy(ptr, &rgpair, sizeof(rgpair));
785 _mm_storeu_si128((__m128i*)ptr + 0, rg);
789 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
797 if ( true ) { _0 = load_rgb(ptr + 0); }
798 if (tail > 1) { _1 = load_rgb(ptr + 3); }
799 if (tail > 2) { _2 = load_rgb(ptr + 6); }
802 auto _01 = _mm_loadu_si128((const __m128i*)(ptr + 0)) ,
803 _23 = _mm_srli_si128(_mm_loadu_si128((const __m128i*)(ptr + 4)), 4);
825 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
829 auto src = (const double*)ptr;
834 _01 = _mm_loadu_si128(((__m128i*)ptr) + 0); // r0 g0 b0 a0 r1 g1 b1 a1
835 _23 = _mm_loadu_si128(((__m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
850 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
855 auto dst = (double*)ptr;
860 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
861 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
865 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
869 if ( true ) { _01 = _mm_loadl_pi(_01, (__m64 const*)(ptr + 0)); }
870 if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); }
871 if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); }
873 _01 = _mm_loadu_ps(ptr + 0);
874 _23 = _mm_loadu_ps(ptr + 4);
879 SI void store2(float* ptr, size_t tail, F r, F g) {
883 if ( true ) { _mm_storel_pi((__m64*)(ptr + 0), _01); }
884 if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); }
885 if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); }
887 _mm_storeu_ps(ptr + 0, _01);
888 _mm_storeu_ps(ptr + 4, _23);
892 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
896 if ( true ) { _0 = _mm_loadu_ps(ptr + 0); }
897 if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
898 if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
900 _0 = _mm_loadu_ps(ptr + 0);
901 _1 = _mm_loadu_ps(ptr + 4);
902 _2 = _mm_loadu_ps(ptr + 8);
903 _3 = _mm_loadu_ps(ptr +12);
912 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
915 if ( true ) { _mm_storeu_ps(ptr + 0, r); }
916 if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
917 if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
919 _mm_storeu_ps(ptr + 0, r);
920 _mm_storeu_ps(ptr + 4, g);
921 _mm_storeu_ps(ptr + 8, b);
922 _mm_storeu_ps(ptr +12, a);
1255 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1259 *ptr = (const T*)ctx->pixels;
1358 STAGE(load_src, const float* ptr) {
1359 r = sk_unaligned_load<F>(ptr + 0*N);
1360 g = sk_unaligned_load<F>(ptr + 1*N);
1361 b = sk_unaligned_load<F>(ptr + 2*N);
1362 a = sk_unaligned_load<F>(ptr + 3*N);
1366 STAGE(store_src, float* ptr) {
1367 sk_unaligned_store(ptr + 0*N, r);
1368 sk_unaligned_store(ptr + 1*N, g);
1369 sk_unaligned_store(ptr + 2*N, b);
1370 sk_unaligned_store(ptr + 3*N, a);
1372 STAGE(store_src_a, float* ptr) {
1373 sk_unaligned_store(ptr, a);
1377 STAGE(load_dst, const float* ptr) {
1378 dr = sk_unaligned_load<F>(ptr + 0*N);
1379 dg = sk_unaligned_load<F>(ptr + 1*N);
1380 db = sk_unaligned_load<F>(ptr + 2*N);
1381 da = sk_unaligned_load<F>(ptr + 3*N);
1385 STAGE(store_dst, float* ptr) {
1386 sk_unaligned_store(ptr + 0*N, dr);
1387 sk_unaligned_store(ptr + 1*N, dg);
1388 sk_unaligned_store(ptr + 2*N, db);
1389 sk_unaligned_store(ptr + 3*N, da);
1582 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
1584 U32 dst = load<U32>(ptr, tail);
1603 store(ptr, dst, tail);
1751 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1753 auto scales = load<U8>(ptr, tail);
1762 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1765 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1800 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1802 auto scales = load<U8>(ptr, tail);
1811 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1814 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1937 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1940 a = from_byte(load<U8>(ptr, tail));
1943 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
1946 da = from_byte(load<U8>(ptr, tail));
1949 const uint8_t* ptr;
1950 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1952 a = from_byte(gather(ptr, ix));
1955 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
1958 store(ptr, packed, tail);
1962 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1964 from_565(load<U16>(ptr, tail), &r,&g,&b);
1968 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1970 from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1974 const uint16_t* ptr;
1975 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1976 from_565(gather(ptr, ix), &r,&g,&b);
1980 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
1985 store(ptr, px, tail);
1989 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1990 from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
1993 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
1994 from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
1997 const uint16_t* ptr;
1998 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
1999 from_4444(gather(ptr, ix), &r,&g,&b,&a);
2002 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2007 store(ptr, px, tail);
2011 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2012 from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
2015 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2016 from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2019 const uint32_t* ptr;
2020 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2021 from_8888(gather(ptr, ix), &r,&g,&b,&a);
2024 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2030 store(ptr, px, tail);
2034 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2035 from_88(load<U16>(ptr, tail), &r, &g);
2040 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2041 from_88(load<U16>(ptr, tail), &dr, &dg);
2046 const uint16_t* ptr;
2047 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2048 from_88(gather(ptr, ix), &r, &g);
2053 auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2055 store(ptr, px, tail);
2059 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2061 a = from_short(load<U16>(ptr, tail));
2064 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2066 da = from_short(load<U16>(ptr, tail));
2069 const uint16_t* ptr;
2070 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2072 a = from_short(gather(ptr, ix));
2075 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2078 store(ptr, px, tail);
2082 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2084 from_1616(load<U32>(ptr, tail), &r,&g);
2087 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2088 from_1616(load<U32>(ptr, tail), &dr, &dg);
2093 const uint32_t* ptr;
2094 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2095 from_1616(gather(ptr, ix), &r, &g);
2100 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2104 store(ptr, px, tail);
2108 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2109 from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2112 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2113 from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da);
2116 const uint64_t* ptr;
2117 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2118 from_16161616(gather(ptr, ix), &r, &g, &b, &a);
2121 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
2128 store4(ptr,tail, R,G,B,A);
2133 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2134 from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2137 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2138 from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2141 const uint32_t* ptr;
2142 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2143 from_1010102(gather(ptr, ix), &r,&g,&b,&a);
2146 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2152 store(ptr, px, tail);
2156 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2159 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2166 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
2169 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2176 const uint64_t* ptr;
2177 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2178 auto px = gather(ptr, ix);
2188 auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
2189 store4((uint16_t*)ptr,tail, to_half(r)
2196 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,dy);
2203 store4(ptr,tail, R,G,B,A);
2207 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2209 U16 A = load<U16>((const uint16_t*)ptr, tail);
2216 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2218 U16 A = load<U16>((const uint16_t*)ptr, tail);
2223 const uint16_t* ptr;
2224 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2226 a = from_half(gather(ptr, ix));
2229 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2230 store(ptr, to_half(a), tail);
2234 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2237 load2((const uint16_t*)ptr, tail, &R, &G);
2244 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2247 load2((const uint16_t*)ptr, tail, &R, &G);
2254 const uint32_t* ptr;
2255 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2256 auto px = gather(ptr, ix);
2266 auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
2267 store2((uint16_t*)ptr, tail, to_half(r)
2272 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
2273 load4(ptr,tail, &r,&g,&b,&a);
2276 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
2277 load4(ptr,tail, &dr,&dg,&db,&da);
2280 const float* ptr;
2281 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2282 r = gather(ptr, 4*ix + 0);
2283 g = gather(ptr, 4*ix + 1);
2284 b = gather(ptr, 4*ix + 2);
2285 a = gather(ptr, 4*ix + 3);
2288 auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy);
2289 store4(ptr,tail, r,g,b,a);
2293 auto ptr = ptr_at_xy<const float>(ctx, 2*dx,2*dy);
2294 load2(ptr, tail, &r, &g);
2299 auto ptr = ptr_at_xy<float>(ctx, 2*dx,2*dy);
2300 store2(ptr, tail, r, g);
2731 const uint32_t* ptr;
2732 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2733 from_8888(gather(ptr, ix), r,g,b,a);
2803 const uint32_t* ptr;
2804 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
2807 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
2854 const uint32_t* ptr;
2855 U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
2858 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
3456 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
3464 *ptr = (const T*)ctx->pixels;
3469 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) {
3477 *ptr = (const T*)ctx->pixels;
3482 SI V load(const T* ptr, size_t tail) {
3485 case 0: memcpy(&v, ptr, sizeof(v)); break;
3487 case 15: v[14] = ptr[14]; [[fallthrough]];
3488 case 14: v[13] = ptr[13]; [[fallthrough]];
3489 case 13: v[12] = ptr[12]; [[fallthrough]];
3490 case 12: memcpy(&v, ptr, 12*sizeof(T)); break;
3491 case 11: v[10] = ptr[10]; [[fallthrough]];
3492 case 10: v[ 9] = ptr[ 9]; [[fallthrough]];
3493 case 9: v[ 8] = ptr[ 8]; [[fallthrough]];
3494 case 8: memcpy(&v, ptr, 8*sizeof(T)); break;
3496 case 7: v[ 6] = ptr[ 6]; [[fallthrough]];
3497 case 6: v[ 5] = ptr[ 5]; [[fallthrough]];
3498 case 5: v[ 4] = ptr[ 4]; [[fallthrough]];
3499 case 4: memcpy(&v, ptr, 4*sizeof(T)); break;
3500 case 3: v[ 2] = ptr[ 2]; [[fallthrough]];
3501 case 2: memcpy(&v, ptr, 2*sizeof(T)); break;
3502 case 1: v[ 0] = ptr[ 0];
3507 SI void store(T* ptr, size_t tail, V v) {
3509 case 0: memcpy(ptr, &v, sizeof(v)); break;
3511 case 15: ptr[14] = v[14]; [[fallthrough]];
3512 case 14: ptr[13] = v[13]; [[fallthrough]];
3513 case 13: ptr[12] = v[12]; [[fallthrough]];
3514 case 12: memcpy(ptr, &v, 12*sizeof(T)); break;
3515 case 11: ptr[10] = v[10]; [[fallthrough]];
3516 case 10: ptr[ 9] = v[ 9]; [[fallthrough]];
3517 case 9: ptr[ 8] = v[ 8]; [[fallthrough]];
3518 case 8: memcpy(ptr, &v, 8*sizeof(T)); break;
3520 case 7: ptr[ 6] = v[ 6]; [[fallthrough]];
3521 case 6: ptr[ 5] = v[ 5]; [[fallthrough]];
3522 case 5: ptr[ 4] = v[ 4]; [[fallthrough]];
3523 case 4: memcpy(ptr, &v, 4*sizeof(T)); break;
3524 case 3: ptr[ 2] = v[ 2]; [[fallthrough]];
3525 case 2: memcpy(ptr, &v, 2*sizeof(T)); break;
3526 case 1: ptr[ 0] = v[ 0];
3532 SI V gather(const T* ptr, U32 ix) {
3533 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
3534 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
3535 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
3536 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
3540 F gather(const float* ptr, U32 ix) {
3544 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
3545 _mm256_i32gather_ps(ptr, hi, 4));
3549 U32 gather(const uint32_t* ptr, U32 ix) {
3553 return join<U32>(_mm256_i32gather_epi32(ptr, lo, 4),
3554 _mm256_i32gather_epi32(ptr, hi, 4));
3558 SI V gather(const T* ptr, U32 ix) {
3559 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
3560 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
3592 SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3596 case 0: rgba = vld4_u8 ((const uint8_t*)(ptr+0) ); break;
3597 case 7: rgba = vld4_lane_u8((const uint8_t*)(ptr+6), rgba, 6); [[fallthrough]];
3598 case 6: rgba = vld4_lane_u8((const uint8_t*)(ptr+5), rgba, 5); [[fallthrough]];
3599 case 5: rgba = vld4_lane_u8((const uint8_t*)(ptr+4), rgba, 4); [[fallthrough]];
3600 case 4: rgba = vld4_lane_u8((const uint8_t*)(ptr+3), rgba, 3); [[fallthrough]];
3601 case 3: rgba = vld4_lane_u8((const uint8_t*)(ptr+2), rgba, 2); [[fallthrough]];
3602 case 2: rgba = vld4_lane_u8((const uint8_t*)(ptr+1), rgba, 1); [[fallthrough]];
3603 case 1: rgba = vld4_lane_u8((const uint8_t*)(ptr+0), rgba, 0);
3610 from_8888(load<U32>(ptr, tail), r,g,b,a);
3613 SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3622 case 0: vst4_u8 ((uint8_t*)(ptr+0), rgba ); break;
3623 case 7: vst4_lane_u8((uint8_t*)(ptr+6), rgba, 6); [[fallthrough]];
3624 case 6: vst4_lane_u8((uint8_t*)(ptr+5), rgba, 5); [[fallthrough]];
3625 case 5: vst4_lane_u8((uint8_t*)(ptr+4), rgba, 4); [[fallthrough]];
3626 case 4: vst4_lane_u8((uint8_t*)(ptr+3), rgba, 3); [[fallthrough]];
3627 case 3: vst4_lane_u8((uint8_t*)(ptr+2), rgba, 2); [[fallthrough]];
3628 case 2: vst4_lane_u8((uint8_t*)(ptr+1), rgba, 1); [[fallthrough]];
3629 case 1: vst4_lane_u8((uint8_t*)(ptr+0), rgba, 0);
3632 store(ptr, tail, cast<U32>(r | (g<<8)) << 0
3647 const uint32_t* ptr;
3648 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3649 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
3665 SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3666 from_565(load<U16>(ptr, tail), r,g,b);
3668 SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3676 store(ptr, tail, R << 11
3693 const uint16_t* ptr;
3694 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3695 from_565(gather<U16>(ptr, ix), &r, &g, &b);
3712 SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3713 from_4444(load<U16>(ptr, tail), r,g,b,a);
3715 SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3722 store(ptr, tail, R << 12
3738 const uint16_t* ptr;
3739 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3740 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
3748 SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3752 case 0: rg = vld2_u8 ((const uint8_t*)(ptr+0) ); break;
3753 case 7: rg = vld2_lane_u8((const uint8_t*)(ptr+6), rg, 6); [[fallthrough]];
3754 case 6: rg = vld2_lane_u8((const uint8_t*)(ptr+5), rg, 5); [[fallthrough]];
3755 case 5: rg = vld2_lane_u8((const uint8_t*)(ptr+4), rg, 4); [[fallthrough]];
3756 case 4: rg = vld2_lane_u8((const uint8_t*)(ptr+3), rg, 3); [[fallthrough]];
3757 case 3: rg = vld2_lane_u8((const uint8_t*)(ptr+2), rg, 2); [[fallthrough]];
3758 case 2: rg = vld2_lane_u8((const uint8_t*)(ptr+1), rg, 1); [[fallthrough]];
3759 case 1: rg = vld2_lane_u8((const uint8_t*)(ptr+0), rg, 0);
3764 from_88(load<U16>(ptr, tail), r,g);
3768 SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3775 case 0: vst2_u8 ((uint8_t*)(ptr+0), rg ); break;
3776 case 7: vst2_lane_u8((uint8_t*)(ptr+6), rg, 6); [[fallthrough]];
3777 case 6: vst2_lane_u8((uint8_t*)(ptr+5), rg, 5); [[fallthrough]];
3778 case 5: vst2_lane_u8((uint8_t*)(ptr+4), rg, 4); [[fallthrough]];
3779 case 4: vst2_lane_u8((uint8_t*)(ptr+3), rg, 3); [[fallthrough]];
3780 case 3: vst2_lane_u8((uint8_t*)(ptr+2), rg, 2); [[fallthrough]];
3781 case 2: vst2_lane_u8((uint8_t*)(ptr+1), rg, 1); [[fallthrough]];
3782 case 1: vst2_lane_u8((uint8_t*)(ptr+0), rg, 0);
3785 store(ptr, tail, cast<U16>(r | (g<<8)) << 0);
3803 const uint16_t* ptr;
3804 U32 ix = ix_and_ptr(&ptr, ctx, x, y);
3805 from_88(gather<U16>(ptr, ix), &r, &g);
3812 SI U16 load_8(const uint8_t* ptr, size_t tail) {
3813 return cast<U16>(load<U8>(ptr, tail));
3815 SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3816 store(ptr, tail, cast<U8>(v));
3831 const uint8_t* ptr;
3832 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
3834 a = cast<U16>(gather<U8>(ptr, ix));
3855 STAGE_PP(load_src, const uint16_t* ptr) {
3856 r = sk_unaligned_load<U16>(ptr + 0*N);
3857 g = sk_unaligned_load<U16>(ptr + 1*N);
3858 b = sk_unaligned_load<U16>(ptr + 2*N);
3859 a = sk_unaligned_load<U16>(ptr + 3*N);
3861 STAGE_PP(store_src, uint16_t* ptr) {
3862 sk_unaligned_store(ptr + 0*N, r);
3863 sk_unaligned_store(ptr + 1*N, g);
3864 sk_unaligned_store(ptr + 2*N, b);
3865 sk_unaligned_store(ptr + 3*N, a);
3867 STAGE_PP(store_src_a, uint16_t* ptr) {
3868 sk_unaligned_store(ptr, a);
3870 STAGE_PP(load_dst, const uint16_t* ptr) {
3871 dr = sk_unaligned_load<U16>(ptr + 0*N);
3872 dg = sk_unaligned_load<U16>(ptr + 1*N);
3873 db = sk_unaligned_load<U16>(ptr + 2*N);
3874 da = sk_unaligned_load<U16>(ptr + 3*N);
3876 STAGE_PP(store_dst, uint16_t* ptr) {
3877 sk_unaligned_store(ptr + 0*N, dr);
3878 sk_unaligned_store(ptr + 1*N, dg);
3879 sk_unaligned_store(ptr + 2*N, db);
3880 sk_unaligned_store(ptr + 3*N, da);
4143 const uint32_t* ptr;
4144 U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
4146 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
4148 ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
4150 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
4157 ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
4158 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
4160 ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
4161 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
4216 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
4218 load_8888_(ptr, tail, &dr,&dg,&db,&da);
4223 store_8888_(ptr, tail, r,g,b,a);