Lines Matching refs:tail
143 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
147 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
151 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
156 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
162 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
169 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
173 SI void store2(float* ptr, size_t tail, F r, F g) {
177 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
183 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
242 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
244 if (__builtin_expect(tail,0)) {
246 if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); }
247 if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); }
254 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
255 if (__builtin_expect(tail,0)) {
257 if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); }
258 if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); }
263 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
265 if (__builtin_expect(tail,0)) {
267 if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
268 if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
276 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
278 if (__builtin_expect(tail,0)) {
280 if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
281 if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
291 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
292 if (__builtin_expect(tail,0)) {
294 if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
295 if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
300 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
302 if (__builtin_expect(tail,0)) {
304 if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); }
305 if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); }
312 SI void store2(float* ptr, size_t tail, F r, F g) {
313 if (__builtin_expect(tail,0)) {
315 if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); }
316 if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); }
321 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
323 if (__builtin_expect(tail,0)) {
325 if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
326 if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
335 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
336 if (__builtin_expect(tail,0)) {
338 if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
339 if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
409 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
411 if (__builtin_expect(tail,0)) {
414 if (tail > 3) {
416 tail -= 4;
421 if (tail > 1) {
423 tail -= 2;
427 if (tail > 0) {
440 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
443 if (__builtin_expect(tail,0)) {
445 if (tail > 3) {
448 tail -= 4;
452 if (tail > 1) {
455 tail -= 2;
458 if (tail > 0) {
471 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
473 if (__builtin_expect(tail,0)) {
480 if (tail > 1) { _1 = load_rgb(ptr + 3); }
481 if (tail > 2) { _2 = load_rgb(ptr + 6); }
482 if (tail > 3) { _3 = load_rgb(ptr + 9); }
483 if (tail > 4) { _4 = load_rgb(ptr + 12); }
484 if (tail > 5) { _5 = load_rgb(ptr + 15); }
485 if (tail > 6) { _6 = load_rgb(ptr + 18); }
512 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
514 if (__builtin_expect(tail,0)) {
517 if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
518 if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
519 if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
520 if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
521 if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
522 if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
523 if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
546 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
557 if (__builtin_expect(tail,0)) {
559 if (tail > 0) { _mm_storel_pd(dst+0, _01); }
560 if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
561 if (tail > 2) { _mm_storel_pd(dst+2, _23); }
562 if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
563 if (tail > 4) { _mm_storel_pd(dst+4, _45); }
564 if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
565 if (tail > 6) { _mm_storel_pd(dst+6, _67); }
574 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
576 if (__builtin_expect(tail, 0)) {
579 if (tail > 3) {
582 tail -= 4;
586 if (tail > 1) {
589 tail -= 2;
592 if (tail > 0) {
607 SI void store2(float* ptr, size_t tail, F r, F g) {
613 if (__builtin_expect(tail, 0)) {
615 if (tail > 3) {
618 tail -= 4;
622 if (tail > 1) {
625 tail -= 2;
628 if (tail > 0) {
638 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
641 switch (tail) {
662 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
673 if (__builtin_expect(tail, 0)) {
674 if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
675 if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
676 if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
677 if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
678 if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
679 if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
680 if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
747 SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
749 if (__builtin_expect(tail,0)) {
751 if (tail > 1) {
753 if (tail > 2) {
771 SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
773 if (__builtin_expect(tail, 0)) {
774 if (tail > 1) {
776 if (tail > 2) {
789 SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
791 if (__builtin_expect(tail,0)) {
798 if (tail > 1) { _1 = load_rgb(ptr + 3); }
799 if (tail > 2) { _2 = load_rgb(ptr + 6); }
825 SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
827 if (__builtin_expect(tail,0)) {
831 if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
832 if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
850 SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
854 if (__builtin_expect(tail, 0)) {
857 if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
858 if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
865 SI void load2(const float* ptr, size_t tail, F* r, F* g) {
867 if (__builtin_expect(tail, 0)) {
870 if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); }
871 if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); }
879 SI void store2(float* ptr, size_t tail, F r, F g) {
882 if (__builtin_expect(tail, 0)) {
884 if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); }
885 if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); }
892 SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
894 if (__builtin_expect(tail, 0)) {
897 if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
898 if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
912 SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
914 if (__builtin_expect(tail, 0)) {
916 if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
917 if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
1045 // tail == 0 ~~> work on a full N pixels
1046 // tail != 0 ~~> work on only the first tail pixels
1047 // tail is always < N.
1074 size_t dx, dy, tail;
1080 using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
1094 if (size_t tail = xlimit - params.dx) {
1095 params.tail = tail;
1104 if (size_t tail = xlimit - dx) {
1105 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
1113 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1117 name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a, \
1122 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1126 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1128 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
1130 name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da); \
1132 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
1134 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
1150 // These load() and store() methods are tail-aware,
1151 // but focus mainly on keeping the at-stride tail==0 case fast.
1154 SI V load(const T* src, size_t tail) {
1156 __builtin_assume(tail < N);
1157 if (__builtin_expect(tail, 0)) {
1159 switch (tail) {
1175 SI void store(T* dst, V v, size_t tail) {
1177 __builtin_assume(tail < N);
1178 if (__builtin_expect(tail, 0)) {
1179 switch (tail) {
1584 U32 dst = load<U32>(ptr, tail);
1603 store(ptr, dst, tail);
1753 auto scales = load<U8>(ptr, tail);
1765 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1802 auto scales = load<U8>(ptr, tail);
1814 from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1828 F mul = from_byte(load<U8>(mptr, tail)),
1829 add = from_byte(load<U8>(aptr, tail));
1940 a = from_byte(load<U8>(ptr, tail));
1946 da = from_byte(load<U8>(ptr, tail));
1958 store(ptr, packed, tail);
1964 from_565(load<U16>(ptr, tail), &r,&g,&b);
1970 from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1985 store(ptr, px, tail);
1990 from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
1994 from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
2007 store(ptr, px, tail);
2012 from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
2016 from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2030 store(ptr, px, tail);
2035 from_88(load<U16>(ptr, tail), &r, &g);
2041 from_88(load<U16>(ptr, tail), &dr, &dg);
2055 store(ptr, px, tail);
2061 a = from_short(load<U16>(ptr, tail));
2066 da = from_short(load<U16>(ptr, tail));
2078 store(ptr, px, tail);
2084 from_1616(load<U32>(ptr, tail), &r,&g);
2088 from_1616(load<U32>(ptr, tail), &dr, &dg);
2104 store(ptr, px, tail);
2109 from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2113 from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da);
2128 store4(ptr,tail, R,G,B,A);
2134 from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2138 from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2152 store(ptr, px, tail);
2159 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2169 load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2189 store4((uint16_t*)ptr,tail, to_half(r)
2203 store4(ptr,tail, R,G,B,A);
2209 U16 A = load<U16>((const uint16_t*)ptr, tail);
2218 U16 A = load<U16>((const uint16_t*)ptr, tail);
2230 store(ptr, to_half(a), tail);
2237 load2((const uint16_t*)ptr, tail, &R, &G);
2247 load2((const uint16_t*)ptr, tail, &R, &G);
2267 store2((uint16_t*)ptr, tail, to_half(r)
2273 load4(ptr,tail, &r,&g,&b,&a);
2277 load4(ptr,tail, &dr,&dg,&db,&da);
2289 store4(ptr,tail, r,g,b,a);
2294 load2(ptr, tail, &r, &g);
2300 store2(ptr, tail, r, g);
2688 c->fn(c, tail ? tail : N);
2933 size_t dx, dy, tail;
2939 using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
2953 if (size_t tail = xlimit - params.dx) {
2954 params.tail = tail;
2962 if (size_t tail = xlimit - dx) {
2963 start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
2987 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
2991 name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y); \
2997 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3000 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3006 name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a, \
3011 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3016 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3020 name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a, \
3025 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3030 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y); \
3031 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3036 name##_k(Ctx{program}, dx,dy,tail, x,y); \
3040 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3042 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3045 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3048 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3053 name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da); \
3055 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3057 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y, \
3062 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3065 static void ABI name(size_t tail, void** program, size_t dx, size_t dy, \
3068 name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da); \
3070 next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da); \
3072 SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, \
3482 SI V load(const T* ptr, size_t tail) {
3484 switch (tail & (N-1)) {
3507 SI void store(T* ptr, size_t tail, V v) {
3508 switch (tail & (N-1)) {
3592 SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3595 switch (tail & (N-1)) {
3610 from_8888(load<U32>(ptr, tail), r,g,b,a);
3613 SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3621 switch (tail & (N-1)) {
3632 store(ptr, tail, cast<U32>(r | (g<<8)) << 0
3638 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3641 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3644 store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
3665 SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3666 from_565(load<U16>(ptr, tail), r,g,b);
3668 SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3676 store(ptr, tail, R << 11
3682 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
3686 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
3690 store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
3712 SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3713 from_4444(load<U16>(ptr, tail), r,g,b,a);
3715 SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3722 store(ptr, tail, R << 12
3729 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3732 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3735 store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
3748 SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3751 switch (tail & (N-1)) {
3764 from_88(load<U16>(ptr, tail), r,g);
3768 SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3774 switch (tail & (N-1)) {
3785 store(ptr, tail, cast<U16>(r | (g<<8)) << 0);
3790 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g);
3795 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg);
3800 store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
3812 SI U16 load_8(const uint8_t* ptr, size_t tail) {
3813 return cast<U16>(load<U8>(ptr, tail));
3815 SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3816 store(ptr, tail, cast<U8>(v));
3821 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3825 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3828 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
3916 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3923 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3937 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3947 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3957 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
3958 add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
4218 load_8888_(ptr, tail, &dr,&dg,&db,&da);
4223 store_8888_(ptr, tail, r,g,b,a);