SkRasterPipeline_opts.h - OpenGrok cross reference for /third_party/skia/src/opts/SkRasterPipeline

Lines Matching refs:tail
143     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
147     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
151     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
156     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
162     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
169     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
173     SI void store2(float* ptr, size_t tail, F r, F g) {
177     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
183     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
242     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
244         if (__builtin_expect(tail,0)) {
246             if (tail > 1) { rg = vld2_lane_u16(ptr + 2, rg, 1); }
247             if (tail > 2) { rg = vld2_lane_u16(ptr + 4, rg, 2); }
254     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
255         if (__builtin_expect(tail,0)) {
257             if (tail > 1) { vst2_lane_u16(ptr + 2, (uint16x4x2_t{{r,g}}), 1); }
258             if (tail > 2) { vst2_lane_u16(ptr + 4, (uint16x4x2_t{{r,g}}), 2); }
263     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
265         if (__builtin_expect(tail,0)) {
267             if (tail > 1) { rgb = vld3_lane_u16(ptr + 3, rgb, 1); }
268             if (tail > 2) { rgb = vld3_lane_u16(ptr + 6, rgb, 2); }
276     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
278         if (__builtin_expect(tail,0)) {
280             if (tail > 1) { rgba = vld4_lane_u16(ptr + 4, rgba, 1); }
281             if (tail > 2) { rgba = vld4_lane_u16(ptr + 8, rgba, 2); }
291     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
292         if (__builtin_expect(tail,0)) {
294             if (tail > 1) { vst4_lane_u16(ptr + 4, (uint16x4x4_t{{r,g,b,a}}), 1); }
295             if (tail > 2) { vst4_lane_u16(ptr + 8, (uint16x4x4_t{{r,g,b,a}}), 2); }
300     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
302         if (__builtin_expect(tail,0)) {
304             if (tail > 1) { rg = vld2q_lane_f32(ptr + 2, rg, 1); }
305             if (tail > 2) { rg = vld2q_lane_f32(ptr + 4, rg, 2); }
312     SI void store2(float* ptr, size_t tail, F r, F g) {
313         if (__builtin_expect(tail,0)) {
315             if (tail > 1) { vst2q_lane_f32(ptr + 2, (float32x4x2_t{{r,g}}), 1); }
316             if (tail > 2) { vst2q_lane_f32(ptr + 4, (float32x4x2_t{{r,g}}), 2); }
321     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
323         if (__builtin_expect(tail,0)) {
325             if (tail > 1) { rgba = vld4q_lane_f32(ptr + 4, rgba, 1); }
326             if (tail > 2) { rgba = vld4q_lane_f32(ptr + 8, rgba, 2); }
335     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
336         if (__builtin_expect(tail,0)) {
338             if (tail > 1) { vst4q_lane_f32(ptr + 4, (float32x4x4_t{{r,g,b,a}}), 1); }
339             if (tail > 2) { vst4q_lane_f32(ptr + 8, (float32x4x4_t{{r,g,b,a}}), 2); }
409     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
411         if (__builtin_expect(tail,0)) {
414             if (tail > 3) {
416                 tail -= 4;
421             if (tail > 1) {
423                 tail -= 2;
427             if (tail > 0) {
440     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
443         if (__builtin_expect(tail,0)) {
445             if (tail > 3) {
448                 tail -= 4;
452             if (tail > 1) {
455                 tail -= 2;
458             if (tail > 0) {
471     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
473         if (__builtin_expect(tail,0)) {
480             if (tail > 1) { _1 = load_rgb(ptr +  3); }
481             if (tail > 2) { _2 = load_rgb(ptr +  6); }
482             if (tail > 3) { _3 = load_rgb(ptr +  9); }
483             if (tail > 4) { _4 = load_rgb(ptr + 12); }
484             if (tail > 5) { _5 = load_rgb(ptr + 15); }
485             if (tail > 6) { _6 = load_rgb(ptr + 18); }
512     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
514         if (__builtin_expect(tail,0)) {
517             if (tail > 0) { _01 = _mm_loadl_pd(_01, src+0); }
518             if (tail > 1) { _01 = _mm_loadh_pd(_01, src+1); }
519             if (tail > 2) { _23 = _mm_loadl_pd(_23, src+2); }
520             if (tail > 3) { _23 = _mm_loadh_pd(_23, src+3); }
521             if (tail > 4) { _45 = _mm_loadl_pd(_45, src+4); }
522             if (tail > 5) { _45 = _mm_loadh_pd(_45, src+5); }
523             if (tail > 6) { _67 = _mm_loadl_pd(_67, src+6); }
546     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
557         if (__builtin_expect(tail,0)) {
559             if (tail > 0) { _mm_storel_pd(dst+0, _01); }
560             if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
561             if (tail > 2) { _mm_storel_pd(dst+2, _23); }
562             if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
563             if (tail > 4) { _mm_storel_pd(dst+4, _45); }
564             if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
565             if (tail > 6) { _mm_storel_pd(dst+6, _67); }
574     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
576         if (__builtin_expect(tail, 0)) {
579             if (tail > 3) {
582                 tail -= 4;
586             if (tail > 1) {
589                 tail -= 2;
592             if (tail > 0) {
607     SI void store2(float* ptr, size_t tail, F r, F g) {
613         if (__builtin_expect(tail, 0)) {
615             if (tail > 3) {
618                 tail -= 4;
622             if (tail > 1) {
625                 tail -= 2;
628             if (tail > 0) {
638     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
641         switch (tail) {
662     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
673         if (__builtin_expect(tail, 0)) {
674             if (tail > 0) { _mm_storeu_ps(ptr+ 0, _mm256_extractf128_ps(_04, 0)); }
675             if (tail > 1) { _mm_storeu_ps(ptr+ 4, _mm256_extractf128_ps(_15, 0)); }
676             if (tail > 2) { _mm_storeu_ps(ptr+ 8, _mm256_extractf128_ps(_26, 0)); }
677             if (tail > 3) { _mm_storeu_ps(ptr+12, _mm256_extractf128_ps(_37, 0)); }
678             if (tail > 4) { _mm_storeu_ps(ptr+16, _mm256_extractf128_ps(_04, 1)); }
679             if (tail > 5) { _mm_storeu_ps(ptr+20, _mm256_extractf128_ps(_15, 1)); }
680             if (tail > 6) { _mm_storeu_ps(ptr+24, _mm256_extractf128_ps(_26, 1)); }
747     SI void load2(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
749         if (__builtin_expect(tail,0)) {
751             if (tail > 1) {
753                 if (tail > 2) {
771     SI void store2(uint16_t* ptr, size_t tail, U16 r, U16 g) {
773         if (__builtin_expect(tail, 0)) {
774             if (tail > 1) {
776                 if (tail > 2) {
789     SI void load3(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
791         if (__builtin_expect(tail,0)) {
798             if (tail > 1) { _1 = load_rgb(ptr + 3); }
799             if (tail > 2) { _2 = load_rgb(ptr + 6); }
825     SI void load4(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
827         if (__builtin_expect(tail,0)) {
831             if (tail > 1) { _01 = _mm_loadh_pd(_01, src + 1); } // r0 g0 b0 a0 r1 g1 b1 a1
832             if (tail > 2) { _23 = _mm_loadl_pd(_23, src + 2); } // r2 g2 b2 a2 00 00 00 00
850     SI void store4(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
854         if (__builtin_expect(tail, 0)) {
857             if (tail > 1) { _mm_storeh_pd(dst + 1, _mm_unpacklo_epi32(rg, ba)); }
858             if (tail > 2) { _mm_storel_pd(dst + 2, _mm_unpackhi_epi32(rg, ba)); }
865     SI void load2(const float* ptr, size_t tail, F* r, F* g) {
867         if (__builtin_expect(tail, 0)) {
870             if (tail > 1) { _01 = _mm_loadh_pi(_01, (__m64 const*)(ptr + 2)); }
871             if (tail > 2) { _23 = _mm_loadl_pi(_23, (__m64 const*)(ptr + 4)); }
879     SI void store2(float* ptr, size_t tail, F r, F g) {
882         if (__builtin_expect(tail, 0)) {
884             if (tail > 1) { _mm_storeh_pi((__m64*)(ptr + 2), _01); }
885             if (tail > 2) { _mm_storel_pi((__m64*)(ptr + 4), _23); }
892     SI void load4(const float* ptr, size_t tail, F* r, F* g, F* b, F* a) {
894         if (__builtin_expect(tail, 0)) {
897             if (tail > 1) { _1 = _mm_loadu_ps(ptr + 4); }
898             if (tail > 2) { _2 = _mm_loadu_ps(ptr + 8); }
912     SI void store4(float* ptr, size_t tail, F r, F g, F b, F a) {
914         if (__builtin_expect(tail, 0)) {
916             if (tail > 1) { _mm_storeu_ps(ptr + 4, g); }
917             if (tail > 2) { _mm_storeu_ps(ptr + 8, b); }
1045 //    tail == 0 ~~> work on a full N pixels
1046 //    tail != 0 ~~> work on only the first tail pixels
1047 // tail is always < N.
1074         size_t dx, dy, tail;
1080     using Stage = void(ABI*)(size_t tail, void** program, size_t dx, size_t dy, F,F,F,F, F,F,F,F);
1094         if (size_t tail = xlimit - params.dx) {
1095             params.tail = tail;
1104         if (size_t tail = xlimit - dx) {
1105             start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
1113         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \
1117             name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a,  \
1122         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \
1126         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \
1128         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,      \
1130             name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                 \
1132             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                          \
1134         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \
1150 // These load() and store() methods are tail-aware,
1151 // but focus mainly on keeping the at-stride tail==0 case fast.
1154 SI V load(const T* src, size_t tail) {
1156     __builtin_assume(tail < N);
1157     if (__builtin_expect(tail, 0)) {
1159         switch (tail) {
1175 SI void store(T* dst, V v, size_t tail) {
1177     __builtin_assume(tail < N);
1178     if (__builtin_expect(tail, 0)) {
1179         switch (tail) {
1584     U32 dst = load<U32>(ptr, tail);
1603     store(ptr, dst, tail);
1753     auto scales = load<U8>(ptr, tail);
1765     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1802     auto scales = load<U8>(ptr, tail);
1814     from_565(load<U16>(ptr, tail), &cr, &cg, &cb);
1828     F mul = from_byte(load<U8>(mptr, tail)),
1829       add = from_byte(load<U8>(aptr, tail));
1940     a = from_byte(load<U8>(ptr, tail));
1946     da = from_byte(load<U8>(ptr, tail));
1958     store(ptr, packed, tail);
1964     from_565(load<U16>(ptr, tail), &r,&g,&b);
1970     from_565(load<U16>(ptr, tail), &dr,&dg,&db);
1985     store(ptr, px, tail);
1990     from_4444(load<U16>(ptr, tail), &r,&g,&b,&a);
1994     from_4444(load<U16>(ptr, tail), &dr,&dg,&db,&da);
2007     store(ptr, px, tail);
2012     from_8888(load<U32>(ptr, tail), &r,&g,&b,&a);
2016     from_8888(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2030     store(ptr, px, tail);
2035     from_88(load<U16>(ptr, tail), &r, &g);
2041     from_88(load<U16>(ptr, tail), &dr, &dg);
2055     store(ptr, px, tail);
2061     a = from_short(load<U16>(ptr, tail));
2066     da = from_short(load<U16>(ptr, tail));
2078     store(ptr, px, tail);
2084     from_1616(load<U32>(ptr, tail), &r,&g);
2088     from_1616(load<U32>(ptr, tail), &dr, &dg);
2104     store(ptr, px, tail);
2109     from_16161616(load<U64>(ptr, tail), &r,&g, &b, &a);
2113     from_16161616(load<U64>(ptr, tail), &dr, &dg, &db, &da);
2128     store4(ptr,tail, R,G,B,A);
2134     from_1010102(load<U32>(ptr, tail), &r,&g,&b,&a);
2138     from_1010102(load<U32>(ptr, tail), &dr,&dg,&db,&da);
2152     store(ptr, px, tail);
2159     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2169     load4((const uint16_t*)ptr,tail, &R,&G,&B,&A);
2189     store4((uint16_t*)ptr,tail, to_half(r)
2203     store4(ptr,tail, R,G,B,A);
2209     U16 A = load<U16>((const uint16_t*)ptr, tail);
2218     U16 A = load<U16>((const uint16_t*)ptr, tail);
2230     store(ptr, to_half(a), tail);
2237     load2((const uint16_t*)ptr, tail, &R, &G);
2247     load2((const uint16_t*)ptr, tail, &R, &G);
2267     store2((uint16_t*)ptr, tail, to_half(r)
2273     load4(ptr,tail, &r,&g,&b,&a);
2277     load4(ptr,tail, &dr,&dg,&db,&da);
2289     store4(ptr,tail, r,g,b,a);
2294     load2(ptr, tail, &r, &g);
2300     store2(ptr, tail, r, g);
2688     c->fn(c, tail ? tail : N);
2933         size_t dx, dy, tail;
2939     using Stage = void (ABI*)(size_t tail, void** program, size_t dx, size_t dy,
2953         if (size_t tail = xlimit - params.dx) {
2954             params.tail = tail;
2962         if (size_t tail = xlimit - dx) {
2963             start(tail,program,dx,dy, 0,0,0,0, 0,0,0,0);
2987         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);          \
2991             name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y);                   \
2997         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3000         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
3006             name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a,       \
3011         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
3016         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
3020             name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a,            \
3025         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
3030         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
3031         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
3036             name##_k(Ctx{program}, dx,dy,tail, x,y);                                       \
3040             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
3042         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y)
3045         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
3048         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
3053             name##_k(Ctx{program}, dx,dy,tail, x,y, r,g,b,a, dr,dg,db,da);                 \
3055             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
3057         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
3062         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
3065         static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
3068             name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \
3070             next(tail,program,dx,dy, r,g,b,a, dr,dg,db,da);                                \
3072         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
3482 SI V load(const T* ptr, size_t tail) {
3484     switch (tail & (N-1)) {
3507 SI void store(T* ptr, size_t tail, V v) {
3508     switch (tail & (N-1)) {
3592 SI void load_8888_(const uint32_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3595     switch (tail & (N-1)) {
3610     from_8888(load<U32>(ptr, tail), r,g,b,a);
3613 SI void store_8888_(uint32_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3621     switch (tail & (N-1)) {
3632     store(ptr, tail, cast<U32>(r | (g<<8)) <<  0
3638     load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3641     load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3644     store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), tail, r,g,b,a);
3665 SI void load_565_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b) {
3666     from_565(load<U16>(ptr, tail), r,g,b);
3668 SI void store_565_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b) {
3676     store(ptr, tail, R << 11
3682     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b);
3686     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db);
3690     store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b);
3712 SI void load_4444_(const uint16_t* ptr, size_t tail, U16* r, U16* g, U16* b, U16* a) {
3713     from_4444(load<U16>(ptr, tail), r,g,b,a);
3715 SI void store_4444_(uint16_t* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
3722     store(ptr, tail, R << 12
3729     load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &r,&g,&b,&a);
3732     load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &dr,&dg,&db,&da);
3735     store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), tail, r,g,b,a);
3748 SI void load_88_(const uint16_t* ptr, size_t tail, U16* r, U16* g) {
3751     switch (tail & (N-1)) {
3764     from_88(load<U16>(ptr, tail), r,g);
3768 SI void store_88_(uint16_t* ptr, size_t tail, U16 r, U16 g) {
3774     switch (tail & (N-1)) {
3785     store(ptr, tail, cast<U16>(r | (g<<8)) <<  0);
3790     load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &r, &g);
3795     load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), tail, &dr, &dg);
3800     store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), tail, r, g);
3812 SI U16 load_8(const uint8_t* ptr, size_t tail) {
3813     return cast<U16>(load<U8>(ptr, tail));
3815 SI void store_8(uint8_t* ptr, size_t tail, U16 v) {
3816     store(ptr, tail, cast<U8>(v));
3821     a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3825     da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3828     store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), tail, a);
3916     U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3923     U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy), tail);
3937     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3947     load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), tail, &cr,&cg,&cb);
3957     U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy), tail),
3958         add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy), tail);
4218     load_8888_(ptr, tail, &dr,&dg,&db,&da);
4223     store_8888_(ptr, tail, r,g,b,a);