1diff --git a/arm/arm_init.c b/arm/arm_init.c 2index 3a89998ab..05aa2c0d9 100644 3--- a/arm/arm_init.c 4+++ b/arm/arm_init.c 5@@ -113,13 +113,23 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) 6 * initialization function.) 7 */ 8 pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon; 9- 10+#ifdef PNG_MULTY_LINE_ENABLE 11+ // OH ISSUE: png optimize 12+ pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon; 13+#endif 14 if (bpp == 3) 15 { 16 pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon; 17 pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon; 18 pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = 19 png_read_filter_row_paeth3_neon; 20+#ifdef PNG_MULTY_LINE_ENABLE 21+ // OH ISSUE: png optimize 22+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] = 23+ png_read_filter_row_avg3_x2_neon; 24+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] = 25+ png_read_filter_row_paeth3_x2_neon; 26+#endif 27 } 28 29 else if (bpp == 4) 30@@ -128,6 +138,13 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp) 31 pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon; 32 pp->read_filter[PNG_FILTER_VALUE_PAETH-1] = 33 png_read_filter_row_paeth4_neon; 34+#ifdef PNG_MULTY_LINE_ENABLE 35+ // OH ISSUE: png optimize 36+ pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] = 37+ png_read_filter_row_avg4_x2_neon; 38+ pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] = 39+ png_read_filter_row_paeth4_x2_neon; 40+#endif 41 } 42 } 43 #endif /* PNG_ARM_NEON_OPT > 0 */ 44diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c 45index 4466d48b2..4ff810a19 100644 46--- a/arm/filter_neon_intrinsics.c 47+++ b/arm/filter_neon_intrinsics.c 48@@ -47,6 +47,7 @@ 49 50 #if PNG_ARM_NEON_OPT > 0 51 52+#ifndef PNG_MULTY_LINE_ENABLE 53 void 54 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 55 png_const_bytep prev_row) 56@@ -396,7 +397,1351 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 57 vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0); 58 } 59 } 60+#else 61+// OH ISSUE: png optimize 62+// according to definition: row_info->rowbytes = row_width * row_info->channels, 63+// the input rowbytes must be 3 or 4 times the channel size, so: 64+// for RGB neon process 12 bytes at once,the tail must be 3,6,9; 65+// for RGBA neon process 16 or 8 bytes at once,the tail must be 4; 66+// filter operators are internal function, row_info and row ensure non empty outside. 67+#define STEP_RGB (12) // 3 channel RGB process 12 bytes at once 68+#define TAIL_RGB3 (9) // tail 3 pixels have 9 bytes 69+#define TAIL_RGB2 (6) // tail 2 pixels have 6 bytes 70+#define TAIL_RGB1 (3) // tail 1 pixel have 3 bytes 71+#define STEP_RGBA (16) // GBA neon process 16 bytes at once 72+#define STEP_RGBA_HALF (8) // GBA neon process 8 bytes at once 73+#define TAIL_RGBA (4) // tail 1 pixel have 4 bytes 74+#define IND3 (3) // index 3 75+#define IND2 (2) // index 2 76+#define OFFSET3 (3) // RGB offset 3 bytes 77+#define OFFSET6 (6) // RGB offset 6 bytes 78+void png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row, 79+ png_const_bytep prev_row) 80+{ 81+ png_bytep rp = row; 82+ png_const_bytep pp = prev_row; 83+ int count = row_info->rowbytes; 84+ 85+ png_debug(1, "in png_read_filter_row_up_neon"); 86+ 87+ uint8x16_t qrp, qpp; 88+ while (count >= STEP_RGBA) { 89+ qrp = vld1q_u8(rp); 90+ qpp = vld1q_u8(pp); 91+ qrp = vaddq_u8(qrp, qpp); 92+ vst1q_u8(rp, qrp); 93+ rp += STEP_RGBA; 94+ pp += STEP_RGBA; 95+ count -= STEP_RGBA; 96+ } 97+ 98+ if (count >= STEP_RGBA_HALF) { 99+ uint8x8_t qrp1, qpp1; 100+ qrp1 = vld1_u8(rp); 101+ qpp1 = vld1_u8(pp); 102+ qrp1 = vadd_u8(qrp1, qpp1); 103+ vst1_u8(rp, qrp1); 104+ rp += STEP_RGBA_HALF; 105+ pp += STEP_RGBA_HALF; 106+ count -= STEP_RGBA_HALF; 107+ } 108+ 109+ for (int i = 0; i < count; i++) { 110+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 111+ rp++; 112+ } 113+} 114+ 115+void png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row, 116+ png_const_bytep prev_row) 117+{ 118+ png_bytep rp = row; 119+ png_const_bytep pp = prev_row; 120+ int count = row_info->rowbytes; 121+ png_bytep np = row + row_info->rowbytes + 1; 122+ 123+ png_debug(1, "in png_read_filter_row_up_x2_neon"); 124+ 125+ uint8x16_t qrp, qpp, qnp; 126+ while (count >= STEP_RGBA) { 127+ qrp = vld1q_u8(rp); 128+ qpp = vld1q_u8(pp); 129+ qnp = vld1q_u8(np); 130+ qrp = vaddq_u8(qrp, qpp); 131+ qnp = vaddq_u8(qnp, qrp); 132+ vst1q_u8(rp, qrp); 133+ vst1q_u8(np, qnp); 134+ rp += STEP_RGBA; 135+ pp += STEP_RGBA; 136+ np += STEP_RGBA; 137+ count -= STEP_RGBA; 138+ } 139+ 140+ if (count >= STEP_RGBA_HALF) { 141+ uint8x8_t qrp1, qpp1, qnp1; 142+ qrp1 = vld1_u8(rp); 143+ qpp1 = vld1_u8(pp); 144+ qnp1 = vld1_u8(np); 145+ qrp1 = vadd_u8(qrp1, qpp1); 146+ qnp1 = vadd_u8(qnp1, qrp1); 147+ vst1_u8(rp, qrp1); 148+ vst1_u8(np, qnp1); 149+ rp += STEP_RGBA_HALF; 150+ pp += STEP_RGBA_HALF; 151+ np += STEP_RGBA_HALF; 152+ count -= STEP_RGBA_HALF; 153+ } 154+ 155+ for (int i = 0; i < count; i++) { 156+ *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff); 157+ *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff); 158+ np++; 159+ } 160+} 161+ 162+void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row, 163+ png_const_bytep prev_row) 164+{ 165+ png_bytep rp = row; 166+ png_bytep rp_stop = row + row_info->rowbytes; 167+ 168+ uint8x16_t vtmp = vld1q_u8(rp); 169+ uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp); 170+ uint8x8x2_t vrp = *vrpt; 171+ 172+ uint8x8x4_t vdest; 173+ vdest.val[IND3] = vdup_n_u8(0); 174+ 175+ uint8x8_t vtmp1, vtmp2; 176+ uint32x2_t *temp_pointer; 177+ 178+ png_debug(1, "in png_read_filter_row_sub3_neon"); 179+ 180+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 181+ png_byte last_byte = *rp_stop; 182+ png_bytep rp_stop_new = rp_stop - tail_bytes; 183+ for (; rp < rp_stop_new;) 184+ { 185+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 186+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 187+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 188+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 189+ 190+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 191+ vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2); 192+ vdest.val[IND3] = vadd_u8(vdest.val[IND2], vtmp1); 193+ 194+ vtmp = vld1q_u8(rp + STEP_RGB); 195+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 196+ vrp = *vrpt; 197+ 198+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 199+ rp += OFFSET3; 200+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 201+ rp += OFFSET3; 202+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 203+ rp += OFFSET3; 204+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 205+ rp += OFFSET3; 206+ } 207+ 208+ if (tail_bytes == TAIL_RGB1) { 209+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 210+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 211+ } else if (tail_bytes == TAIL_RGB2) { 212+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 213+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 214+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 215+ 216+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 217+ rp += OFFSET3; 218+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 219+ } else if (tail_bytes == TAIL_RGB3) { 220+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 221+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 222+ vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 223+ vdest.val[1] = vadd_u8(vdest.val[0], vtmp1); 224+ vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2); 225+ 226+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 227+ rp += OFFSET3; 228+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 229+ rp += OFFSET3; 230+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 231+ } 232+ *rp_stop = last_byte; 233+ 234+ PNG_UNUSED(prev_row) 235+} 236+ 237+void png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row, 238+ png_const_bytep prev_row) 239+{ 240+ png_bytep rp = row; 241+ int count = row_info->rowbytes; 242+ 243+ uint8x8x4_t vdest; 244+ vdest.val[IND3] = vdup_n_u8(0); 245+ 246+ png_debug(1, "in png_read_filter_row_sub4_neon"); 247+ 248+ uint32x2x4_t vtmp; 249+ uint8x8x4_t *vrpt; 250+ uint8x8x4_t vrp; 251+ uint32x2x4_t vdest_val; 252+ while (count >= STEP_RGBA) { 253+ uint32x2x4_t *temp_pointer; 254+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 255+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 256+ vrp = *vrpt; 257+ 258+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]); 259+ vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]); 260+ vdest.val[IND2] = vadd_u8(vdest.val[1], vrp.val[IND2]); 261+ vdest.val[IND3] = vadd_u8(vdest.val[IND2], vrp.val[IND3]); 262+ 263+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 264+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 265+ 266+ rp += STEP_RGBA; 267+ count -= STEP_RGBA; 268+ } 269+ 270+ if (count >= STEP_RGBA_HALF) { 271+ uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 272+ uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 273+ uint8x8x2_t vrp1 = *vrpt1; 274+ uint32x2x2_t *temp_pointer; 275+ uint32x2x2_t vdest_val1; 276+ 277+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp1.val[0]); 278+ vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]); 279+ vdest.val[IND3] = vdest.val[1]; 280+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 281+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 282+ 283+ rp += STEP_RGBA_HALF; 284+ count -= STEP_RGBA_HALF; 285+ } 286+ 287+ if (count == 0) { 288+ return; 289+ } 290+ 291+ uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 292+ uint8x8_t *vrpt2 = png_ptr(uint8x8_t, &vtmp2); 293+ uint8x8_t vrp2 = *vrpt2; 294+ uint32x2_t *temp_pointer; 295+ uint32x2_t vdest_val2; 296+ 297+ vdest.val[0] = vadd_u8(vdest.val[IND3], vrp2); 298+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 299+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 300+ 301+ PNG_UNUSED(prev_row) 302+} 303+ 304+void png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row, 305+ png_const_bytep prev_row) 306+{ 307+ png_bytep rp = row; 308+ png_const_bytep pp = prev_row; 309+ png_bytep rp_stop = row + row_info->rowbytes; 310+ 311+ uint8x16_t vtmp; 312+ uint8x8x2_t *vrpt; 313+ uint8x8x2_t vrp; 314+ uint8x8x4_t vdest; 315+ vdest.val[IND3] = vdup_n_u8(0); 316+ 317+ vtmp = vld1q_u8(rp); 318+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 319+ vrp = *vrpt; 320+ 321+ png_debug(1, "in png_read_filter_row_avg3_neon"); 322+ 323+ uint8x8_t vtmp1, vtmp2, vtmp3; 324+ uint8x8x2_t *vppt; 325+ uint8x8x2_t vpp; 326+ uint32x2_t *temp_pointer; 327+ 328+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 329+ png_byte last_byte = *rp_stop; 330+ png_bytep rp_stop_new = rp_stop - tail_bytes; 331+ for (; rp < rp_stop_new; pp += STEP_RGB) 332+ { 333+ vtmp = vld1q_u8(pp); 334+ vppt = png_ptr(uint8x8x2_t, &vtmp); 335+ vpp = *vppt; 336+ 337+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 338+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 339+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 340+ 341+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 342+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 343+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 344+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 345+ 346+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 347+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 348+ 349+ vtmp = vld1q_u8(rp + STEP_RGB); 350+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 351+ vrp = *vrpt; 352+ 353+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 354+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 355+ 356+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 357+ 358+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2); 359+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 360+ 361+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 362+ rp += OFFSET3; 363+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 364+ rp += OFFSET3; 365+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 366+ rp += OFFSET3; 367+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 368+ rp += OFFSET3; 369+ } 370+ 371+ vtmp = vld1q_u8(pp); 372+ vppt = png_ptr(uint8x8x2_t, &vtmp); 373+ vpp = *vppt; 374+ 375+ if (tail_bytes == TAIL_RGB1) { 376+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 377+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 378+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 379+ } else if (tail_bytes == TAIL_RGB2) { 380+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 381+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 382+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 383+ 384+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 385+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 386+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 387+ 388+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 389+ rp += OFFSET3; 390+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 391+ } else if (tail_bytes == TAIL_RGB3) { 392+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 393+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 394+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 395+ 396+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 397+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 398+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 399+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 400+ 401+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 402+ 403+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 404+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 405+ 406+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 407+ rp += OFFSET3; 408+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 409+ rp += OFFSET3; 410+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 411+ } 412+ *rp_stop = last_byte; 413+} 414+ 415+void png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row, 416+ png_const_bytep prev_row) 417+{ 418+ png_bytep rp = row; 419+ png_const_bytep pp = prev_row; 420+ png_bytep rp_stop = row + row_info->rowbytes; 421+ png_bytep np = rp_stop + 1; 422+ 423+ uint8x16_t vtmp; 424+ uint8x8x2_t *vrpt; 425+ uint8x8x2_t vrp; 426+ uint8x8x4_t vdest; 427+ vdest.val[IND3] = vdup_n_u8(0); 428+ 429+ vtmp = vld1q_u8(rp); 430+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 431+ vrp = *vrpt; 432+ 433+ uint8x8x2_t *vnpt; 434+ uint8x8x2_t vnp; 435+ uint8x8x4_t vdestN; 436+ vdestN.val[IND3] = vdup_n_u8(0); 437+ 438+ vtmp = vld1q_u8(np); 439+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 440+ vnp = *vnpt; 441+ 442+ png_debug(1, "in png_read_filter_row_x2_avg3_neon"); 443+ 444+ uint8x8_t vtmp1, vtmp2, vtmp3; 445+ uint8x8x2_t *vppt; 446+ uint8x8x2_t vpp; 447+ uint32x2_t *temp_pointer; 448+ 449+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 450+ png_byte last_byte = *rp_stop; 451+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1); 452+ png_bytep rp_stop_new = rp_stop - tail_bytes; 453+ for (; rp < rp_stop_new; pp += STEP_RGB) 454+ { 455+ vtmp = vld1q_u8(pp); 456+ vppt = png_ptr(uint8x8x2_t, &vtmp); 457+ vpp = *vppt; 458+ 459+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 460+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 461+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 462+ 463+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 464+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 465+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 466+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 467+ 468+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 469+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 470+ 471+ vtmp = vld1q_u8(rp + STEP_RGB); 472+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 473+ vrp = *vrpt; 474+ 475+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 476+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 477+ 478+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 479+ 480+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2); 481+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 482+ 483+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 484+ rp += OFFSET3; 485+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 486+ rp += OFFSET3; 487+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 488+ rp += OFFSET3; 489+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 490+ rp += OFFSET3; 491+ 492+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 493+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 494+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 495+ 496+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 497+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 498+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 499+ 500+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1); 501+ 502+ vtmp = vld1q_u8(np + STEP_RGB); 503+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 504+ vnp = *vnpt; 505+ 506+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]); 507+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3); 508+ 509+ vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]); 510+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1); 511+ 512+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 513+ np += OFFSET3; 514+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 515+ np += OFFSET3; 516+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 517+ np += OFFSET3; 518+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0); 519+ np += OFFSET3; 520+ } 521+ 522+ vtmp = vld1q_u8(pp); 523+ vppt = png_ptr(uint8x8x2_t, &vtmp); 524+ vpp = *vppt; 525+ 526+ if (tail_bytes == TAIL_RGB1) { 527+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 528+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 529+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 530+ 531+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 532+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 533+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 534+ } else if (tail_bytes == TAIL_RGB2) { 535+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 536+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 537+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 538+ 539+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 540+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 541+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 542+ 543+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 544+ rp += OFFSET3; 545+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 546+ 547+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 548+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 549+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 550+ 551+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 552+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 553+ 554+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 555+ np += OFFSET3; 556+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 557+ } else if (tail_bytes == TAIL_RGB3) { 558+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 559+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 560+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 561+ 562+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 563+ vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 564+ vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2); 565+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 566+ 567+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 568+ 569+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2); 570+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3); 571+ 572+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 573+ rp += OFFSET3; 574+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 575+ rp += OFFSET3; 576+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 577+ 578+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 579+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 580+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 581+ 582+ vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 583+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 584+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 585+ 586+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]); 587+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3); 588+ 589+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 590+ np += OFFSET3; 591+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 592+ np += OFFSET3; 593+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 594+ } 595+ *rp_stop = last_byte; 596+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next; 597+} 598+ 599+void png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row, 600+ png_const_bytep prev_row) 601+{ 602+ png_bytep rp = row; 603+ png_const_bytep pp = prev_row; 604+ int count = row_info->rowbytes; 605+ 606+ uint8x8x4_t vdest; 607+ vdest.val[IND3] = vdup_n_u8(0); 608+ 609+ png_debug(1, "in png_read_filter_row_avg4_neon"); 610+ 611+ uint32x2x4_t vtmp; 612+ uint8x8x4_t *vrpt, *vppt; 613+ uint8x8x4_t vrp, vpp; 614+ uint32x2x4_t vdest_val; 615+ while (count >= STEP_RGBA) { 616+ uint32x2x4_t *temp_pointer; 617+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 618+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 619+ vrp = *vrpt; 620+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 621+ vppt = png_ptr(uint8x8x4_t, &vtmp); 622+ vpp = *vppt; 623+ 624+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 625+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 626+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 627+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 628+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]); 629+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 630+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]); 631+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 632+ 633+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 634+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 635+ 636+ rp += STEP_RGBA; 637+ pp += STEP_RGBA; 638+ count -= STEP_RGBA; 639+ } 640+ 641+ if (count >= STEP_RGBA_HALF) { 642+ uint32x2x2_t vtmp1; 643+ uint8x8x2_t *vrpt1, *vppt1; 644+ uint8x8x2_t vrp1, vpp1; 645+ uint32x2x2_t *temp_pointer; 646+ uint32x2x2_t vdest_val1; 647+ 648+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 649+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 650+ vrp1 = *vrpt1; 651+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 652+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 653+ vpp1 = *vppt1; 654+ 655+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]); 656+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 657+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]); 658+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 659+ vdest.val[IND3] = vdest.val[1]; 660+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 661+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 662+ 663+ rp += STEP_RGBA_HALF; 664+ pp += STEP_RGBA_HALF; 665+ count -= STEP_RGBA_HALF; 666+ } 667+ 668+ if (count == 0) { 669+ return; 670+ } 671+ 672+ uint32x2_t vtmp2; 673+ uint8x8_t *vrpt2, *vppt2; 674+ uint8x8_t vrp2, vpp2; 675+ uint32x2_t *temp_pointer; 676+ uint32x2_t vdest_val2; 677+ 678+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 679+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 680+ vrp2 = *vrpt2; 681+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 682+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 683+ vpp2 = *vppt2; 684+ 685+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2); 686+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 687 688+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 689+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 690+} 691+ 692+void png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row, 693+ png_const_bytep prev_row) 694+{ 695+ png_bytep rp = row; 696+ png_const_bytep pp = prev_row; 697+ int count = row_info->rowbytes; 698+ png_bytep np = row + count + 1; 699+ 700+ uint8x8x4_t vdest; 701+ vdest.val[IND3] = vdup_n_u8(0); 702+ 703+ png_debug(1, "in png_read_filter_row_avg4_x2_neon"); 704+ 705+ uint32x2x4_t vtmp; 706+ uint8x8x4_t *vrpt, *vppt; 707+ uint8x8x4_t vrp, vpp; 708+ uint32x2x4_t vdest_val; 709+ 710+ uint8x8x4_t *vnpt; 711+ uint8x8x4_t vnp; 712+ uint8x8x4_t vdestN; 713+ vdestN.val[IND3] = vdup_n_u8(0); 714+ 715+ while (count >= STEP_RGBA) { 716+ uint32x2x4_t *temp_pointer; 717+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 718+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 719+ vrp = *vrpt; 720+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 721+ vppt = png_ptr(uint8x8x4_t, &vtmp); 722+ vpp = *vppt; 723+ vtmp = vld4_u32(png_ptrc(uint32_t, np)); 724+ vnpt = png_ptr(uint8x8x4_t, &vtmp); 725+ vnp = *vnpt; 726+ 727+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]); 728+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 729+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]); 730+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 731+ vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]); 732+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 733+ vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]); 734+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 735+ 736+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 737+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 738+ 739+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 740+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 741+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 742+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]); 743+ vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]); 744+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]); 745+ vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]); 746+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]); 747+ 748+ vdest_val = png_ldr(uint32x2x4_t, &vdestN); 749+ vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0); 750+ 751+ rp += STEP_RGBA; 752+ pp += STEP_RGBA; 753+ np += STEP_RGBA; 754+ count -= STEP_RGBA; 755+ } 756+ 757+ if (count >= STEP_RGBA_HALF) { 758+ uint32x2x2_t vtmp1; 759+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1; 760+ uint8x8x2_t vrp1, vpp1, vnp1; 761+ uint32x2x2_t *temp_pointer; 762+ uint32x2x2_t vdest_val1; 763+ 764+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 765+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 766+ vrp1 = *vrpt1; 767+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 768+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 769+ vpp1 = *vppt1; 770+ vtmp1 = vld2_u32(png_ptrc(uint32_t, np)); 771+ vnpt1 = png_ptr(uint8x8x2_t, &vtmp1); 772+ vnp1 = *vnpt1; 773+ 774+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]); 775+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 776+ vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]); 777+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 778+ vdest.val[IND3] = vdest.val[1]; 779+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 780+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 781+ 782+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 783+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]); 784+ vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]); 785+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]); 786+ vdestN.val[IND3] = vdestN.val[1]; 787+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN); 788+ vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0); 789+ 790+ rp += STEP_RGBA_HALF; 791+ pp += STEP_RGBA_HALF; 792+ np += STEP_RGBA_HALF; 793+ count -= STEP_RGBA_HALF; 794+ } 795+ 796+ if (count == 0) { 797+ return; 798+ } 799+ 800+ uint32x2_t vtmp2; 801+ uint8x8_t *vrpt2, *vppt2, *vnpt2; 802+ uint8x8_t vrp2, vpp2, vnp2; 803+ uint32x2_t *temp_pointer; 804+ uint32x2_t vdest_val2; 805+ 806+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 807+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 808+ vrp2 = *vrpt2; 809+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 810+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 811+ vpp2 = *vppt2; 812+ vtmp2 = vld1_u32(png_ptrc(uint32_t, np)); 813+ vnpt2 = png_ptr(uint8x8_t, &vtmp2); 814+ vnp2 = *vnpt2; 815+ 816+ vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2); 817+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 818+ 819+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 820+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 821+ 822+ vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]); 823+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2); 824+ 825+ vdest_val2 = png_ldr(uint32x2_t, &vdestN); 826+ vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0); 827+} 828+ 829+static uint8x8_t paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c) 830+{ 831+ uint8x8_t d, e; 832+ uint16x8_t p1, pa, pb, pc; 833+ 834+ p1 = vaddl_u8(a, b); /* a + b */ 835+ pc = vaddl_u8(c, c); /* c * 2 */ 836+ pa = vabdl_u8(b, c); /* pa */ 837+ pb = vabdl_u8(a, c); /* pb */ 838+ pc = vabdq_u16(p1, pc); /* pc */ 839+ 840+ p1 = vcleq_u16(pa, pb); /* pa <= pb */ 841+ pa = vcleq_u16(pa, pc); /* pa <= pc */ 842+ pb = vcleq_u16(pb, pc); /* pb <= pc */ 843+ 844+ p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */ 845+ 846+ d = vmovn_u16(pb); 847+ e = vmovn_u16(p1); 848+ 849+ d = vbsl_u8(d, b, c); 850+ e = vbsl_u8(e, a, d); 851+ 852+ return e; 853+} 854+ 855+void png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row, 856+ png_const_bytep prev_row) 857+{ 858+ png_bytep rp = row; 859+ png_const_bytep pp = prev_row; 860+ png_bytep rp_stop = row + row_info->rowbytes; 861+ 862+ uint8x16_t vtmp; 863+ uint8x8x2_t *vrpt; 864+ uint8x8x2_t vrp; 865+ uint8x8_t vlast = vdup_n_u8(0); 866+ uint8x8x4_t vdest; 867+ vdest.val[IND3] = vdup_n_u8(0); 868+ 869+ vtmp = vld1q_u8(rp); 870+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 871+ vrp = *vrpt; 872+ 873+ uint8x8x2_t *vppt; 874+ uint8x8x2_t vpp; 875+ uint8x8_t vtmp1, vtmp2, vtmp3; 876+ uint32x2_t *temp_pointer; 877+ 878+ png_debug(1, "in png_read_filter_row_paeth3_neon"); 879+ 880+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 881+ png_byte last_byte = *rp_stop; 882+ png_bytep rp_stop_new = rp_stop - tail_bytes; 883+ for (; rp < rp_stop_new; pp += STEP_RGB) 884+ { 885+ vtmp = vld1q_u8(pp); 886+ vppt = png_ptr(uint8x8x2_t, &vtmp); 887+ vpp = *vppt; 888+ 889+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 890+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 891+ 892+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 893+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 894+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 895+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 896+ 897+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 898+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 899+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 900+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 901+ 902+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 903+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 904+ 905+ vtmp = vld1q_u8(rp + STEP_RGB); 906+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 907+ vrp = *vrpt; 908+ 909+ vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3); 910+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 911+ 912+ vlast = vtmp2; 913+ 914+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 915+ rp += OFFSET3; 916+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 917+ rp += OFFSET3; 918+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 919+ rp += OFFSET3; 920+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 921+ rp += OFFSET3; 922+ } 923+ 924+ vtmp = vld1q_u8(pp); 925+ vppt = png_ptr(uint8x8x2_t, &vtmp); 926+ vpp = *vppt; 927+ 928+ if (tail_bytes == TAIL_RGB1) { 929+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 930+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 931+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 932+ } else if (tail_bytes == TAIL_RGB2) { 933+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 934+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 935+ 936+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 937+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 938+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 939+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 940+ 941+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 942+ rp += OFFSET3; 943+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 944+ } else if (tail_bytes == TAIL_RGB3) { 945+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 946+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 947+ 948+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 949+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 950+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 951+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 952+ 953+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 954+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 955+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 956+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 957+ 958+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 959+ rp += OFFSET3; 960+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 961+ rp += OFFSET3; 962+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 963+ } 964+ *rp_stop = last_byte; 965+} 966+ 967+void png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row, 968+ png_const_bytep prev_row) 969+{ 970+ png_bytep rp = row; 971+ png_const_bytep pp = prev_row; 972+ png_bytep rp_stop = row + row_info->rowbytes; 973+ png_bytep np = rp_stop + 1; 974+ 975+ uint8x16_t vtmp; 976+ uint8x8x2_t *vrpt; 977+ uint8x8x2_t vrp; 978+ uint8x8_t vlast = vdup_n_u8(0); 979+ uint8x8x4_t vdest; 980+ vdest.val[IND3] = vdup_n_u8(0); 981+ 982+ vtmp = vld1q_u8(rp); 983+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 984+ vrp = *vrpt; 985+ 986+ uint8x8x2_t *vppt; 987+ uint8x8x2_t vpp; 988+ uint8x8_t vtmp1, vtmp2, vtmp3; 989+ uint32x2_t *temp_pointer; 990+ 991+ uint8x8x2_t *vnpt; 992+ uint8x8x2_t vnp; 993+ uint8x8_t vlastN = vdup_n_u8(0); 994+ uint8x8x4_t vdestN; 995+ vdestN.val[IND3] = vdup_n_u8(0); 996+ 997+ vtmp = vld1q_u8(np); 998+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 999+ vnp = *vnpt; 1000+ 1001+ png_debug(1, "in png_read_filter_row_paeth3_x2_neon"); 1002+ 1003+ size_t tail_bytes = row_info->rowbytes % STEP_RGB; 1004+ png_byte last_byte = *rp_stop; 1005+ png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1); 1006+ png_bytep rp_stop_new = rp_stop - tail_bytes; 1007+ 1008+ for (; rp < rp_stop_new; pp += STEP_RGB) 1009+ { 1010+ vtmp = vld1q_u8(pp); 1011+ vppt = png_ptr(uint8x8x2_t, &vtmp); 1012+ vpp = *vppt; 1013+ 1014+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1015+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1016+ 1017+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 1018+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 1019+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 1020+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 1021+ 1022+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 1023+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 1024+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 1025+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 1026+ 1027+ vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1); 1028+ vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1); 1029+ 1030+ vtmp = vld1q_u8(rp + STEP_RGB); 1031+ vrpt = png_ptr(uint8x8x2_t, &vtmp); 1032+ vrp = *vrpt; 1033+ 1034+ vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3); 1035+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1); 1036+ 1037+ vlast = vtmp2; 1038+ 1039+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1040+ rp += OFFSET3; 1041+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 1042+ rp += OFFSET3; 1043+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 1044+ rp += OFFSET3; 1045+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0); 1046+ rp += OFFSET3; 1047+ 1048+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1049+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1050+ 1051+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 1052+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1053+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1054+ 1055+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 1056+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]); 1057+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1); 1058+ 1059+ vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1); 1060+ 1061+ vtmp = vld1q_u8(np + STEP_RGB); 1062+ vnpt = png_ptr(uint8x8x2_t, &vtmp); 1063+ vnp = *vnpt; 1064+ 1065+ vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]); 1066+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1); 1067+ 1068+ vlastN = vdest.val[IND3]; 1069+ 1070+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1071+ np += OFFSET3; 1072+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 1073+ np += OFFSET3; 1074+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 1075+ np += OFFSET3; 1076+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0); 1077+ np += OFFSET3; 1078+ } 1079+ 1080+ vtmp = vld1q_u8(pp); 1081+ vppt = png_ptr(uint8x8x2_t, &vtmp); 1082+ vpp = *vppt; 1083+ 1084+ if (tail_bytes == TAIL_RGB1) { 1085+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1086+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1087+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1088+ 1089+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1090+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1091+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1092+ } else if (tail_bytes == TAIL_RGB2) { 1093+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1094+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1095+ 1096+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 1097+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 1098+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 1099+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 1100+ 1101+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1102+ rp += OFFSET3; 1103+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 1104+ 1105+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1106+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1107+ 1108+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 1109+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1110+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1111+ 1112+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1113+ np += OFFSET3; 1114+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 1115+ } else if (tail_bytes == TAIL_RGB3) { 1116+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1117+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1118+ 1119+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3); 1120+ vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3); 1121+ vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]); 1122+ vdest.val[1] = vadd_u8(vdest.val[1], vtmp1); 1123+ 1124+ vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6); 1125+ vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6); 1126+ vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2); 1127+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1); 1128+ 1129+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0); 1130+ rp += OFFSET3; 1131+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0); 1132+ rp += OFFSET3; 1133+ vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0); 1134+ 1135+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1136+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1137+ 1138+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3); 1139+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1140+ vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1); 1141+ 1142+ vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6); 1143+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]); 1144+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1); 1145+ 1146+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0); 1147+ np += OFFSET3; 1148+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0); 1149+ np += OFFSET3; 1150+ vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0); 1151+ } 1152+ *rp_stop = last_byte; 1153+ *(rp_stop + row_info->rowbytes + 1) = last_byte_next; 1154+} 1155+ 1156+void png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row, 1157+ png_const_bytep prev_row) 1158+{ 1159+ png_bytep rp = row; 1160+ int count = row_info->rowbytes; 1161+ png_const_bytep pp = prev_row; 1162+ 1163+ uint8x8_t vlast = vdup_n_u8(0); 1164+ uint8x8x4_t vdest; 1165+ vdest.val[IND3] = vdup_n_u8(0); 1166+ 1167+ png_debug(1, "in png_read_filter_row_paeth4_neon"); 1168+ 1169+ uint32x2x4_t vtmp; 1170+ uint8x8x4_t *vrpt, *vppt; 1171+ uint8x8x4_t vrp, vpp; 1172+ uint32x2x4_t vdest_val; 1173+ while (count >= STEP_RGBA) { 1174+ uint32x2x4_t *temp_pointer; 1175+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 1176+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 1177+ vrp = *vrpt; 1178+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 1179+ vppt = png_ptr(uint8x8x4_t, &vtmp); 1180+ vpp = *vppt; 1181+ 1182+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1183+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1184+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 1185+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 1186+ vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]); 1187+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 1188+ vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]); 1189+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 1190+ 1191+ vlast = vpp.val[IND3]; 1192+ 1193+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 1194+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 1195+ 1196+ rp += STEP_RGBA; 1197+ pp += STEP_RGBA; 1198+ count -= STEP_RGBA; 1199+ } 1200+ 1201+ if (count >= STEP_RGBA_HALF) { 1202+ uint32x2x2_t vtmp1; 1203+ uint8x8x2_t *vrpt1, *vppt1; 1204+ uint8x8x2_t vrp1, vpp1; 1205+ uint32x2x2_t *temp_pointer; 1206+ uint32x2x2_t vdest_val1; 1207+ 1208+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 1209+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 1210+ vrp1 = *vrpt1; 1211+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 1212+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 1213+ vpp1 = *vppt1; 1214+ 1215+ vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast); 1216+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 1217+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]); 1218+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 1219+ vlast = vpp1.val[1]; 1220+ 1221+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 1222+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 1223+ vdest.val[IND3] = vdest.val[1]; 1224+ 1225+ rp += STEP_RGBA_HALF; 1226+ pp += STEP_RGBA_HALF; 1227+ count -= STEP_RGBA_HALF; 1228+ } 1229+ 1230+ if (count == 0) { 1231+ return; 1232+ } 1233+ 1234+ uint32x2_t vtmp2; 1235+ uint8x8_t *vrpt2, *vppt2; 1236+ uint8x8_t vrp2, vpp2; 1237+ uint32x2_t *temp_pointer; 1238+ uint32x2_t vdest_val2; 1239+ 1240+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 1241+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 1242+ vrp2 = *vrpt2; 1243+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 1244+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 1245+ vpp2 = *vppt2; 1246+ 1247+ vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast); 1248+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 1249+ 1250+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 1251+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 1252+} 1253+ 1254+void png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row, 1255+ png_const_bytep prev_row) 1256+{ 1257+ png_bytep rp = row; 1258+ int count = row_info->rowbytes; 1259+ png_const_bytep pp = prev_row; 1260+ png_bytep np = row + row_info->rowbytes + 1; 1261+ 1262+ uint8x8_t vlast = vdup_n_u8(0); 1263+ uint8x8x4_t vdest; 1264+ vdest.val[IND3] = vdup_n_u8(0); 1265+ 1266+ png_debug(1, "in png_read_filter_row_paeth4_x2_neon"); 1267+ 1268+ uint32x2x4_t vtmp; 1269+ uint8x8x4_t *vrpt, *vppt; 1270+ uint8x8x4_t vrp, vpp; 1271+ uint32x2x4_t vdest_val; 1272+ 1273+ uint8x8x4_t *vnpt; 1274+ uint8x8x4_t vnp; 1275+ uint8x8_t vlastN = vdup_n_u8(0); 1276+ uint8x8x4_t vdestN; 1277+ vdestN.val[IND3] = vdup_n_u8(0); 1278+ 1279+ while (count >= STEP_RGBA) { 1280+ uint32x2x4_t *temp_pointer; 1281+ vtmp = vld4_u32(png_ptr(uint32_t, rp)); 1282+ vrpt = png_ptr(uint8x8x4_t, &vtmp); 1283+ vrp = *vrpt; 1284+ vtmp = vld4_u32(png_ptrc(uint32_t, pp)); 1285+ vppt = png_ptr(uint8x8x4_t, &vtmp); 1286+ vpp = *vppt; 1287+ vtmp = vld4_u32(png_ptrc(uint32_t, np)); 1288+ vnpt = png_ptr(uint8x8x4_t, &vtmp); 1289+ vnp = *vnpt; 1290+ 1291+ vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast); 1292+ vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]); 1293+ vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]); 1294+ vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]); 1295+ vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]); 1296+ vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]); 1297+ vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]); 1298+ vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]); 1299+ 1300+ vlast = vpp.val[IND3]; 1301+ 1302+ vdest_val = png_ldr(uint32x2x4_t, &vdest); 1303+ vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0); 1304+ 1305+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1306+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]); 1307+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1308+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]); 1309+ vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]); 1310+ vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]); 1311+ vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]); 1312+ vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]); 1313+ 1314+ vlastN = vdest.val[IND3]; 1315+ 1316+ vdest_val = png_ldr(uint32x2x4_t, &vdestN); 1317+ vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0); 1318+ 1319+ rp += STEP_RGBA; 1320+ pp += STEP_RGBA; 1321+ np += STEP_RGBA; 1322+ count -= STEP_RGBA; 1323+ } 1324+ 1325+ if (count >= STEP_RGBA_HALF) { 1326+ uint32x2x2_t vtmp1; 1327+ uint8x8x2_t *vrpt1, *vppt1, *vnpt1; 1328+ uint8x8x2_t vrp1, vpp1, vnp1; 1329+ uint32x2x2_t *temp_pointer; 1330+ uint32x2x2_t vdest_val1; 1331+ 1332+ vtmp1 = vld2_u32(png_ptr(uint32_t, rp)); 1333+ vrpt1 = png_ptr(uint8x8x2_t, &vtmp1); 1334+ vrp1 = *vrpt1; 1335+ vtmp1 = vld2_u32(png_ptrc(uint32_t, pp)); 1336+ vppt1 = png_ptr(uint8x8x2_t, &vtmp1); 1337+ vpp1 = *vppt1; 1338+ vtmp1 = vld2_u32(png_ptrc(uint32_t, np)); 1339+ vnpt1 = png_ptr(uint8x8x2_t, &vtmp1); 1340+ vnp1 = *vnpt1; 1341+ 1342+ vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast); 1343+ vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]); 1344+ vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]); 1345+ vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]); 1346+ 1347+ vlast = vpp1.val[1]; 1348+ 1349+ vdest_val1 = png_ldr(uint32x2x2_t, &vdest); 1350+ vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0); 1351+ 1352+ vdest.val[IND3] = vdest.val[1]; 1353+ 1354+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1355+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]); 1356+ vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]); 1357+ vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]); 1358+ 1359+ vlastN = vdest.val[1]; 1360+ 1361+ vdest_val1 = png_ldr(uint32x2x2_t, &vdestN); 1362+ vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0); 1363+ 1364+ vdestN.val[IND3] = vdestN.val[1]; 1365+ 1366+ rp += STEP_RGBA_HALF; 1367+ pp += STEP_RGBA_HALF; 1368+ np += STEP_RGBA_HALF; 1369+ count -= STEP_RGBA_HALF; 1370+ } 1371+ 1372+ if (count == 0) { 1373+ return; 1374+ } 1375+ 1376+ uint32x2_t vtmp2; 1377+ uint8x8_t *vrpt2, *vppt2, *vnpt2; 1378+ uint8x8_t vrp2, vpp2, vnp2; 1379+ uint32x2_t *temp_pointer; 1380+ uint32x2_t vdest_val2; 1381+ 1382+ vtmp2 = vld1_u32(png_ptr(uint32_t, rp)); 1383+ vrpt2 = png_ptr(uint8x8_t, &vtmp2); 1384+ vrp2 = *vrpt2; 1385+ vtmp2 = vld1_u32(png_ptrc(uint32_t, pp)); 1386+ vppt2 = png_ptr(uint8x8_t, &vtmp2); 1387+ vpp2 = *vppt2; 1388+ vtmp2 = vld1_u32(png_ptrc(uint32_t, np)); 1389+ vnpt2 = png_ptr(uint8x8_t, &vtmp2); 1390+ vnp2 = *vnpt2; 1391+ 1392+ vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast); 1393+ vdest.val[0] = vadd_u8(vdest.val[0], vrp2); 1394+ 1395+ vdest_val2 = png_ldr(uint32x2_t, &vdest); 1396+ vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0); 1397+ 1398+ vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN); 1399+ vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2); 1400+ 1401+ vdest_val2 = png_ldr(uint32x2_t, &vdestN); 1402+ vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0); 1403+} 1404+#endif /* PNG_MULTY_LINE_ENABLE */ 1405 #endif /* PNG_ARM_NEON_OPT > 0 */ 1406 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */ 1407 #endif /* READ */ 1408diff --git a/pngpread.c b/pngpread.c 1409index e283627b7..bb12f61ea 100644 1410--- a/pngpread.c 1411+++ b/pngpread.c 1412@@ -264,9 +264,22 @@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr) 1413 png_ptr->idat_size = png_ptr->push_length; 1414 png_ptr->process_mode = PNG_READ_IDAT_MODE; 1415 png_push_have_info(png_ptr, info_ptr); 1416- png_ptr->zstream.avail_out = 1417- (uInt) PNG_ROWBYTES(png_ptr->pixel_depth, 1418- png_ptr->iwidth) + 1; 1419+#ifdef PNG_MULTY_LINE_ENABLE 1420+ // OH ISSUE: png optimize 1421+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1422+ (png_ptr->transformations & PNG_CHECK) == 0) { 1423+ int rest = png_ptr->num_rows - png_ptr->row_number; 1424+ int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1425+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1426+ png_ptr->iwidth) + 1) * row_num; 1427+ } 1428+ else 1429+#endif 1430+ { 1431+ png_ptr->zstream.avail_out = 1432+ (uInt) PNG_ROWBYTES(png_ptr->pixel_depth, 1433+ png_ptr->iwidth) + 1; 1434+ } 1435 png_ptr->zstream.next_out = png_ptr->row_buf; 1436 return; 1437 } 1438@@ -623,6 +636,92 @@ png_push_read_IDAT(png_structrp png_ptr) 1439 } 1440 } 1441 1442+#ifdef PNG_MULTY_LINE_ENABLE 1443+// OH ISSUE: png optimize 1444+static void png_push_process_row_x2(png_structrp png_ptr, 1445+ png_row_info row_info_in) 1446+{ 1447+ png_debug(1, "in png_push_process_row_x2"); 1448+ png_row_info row_info = row_info_in; 1449+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1450+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4); 1451+ 1452+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1453+ if (png_ptr->transformations != 0) 1454+ png_do_read_transformations(png_ptr, &row_info); 1455+#endif 1456+ 1457+ if (png_ptr->transformed_pixel_depth == 0) 1458+ { 1459+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1460+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1461+ png_error(png_ptr, "progressive row overflow"); 1462+ } 1463+ 1464+ png_push_have_row(png_ptr, png_ptr->row_buf + 1); 1465+ png_read_push_finish_row(png_ptr); 1466+ 1467+ png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1; 1468+ 1469+ // do it again 1470+ if (png_ptr->transformations != 0) 1471+ { 1472+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1473+ } 1474+ else 1475+ { 1476+ png_ptr->prev_row = png_ptr->row_buf; 1477+ } 1478+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1479+ if (png_ptr->transformations != 0) 1480+ png_do_read_transformations(png_ptr, &row_info); 1481+#endif 1482+ 1483+ png_push_have_row(png_ptr, png_ptr->row_buf + 1); 1484+ png_read_push_finish_row(png_ptr); 1485+} 1486+ 1487+static void png_push_process_multi_rows(png_structrp png_ptr, int row_num) 1488+{ 1489+ png_debug(1, "in png_push_process_multi_rows"); 1490+ uInt row_bytes = png_ptr->rowbytes + 1; 1491+ 1492+ png_row_info row_info; 1493+ row_info.width = png_ptr->iwidth; 1494+ row_info.color_type = png_ptr->color_type; 1495+ row_info.bit_depth = png_ptr->bit_depth; 1496+ row_info.channels = png_ptr->channels; 1497+ row_info.pixel_depth = png_ptr->pixel_depth; 1498+ row_info.rowbytes = png_ptr->rowbytes; 1499+ 1500+ png_bytep temp_row = png_ptr->row_buf; 1501+ png_bytep temp_prev_row = png_ptr->prev_row; 1502+ 1503+ for (int i = 0; i < row_num; i++) { 1504+ // check if the x2_filter is effective: only supports channels 3 or 4 1505+ if ((png_ptr->channels == 3 || png_ptr->channels == 4) && 1506+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB && 1507+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST && 1508+ png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes]) 1509+ { 1510+ png_push_process_row_x2(png_ptr, row_info); 1511+ png_ptr->row_buf = png_ptr->row_buf + row_bytes; 1512+ i++; 1513+ continue; 1514+ } 1515+ png_push_process_row(png_ptr); 1516+ png_ptr->row_buf = png_ptr->row_buf + row_bytes; 1517+ } 1518+ 1519+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0) 1520+ { 1521+ png_ptr->prev_row = temp_prev_row; 1522+ memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes); 1523+ } 1524+ png_ptr->row_buf = temp_row; 1525+} 1526+#endif 1527+ 1528 void /* PRIVATE */ 1529 png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1530 size_t buffer_length) 1531@@ -639,6 +738,17 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1532 /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */ 1533 png_ptr->zstream.avail_in = (uInt)buffer_length; 1534 1535+#ifdef PNG_MULTY_LINE_ENABLE 1536+ // OH ISSUE: png optimize 1537+ int row_num = 1; 1538+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1539+ (png_ptr->transformations & PNG_CHECK) == 0) 1540+ { 1541+ int rest = png_ptr->num_rows - png_ptr->row_number; 1542+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1543+ } 1544+#endif 1545+ 1546 /* Keep going until the decompressed data is all processed 1547 * or the stream marked as finished. 1548 */ 1549@@ -655,9 +765,20 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1550 if (!(png_ptr->zstream.avail_out > 0)) 1551 { 1552 /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */ 1553+#ifdef PNG_MULTY_LINE_ENABLE 1554+ // OH ISSUE: png optimize 1555+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1556+ (png_ptr->transformations & PNG_CHECK) == 0) 1557+ { 1558+ int rest = png_ptr->num_rows - png_ptr->row_number; 1559+ row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS; 1560+ } 1561+ png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1562+ png_ptr->iwidth) + 1) * row_num; 1563+#else 1564 png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth, 1565 png_ptr->iwidth) + 1); 1566- 1567+#endif 1568 png_ptr->zstream.next_out = png_ptr->row_buf; 1569 } 1570 1571@@ -719,7 +840,12 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1572 1573 /* Do we have a complete row? */ 1574 if (png_ptr->zstream.avail_out == 0) 1575+#ifdef PNG_MULTY_LINE_ENABLE 1576+ // OH ISSUE: png optimize 1577+ png_push_process_multi_rows(png_ptr, row_num); 1578+#else 1579 png_push_process_row(png_ptr); 1580+#endif 1581 } 1582 1583 /* And check for the end of the stream. */ 1584@@ -738,6 +864,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer, 1585 void /* PRIVATE */ 1586 png_push_process_row(png_structrp png_ptr) 1587 { 1588+ png_debug(1, "in png_push_process_row"); 1589 /* 1.5.6: row_info moved out of png_struct to a local here. */ 1590 png_row_info row_info; 1591 1592@@ -762,8 +889,17 @@ png_push_process_row(png_structrp png_ptr) 1593 * it may not be in the future, so this was changed just to copy the 1594 * interlaced row count: 1595 */ 1596- memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1597- 1598+#ifdef PNG_MULTY_LINE_ENABLE 1599+ // OH ISSUE: png optimize 1600+ if (png_ptr->transformations == 0 && png_ptr->interlaced == 0) 1601+ { 1602+ png_ptr->prev_row = png_ptr->row_buf; 1603+ } 1604+ else 1605+#endif 1606+ { 1607+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1608+ } 1609 #ifdef PNG_READ_TRANSFORMS_SUPPORTED 1610 if (png_ptr->transformations != 0) 1611 png_do_read_transformations(png_ptr, &row_info); 1612diff --git a/pngpriv.h b/pngpriv.h 1613index fb521cf00..81300fbd8 100644 1614--- a/pngpriv.h 1615+++ b/pngpriv.h 1616@@ -189,6 +189,19 @@ 1617 # define PNG_ARM_NEON_IMPLEMENTATION 0 1618 #endif /* PNG_ARM_NEON_OPT > 0 */ 1619 1620+#if defined(PNG_ARM_NEON_IMPLEMENTATION) && defined(PNG_ARM_NEON) 1621+// OH ISSUE: png optimize 1622+# if PNG_ARM_NEON_IMPLEMENTATION == 1 1623+# define PNG_MULTY_LINE_ENABLE 1624+# define PNG_INFLATE_MAX_SIZE (65536) 1625+# define PNG_INFLATE_ROWS (50) 1626+# define PNG_CHECK (PNG_EXPAND | PNG_STRIP_ALPHA | PNG_RGB_TO_GRAY | PNG_ENCODE_ALPHA | \ 1627+ PNG_PACKSWAP | PNG_GRAY_TO_RGB | PNG_COMPOSE | PNG_SCALE_16_TO_8 | PNG_16_TO_8 | \ 1628+ PNG_BACKGROUND_EXPAND | PNG_EXPAND_16 | PNG_PACK | PNG_ADD_ALPHA | PNG_EXPAND_tRNS | \ 1629+ PNG_RGB_TO_GRAY_ERR | PNG_RGB_TO_GRAY_WARN | PNG_FILLER | PNG_USER_TRANSFORM) 1630+# endif 1631+#endif 1632+ 1633 #ifndef PNG_MIPS_MSA_OPT 1634 # if defined(__mips_msa) && (__mips_isa_rev >= 5) && defined(PNG_ALIGNED_MEMORY_SUPPORTED) 1635 # define PNG_MIPS_MSA_OPT 2 1636@@ -351,8 +364,14 @@ 1637 #endif 1638 1639 #ifndef PNG_INTERNAL_FUNCTION 1640+// OH ISSUE: png optimize 1641+# ifdef PNG_MULTY_LINE_ENABLE 1642+# define PNG_HIDE __attribute__((visibility("hidden"))) 1643+# else 1644+# define PNG_HIDE 1645+# endif 1646 # define PNG_INTERNAL_FUNCTION(type, name, args, attributes)\ 1647- PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_EMPTY attributes) 1648+ PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_HIDE attributes) 1649 #endif 1650 1651 #ifndef PNG_INTERNAL_CALLBACK 1652@@ -1304,6 +1323,19 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop 1653 row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1654 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop 1655 row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY); 1656+#ifdef PNG_MULTY_LINE_ENABLE 1657+// OH ISSUE: png optimize 1658+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_up_x2_neon, (png_row_infop 1659+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1660+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg3_x2_neon, (png_row_infop 1661+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1662+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg4_x2_neon, (png_row_infop 1663+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1664+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth3_x2_neon, (png_row_infop 1665+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1666+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth4_x2_neon, (png_row_infop 1667+ row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY); 1668+#endif 1669 #endif 1670 1671 #if PNG_MIPS_MSA_OPT > 0 1672diff --git a/pngread.c b/pngread.c 1673index 8fa7d9f16..ed5a25307 100644 1674--- a/pngread.c 1675+++ b/pngread.c 1676@@ -54,7 +54,12 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr, 1677 * required (it will be zero in a write structure.) 1678 */ 1679 # ifdef PNG_SEQUENTIAL_READ_SUPPORTED 1680+#ifdef PNG_MULTY_LINE_ENABLE 1681+ // OH ISSUE: png optimize 1682+ png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE; 1683+#else 1684 png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE; 1685+#endif 1686 # endif 1687 1688 # ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED 1689@@ -684,6 +689,224 @@ png_read_rows(png_structrp png_ptr, png_bytepp row, 1690 #endif /* SEQUENTIAL_READ */ 1691 1692 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED 1693+ 1694+#ifdef PNG_MULTY_LINE_ENABLE 1695+// OH ISSUE: png optimize 1696+static void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i, 1697+ png_row_info row_info) 1698+{ 1699+ png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]); 1700+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1701+ png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4); 1702+ 1703+#ifdef PNG_MNG_FEATURES_SUPPORTED 1704+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1705+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1706+ { 1707+ /* Intrapixel differencing */ 1708+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1709+ } 1710+#endif 1711+ 1712+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1713+ if (png_ptr->transformations 1714+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1715+ || png_ptr->num_palette_max >= 0 1716+# endif 1717+ ) 1718+ png_do_read_transformations(png_ptr, &row_info); 1719+#endif 1720+ 1721+ /* The transformed pixel depth should match the depth now in row_info. */ 1722+ if (png_ptr->transformed_pixel_depth == 0) 1723+ { 1724+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1725+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1726+ png_error(png_ptr, "sequential row overflow"); 1727+ } 1728+ 1729+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1730+ png_error(png_ptr, "internal sequential row size calculation error"); 1731+ 1732+ if (rows[i] != NULL) 1733+ png_combine_row(png_ptr, rows[i], -1); 1734+ 1735+ png_read_finish_row(png_ptr); 1736+ 1737+ if (png_ptr->read_row_fn != NULL) 1738+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1739+ 1740+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1; 1741+ 1742+ // do again next line 1743+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1); 1744+ 1745+#ifdef PNG_MNG_FEATURES_SUPPORTED 1746+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1747+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1748+ { 1749+ /* Intrapixel differencing */ 1750+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1751+ } 1752+#endif 1753+ 1754+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1755+ if (png_ptr->transformations 1756+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1757+ || png_ptr->num_palette_max >= 0 1758+# endif 1759+ ) 1760+ png_do_read_transformations(png_ptr, &row_info); 1761+#endif 1762+ 1763+ /* The transformed pixel depth should match the depth now in row_info. */ 1764+ if (png_ptr->transformed_pixel_depth == 0) 1765+ { 1766+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1767+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1768+ png_error(png_ptr, "sequential row overflow"); 1769+ } 1770+ 1771+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1772+ png_error(png_ptr, "internal sequential row size calculation error"); 1773+ 1774+ if (rows[i+1] != NULL) 1775+ png_combine_row(png_ptr, rows[i+1], -1); 1776+ 1777+ png_read_finish_row(png_ptr); 1778+ 1779+ if (png_ptr->read_row_fn != NULL) 1780+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1781+ 1782+ png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1; 1783+} 1784+ 1785+static void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows, 1786+ png_uint_32 row_num, png_row_info row_info_in) 1787+{ 1788+ if (png_ptr == NULL) 1789+ return; 1790+ 1791+ png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)", 1792+ (unsigned long)png_ptr->row_number, png_ptr->pass); 1793+ 1794+ if ((png_ptr->mode & PNG_HAVE_IDAT) == 0) 1795+ png_error(png_ptr, "Invalid attempt to read row data"); 1796+ 1797+ /* Fill the row with IDAT data: */ 1798+ uInt row_bytes = row_info_in.rowbytes; 1799+ png_ptr->row_buf[0]=255; /* 255 to force error if no data was found */ 1800+ png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num); 1801+ png_bytep temp_row = png_ptr->row_buf; 1802+ 1803+ for (png_uint_32 i = 0; i < row_num; i++) { 1804+ png_row_info row_info = row_info_in; 1805+ // check if the x2_filter is effective: only supports channels 3 or 4 1806+ if ((row_info_in.channels == 3 || row_info_in.channels == 4) && 1807+ i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB && 1808+ png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST && 1809+ png_ptr->row_buf[0] == png_ptr->row_buf[row_info_in.rowbytes + 1]) 1810+ { 1811+ png_read_two_rows(png_ptr, rows, i, row_info); 1812+ i++; 1813+ continue; 1814+ } 1815+ if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE) 1816+ { 1817+ if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST) 1818+ png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1, 1819+ png_ptr->prev_row + 1, png_ptr->row_buf[0]); 1820+ else 1821+ png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]); 1822+ } 1823+ 1824+ memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_in.rowbytes + 1); 1825+ 1826+#ifdef PNG_MNG_FEATURES_SUPPORTED 1827+ if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 && 1828+ (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING)) 1829+ { 1830+ /* Intrapixel differencing */ 1831+ png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1); 1832+ } 1833+#endif 1834+ 1835+#ifdef PNG_READ_TRANSFORMS_SUPPORTED 1836+ if (png_ptr->transformations 1837+# ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED 1838+ || png_ptr->num_palette_max >= 0 1839+# endif 1840+ ) 1841+ png_do_read_transformations(png_ptr, &row_info); 1842+#endif 1843+ 1844+ /* The transformed pixel depth should match the depth now in row_info. */ 1845+ if (png_ptr->transformed_pixel_depth == 0) 1846+ { 1847+ png_ptr->transformed_pixel_depth = row_info.pixel_depth; 1848+ if (row_info.pixel_depth > png_ptr->maximum_pixel_depth) 1849+ png_error(png_ptr, "sequential row overflow"); 1850+ } 1851+ 1852+ else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth) 1853+ png_error(png_ptr, "internal sequential row size calculation error"); 1854+ 1855+ if (rows[i] != NULL) 1856+ png_combine_row(png_ptr, rows[i], -1); 1857+ 1858+ png_read_finish_row(png_ptr); 1859+ 1860+ if (png_ptr->read_row_fn != NULL) 1861+ (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass); 1862+ 1863+ png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1; 1864+ } 1865+ png_ptr->row_buf = temp_row; 1866+} 1867+ 1868+static void png_warn_check(png_structrp png_ptr) 1869+{ 1870+#ifdef PNG_WARNINGS_SUPPORTED 1871+ /* Check for transforms that have been set but were defined out */ 1872+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED) 1873+ if ((png_ptr->transformations & PNG_INVERT_MONO) != 0) 1874+ png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined"); 1875+#endif 1876+ 1877+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED) 1878+ if ((png_ptr->transformations & PNG_FILLER) != 0) 1879+ png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined"); 1880+#endif 1881+ 1882+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \ 1883+ !defined(PNG_READ_PACKSWAP_SUPPORTED) 1884+ if ((png_ptr->transformations & PNG_PACKSWAP) != 0) 1885+ png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined"); 1886+#endif 1887+ 1888+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED) 1889+ if ((png_ptr->transformations & PNG_PACK) != 0) 1890+ png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined"); 1891+#endif 1892+ 1893+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED) 1894+ if ((png_ptr->transformations & PNG_SHIFT) != 0) 1895+ png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined"); 1896+#endif 1897+ 1898+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED) 1899+ if ((png_ptr->transformations & PNG_BGR) != 0) 1900+ png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined"); 1901+#endif 1902+ 1903+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED) 1904+ if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0) 1905+ png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined"); 1906+#endif 1907+#endif /* WARNINGS */ 1908+} 1909+#endif // PNG_MULTY_LINE_ENABLE 1910+ 1911 /* Read the entire image. If the image has an alpha channel or a tRNS 1912 * chunk, and you have called png_handle_alpha()[*], you will need to 1913 * initialize the image to the current image that PNG will be overlaying. 1914@@ -745,13 +968,45 @@ png_read_image(png_structrp png_ptr, png_bytepp image) 1915 1916 image_height=png_ptr->height; 1917 1918- for (j = 0; j < pass; j++) 1919- { 1920+#ifdef PNG_MULTY_LINE_ENABLE 1921+ // OH ISSUE: png optimize 1922+ if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 && 1923+ (png_ptr->transformations & PNG_CHECK) == 0) { 1924+ if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0) 1925+ png_read_start_row(png_ptr); 1926+ 1927+ png_warn_check(png_ptr); 1928+ png_row_info row_info; 1929+ row_info.width = png_ptr->iwidth; 1930+ row_info.color_type = png_ptr->color_type; 1931+ row_info.bit_depth = png_ptr->bit_depth; 1932+ row_info.channels = png_ptr->channels; 1933+ row_info.pixel_depth = png_ptr->pixel_depth; 1934+ row_info.rowbytes = png_ptr->rowbytes; 1935+ 1936 rp = image; 1937- for (i = 0; i < image_height; i++) 1938+ int row_num = PNG_INFLATE_ROWS; 1939+ for (i = 0; i < image_height; i += PNG_INFLATE_ROWS) 1940 { 1941- png_read_row(png_ptr, *rp, NULL); 1942- rp++; 1943+ if (image_height - i < PNG_INFLATE_ROWS) 1944+ { 1945+ row_num = image_height - i; 1946+ } 1947+ png_read_muilty_rows(png_ptr, rp, row_num, row_info); 1948+ rp += row_num; 1949+ } 1950+ } 1951+ else 1952+#endif 1953+ { 1954+ for (j = 0; j < pass; j++) 1955+ { 1956+ rp = image; 1957+ for (i = 0; i < image_height; i++) 1958+ { 1959+ png_read_row(png_ptr, *rp, NULL); 1960+ rp++; 1961+ } 1962 } 1963 } 1964 } 1965diff --git a/pngrutil.c b/pngrutil.c 1966index 9ac8ec11f..f9c65927d 100644 1967--- a/pngrutil.c 1968+++ b/pngrutil.c 1969@@ -4134,7 +4134,12 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row, 1970 * PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic 1971 * implementations. See png_init_filter_functions above. 1972 */ 1973+#ifdef PNG_MULTY_LINE_ENABLE 1974+ // OH ISSUE: png optimize 1975+ if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2) 1976+#else 1977 if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST) 1978+#endif 1979 { 1980 if (pp->read_filter[0] == NULL) 1981 png_init_filter_functions(pp); 1982@@ -4606,7 +4611,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED) 1983 row_bytes + 48); 1984 1985 else 1986+ { 1987+#ifdef PNG_MULTY_LINE_ENABLE 1988+ // OH ISSUE: png optimize 1989+ png_uint_32 row_num = 1; 1990+ if (png_ptr->bit_depth == 8 && 1991+ (png_ptr->transformations & PNG_CHECK) == 0) 1992+ { 1993+ row_num = png_ptr->height < PNG_INFLATE_ROWS ? 1994+ png_ptr->height : PNG_INFLATE_ROWS; 1995+ } 1996+ png_ptr->big_row_buf = (png_bytep)png_malloc( 1997+ png_ptr, row_bytes * row_num + 48); 1998+ if (png_ptr->big_row_buf == NULL) 1999+ png_error(png_ptr, "png_malloc failed"); 2000+#else 2001 png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48); 2002+#endif 2003+ } 2004 2005 png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48); 2006 2007diff --git a/pngstruct.h b/pngstruct.h 2008index e591d94d5..7c3846475 100644 2009--- a/pngstruct.h 2010+++ b/pngstruct.h 2011@@ -140,6 +140,14 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp; 2012 #define PNG_COLORSPACE_CANCEL(flags) (0xffff ^ (flags)) 2013 #endif /* COLORSPACE || GAMMA */ 2014 2015+#ifdef PNG_MULTY_LINE_ENABLE 2016+// OH ISSUE: png optimize 2017+#define PNG_FILTER_VALUE_UP_X2 (6) // PNG_FILTER_VALUE_UP + 4 2018+#define PNG_FILTER_VALUE_AVG_X2 (7) // PNG_FILTER_VALUE_AVG + 4 2019+#define PNG_FILTER_VALUE_PAETH_X2 (8) // PNG_FILTER_VALUE_PAETH + 4 2020+#define PNG_FILTER_VALUE_LAST_X2 (9) // PNG_FILTER_VALUE_LAST + 4 2021+#endif 2022+ 2023 struct png_struct_def 2024 { 2025 #ifdef PNG_SETJMP_SUPPORTED 2026@@ -467,8 +475,14 @@ struct png_struct_def 2027 png_bytep big_prev_row; 2028 2029 /* New member added in libpng-1.5.7 */ 2030+#ifdef PNG_MULTY_LINE_ENABLE 2031+ // OH ISSUE: png optimize 2032+ void (*read_filter[PNG_FILTER_VALUE_LAST_X2 - 1])(png_row_infop row_info, 2033+ png_bytep row, png_const_bytep prev_row); 2034+#else 2035 void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info, 2036 png_bytep row, png_const_bytep prev_row); 2037+#endif 2038 2039 #ifdef PNG_READ_SUPPORTED 2040 #if defined(PNG_COLORSPACE_SUPPORTED) || defined(PNG_GAMMA_SUPPORTED) 2041diff --git a/pngtrans.c b/pngtrans.c 2042index 1100f46eb..9addf3423 100644 2043--- a/pngtrans.c 2044+++ b/pngtrans.c 2045@@ -13,6 +13,19 @@ 2046 2047 #include "pngpriv.h" 2048 2049+#ifdef PNG_MULTY_LINE_ENABLE 2050+# if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64) 2051+# include <arm64_neon.h> 2052+# else 2053+# include <arm_neon.h> 2054+# endif 2055+# define STEP_GRAY (16) 2056+# define STEP_GA (32) 2057+# define STEP_RGB (48) 2058+# define STEP_RGBA (64) 2059+# define INDEX2 (2) 2060+#endif 2061+ 2062 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED) 2063 2064 #if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED) 2065@@ -269,13 +282,19 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2066 if (row_info->color_type == PNG_COLOR_TYPE_GRAY) 2067 { 2068 png_bytep rp = row; 2069- size_t i; 2070- size_t istop = row_info->rowbytes; 2071- 2072- for (i = 0; i < istop; i++) 2073+ png_bytep rp_stop = row + row_info->rowbytes; 2074+#ifdef PNG_MULTY_LINE_ENABLE 2075+ png_bytep rp_stop_neon = rp_stop - STEP_GRAY; 2076+ for (; rp < rp_stop_neon; rp += STEP_GRAY) 2077+ { 2078+ uint8x16_t gray = vld1q_u8(rp); 2079+ gray = ~gray; 2080+ vst1q_u8(rp, gray); 2081+ } 2082+#endif 2083+ for (; rp < rp_stop; rp++) 2084 { 2085 *rp = (png_byte)(~(*rp)); 2086- rp++; 2087 } 2088 } 2089 2090@@ -283,13 +302,19 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2091 row_info->bit_depth == 8) 2092 { 2093 png_bytep rp = row; 2094- size_t i; 2095- size_t istop = row_info->rowbytes; 2096- 2097- for (i = 0; i < istop; i += 2) 2098+ png_bytep rp_stop = row + row_info->rowbytes; 2099+#ifdef PNG_MULTY_LINE_ENABLE 2100+ png_bytep rp_stop_neon = rp_stop - STEP_GA; 2101+ for (; rp < rp_stop_neon; rp += STEP_GA) 2102+ { 2103+ uint8x16x2_t gray_alpha = vld2q_u8(rp); 2104+ gray_alpha.val[0] = ~gray_alpha.val[0]; 2105+ vst2q_u8(rp, gray_alpha); 2106+ } 2107+#endif 2108+ for (; rp < rp_stop; rp += 2) 2109 { 2110 *rp = (png_byte)(~(*rp)); 2111- rp += 2; 2112 } 2113 } 2114 2115@@ -298,14 +323,21 @@ png_do_invert(png_row_infop row_info, png_bytep row) 2116 row_info->bit_depth == 16) 2117 { 2118 png_bytep rp = row; 2119- size_t i; 2120- size_t istop = row_info->rowbytes; 2121- 2122- for (i = 0; i < istop; i += 4) 2123+ png_bytep rp_stop = row + row_info->rowbytes; 2124+#ifdef PNG_MULTY_LINE_ENABLE 2125+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA; 2126+ for (; rp < rp_stop_neon; rp += STEP_RGBA) 2127+ { 2128+ uint8x16x4_t gray_alpha = vld4q_u8(rp); 2129+ gray_alpha.val[0] = ~gray_alpha.val[0]; 2130+ gray_alpha.val[1] = ~gray_alpha.val[1]; 2131+ vst4q_u8(rp, gray_alpha); 2132+ } 2133+#endif 2134+ for (; rp < rp_stop; rp += 4) 2135 { 2136 *rp = (png_byte)(~(*rp)); 2137 *(rp + 1) = (png_byte)(~(*(rp + 1))); 2138- rp += 4; 2139 } 2140 } 2141 #endif 2142@@ -323,10 +355,19 @@ png_do_swap(png_row_infop row_info, png_bytep row) 2143 if (row_info->bit_depth == 16) 2144 { 2145 png_bytep rp = row; 2146- png_uint_32 i; 2147- png_uint_32 istop= row_info->width * row_info->channels; 2148- 2149- for (i = 0; i < istop; i++, rp += 2) 2150+ png_bytep rp_stop = row + row_info->rowbytes; 2151+#ifdef PNG_MULTY_LINE_ENABLE 2152+ png_bytep rp_stop_neon = rp_stop - STEP_GA; 2153+ for (; rp < rp_stop_neon; rp += STEP_GA) 2154+ { 2155+ uint8x16x2_t gray = vld2q_u8(rp); 2156+ uint8x16_t tmp = gray.val[0]; 2157+ gray.val[0] = gray.val[1]; 2158+ gray.val[1] = tmp; 2159+ vst2q_u8(rp, gray); 2160+ } 2161+#endif 2162+ for (; rp < rp_stop; rp += 2) 2163 { 2164 #ifdef PNG_BUILTIN_BSWAP16_SUPPORTED 2165 /* Feature added to libpng-1.6.11 for testing purposes, not 2166@@ -622,15 +663,24 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2167 2168 if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0) 2169 { 2170- png_uint_32 row_width = row_info->width; 2171 if (row_info->bit_depth == 8) 2172 { 2173 if (row_info->color_type == PNG_COLOR_TYPE_RGB) 2174 { 2175- png_bytep rp; 2176- png_uint_32 i; 2177- 2178- for (i = 0, rp = row; i < row_width; i++, rp += 3) 2179+ png_bytep rp = row; 2180+ png_bytep rp_stop = row + row_info->rowbytes; 2181+#ifdef PNG_MULTY_LINE_ENABLE 2182+ png_bytep rp_stop_neon = rp_stop - STEP_RGB; 2183+ for (; rp < rp_stop_neon; rp += STEP_RGB) 2184+ { 2185+ uint8x16x3_t bgr = vld3q_u8(rp); 2186+ uint8x16_t tmp = bgr.val[INDEX2]; 2187+ bgr.val[INDEX2] = bgr.val[0]; 2188+ bgr.val[0] = tmp; 2189+ vst3q_u8(rp, bgr); 2190+ } 2191+#endif 2192+ for (; rp < rp_stop; rp += 3) 2193 { 2194 png_byte save = *rp; 2195 *rp = *(rp + 2); 2196@@ -640,10 +690,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2197 2198 else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA) 2199 { 2200- png_bytep rp; 2201- png_uint_32 i; 2202- 2203- for (i = 0, rp = row; i < row_width; i++, rp += 4) 2204+ png_bytep rp = row; 2205+ png_bytep rp_stop = row + row_info->rowbytes; 2206+#ifdef PNG_MULTY_LINE_ENABLE 2207+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA; 2208+ for (; rp < rp_stop_neon; rp += STEP_RGBA) 2209+ { 2210+ uint8x16x4_t bgra = vld4q_u8(rp); 2211+ uint8x16_t tmp = bgra.val[INDEX2]; 2212+ bgra.val[INDEX2] = bgra.val[0]; 2213+ bgra.val[0] = tmp; 2214+ vst4q_u8(rp, bgra); 2215+ } 2216+#endif 2217+ for (; rp < rp_stop; rp += 4) 2218 { 2219 png_byte save = *rp; 2220 *rp = *(rp + 2); 2221@@ -657,10 +717,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2222 { 2223 if (row_info->color_type == PNG_COLOR_TYPE_RGB) 2224 { 2225- png_bytep rp; 2226- png_uint_32 i; 2227- 2228- for (i = 0, rp = row; i < row_width; i++, rp += 6) 2229+ png_bytep rp = row; 2230+ png_bytep rp_stop = row + row_info->rowbytes; 2231+#ifdef PNG_MULTY_LINE_ENABLE 2232+ png_bytep rp_stop_neon = rp_stop - STEP_RGB; 2233+ for (; rp < rp_stop_neon; rp += STEP_RGB) 2234+ { 2235+ uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp); 2236+ uint16x8_t tmp = bgr.val[INDEX2]; 2237+ bgr.val[INDEX2] = bgr.val[0]; 2238+ bgr.val[0] = tmp; 2239+ vst3q_u16((unsigned short *)rp, bgr); 2240+ } 2241+#endif 2242+ for (; rp < rp_stop; rp += 6) 2243 { 2244 png_byte save = *rp; 2245 *rp = *(rp + 4); 2246@@ -673,10 +743,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row) 2247 2248 else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA) 2249 { 2250- png_bytep rp; 2251- png_uint_32 i; 2252- 2253- for (i = 0, rp = row; i < row_width; i++, rp += 8) 2254+ png_bytep rp = row; 2255+ png_bytep rp_stop = row + row_info->rowbytes; 2256+#ifdef PNG_MULTY_LINE_ENABLE 2257+ png_bytep rp_stop_neon = rp_stop - STEP_RGBA; 2258+ for (; rp < rp_stop_neon; rp += STEP_RGBA) 2259+ { 2260+ uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp); 2261+ uint16x8_t tmp = bgra.val[INDEX2]; 2262+ bgra.val[INDEX2] = bgra.val[0]; 2263+ bgra.val[0] = tmp; 2264+ vst4q_u16((unsigned short *)rp, bgra); 2265+ } 2266+#endif 2267+ for (; rp < rp_stop; rp += 8) 2268 { 2269 png_byte save = *rp; 2270 *rp = *(rp + 4); 2271