1diff --git a/arm/arm_init.c b/arm/arm_init.c
2index 3a89998ab..05aa2c0d9 100644
3--- a/arm/arm_init.c
4+++ b/arm/arm_init.c
5@@ -113,13 +113,23 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
6     * initialization function.)
7     */
8    pp->read_filter[PNG_FILTER_VALUE_UP-1] = png_read_filter_row_up_neon;
9-
10+#ifdef PNG_MULTY_LINE_ENABLE
11+   // OH ISSUE: png optimize
12+   pp->read_filter[PNG_FILTER_VALUE_UP_X2-1] = png_read_filter_row_up_x2_neon;
13+#endif
14    if (bpp == 3)
15    {
16       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_neon;
17       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg3_neon;
18       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
19          png_read_filter_row_paeth3_neon;
20+#ifdef PNG_MULTY_LINE_ENABLE
21+      // OH ISSUE: png optimize
22+      pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
23+         png_read_filter_row_avg3_x2_neon;
24+      pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
25+         png_read_filter_row_paeth3_x2_neon;
26+#endif
27    }
28 
29    else if (bpp == 4)
30@@ -128,6 +138,13 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
31       pp->read_filter[PNG_FILTER_VALUE_AVG-1] = png_read_filter_row_avg4_neon;
32       pp->read_filter[PNG_FILTER_VALUE_PAETH-1] =
33           png_read_filter_row_paeth4_neon;
34+#ifdef PNG_MULTY_LINE_ENABLE
35+      // OH ISSUE: png optimize
36+      pp->read_filter[PNG_FILTER_VALUE_AVG_X2-1] =
37+         png_read_filter_row_avg4_x2_neon;
38+      pp->read_filter[PNG_FILTER_VALUE_PAETH_X2-1] =
39+         png_read_filter_row_paeth4_x2_neon;
40+#endif
41    }
42 }
43 #endif /* PNG_ARM_NEON_OPT > 0 */
44diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
45index 4466d48b2..4ff810a19 100644
46--- a/arm/filter_neon_intrinsics.c
47+++ b/arm/filter_neon_intrinsics.c
48@@ -47,6 +47,7 @@
49 
50 #if PNG_ARM_NEON_OPT > 0
51 
52+#ifndef PNG_MULTY_LINE_ENABLE
53 void
54 png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
55    png_const_bytep prev_row)
56@@ -396,7 +397,1351 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
57       vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
58    }
59 }
60+#else
61+// OH ISSUE: png optimize
62+// according to definition: row_info->rowbytes = row_width * row_info->channels,
63+// the input rowbytes must be 3 or 4 times the channel size, so:
64+// for RGB neon process 12 bytes at once,the tail must be 3,6,9;
65+// for RGBA neon process 16 or 8 bytes at once,the tail must be 4;
66+// filter operators are internal function, row_info and row ensure non empty outside.
67+#define STEP_RGB (12) // 3 channel RGB process 12 bytes at once
68+#define TAIL_RGB3 (9) // tail 3 pixels have 9 bytes
69+#define TAIL_RGB2 (6) // tail 2 pixels have 6 bytes
70+#define TAIL_RGB1 (3) // tail 1 pixel have 3 bytes
71+#define STEP_RGBA (16) // GBA neon process 16 bytes at once
72+#define STEP_RGBA_HALF (8) // GBA neon process 8 bytes at once
73+#define TAIL_RGBA (4) // tail 1 pixel have 4 bytes
74+#define IND3 (3) // index 3
75+#define IND2 (2) // index 2
76+#define OFFSET3 (3) // RGB offset 3 bytes
77+#define OFFSET6 (6) // RGB offset 6 bytes
78+void png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
79+   png_const_bytep prev_row)
80+{
81+   png_bytep rp = row;
82+   png_const_bytep pp = prev_row;
83+   int count = row_info->rowbytes;
84+
85+   png_debug(1, "in png_read_filter_row_up_neon");
86+
87+   uint8x16_t qrp, qpp;
88+   while (count >= STEP_RGBA) {
89+      qrp = vld1q_u8(rp);
90+      qpp = vld1q_u8(pp);
91+      qrp = vaddq_u8(qrp, qpp);
92+      vst1q_u8(rp, qrp);
93+      rp += STEP_RGBA;
94+      pp += STEP_RGBA;
95+      count -= STEP_RGBA;
96+   }
97+
98+   if (count >= STEP_RGBA_HALF) {
99+      uint8x8_t qrp1, qpp1;
100+      qrp1 = vld1_u8(rp);
101+      qpp1 = vld1_u8(pp);
102+      qrp1 = vadd_u8(qrp1, qpp1);
103+      vst1_u8(rp, qrp1);
104+      rp += STEP_RGBA_HALF;
105+      pp += STEP_RGBA_HALF;
106+      count -= STEP_RGBA_HALF;
107+   }
108+
109+   for (int i = 0; i < count; i++) {
110+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
111+      rp++;
112+   }
113+}
114+
115+void png_read_filter_row_up_x2_neon(png_row_infop row_info, png_bytep row,
116+   png_const_bytep prev_row)
117+{
118+   png_bytep rp = row;
119+   png_const_bytep pp = prev_row;
120+   int count = row_info->rowbytes;
121+   png_bytep np = row + row_info->rowbytes + 1;
122+
123+   png_debug(1, "in png_read_filter_row_up_x2_neon");
124+
125+   uint8x16_t qrp, qpp, qnp;
126+   while (count >= STEP_RGBA) {
127+      qrp = vld1q_u8(rp);
128+      qpp = vld1q_u8(pp);
129+      qnp = vld1q_u8(np);
130+      qrp = vaddq_u8(qrp, qpp);
131+      qnp = vaddq_u8(qnp, qrp);
132+      vst1q_u8(rp, qrp);
133+      vst1q_u8(np, qnp);
134+      rp += STEP_RGBA;
135+      pp += STEP_RGBA;
136+      np += STEP_RGBA;
137+      count -= STEP_RGBA;
138+   }
139+
140+   if (count >= STEP_RGBA_HALF) {
141+      uint8x8_t qrp1, qpp1, qnp1;
142+      qrp1 = vld1_u8(rp);
143+      qpp1 = vld1_u8(pp);
144+      qnp1 = vld1_u8(np);
145+      qrp1 = vadd_u8(qrp1, qpp1);
146+      qnp1 = vadd_u8(qnp1, qrp1);
147+      vst1_u8(rp, qrp1);
148+      vst1_u8(np, qnp1);
149+      rp += STEP_RGBA_HALF;
150+      pp += STEP_RGBA_HALF;
151+      np += STEP_RGBA_HALF;
152+      count -= STEP_RGBA_HALF;
153+   }
154+
155+   for (int i = 0; i < count; i++) {
156+      *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
157+      *np = (png_byte)(((int)(*np) + (int)(*rp++)) & 0xff);
158+      np++;
159+   }
160+}
161+
162+void png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
163+   png_const_bytep prev_row)
164+{
165+   png_bytep rp = row;
166+   png_bytep rp_stop = row + row_info->rowbytes;
167+
168+   uint8x16_t vtmp = vld1q_u8(rp);
169+   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
170+   uint8x8x2_t vrp = *vrpt;
171+
172+   uint8x8x4_t vdest;
173+   vdest.val[IND3] = vdup_n_u8(0);
174+
175+   uint8x8_t vtmp1, vtmp2;
176+   uint32x2_t *temp_pointer;
177+
178+   png_debug(1, "in png_read_filter_row_sub3_neon");
179+
180+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
181+   png_byte last_byte = *rp_stop;
182+   png_bytep rp_stop_new = rp_stop - tail_bytes;
183+   for (; rp < rp_stop_new;)
184+   {
185+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
186+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
187+      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
188+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
189+
190+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
191+      vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2);
192+      vdest.val[IND3] = vadd_u8(vdest.val[IND2], vtmp1);
193+
194+      vtmp = vld1q_u8(rp + STEP_RGB);
195+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
196+      vrp = *vrpt;
197+
198+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
199+      rp += OFFSET3;
200+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
201+      rp += OFFSET3;
202+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
203+      rp += OFFSET3;
204+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
205+      rp += OFFSET3;
206+   }
207+
208+   if (tail_bytes == TAIL_RGB1) {
209+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
210+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
211+   } else if (tail_bytes == TAIL_RGB2) {
212+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
213+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
214+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
215+
216+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
217+      rp += OFFSET3;
218+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
219+   } else if (tail_bytes == TAIL_RGB3) {
220+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
221+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
222+      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
223+      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
224+      vdest.val[IND2] = vadd_u8(vdest.val[1], vtmp2);
225+
226+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
227+      rp += OFFSET3;
228+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
229+      rp += OFFSET3;
230+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
231+   }
232+   *rp_stop = last_byte;
233+
234+   PNG_UNUSED(prev_row)
235+}
236+
237+void png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
238+   png_const_bytep prev_row)
239+{
240+   png_bytep rp = row;
241+   int count = row_info->rowbytes;
242+
243+   uint8x8x4_t vdest;
244+   vdest.val[IND3] = vdup_n_u8(0);
245+
246+   png_debug(1, "in png_read_filter_row_sub4_neon");
247+
248+   uint32x2x4_t vtmp;
249+   uint8x8x4_t *vrpt;
250+   uint8x8x4_t vrp;
251+   uint32x2x4_t vdest_val;
252+   while (count >= STEP_RGBA) {
253+      uint32x2x4_t *temp_pointer;
254+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
255+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
256+      vrp = *vrpt;
257+
258+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp.val[0]);
259+      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
260+      vdest.val[IND2] = vadd_u8(vdest.val[1], vrp.val[IND2]);
261+      vdest.val[IND3] = vadd_u8(vdest.val[IND2], vrp.val[IND3]);
262+
263+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
264+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
265+
266+      rp += STEP_RGBA;
267+      count -= STEP_RGBA;
268+   }
269+
270+   if (count >= STEP_RGBA_HALF) {
271+      uint32x2x2_t vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
272+      uint8x8x2_t *vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
273+      uint8x8x2_t vrp1 = *vrpt1;
274+      uint32x2x2_t *temp_pointer;
275+      uint32x2x2_t vdest_val1;
276+
277+      vdest.val[0] = vadd_u8(vdest.val[IND3], vrp1.val[0]);
278+      vdest.val[1] = vadd_u8(vdest.val[0], vrp1.val[1]);
279+      vdest.val[IND3] = vdest.val[1];
280+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
281+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
282+
283+      rp += STEP_RGBA_HALF;
284+      count -= STEP_RGBA_HALF;
285+   }
286+
287+   if (count == 0) {
288+      return;
289+   }
290+
291+   uint32x2_t vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
292+   uint8x8_t *vrpt2 = png_ptr(uint8x8_t, &vtmp2);
293+   uint8x8_t vrp2 = *vrpt2;
294+   uint32x2_t *temp_pointer;
295+   uint32x2_t vdest_val2;
296+
297+   vdest.val[0] = vadd_u8(vdest.val[IND3], vrp2);
298+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
299+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
300+
301+   PNG_UNUSED(prev_row)
302+}
303+
304+void png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
305+   png_const_bytep prev_row)
306+{
307+   png_bytep rp = row;
308+   png_const_bytep pp = prev_row;
309+   png_bytep rp_stop = row + row_info->rowbytes;
310+
311+   uint8x16_t vtmp;
312+   uint8x8x2_t *vrpt;
313+   uint8x8x2_t vrp;
314+   uint8x8x4_t vdest;
315+   vdest.val[IND3] = vdup_n_u8(0);
316+
317+   vtmp = vld1q_u8(rp);
318+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
319+   vrp = *vrpt;
320+
321+   png_debug(1, "in png_read_filter_row_avg3_neon");
322+
323+   uint8x8_t vtmp1, vtmp2, vtmp3;
324+   uint8x8x2_t *vppt;
325+   uint8x8x2_t vpp;
326+   uint32x2_t *temp_pointer;
327+
328+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
329+   png_byte last_byte = *rp_stop;
330+   png_bytep rp_stop_new = rp_stop - tail_bytes;
331+   for (; rp < rp_stop_new; pp += STEP_RGB)
332+   {
333+      vtmp = vld1q_u8(pp);
334+      vppt = png_ptr(uint8x8x2_t, &vtmp);
335+      vpp = *vppt;
336+
337+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
338+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
339+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
340+
341+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
342+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
343+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
344+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
345+
346+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
347+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
348+
349+      vtmp = vld1q_u8(rp + STEP_RGB);
350+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
351+      vrp = *vrpt;
352+
353+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
354+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
355+
356+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
357+
358+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2);
359+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
360+
361+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
362+      rp += OFFSET3;
363+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
364+      rp += OFFSET3;
365+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
366+      rp += OFFSET3;
367+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
368+      rp += OFFSET3;
369+   }
370+
371+   vtmp = vld1q_u8(pp);
372+   vppt = png_ptr(uint8x8x2_t, &vtmp);
373+   vpp = *vppt;
374+
375+   if (tail_bytes == TAIL_RGB1) {
376+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
377+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
378+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
379+   } else if (tail_bytes == TAIL_RGB2) {
380+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
381+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
382+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
383+
384+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
385+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
386+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
387+
388+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
389+      rp += OFFSET3;
390+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
391+   } else if (tail_bytes == TAIL_RGB3) {
392+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
393+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
394+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
395+
396+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
397+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
398+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
399+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
400+
401+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
402+
403+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
404+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
405+
406+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
407+      rp += OFFSET3;
408+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
409+      rp += OFFSET3;
410+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
411+   }
412+   *rp_stop = last_byte;
413+}
414+
415+void png_read_filter_row_avg3_x2_neon(png_row_infop row_info, png_bytep row,
416+   png_const_bytep prev_row)
417+{
418+   png_bytep rp = row;
419+   png_const_bytep pp = prev_row;
420+   png_bytep rp_stop = row + row_info->rowbytes;
421+   png_bytep np = rp_stop + 1;
422+
423+   uint8x16_t vtmp;
424+   uint8x8x2_t *vrpt;
425+   uint8x8x2_t vrp;
426+   uint8x8x4_t vdest;
427+   vdest.val[IND3] = vdup_n_u8(0);
428+
429+   vtmp = vld1q_u8(rp);
430+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
431+   vrp = *vrpt;
432+
433+   uint8x8x2_t *vnpt;
434+   uint8x8x2_t vnp;
435+   uint8x8x4_t vdestN;
436+   vdestN.val[IND3] = vdup_n_u8(0);
437+
438+   vtmp = vld1q_u8(np);
439+   vnpt = png_ptr(uint8x8x2_t, &vtmp);
440+   vnp = *vnpt;
441+
442+   png_debug(1, "in png_read_filter_row_x2_avg3_neon");
443+
444+   uint8x8_t vtmp1, vtmp2, vtmp3;
445+   uint8x8x2_t *vppt;
446+   uint8x8x2_t vpp;
447+   uint32x2_t *temp_pointer;
448+
449+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
450+   png_byte last_byte = *rp_stop;
451+   png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
452+   png_bytep rp_stop_new = rp_stop - tail_bytes;
453+   for (; rp < rp_stop_new; pp += STEP_RGB)
454+   {
455+      vtmp = vld1q_u8(pp);
456+      vppt = png_ptr(uint8x8x2_t, &vtmp);
457+      vpp = *vppt;
458+
459+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
460+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
461+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
462+
463+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
464+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
465+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
466+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
467+
468+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
469+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
470+
471+      vtmp = vld1q_u8(rp + STEP_RGB);
472+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
473+      vrp = *vrpt;
474+
475+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
476+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
477+
478+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
479+
480+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vtmp2);
481+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
482+
483+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
484+      rp += OFFSET3;
485+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
486+      rp += OFFSET3;
487+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
488+      rp += OFFSET3;
489+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
490+      rp += OFFSET3;
491+
492+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
493+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
494+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
495+
496+      vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
497+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
498+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
499+
500+      vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
501+
502+      vtmp = vld1q_u8(np + STEP_RGB);
503+      vnpt = png_ptr(uint8x8x2_t, &vtmp);
504+      vnp = *vnpt;
505+
506+      vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
507+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3);
508+
509+      vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]);
510+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1);
511+
512+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
513+      np += OFFSET3;
514+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
515+      np += OFFSET3;
516+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
517+      np += OFFSET3;
518+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0);
519+      np += OFFSET3;
520+   }
521+
522+   vtmp = vld1q_u8(pp);
523+   vppt = png_ptr(uint8x8x2_t, &vtmp);
524+   vpp = *vppt;
525+
526+   if (tail_bytes == TAIL_RGB1) {
527+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
528+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
529+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
530+
531+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
532+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
533+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
534+   } else if (tail_bytes == TAIL_RGB2) {
535+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
536+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
537+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
538+
539+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
540+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
541+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
542+
543+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
544+      rp += OFFSET3;
545+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
546+
547+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
548+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
549+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
550+
551+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
552+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
553+
554+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
555+      np += OFFSET3;
556+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
557+   } else if (tail_bytes == TAIL_RGB3) {
558+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
559+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
560+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
561+
562+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
563+      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
564+      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
565+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
566+
567+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
568+
569+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vtmp2);
570+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp3);
571+
572+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
573+      rp += OFFSET3;
574+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
575+      rp += OFFSET3;
576+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
577+
578+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
579+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
580+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
581+
582+      vtmp3 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
583+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
584+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
585+
586+      vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
587+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp3);
588+
589+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
590+      np += OFFSET3;
591+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
592+      np += OFFSET3;
593+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
594+   }
595+   *rp_stop = last_byte;
596+   *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
597+}
598+
599+void png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
600+   png_const_bytep prev_row)
601+{
602+   png_bytep rp = row;
603+   png_const_bytep pp = prev_row;
604+   int count = row_info->rowbytes;
605+
606+   uint8x8x4_t vdest;
607+   vdest.val[IND3] = vdup_n_u8(0);
608+
609+   png_debug(1, "in png_read_filter_row_avg4_neon");
610+
611+   uint32x2x4_t vtmp;
612+   uint8x8x4_t *vrpt, *vppt;
613+   uint8x8x4_t vrp, vpp;
614+   uint32x2x4_t vdest_val;
615+   while (count >= STEP_RGBA) {
616+      uint32x2x4_t *temp_pointer;
617+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
618+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
619+      vrp = *vrpt;
620+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
621+      vppt = png_ptr(uint8x8x4_t, &vtmp);
622+      vpp = *vppt;
623+
624+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
625+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
626+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
627+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
628+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]);
629+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
630+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]);
631+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
632+
633+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
634+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
635+
636+      rp += STEP_RGBA;
637+      pp += STEP_RGBA;
638+      count -= STEP_RGBA;
639+   }
640+
641+   if (count >= STEP_RGBA_HALF) {
642+      uint32x2x2_t vtmp1;
643+      uint8x8x2_t *vrpt1, *vppt1;
644+      uint8x8x2_t vrp1, vpp1;
645+      uint32x2x2_t *temp_pointer;
646+      uint32x2x2_t vdest_val1;
647+
648+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
649+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
650+      vrp1 = *vrpt1;
651+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
652+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
653+      vpp1 = *vppt1;
654+
655+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]);
656+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
657+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
658+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
659+      vdest.val[IND3] = vdest.val[1];
660+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
661+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
662+
663+      rp += STEP_RGBA_HALF;
664+      pp += STEP_RGBA_HALF;
665+      count -= STEP_RGBA_HALF;
666+   }
667+
668+   if (count == 0) {
669+      return;
670+   }
671+
672+   uint32x2_t vtmp2;
673+   uint8x8_t *vrpt2, *vppt2;
674+   uint8x8_t vrp2, vpp2;
675+   uint32x2_t *temp_pointer;
676+   uint32x2_t vdest_val2;
677+
678+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
679+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
680+   vrp2 = *vrpt2;
681+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
682+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
683+   vpp2 = *vppt2;
684+
685+   vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2);
686+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
687 
688+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
689+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
690+}
691+
692+void png_read_filter_row_avg4_x2_neon(png_row_infop row_info, png_bytep row,
693+   png_const_bytep prev_row)
694+{
695+   png_bytep rp = row;
696+   png_const_bytep pp = prev_row;
697+   int count = row_info->rowbytes;
698+   png_bytep np = row + count + 1;
699+
700+   uint8x8x4_t vdest;
701+   vdest.val[IND3] = vdup_n_u8(0);
702+
703+   png_debug(1, "in png_read_filter_row_avg4_x2_neon");
704+
705+   uint32x2x4_t vtmp;
706+   uint8x8x4_t *vrpt, *vppt;
707+   uint8x8x4_t vrp, vpp;
708+   uint32x2x4_t vdest_val;
709+
710+   uint8x8x4_t *vnpt;
711+   uint8x8x4_t vnp;
712+   uint8x8x4_t vdestN;
713+   vdestN.val[IND3] = vdup_n_u8(0);
714+
715+   while (count >= STEP_RGBA) {
716+      uint32x2x4_t *temp_pointer;
717+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
718+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
719+      vrp = *vrpt;
720+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
721+      vppt = png_ptr(uint8x8x4_t, &vtmp);
722+      vpp = *vppt;
723+      vtmp = vld4_u32(png_ptrc(uint32_t, np));
724+      vnpt = png_ptr(uint8x8x4_t, &vtmp);
725+      vnp = *vnpt;
726+
727+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp.val[0]);
728+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
729+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
730+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
731+      vdest.val[IND2] = vhadd_u8(vdest.val[1], vpp.val[IND2]);
732+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
733+      vdest.val[IND3] = vhadd_u8(vdest.val[IND2], vpp.val[IND3]);
734+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
735+
736+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
737+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
738+
739+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
740+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
741+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
742+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
743+      vdestN.val[IND2] = vhadd_u8(vdestN.val[1], vdest.val[IND2]);
744+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]);
745+      vdestN.val[IND3] = vhadd_u8(vdestN.val[IND2], vdest.val[IND3]);
746+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]);
747+
748+      vdest_val = png_ldr(uint32x2x4_t, &vdestN);
749+      vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0);
750+
751+      rp += STEP_RGBA;
752+      pp += STEP_RGBA;
753+      np += STEP_RGBA;
754+      count -= STEP_RGBA;
755+   }
756+
757+   if (count >= STEP_RGBA_HALF) {
758+      uint32x2x2_t vtmp1;
759+      uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
760+      uint8x8x2_t vrp1, vpp1, vnp1;
761+      uint32x2x2_t *temp_pointer;
762+      uint32x2x2_t vdest_val1;
763+
764+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
765+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
766+      vrp1 = *vrpt1;
767+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
768+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
769+      vpp1 = *vppt1;
770+      vtmp1 = vld2_u32(png_ptrc(uint32_t, np));
771+      vnpt1 = png_ptr(uint8x8x2_t, &vtmp1);
772+      vnp1 = *vnpt1;
773+
774+      vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp1.val[0]);
775+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
776+      vdest.val[1] = vhadd_u8(vdest.val[0], vpp1.val[1]);
777+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
778+      vdest.val[IND3] = vdest.val[1];
779+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
780+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
781+
782+      vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
783+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
784+      vdestN.val[1] = vhadd_u8(vdestN.val[0], vdest.val[1]);
785+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
786+      vdestN.val[IND3] = vdestN.val[1];
787+      vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
788+      vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0);
789+
790+      rp += STEP_RGBA_HALF;
791+      pp += STEP_RGBA_HALF;
792+      np += STEP_RGBA_HALF;
793+      count -= STEP_RGBA_HALF;
794+   }
795+
796+   if (count == 0) {
797+      return;
798+   }
799+
800+   uint32x2_t vtmp2;
801+   uint8x8_t *vrpt2, *vppt2, *vnpt2;
802+   uint8x8_t vrp2, vpp2, vnp2;
803+   uint32x2_t *temp_pointer;
804+   uint32x2_t vdest_val2;
805+
806+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
807+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
808+   vrp2 = *vrpt2;
809+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
810+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
811+   vpp2 = *vppt2;
812+   vtmp2 = vld1_u32(png_ptrc(uint32_t, np));
813+   vnpt2 = png_ptr(uint8x8_t, &vtmp2);
814+   vnp2 = *vnpt2;
815+
816+   vdest.val[0] = vhadd_u8(vdest.val[IND3], vpp2);
817+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
818+
819+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
820+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
821+
822+   vdestN.val[0] = vhadd_u8(vdestN.val[IND3], vdest.val[0]);
823+   vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
824+
825+   vdest_val2 = png_ldr(uint32x2_t, &vdestN);
826+   vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0);
827+}
828+
829+static uint8x8_t paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
830+{
831+   uint8x8_t d, e;
832+   uint16x8_t p1, pa, pb, pc;
833+
834+   p1 = vaddl_u8(a, b); /* a + b */
835+   pc = vaddl_u8(c, c); /* c * 2 */
836+   pa = vabdl_u8(b, c); /* pa */
837+   pb = vabdl_u8(a, c); /* pb */
838+   pc = vabdq_u16(p1, pc); /* pc */
839+
840+   p1 = vcleq_u16(pa, pb); /* pa <= pb */
841+   pa = vcleq_u16(pa, pc); /* pa <= pc */
842+   pb = vcleq_u16(pb, pc); /* pb <= pc */
843+
844+   p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
845+
846+   d = vmovn_u16(pb);
847+   e = vmovn_u16(p1);
848+
849+   d = vbsl_u8(d, b, c);
850+   e = vbsl_u8(e, a, d);
851+
852+   return e;
853+}
854+
855+void png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
856+   png_const_bytep prev_row)
857+{
858+   png_bytep rp = row;
859+   png_const_bytep pp = prev_row;
860+   png_bytep rp_stop = row + row_info->rowbytes;
861+
862+   uint8x16_t vtmp;
863+   uint8x8x2_t *vrpt;
864+   uint8x8x2_t vrp;
865+   uint8x8_t vlast = vdup_n_u8(0);
866+   uint8x8x4_t vdest;
867+   vdest.val[IND3] = vdup_n_u8(0);
868+
869+   vtmp = vld1q_u8(rp);
870+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
871+   vrp = *vrpt;
872+
873+   uint8x8x2_t *vppt;
874+   uint8x8x2_t vpp;
875+   uint8x8_t vtmp1, vtmp2, vtmp3;
876+   uint32x2_t *temp_pointer;
877+
878+   png_debug(1, "in png_read_filter_row_paeth3_neon");
879+
880+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
881+   png_byte last_byte = *rp_stop;
882+   png_bytep rp_stop_new = rp_stop - tail_bytes;
883+   for (; rp < rp_stop_new; pp += STEP_RGB)
884+   {
885+      vtmp = vld1q_u8(pp);
886+      vppt = png_ptr(uint8x8x2_t, &vtmp);
887+      vpp = *vppt;
888+
889+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
890+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
891+
892+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
893+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
894+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
895+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
896+
897+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
898+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
899+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
900+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
901+
902+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
903+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
904+
905+      vtmp = vld1q_u8(rp + STEP_RGB);
906+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
907+      vrp = *vrpt;
908+
909+      vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3);
910+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
911+
912+      vlast = vtmp2;
913+
914+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
915+      rp += OFFSET3;
916+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
917+      rp += OFFSET3;
918+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
919+      rp += OFFSET3;
920+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
921+      rp += OFFSET3;
922+   }
923+
924+   vtmp = vld1q_u8(pp);
925+   vppt = png_ptr(uint8x8x2_t, &vtmp);
926+   vpp = *vppt;
927+
928+   if (tail_bytes == TAIL_RGB1) {
929+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
930+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
931+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
932+   } else if (tail_bytes == TAIL_RGB2) {
933+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
934+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
935+
936+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
937+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
938+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
939+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
940+
941+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
942+      rp += OFFSET3;
943+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
944+   } else if (tail_bytes == TAIL_RGB3) {
945+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
946+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
947+
948+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
949+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
950+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
951+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
952+
953+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
954+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
955+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
956+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
957+
958+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
959+      rp += OFFSET3;
960+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
961+      rp += OFFSET3;
962+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
963+   }
964+   *rp_stop = last_byte;
965+}
966+
967+void png_read_filter_row_paeth3_x2_neon(png_row_infop row_info, png_bytep row,
968+   png_const_bytep prev_row)
969+{
970+   png_bytep rp = row;
971+   png_const_bytep pp = prev_row;
972+   png_bytep rp_stop = row + row_info->rowbytes;
973+   png_bytep np = rp_stop + 1;
974+
975+   uint8x16_t vtmp;
976+   uint8x8x2_t *vrpt;
977+   uint8x8x2_t vrp;
978+   uint8x8_t vlast = vdup_n_u8(0);
979+   uint8x8x4_t vdest;
980+   vdest.val[IND3] = vdup_n_u8(0);
981+
982+   vtmp = vld1q_u8(rp);
983+   vrpt = png_ptr(uint8x8x2_t, &vtmp);
984+   vrp = *vrpt;
985+
986+   uint8x8x2_t *vppt;
987+   uint8x8x2_t vpp;
988+   uint8x8_t vtmp1, vtmp2, vtmp3;
989+   uint32x2_t *temp_pointer;
990+
991+   uint8x8x2_t *vnpt;
992+   uint8x8x2_t vnp;
993+   uint8x8_t vlastN = vdup_n_u8(0);
994+   uint8x8x4_t vdestN;
995+   vdestN.val[IND3] = vdup_n_u8(0);
996+
997+   vtmp = vld1q_u8(np);
998+   vnpt = png_ptr(uint8x8x2_t, &vtmp);
999+   vnp = *vnpt;
1000+
1001+   png_debug(1, "in png_read_filter_row_paeth3_x2_neon");
1002+
1003+   size_t tail_bytes = row_info->rowbytes % STEP_RGB;
1004+   png_byte last_byte = *rp_stop;
1005+   png_byte last_byte_next = *(rp_stop + row_info->rowbytes + 1);
1006+   png_bytep rp_stop_new = rp_stop - tail_bytes;
1007+
1008+   for (; rp < rp_stop_new; pp += STEP_RGB)
1009+   {
1010+      vtmp = vld1q_u8(pp);
1011+      vppt = png_ptr(uint8x8x2_t, &vtmp);
1012+      vpp = *vppt;
1013+
1014+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1015+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1016+
1017+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
1018+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
1019+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
1020+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
1021+
1022+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
1023+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
1024+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
1025+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
1026+
1027+      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
1028+      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
1029+
1030+      vtmp = vld1q_u8(rp + STEP_RGB);
1031+      vrpt = png_ptr(uint8x8x2_t, &vtmp);
1032+      vrp = *vrpt;
1033+
1034+      vdest.val[IND3] = paeth(vdest.val[IND2], vtmp2, vtmp3);
1035+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vtmp1);
1036+
1037+      vlast = vtmp2;
1038+
1039+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1040+      rp += OFFSET3;
1041+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
1042+      rp += OFFSET3;
1043+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
1044+      rp += OFFSET3;
1045+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND3]), 0);
1046+      rp += OFFSET3;
1047+
1048+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1049+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1050+
1051+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
1052+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1053+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1054+
1055+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
1056+      vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
1057+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1);
1058+
1059+      vtmp1 = vext_u8(vnp.val[1], vnp.val[1], 1);
1060+
1061+      vtmp = vld1q_u8(np + STEP_RGB);
1062+      vnpt = png_ptr(uint8x8x2_t, &vtmp);
1063+      vnp = *vnpt;
1064+
1065+      vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]);
1066+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vtmp1);
1067+
1068+      vlastN = vdest.val[IND3];
1069+
1070+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1071+      np += OFFSET3;
1072+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
1073+      np += OFFSET3;
1074+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
1075+      np += OFFSET3;
1076+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND3]), 0);
1077+      np += OFFSET3;
1078+   }
1079+
1080+   vtmp = vld1q_u8(pp);
1081+   vppt = png_ptr(uint8x8x2_t, &vtmp);
1082+   vpp = *vppt;
1083+
1084+   if (tail_bytes == TAIL_RGB1) {
1085+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1086+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1087+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1088+
1089+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1090+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1091+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1092+   } else if (tail_bytes == TAIL_RGB2) {
1093+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1094+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1095+
1096+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
1097+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
1098+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
1099+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
1100+
1101+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1102+      rp += OFFSET3;
1103+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
1104+
1105+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1106+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1107+
1108+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
1109+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1110+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1111+
1112+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1113+      np += OFFSET3;
1114+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
1115+   } else if (tail_bytes == TAIL_RGB3) {
1116+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1117+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1118+
1119+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET3);
1120+      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], OFFSET3);
1121+      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
1122+      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
1123+
1124+      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], OFFSET6);
1125+      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], OFFSET6);
1126+      vdest.val[IND2] = paeth(vdest.val[1], vtmp3, vtmp2);
1127+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vtmp1);
1128+
1129+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[0]), 0);
1130+      rp += OFFSET3;
1131+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[1]), 0);
1132+      rp += OFFSET3;
1133+      vst1_lane_u32(png_ptr(uint32_t, rp), png_ldr(uint32x2_t, &vdest.val[IND2]), 0);
1134+
1135+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1136+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1137+
1138+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET3);
1139+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1140+      vdestN.val[1] = vadd_u8(vdestN.val[1], vtmp1);
1141+
1142+      vtmp1 = vext_u8(vnp.val[0], vnp.val[1], OFFSET6);
1143+      vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
1144+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vtmp1);
1145+
1146+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[0]), 0);
1147+      np += OFFSET3;
1148+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[1]), 0);
1149+      np += OFFSET3;
1150+      vst1_lane_u32(png_ptr(uint32_t, np), png_ldr(uint32x2_t, &vdestN.val[IND2]), 0);
1151+   }
1152+   *rp_stop = last_byte;
1153+   *(rp_stop + row_info->rowbytes + 1) = last_byte_next;
1154+}
1155+
1156+void png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
1157+   png_const_bytep prev_row)
1158+{
1159+   png_bytep rp = row;
1160+   int count = row_info->rowbytes;
1161+   png_const_bytep pp = prev_row;
1162+
1163+   uint8x8_t vlast = vdup_n_u8(0);
1164+   uint8x8x4_t vdest;
1165+   vdest.val[IND3] = vdup_n_u8(0);
1166+
1167+   png_debug(1, "in png_read_filter_row_paeth4_neon");
1168+
1169+   uint32x2x4_t vtmp;
1170+   uint8x8x4_t *vrpt, *vppt;
1171+   uint8x8x4_t vrp, vpp;
1172+   uint32x2x4_t vdest_val;
1173+   while (count >= STEP_RGBA) {
1174+      uint32x2x4_t *temp_pointer;
1175+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
1176+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
1177+      vrp = *vrpt;
1178+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
1179+      vppt = png_ptr(uint8x8x4_t, &vtmp);
1180+      vpp = *vppt;
1181+
1182+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1183+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1184+      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
1185+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
1186+      vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]);
1187+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
1188+      vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]);
1189+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
1190+
1191+      vlast = vpp.val[IND3];
1192+
1193+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
1194+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
1195+
1196+      rp += STEP_RGBA;
1197+      pp += STEP_RGBA;
1198+      count -= STEP_RGBA;
1199+   }
1200+
1201+   if (count >= STEP_RGBA_HALF) {
1202+      uint32x2x2_t vtmp1;
1203+      uint8x8x2_t *vrpt1, *vppt1;
1204+      uint8x8x2_t vrp1, vpp1;
1205+      uint32x2x2_t *temp_pointer;
1206+      uint32x2x2_t vdest_val1;
1207+
1208+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
1209+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
1210+      vrp1 = *vrpt1;
1211+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
1212+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
1213+      vpp1 = *vppt1;
1214+
1215+      vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast);
1216+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
1217+      vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
1218+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
1219+      vlast = vpp1.val[1];
1220+
1221+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
1222+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
1223+      vdest.val[IND3] = vdest.val[1];
1224+
1225+      rp += STEP_RGBA_HALF;
1226+      pp += STEP_RGBA_HALF;
1227+      count -= STEP_RGBA_HALF;
1228+   }
1229+
1230+   if (count == 0) {
1231+      return;
1232+   }
1233+
1234+   uint32x2_t vtmp2;
1235+   uint8x8_t *vrpt2, *vppt2;
1236+   uint8x8_t vrp2, vpp2;
1237+   uint32x2_t *temp_pointer;
1238+   uint32x2_t vdest_val2;
1239+
1240+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
1241+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
1242+   vrp2 = *vrpt2;
1243+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
1244+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
1245+   vpp2 = *vppt2;
1246+
1247+   vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast);
1248+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
1249+
1250+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
1251+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
1252+}
1253+
1254+void png_read_filter_row_paeth4_x2_neon(png_row_infop row_info, png_bytep row,
1255+   png_const_bytep prev_row)
1256+{
1257+   png_bytep rp = row;
1258+   int count = row_info->rowbytes;
1259+   png_const_bytep pp = prev_row;
1260+   png_bytep np = row + row_info->rowbytes + 1;
1261+
1262+   uint8x8_t vlast = vdup_n_u8(0);
1263+   uint8x8x4_t vdest;
1264+   vdest.val[IND3] = vdup_n_u8(0);
1265+
1266+   png_debug(1, "in png_read_filter_row_paeth4_x2_neon");
1267+
1268+   uint32x2x4_t vtmp;
1269+   uint8x8x4_t *vrpt, *vppt;
1270+   uint8x8x4_t vrp, vpp;
1271+   uint32x2x4_t vdest_val;
1272+
1273+   uint8x8x4_t *vnpt;
1274+   uint8x8x4_t vnp;
1275+   uint8x8_t vlastN = vdup_n_u8(0);
1276+   uint8x8x4_t vdestN;
1277+   vdestN.val[IND3] = vdup_n_u8(0);
1278+
1279+   while (count >= STEP_RGBA) {
1280+      uint32x2x4_t *temp_pointer;
1281+      vtmp = vld4_u32(png_ptr(uint32_t, rp));
1282+      vrpt = png_ptr(uint8x8x4_t, &vtmp);
1283+      vrp = *vrpt;
1284+      vtmp = vld4_u32(png_ptrc(uint32_t, pp));
1285+      vppt = png_ptr(uint8x8x4_t, &vtmp);
1286+      vpp = *vppt;
1287+      vtmp = vld4_u32(png_ptrc(uint32_t, np));
1288+      vnpt = png_ptr(uint8x8x4_t, &vtmp);
1289+      vnp = *vnpt;
1290+
1291+      vdest.val[0] = paeth(vdest.val[IND3], vpp.val[0], vlast);
1292+      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
1293+      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
1294+      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
1295+      vdest.val[IND2] = paeth(vdest.val[1], vpp.val[IND2], vpp.val[1]);
1296+      vdest.val[IND2] = vadd_u8(vdest.val[IND2], vrp.val[IND2]);
1297+      vdest.val[IND3] = paeth(vdest.val[IND2], vpp.val[IND3], vpp.val[IND2]);
1298+      vdest.val[IND3] = vadd_u8(vdest.val[IND3], vrp.val[IND3]);
1299+
1300+      vlast = vpp.val[IND3];
1301+
1302+      vdest_val = png_ldr(uint32x2x4_t, &vdest);
1303+      vst4_lane_u32(png_ptr(uint32_t, rp), vdest_val, 0);
1304+
1305+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1306+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp.val[0]);
1307+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1308+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp.val[1]);
1309+      vdestN.val[IND2] = paeth(vdestN.val[1], vdest.val[IND2], vdest.val[1]);
1310+      vdestN.val[IND2] = vadd_u8(vdestN.val[IND2], vnp.val[IND2]);
1311+      vdestN.val[IND3] = paeth(vdestN.val[IND2], vdest.val[IND3], vdest.val[IND2]);
1312+      vdestN.val[IND3] = vadd_u8(vdestN.val[IND3], vnp.val[IND3]);
1313+
1314+      vlastN = vdest.val[IND3];
1315+
1316+      vdest_val = png_ldr(uint32x2x4_t, &vdestN);
1317+      vst4_lane_u32(png_ptr(uint32_t, np), vdest_val, 0);
1318+
1319+      rp += STEP_RGBA;
1320+      pp += STEP_RGBA;
1321+      np += STEP_RGBA;
1322+      count -= STEP_RGBA;
1323+   }
1324+
1325+   if (count >= STEP_RGBA_HALF) {
1326+      uint32x2x2_t vtmp1;
1327+      uint8x8x2_t *vrpt1, *vppt1, *vnpt1;
1328+      uint8x8x2_t vrp1, vpp1, vnp1;
1329+      uint32x2x2_t *temp_pointer;
1330+      uint32x2x2_t vdest_val1;
1331+
1332+      vtmp1 = vld2_u32(png_ptr(uint32_t, rp));
1333+      vrpt1 = png_ptr(uint8x8x2_t, &vtmp1);
1334+      vrp1 = *vrpt1;
1335+      vtmp1 = vld2_u32(png_ptrc(uint32_t, pp));
1336+      vppt1 = png_ptr(uint8x8x2_t, &vtmp1);
1337+      vpp1 = *vppt1;
1338+      vtmp1 = vld2_u32(png_ptrc(uint32_t, np));
1339+      vnpt1 = png_ptr(uint8x8x2_t, &vtmp1);
1340+      vnp1 = *vnpt1;
1341+
1342+      vdest.val[0] = paeth(vdest.val[IND3], vpp1.val[0], vlast);
1343+      vdest.val[0] = vadd_u8(vdest.val[0], vrp1.val[0]);
1344+      vdest.val[1] = paeth(vdest.val[0], vpp1.val[1], vpp1.val[0]);
1345+      vdest.val[1] = vadd_u8(vdest.val[1], vrp1.val[1]);
1346+
1347+      vlast = vpp1.val[1];
1348+
1349+      vdest_val1 = png_ldr(uint32x2x2_t, &vdest);
1350+      vst2_lane_u32(png_ptr(uint32_t, rp), vdest_val1, 0);
1351+
1352+      vdest.val[IND3] = vdest.val[1];
1353+
1354+      vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1355+      vdestN.val[0] = vadd_u8(vdestN.val[0], vnp1.val[0]);
1356+      vdestN.val[1] = paeth(vdestN.val[0], vdest.val[1], vdest.val[0]);
1357+      vdestN.val[1] = vadd_u8(vdestN.val[1], vnp1.val[1]);
1358+
1359+      vlastN = vdest.val[1];
1360+
1361+      vdest_val1 = png_ldr(uint32x2x2_t, &vdestN);
1362+      vst2_lane_u32(png_ptr(uint32_t, np), vdest_val1, 0);
1363+
1364+      vdestN.val[IND3] = vdestN.val[1];
1365+
1366+      rp += STEP_RGBA_HALF;
1367+      pp += STEP_RGBA_HALF;
1368+      np += STEP_RGBA_HALF;
1369+      count -= STEP_RGBA_HALF;
1370+   }
1371+
1372+   if (count == 0) {
1373+      return;
1374+   }
1375+
1376+   uint32x2_t vtmp2;
1377+   uint8x8_t *vrpt2, *vppt2, *vnpt2;
1378+   uint8x8_t vrp2, vpp2, vnp2;
1379+   uint32x2_t *temp_pointer;
1380+   uint32x2_t vdest_val2;
1381+
1382+   vtmp2 = vld1_u32(png_ptr(uint32_t, rp));
1383+   vrpt2 = png_ptr(uint8x8_t, &vtmp2);
1384+   vrp2 = *vrpt2;
1385+   vtmp2 = vld1_u32(png_ptrc(uint32_t, pp));
1386+   vppt2 = png_ptr(uint8x8_t, &vtmp2);
1387+   vpp2 = *vppt2;
1388+   vtmp2 = vld1_u32(png_ptrc(uint32_t, np));
1389+   vnpt2 = png_ptr(uint8x8_t, &vtmp2);
1390+   vnp2 = *vnpt2;
1391+
1392+   vdest.val[0] = paeth(vdest.val[IND3], vpp2, vlast);
1393+   vdest.val[0] = vadd_u8(vdest.val[0], vrp2);
1394+
1395+   vdest_val2 = png_ldr(uint32x2_t, &vdest);
1396+   vst1_lane_u32(png_ptr(uint32_t, rp), vdest_val2, 0);
1397+
1398+   vdestN.val[0] = paeth(vdestN.val[IND3], vdest.val[0], vlastN);
1399+   vdestN.val[0] = vadd_u8(vdestN.val[0], vnp2);
1400+
1401+   vdest_val2 = png_ldr(uint32x2_t, &vdestN);
1402+   vst1_lane_u32(png_ptr(uint32_t, np), vdest_val2, 0);
1403+}
1404+#endif /* PNG_MULTY_LINE_ENABLE */
1405 #endif /* PNG_ARM_NEON_OPT > 0 */
1406 #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
1407 #endif /* READ */
1408diff --git a/pngpread.c b/pngpread.c
1409index e283627b7..bb12f61ea 100644
1410--- a/pngpread.c
1411+++ b/pngpread.c
1412@@ -264,9 +264,22 @@ png_push_read_chunk(png_structrp png_ptr, png_inforp info_ptr)
1413       png_ptr->idat_size = png_ptr->push_length;
1414       png_ptr->process_mode = PNG_READ_IDAT_MODE;
1415       png_push_have_info(png_ptr, info_ptr);
1416-      png_ptr->zstream.avail_out =
1417-          (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
1418-          png_ptr->iwidth) + 1;
1419+#ifdef PNG_MULTY_LINE_ENABLE
1420+      // OH ISSUE: png optimize
1421+      if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1422+         (png_ptr->transformations & PNG_CHECK) == 0) {
1423+         int rest = png_ptr->num_rows - png_ptr->row_number;
1424+         int row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1425+         png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1426+             png_ptr->iwidth) + 1) * row_num;
1427+      }
1428+      else
1429+#endif
1430+      {
1431+         png_ptr->zstream.avail_out =
1432+            (uInt) PNG_ROWBYTES(png_ptr->pixel_depth,
1433+            png_ptr->iwidth) + 1;
1434+      }
1435       png_ptr->zstream.next_out = png_ptr->row_buf;
1436       return;
1437    }
1438@@ -623,6 +636,92 @@ png_push_read_IDAT(png_structrp png_ptr)
1439    }
1440 }
1441 
1442+#ifdef PNG_MULTY_LINE_ENABLE
1443+// OH ISSUE: png optimize
1444+static void png_push_process_row_x2(png_structrp png_ptr,
1445+   png_row_info row_info_in)
1446+{
1447+   png_debug(1, "in png_push_process_row_x2");
1448+   png_row_info row_info = row_info_in;
1449+   png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1450+      png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
1451+
1452+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1453+   if (png_ptr->transformations != 0)
1454+      png_do_read_transformations(png_ptr, &row_info);
1455+#endif
1456+
1457+   if (png_ptr->transformed_pixel_depth == 0)
1458+   {
1459+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1460+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1461+         png_error(png_ptr, "progressive row overflow");
1462+   }
1463+
1464+   png_push_have_row(png_ptr, png_ptr->row_buf + 1);
1465+   png_read_push_finish_row(png_ptr);
1466+
1467+   png_ptr->row_buf = png_ptr->row_buf + png_ptr->rowbytes + 1;
1468+
1469+   // do it again
1470+   if (png_ptr->transformations != 0)
1471+   {
1472+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1473+   }
1474+   else
1475+   {
1476+      png_ptr->prev_row = png_ptr->row_buf;
1477+   }
1478+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1479+   if (png_ptr->transformations != 0)
1480+      png_do_read_transformations(png_ptr, &row_info);
1481+#endif
1482+
1483+   png_push_have_row(png_ptr, png_ptr->row_buf + 1);
1484+   png_read_push_finish_row(png_ptr);
1485+}
1486+
1487+static void png_push_process_multi_rows(png_structrp png_ptr, int row_num)
1488+{
1489+   png_debug(1, "in png_push_process_multi_rows");
1490+   uInt row_bytes =  png_ptr->rowbytes + 1;
1491+
1492+   png_row_info row_info;
1493+   row_info.width = png_ptr->iwidth;
1494+   row_info.color_type = png_ptr->color_type;
1495+   row_info.bit_depth = png_ptr->bit_depth;
1496+   row_info.channels = png_ptr->channels;
1497+   row_info.pixel_depth = png_ptr->pixel_depth;
1498+   row_info.rowbytes = png_ptr->rowbytes;
1499+
1500+   png_bytep temp_row = png_ptr->row_buf;
1501+   png_bytep temp_prev_row = png_ptr->prev_row;
1502+
1503+   for (int i = 0; i < row_num; i++) {
1504+      // check if the x2_filter is effective: only supports channels 3 or 4
1505+      if ((png_ptr->channels == 3 || png_ptr->channels == 4) &&
1506+          i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
1507+          png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
1508+          png_ptr->row_buf[0] == png_ptr->row_buf[row_bytes])
1509+      {
1510+         png_push_process_row_x2(png_ptr, row_info);
1511+         png_ptr->row_buf = png_ptr->row_buf + row_bytes;
1512+         i++;
1513+         continue;
1514+      }
1515+      png_push_process_row(png_ptr);
1516+      png_ptr->row_buf = png_ptr->row_buf + row_bytes;
1517+   }
1518+
1519+   if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
1520+   {
1521+      png_ptr->prev_row = temp_prev_row;
1522+      memcpy(png_ptr->prev_row, png_ptr->row_buf - row_bytes, row_bytes);
1523+   }
1524+   png_ptr->row_buf = temp_row;
1525+}
1526+#endif
1527+
1528 void /* PRIVATE */
1529 png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1530     size_t buffer_length)
1531@@ -639,6 +738,17 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1532    /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
1533    png_ptr->zstream.avail_in = (uInt)buffer_length;
1534 
1535+#ifdef PNG_MULTY_LINE_ENABLE
1536+   // OH ISSUE: png optimize
1537+   int row_num = 1;
1538+   if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1539+       (png_ptr->transformations & PNG_CHECK) == 0)
1540+   {
1541+      int rest = png_ptr->num_rows - png_ptr->row_number;
1542+      row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1543+   }
1544+#endif
1545+
1546    /* Keep going until the decompressed data is all processed
1547     * or the stream marked as finished.
1548     */
1549@@ -655,9 +765,20 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1550       if (!(png_ptr->zstream.avail_out > 0))
1551       {
1552          /* TODO: WARNING: TRUNCATION ERROR: DANGER WILL ROBINSON: */
1553+#ifdef PNG_MULTY_LINE_ENABLE
1554+         // OH ISSUE: png optimize
1555+         if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1556+             (png_ptr->transformations & PNG_CHECK) == 0)
1557+         {
1558+            int rest = png_ptr->num_rows - png_ptr->row_number;
1559+            row_num = rest < PNG_INFLATE_ROWS ? rest : PNG_INFLATE_ROWS;
1560+         }
1561+         png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1562+             png_ptr->iwidth) + 1) * row_num;
1563+#else
1564          png_ptr->zstream.avail_out = (uInt)(PNG_ROWBYTES(png_ptr->pixel_depth,
1565              png_ptr->iwidth) + 1);
1566-
1567+#endif
1568          png_ptr->zstream.next_out = png_ptr->row_buf;
1569       }
1570 
1571@@ -719,7 +840,12 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1572 
1573          /* Do we have a complete row? */
1574          if (png_ptr->zstream.avail_out == 0)
1575+#ifdef PNG_MULTY_LINE_ENABLE
1576+            // OH ISSUE: png optimize
1577+            png_push_process_multi_rows(png_ptr, row_num);
1578+#else
1579             png_push_process_row(png_ptr);
1580+#endif
1581       }
1582 
1583       /* And check for the end of the stream. */
1584@@ -738,6 +864,7 @@ png_process_IDAT_data(png_structrp png_ptr, png_bytep buffer,
1585 void /* PRIVATE */
1586 png_push_process_row(png_structrp png_ptr)
1587 {
1588+   png_debug(1, "in png_push_process_row");
1589    /* 1.5.6: row_info moved out of png_struct to a local here. */
1590    png_row_info row_info;
1591 
1592@@ -762,8 +889,17 @@ png_push_process_row(png_structrp png_ptr)
1593     * it may not be in the future, so this was changed just to copy the
1594     * interlaced row count:
1595     */
1596-   memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1597-
1598+#ifdef PNG_MULTY_LINE_ENABLE
1599+   // OH ISSUE: png optimize
1600+   if (png_ptr->transformations == 0 && png_ptr->interlaced == 0)
1601+   {
1602+      png_ptr->prev_row = png_ptr->row_buf;
1603+   }
1604+   else
1605+#endif
1606+   {
1607+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1608+   }
1609 #ifdef PNG_READ_TRANSFORMS_SUPPORTED
1610    if (png_ptr->transformations != 0)
1611       png_do_read_transformations(png_ptr, &row_info);
1612diff --git a/pngpriv.h b/pngpriv.h
1613index fb521cf00..81300fbd8 100644
1614--- a/pngpriv.h
1615+++ b/pngpriv.h
1616@@ -189,6 +189,19 @@
1617 #     define PNG_ARM_NEON_IMPLEMENTATION 0
1618 #endif /* PNG_ARM_NEON_OPT > 0 */
1619 
1620+#if defined(PNG_ARM_NEON_IMPLEMENTATION) && defined(PNG_ARM_NEON)
1621+// OH ISSUE: png optimize
1622+#  if PNG_ARM_NEON_IMPLEMENTATION == 1
1623+#    define PNG_MULTY_LINE_ENABLE
1624+#    define PNG_INFLATE_MAX_SIZE (65536)
1625+#    define PNG_INFLATE_ROWS (50)
1626+#    define PNG_CHECK (PNG_EXPAND | PNG_STRIP_ALPHA | PNG_RGB_TO_GRAY | PNG_ENCODE_ALPHA | \
1627+       PNG_PACKSWAP | PNG_GRAY_TO_RGB | PNG_COMPOSE | PNG_SCALE_16_TO_8 | PNG_16_TO_8 | \
1628+       PNG_BACKGROUND_EXPAND | PNG_EXPAND_16 | PNG_PACK | PNG_ADD_ALPHA | PNG_EXPAND_tRNS | \
1629+       PNG_RGB_TO_GRAY_ERR | PNG_RGB_TO_GRAY_WARN | PNG_FILLER | PNG_USER_TRANSFORM)
1630+#  endif
1631+#endif
1632+
1633 #ifndef PNG_MIPS_MSA_OPT
1634 #  if defined(__mips_msa) && (__mips_isa_rev >= 5) && defined(PNG_ALIGNED_MEMORY_SUPPORTED)
1635 #     define PNG_MIPS_MSA_OPT 2
1636@@ -351,8 +364,14 @@
1637 #endif
1638 
1639 #ifndef PNG_INTERNAL_FUNCTION
1640+// OH ISSUE: png optimize
1641+#  ifdef PNG_MULTY_LINE_ENABLE
1642+#    define PNG_HIDE __attribute__((visibility("hidden")))
1643+#  else
1644+#    define PNG_HIDE
1645+#  endif
1646 #  define PNG_INTERNAL_FUNCTION(type, name, args, attributes)\
1647-      PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_EMPTY attributes)
1648+      PNG_LINKAGE_FUNCTION PNG_FUNCTION(type, name, args, PNG_HIDE attributes)
1649 #endif
1650 
1651 #ifndef PNG_INTERNAL_CALLBACK
1652@@ -1304,6 +1323,19 @@ PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth3_neon,(png_row_infop
1653     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1654 PNG_INTERNAL_FUNCTION(void,png_read_filter_row_paeth4_neon,(png_row_infop
1655     row_info, png_bytep row, png_const_bytep prev_row),PNG_EMPTY);
1656+#ifdef PNG_MULTY_LINE_ENABLE
1657+// OH ISSUE: png optimize
1658+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_up_x2_neon, (png_row_infop
1659+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1660+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg3_x2_neon, (png_row_infop
1661+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1662+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_avg4_x2_neon, (png_row_infop
1663+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1664+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth3_x2_neon, (png_row_infop
1665+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1666+PNG_INTERNAL_FUNCTION(void, png_read_filter_row_paeth4_x2_neon, (png_row_infop
1667+    row_info, png_bytep row, png_const_bytep prev_row), PNG_EMPTY);
1668+#endif
1669 #endif
1670 
1671 #if PNG_MIPS_MSA_OPT > 0
1672diff --git a/pngread.c b/pngread.c
1673index 8fa7d9f16..ed5a25307 100644
1674--- a/pngread.c
1675+++ b/pngread.c
1676@@ -54,7 +54,12 @@ png_create_read_struct_2,(png_const_charp user_png_ver, png_voidp error_ptr,
1677        * required (it will be zero in a write structure.)
1678        */
1679 #     ifdef PNG_SEQUENTIAL_READ_SUPPORTED
1680+#ifdef PNG_MULTY_LINE_ENABLE
1681+         // OH ISSUE: png optimize
1682+         png_ptr->IDAT_read_size = PNG_INFLATE_MAX_SIZE;
1683+#else
1684          png_ptr->IDAT_read_size = PNG_IDAT_READ_SIZE;
1685+#endif
1686 #     endif
1687 
1688 #     ifdef PNG_BENIGN_READ_ERRORS_SUPPORTED
1689@@ -684,6 +689,224 @@ png_read_rows(png_structrp png_ptr, png_bytepp row,
1690 #endif /* SEQUENTIAL_READ */
1691 
1692 #ifdef PNG_SEQUENTIAL_READ_SUPPORTED
1693+
1694+#ifdef PNG_MULTY_LINE_ENABLE
1695+// OH ISSUE: png optimize
1696+static void png_read_two_rows(png_structrp png_ptr, png_bytepp rows, png_uint_32 i,
1697+                         png_row_info row_info)
1698+{
1699+   png_debug1(1, "in png_read_two_rows %d", png_ptr->row_buf[0]);
1700+   png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1701+      png_ptr->prev_row + 1, png_ptr->row_buf[0] + 4);
1702+
1703+#ifdef PNG_MNG_FEATURES_SUPPORTED
1704+   if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1705+      (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1706+   {
1707+      /* Intrapixel differencing */
1708+      png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1709+   }
1710+#endif
1711+
1712+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1713+   if (png_ptr->transformations
1714+#       ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1715+         || png_ptr->num_palette_max >= 0
1716+#       endif
1717+      )
1718+      png_do_read_transformations(png_ptr, &row_info);
1719+#endif
1720+
1721+   /* The transformed pixel depth should match the depth now in row_info. */
1722+   if (png_ptr->transformed_pixel_depth == 0)
1723+   {
1724+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1725+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1726+         png_error(png_ptr, "sequential row overflow");
1727+   }
1728+
1729+   else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1730+      png_error(png_ptr, "internal sequential row size calculation error");
1731+
1732+   if (rows[i] != NULL)
1733+      png_combine_row(png_ptr, rows[i], -1);
1734+
1735+   png_read_finish_row(png_ptr);
1736+
1737+   if (png_ptr->read_row_fn != NULL)
1738+      (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1739+
1740+   png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
1741+
1742+   // do again next line
1743+   memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info.rowbytes + 1);
1744+
1745+#ifdef PNG_MNG_FEATURES_SUPPORTED
1746+   if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1747+      (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1748+   {
1749+      /* Intrapixel differencing */
1750+      png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1751+   }
1752+#endif
1753+
1754+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1755+   if (png_ptr->transformations
1756+#       ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1757+         || png_ptr->num_palette_max >= 0
1758+#       endif
1759+      )
1760+      png_do_read_transformations(png_ptr, &row_info);
1761+#endif
1762+
1763+   /* The transformed pixel depth should match the depth now in row_info. */
1764+   if (png_ptr->transformed_pixel_depth == 0)
1765+   {
1766+      png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1767+      if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1768+         png_error(png_ptr, "sequential row overflow");
1769+   }
1770+
1771+   else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1772+      png_error(png_ptr, "internal sequential row size calculation error");
1773+
1774+   if (rows[i+1] != NULL)
1775+      png_combine_row(png_ptr, rows[i+1], -1);
1776+
1777+   png_read_finish_row(png_ptr);
1778+
1779+   if (png_ptr->read_row_fn != NULL)
1780+      (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1781+
1782+   png_ptr->row_buf = png_ptr->row_buf + row_info.rowbytes + 1;
1783+}
1784+
1785+static void png_read_muilty_rows(png_structrp png_ptr, png_bytepp rows,
1786+   png_uint_32 row_num, png_row_info row_info_in)
1787+{
1788+   if (png_ptr == NULL)
1789+      return;
1790+
1791+   png_debug2(1, "in png_read_muilty_rows (row %lu, pass %d)",
1792+       (unsigned long)png_ptr->row_number, png_ptr->pass);
1793+
1794+   if ((png_ptr->mode & PNG_HAVE_IDAT) == 0)
1795+         png_error(png_ptr, "Invalid attempt to read row data");
1796+
1797+   /* Fill the row with IDAT data: */
1798+   uInt row_bytes =  row_info_in.rowbytes;
1799+   png_ptr->row_buf[0]=255; /* 255 to force error if no data was found */
1800+   png_read_IDAT_data(png_ptr, png_ptr->row_buf, (row_bytes + 1) * row_num);
1801+   png_bytep temp_row = png_ptr->row_buf;
1802+
1803+   for (png_uint_32 i = 0; i < row_num; i++) {
1804+      png_row_info row_info = row_info_in;
1805+      // check if the x2_filter is effective: only supports channels 3 or 4
1806+      if ((row_info_in.channels == 3 || row_info_in.channels == 4) &&
1807+          i < row_num -1 && png_ptr->row_buf[0] > PNG_FILTER_VALUE_SUB &&
1808+          png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST &&
1809+          png_ptr->row_buf[0] == png_ptr->row_buf[row_info_in.rowbytes + 1])
1810+      {
1811+         png_read_two_rows(png_ptr, rows, i, row_info);
1812+         i++;
1813+         continue;
1814+      }
1815+      if (png_ptr->row_buf[0] > PNG_FILTER_VALUE_NONE)
1816+      {
1817+         if (png_ptr->row_buf[0] < PNG_FILTER_VALUE_LAST)
1818+            png_read_filter_row(png_ptr, &row_info, png_ptr->row_buf + 1,
1819+               png_ptr->prev_row + 1, png_ptr->row_buf[0]);
1820+         else
1821+            png_debug1(1, "bad adaptive filter value %d", png_ptr->row_buf[0]);
1822+      }
1823+
1824+      memcpy(png_ptr->prev_row, png_ptr->row_buf, row_info_in.rowbytes + 1);
1825+
1826+#ifdef PNG_MNG_FEATURES_SUPPORTED
1827+      if ((png_ptr->mng_features_permitted & PNG_FLAG_MNG_FILTER_64) != 0 &&
1828+         (png_ptr->filter_type == PNG_INTRAPIXEL_DIFFERENCING))
1829+      {
1830+         /* Intrapixel differencing */
1831+         png_do_read_intrapixel(&row_info, png_ptr->row_buf + 1);
1832+      }
1833+#endif
1834+
1835+#ifdef PNG_READ_TRANSFORMS_SUPPORTED
1836+      if (png_ptr->transformations
1837+#        ifdef PNG_CHECK_FOR_INVALID_INDEX_SUPPORTED
1838+            || png_ptr->num_palette_max >= 0
1839+#        endif
1840+         )
1841+         png_do_read_transformations(png_ptr, &row_info);
1842+#endif
1843+
1844+      /* The transformed pixel depth should match the depth now in row_info. */
1845+      if (png_ptr->transformed_pixel_depth == 0)
1846+      {
1847+         png_ptr->transformed_pixel_depth = row_info.pixel_depth;
1848+         if (row_info.pixel_depth > png_ptr->maximum_pixel_depth)
1849+            png_error(png_ptr, "sequential row overflow");
1850+      }
1851+
1852+      else if (png_ptr->transformed_pixel_depth != row_info.pixel_depth)
1853+         png_error(png_ptr, "internal sequential row size calculation error");
1854+
1855+      if (rows[i] != NULL)
1856+         png_combine_row(png_ptr, rows[i], -1);
1857+
1858+      png_read_finish_row(png_ptr);
1859+
1860+      if (png_ptr->read_row_fn != NULL)
1861+         (*(png_ptr->read_row_fn))(png_ptr, png_ptr->row_number, png_ptr->pass);
1862+
1863+      png_ptr->row_buf = png_ptr->row_buf + row_bytes + 1;
1864+   }
1865+   png_ptr->row_buf = temp_row;
1866+}
1867+
1868+static void png_warn_check(png_structrp png_ptr)
1869+{
1870+#ifdef PNG_WARNINGS_SUPPORTED
1871+   /* Check for transforms that have been set but were defined out */
1872+#if defined(PNG_WRITE_INVERT_SUPPORTED) && !defined(PNG_READ_INVERT_SUPPORTED)
1873+   if ((png_ptr->transformations & PNG_INVERT_MONO) != 0)
1874+      png_warning(png_ptr, "PNG_READ_INVERT_SUPPORTED is not defined");
1875+#endif
1876+
1877+#if defined(PNG_WRITE_FILLER_SUPPORTED) && !defined(PNG_READ_FILLER_SUPPORTED)
1878+   if ((png_ptr->transformations & PNG_FILLER) != 0)
1879+      png_warning(png_ptr, "PNG_READ_FILLER_SUPPORTED is not defined");
1880+#endif
1881+
1882+#if defined(PNG_WRITE_PACKSWAP_SUPPORTED) && \
1883+    !defined(PNG_READ_PACKSWAP_SUPPORTED)
1884+   if ((png_ptr->transformations & PNG_PACKSWAP) != 0)
1885+      png_warning(png_ptr, "PNG_READ_PACKSWAP_SUPPORTED is not defined");
1886+#endif
1887+
1888+#if defined(PNG_WRITE_PACK_SUPPORTED) && !defined(PNG_READ_PACK_SUPPORTED)
1889+   if ((png_ptr->transformations & PNG_PACK) != 0)
1890+      png_warning(png_ptr, "PNG_READ_PACK_SUPPORTED is not defined");
1891+#endif
1892+
1893+#if defined(PNG_WRITE_SHIFT_SUPPORTED) && !defined(PNG_READ_SHIFT_SUPPORTED)
1894+   if ((png_ptr->transformations & PNG_SHIFT) != 0)
1895+      png_warning(png_ptr, "PNG_READ_SHIFT_SUPPORTED is not defined");
1896+#endif
1897+
1898+#if defined(PNG_WRITE_BGR_SUPPORTED) && !defined(PNG_READ_BGR_SUPPORTED)
1899+   if ((png_ptr->transformations & PNG_BGR) != 0)
1900+      png_warning(png_ptr, "PNG_READ_BGR_SUPPORTED is not defined");
1901+#endif
1902+
1903+#if defined(PNG_WRITE_SWAP_SUPPORTED) && !defined(PNG_READ_SWAP_SUPPORTED)
1904+   if ((png_ptr->transformations & PNG_SWAP_BYTES) != 0)
1905+      png_warning(png_ptr, "PNG_READ_SWAP_SUPPORTED is not defined");
1906+#endif
1907+#endif /* WARNINGS */
1908+}
1909+#endif // PNG_MULTY_LINE_ENABLE
1910+
1911 /* Read the entire image.  If the image has an alpha channel or a tRNS
1912  * chunk, and you have called png_handle_alpha()[*], you will need to
1913  * initialize the image to the current image that PNG will be overlaying.
1914@@ -745,13 +968,45 @@ png_read_image(png_structrp png_ptr, png_bytepp image)
1915 
1916    image_height=png_ptr->height;
1917 
1918-   for (j = 0; j < pass; j++)
1919-   {
1920+#ifdef PNG_MULTY_LINE_ENABLE
1921+   // OH ISSUE: png optimize
1922+   if (png_ptr->interlaced == 0 && png_ptr->bit_depth == 8 &&
1923+       (png_ptr->transformations & PNG_CHECK) == 0) {
1924+      if ((png_ptr->flags & PNG_FLAG_ROW_INIT) == 0)
1925+         png_read_start_row(png_ptr);
1926+
1927+      png_warn_check(png_ptr);
1928+      png_row_info row_info;
1929+      row_info.width = png_ptr->iwidth;
1930+      row_info.color_type = png_ptr->color_type;
1931+      row_info.bit_depth = png_ptr->bit_depth;
1932+      row_info.channels = png_ptr->channels;
1933+      row_info.pixel_depth = png_ptr->pixel_depth;
1934+      row_info.rowbytes = png_ptr->rowbytes;
1935+
1936       rp = image;
1937-      for (i = 0; i < image_height; i++)
1938+      int row_num = PNG_INFLATE_ROWS;
1939+      for (i = 0; i < image_height; i += PNG_INFLATE_ROWS)
1940       {
1941-         png_read_row(png_ptr, *rp, NULL);
1942-         rp++;
1943+         if (image_height - i < PNG_INFLATE_ROWS)
1944+         {
1945+            row_num = image_height - i;
1946+         }
1947+         png_read_muilty_rows(png_ptr, rp, row_num, row_info);
1948+         rp += row_num;
1949+      }
1950+   }
1951+   else
1952+#endif
1953+   {
1954+      for (j = 0; j < pass; j++)
1955+      {
1956+         rp = image;
1957+         for (i = 0; i < image_height; i++)
1958+         {
1959+            png_read_row(png_ptr, *rp, NULL);
1960+            rp++;
1961+         }
1962       }
1963    }
1964 }
1965diff --git a/pngrutil.c b/pngrutil.c
1966index 9ac8ec11f..f9c65927d 100644
1967--- a/pngrutil.c
1968+++ b/pngrutil.c
1969@@ -4134,7 +4134,12 @@ png_read_filter_row(png_structrp pp, png_row_infop row_info, png_bytep row,
1970     * PNG_FILTER_OPTIMIZATIONS to a function that overrides the generic
1971     * implementations.  See png_init_filter_functions above.
1972     */
1973+#ifdef PNG_MULTY_LINE_ENABLE
1974+   // OH ISSUE: png optimize
1975+   if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST_X2)
1976+#else
1977    if (filter > PNG_FILTER_VALUE_NONE && filter < PNG_FILTER_VALUE_LAST)
1978+#endif
1979    {
1980       if (pp->read_filter[0] == NULL)
1981          png_init_filter_functions(pp);
1982@@ -4606,7 +4611,24 @@ defined(PNG_USER_TRANSFORM_PTR_SUPPORTED)
1983              row_bytes + 48);
1984 
1985       else
1986+      {
1987+#ifdef PNG_MULTY_LINE_ENABLE
1988+         // OH ISSUE: png optimize
1989+         png_uint_32 row_num = 1;
1990+         if (png_ptr->bit_depth == 8 &&
1991+             (png_ptr->transformations & PNG_CHECK) == 0)
1992+         {
1993+            row_num = png_ptr->height < PNG_INFLATE_ROWS ?
1994+               png_ptr->height : PNG_INFLATE_ROWS;
1995+         }
1996+         png_ptr->big_row_buf = (png_bytep)png_malloc(
1997+            png_ptr, row_bytes * row_num + 48);
1998+         if (png_ptr->big_row_buf == NULL)
1999+            png_error(png_ptr, "png_malloc failed");
2000+#else
2001          png_ptr->big_row_buf = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
2002+#endif
2003+      }
2004 
2005       png_ptr->big_prev_row = (png_bytep)png_malloc(png_ptr, row_bytes + 48);
2006 
2007diff --git a/pngstruct.h b/pngstruct.h
2008index e591d94d5..7c3846475 100644
2009--- a/pngstruct.h
2010+++ b/pngstruct.h
2011@@ -140,6 +140,14 @@ typedef const png_colorspace * PNG_RESTRICT png_const_colorspacerp;
2012 #define PNG_COLORSPACE_CANCEL(flags)        (0xffff ^ (flags))
2013 #endif /* COLORSPACE || GAMMA */
2014 
2015+#ifdef PNG_MULTY_LINE_ENABLE
2016+// OH ISSUE: png optimize
2017+#define PNG_FILTER_VALUE_UP_X2      (6) // PNG_FILTER_VALUE_UP + 4
2018+#define PNG_FILTER_VALUE_AVG_X2     (7) // PNG_FILTER_VALUE_AVG + 4
2019+#define PNG_FILTER_VALUE_PAETH_X2   (8) // PNG_FILTER_VALUE_PAETH + 4
2020+#define PNG_FILTER_VALUE_LAST_X2    (9) // PNG_FILTER_VALUE_LAST + 4
2021+#endif
2022+
2023 struct png_struct_def
2024 {
2025 #ifdef PNG_SETJMP_SUPPORTED
2026@@ -467,8 +475,14 @@ struct png_struct_def
2027    png_bytep big_prev_row;
2028 
2029 /* New member added in libpng-1.5.7 */
2030+#ifdef PNG_MULTY_LINE_ENABLE
2031+   // OH ISSUE: png optimize
2032+   void (*read_filter[PNG_FILTER_VALUE_LAST_X2 - 1])(png_row_infop row_info,
2033+      png_bytep row, png_const_bytep prev_row);
2034+#else
2035    void (*read_filter[PNG_FILTER_VALUE_LAST-1])(png_row_infop row_info,
2036       png_bytep row, png_const_bytep prev_row);
2037+#endif
2038 
2039 #ifdef PNG_READ_SUPPORTED
2040 #if defined(PNG_COLORSPACE_SUPPORTED) || defined(PNG_GAMMA_SUPPORTED)
2041diff --git a/pngtrans.c b/pngtrans.c
2042index 1100f46eb..9addf3423 100644
2043--- a/pngtrans.c
2044+++ b/pngtrans.c
2045@@ -13,6 +13,19 @@
2046 
2047 #include "pngpriv.h"
2048 
2049+#ifdef PNG_MULTY_LINE_ENABLE
2050+#  if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
2051+#    include <arm64_neon.h>
2052+#  else
2053+#    include <arm_neon.h>
2054+#  endif
2055+#  define STEP_GRAY (16)
2056+#  define STEP_GA (32)
2057+#  define STEP_RGB (48)
2058+#  define STEP_RGBA (64)
2059+#  define INDEX2 (2)
2060+#endif
2061+
2062 #if defined(PNG_READ_SUPPORTED) || defined(PNG_WRITE_SUPPORTED)
2063 
2064 #if defined(PNG_READ_BGR_SUPPORTED) || defined(PNG_WRITE_BGR_SUPPORTED)
2065@@ -269,13 +282,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2066    if (row_info->color_type == PNG_COLOR_TYPE_GRAY)
2067    {
2068       png_bytep rp = row;
2069-      size_t i;
2070-      size_t istop = row_info->rowbytes;
2071-
2072-      for (i = 0; i < istop; i++)
2073+      png_bytep rp_stop = row + row_info->rowbytes;
2074+#ifdef PNG_MULTY_LINE_ENABLE
2075+      png_bytep rp_stop_neon = rp_stop - STEP_GRAY;
2076+      for (; rp < rp_stop_neon; rp += STEP_GRAY)
2077+      {
2078+         uint8x16_t gray = vld1q_u8(rp);
2079+         gray = ~gray;
2080+         vst1q_u8(rp, gray);
2081+      }
2082+#endif
2083+      for (; rp < rp_stop; rp++)
2084       {
2085          *rp = (png_byte)(~(*rp));
2086-         rp++;
2087       }
2088    }
2089 
2090@@ -283,13 +302,19 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2091       row_info->bit_depth == 8)
2092    {
2093       png_bytep rp = row;
2094-      size_t i;
2095-      size_t istop = row_info->rowbytes;
2096-
2097-      for (i = 0; i < istop; i += 2)
2098+      png_bytep rp_stop = row + row_info->rowbytes;
2099+#ifdef PNG_MULTY_LINE_ENABLE
2100+      png_bytep rp_stop_neon = rp_stop - STEP_GA;
2101+      for (; rp < rp_stop_neon; rp += STEP_GA)
2102+      {
2103+         uint8x16x2_t gray_alpha = vld2q_u8(rp);
2104+         gray_alpha.val[0] = ~gray_alpha.val[0];
2105+         vst2q_u8(rp, gray_alpha);
2106+      }
2107+#endif
2108+      for (; rp < rp_stop; rp += 2)
2109       {
2110          *rp = (png_byte)(~(*rp));
2111-         rp += 2;
2112       }
2113    }
2114 
2115@@ -298,14 +323,21 @@ png_do_invert(png_row_infop row_info, png_bytep row)
2116       row_info->bit_depth == 16)
2117    {
2118       png_bytep rp = row;
2119-      size_t i;
2120-      size_t istop = row_info->rowbytes;
2121-
2122-      for (i = 0; i < istop; i += 4)
2123+      png_bytep rp_stop = row + row_info->rowbytes;
2124+#ifdef PNG_MULTY_LINE_ENABLE
2125+      png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
2126+      for (; rp < rp_stop_neon; rp += STEP_RGBA)
2127+      {
2128+         uint8x16x4_t gray_alpha = vld4q_u8(rp);
2129+         gray_alpha.val[0] = ~gray_alpha.val[0];
2130+         gray_alpha.val[1] = ~gray_alpha.val[1];
2131+         vst4q_u8(rp, gray_alpha);
2132+      }
2133+#endif
2134+      for (; rp < rp_stop; rp += 4)
2135       {
2136          *rp = (png_byte)(~(*rp));
2137          *(rp + 1) = (png_byte)(~(*(rp + 1)));
2138-         rp += 4;
2139       }
2140    }
2141 #endif
2142@@ -323,10 +355,19 @@ png_do_swap(png_row_infop row_info, png_bytep row)
2143    if (row_info->bit_depth == 16)
2144    {
2145       png_bytep rp = row;
2146-      png_uint_32 i;
2147-      png_uint_32 istop= row_info->width * row_info->channels;
2148-
2149-      for (i = 0; i < istop; i++, rp += 2)
2150+      png_bytep rp_stop = row + row_info->rowbytes;
2151+#ifdef PNG_MULTY_LINE_ENABLE
2152+      png_bytep rp_stop_neon = rp_stop - STEP_GA;
2153+      for (; rp < rp_stop_neon; rp += STEP_GA)
2154+      {
2155+         uint8x16x2_t gray = vld2q_u8(rp);
2156+         uint8x16_t tmp = gray.val[0];
2157+         gray.val[0] = gray.val[1];
2158+         gray.val[1] = tmp;
2159+         vst2q_u8(rp, gray);
2160+      }
2161+#endif
2162+      for (; rp < rp_stop; rp += 2)
2163       {
2164 #ifdef PNG_BUILTIN_BSWAP16_SUPPORTED
2165          /* Feature added to libpng-1.6.11 for testing purposes, not
2166@@ -622,15 +663,24 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2167 
2168    if ((row_info->color_type & PNG_COLOR_MASK_COLOR) != 0)
2169    {
2170-      png_uint_32 row_width = row_info->width;
2171       if (row_info->bit_depth == 8)
2172       {
2173          if (row_info->color_type == PNG_COLOR_TYPE_RGB)
2174          {
2175-            png_bytep rp;
2176-            png_uint_32 i;
2177-
2178-            for (i = 0, rp = row; i < row_width; i++, rp += 3)
2179+            png_bytep rp = row;
2180+            png_bytep rp_stop = row + row_info->rowbytes;
2181+#ifdef PNG_MULTY_LINE_ENABLE
2182+            png_bytep rp_stop_neon = rp_stop - STEP_RGB;
2183+            for (; rp < rp_stop_neon; rp += STEP_RGB)
2184+            {
2185+               uint8x16x3_t bgr = vld3q_u8(rp);
2186+               uint8x16_t tmp = bgr.val[INDEX2];
2187+               bgr.val[INDEX2] = bgr.val[0];
2188+               bgr.val[0] = tmp;
2189+               vst3q_u8(rp, bgr);
2190+            }
2191+#endif
2192+            for (; rp < rp_stop; rp += 3)
2193             {
2194                png_byte save = *rp;
2195                *rp = *(rp + 2);
2196@@ -640,10 +690,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2197 
2198          else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
2199          {
2200-            png_bytep rp;
2201-            png_uint_32 i;
2202-
2203-            for (i = 0, rp = row; i < row_width; i++, rp += 4)
2204+            png_bytep rp = row;
2205+            png_bytep rp_stop = row + row_info->rowbytes;
2206+#ifdef PNG_MULTY_LINE_ENABLE
2207+            png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
2208+            for (; rp < rp_stop_neon; rp += STEP_RGBA)
2209+            {
2210+               uint8x16x4_t bgra = vld4q_u8(rp);
2211+               uint8x16_t tmp = bgra.val[INDEX2];
2212+               bgra.val[INDEX2] = bgra.val[0];
2213+               bgra.val[0] = tmp;
2214+               vst4q_u8(rp, bgra);
2215+            }
2216+#endif
2217+            for (; rp < rp_stop; rp += 4)
2218             {
2219                png_byte save = *rp;
2220                *rp = *(rp + 2);
2221@@ -657,10 +717,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2222       {
2223          if (row_info->color_type == PNG_COLOR_TYPE_RGB)
2224          {
2225-            png_bytep rp;
2226-            png_uint_32 i;
2227-
2228-            for (i = 0, rp = row; i < row_width; i++, rp += 6)
2229+            png_bytep rp = row;
2230+            png_bytep rp_stop = row + row_info->rowbytes;
2231+#ifdef PNG_MULTY_LINE_ENABLE
2232+            png_bytep rp_stop_neon = rp_stop - STEP_RGB;
2233+            for (; rp < rp_stop_neon; rp += STEP_RGB)
2234+            {
2235+               uint16x8x3_t bgr = vld3q_u16((unsigned short *)rp);
2236+               uint16x8_t tmp = bgr.val[INDEX2];
2237+               bgr.val[INDEX2] = bgr.val[0];
2238+               bgr.val[0] = tmp;
2239+               vst3q_u16((unsigned short *)rp, bgr);
2240+            }
2241+#endif
2242+            for (; rp < rp_stop; rp += 6)
2243             {
2244                png_byte save = *rp;
2245                *rp = *(rp + 4);
2246@@ -673,10 +743,20 @@ png_do_bgr(png_row_infop row_info, png_bytep row)
2247 
2248          else if (row_info->color_type == PNG_COLOR_TYPE_RGB_ALPHA)
2249          {
2250-            png_bytep rp;
2251-            png_uint_32 i;
2252-
2253-            for (i = 0, rp = row; i < row_width; i++, rp += 8)
2254+            png_bytep rp = row;
2255+            png_bytep rp_stop = row + row_info->rowbytes;
2256+#ifdef PNG_MULTY_LINE_ENABLE
2257+            png_bytep rp_stop_neon = rp_stop - STEP_RGBA;
2258+            for (; rp < rp_stop_neon; rp += STEP_RGBA)
2259+            {
2260+               uint16x8x4_t bgra = vld4q_u16((unsigned short *)rp);
2261+               uint16x8_t tmp = bgra.val[INDEX2];
2262+               bgra.val[INDEX2] = bgra.val[0];
2263+               bgra.val[0] = tmp;
2264+               vst4q_u16((unsigned short *)rp, bgra);
2265+            }
2266+#endif
2267+            for (; rp < rp_stop; rp += 8)
2268             {
2269                png_byte save = *rp;
2270                *rp = *(rp + 4);
2271