1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
5 * libpng 1.0.9 - January 31, 2001
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 * Debugging and cleanup by Greg Roelofs, 2000, 2001
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
27 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
29 static int mmx_supported=2;
35 int mmx_supported_local = 0;
37 push ebx //CPUID will trash these
40 pushfd //Save Eflag to stack
41 pop eax //Get Eflag from stack into eax
42 mov ecx, eax //Make another copy of Eflag in ecx
43 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
44 push eax //Save modified Eflag back to stack
46 popfd //Restored modified value back to Eflag reg
47 pushfd //Save Eflag to stack
48 pop eax //Get Eflag from stack
49 xor eax, ecx //Compare the new Eflag with the original Eflag
50 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
51 //skip following instructions and jump to
54 xor eax, eax //Set eax to zero
56 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
59 cmp eax, 1 //make sure eax return non-zero value
60 jl NOT_SUPPORTED //If eax is zero, mmx not supported
62 xor eax, eax //set eax to zero
63 inc eax //Now increment eax to 1. This instruction is
64 //faster than the instruction "mov eax, 1"
66 _asm _emit 0x0f //CPUID instruction
69 and edx, 0x00800000 //mask out all bits but mmx bit(24)
70 cmp edx, 0 // 0 = mmx not supported
71 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
73 mov mmx_supported_local, 1 //set return value to 1
76 mov eax, mmx_supported_local //move return value to eax
77 pop edx //CPUID trashed these
82 //mmx_supported_local=0; // test code for force don't support MMX
83 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
85 mmx_supported = mmx_supported_local;
86 return mmx_supported_local;
89 /* Combines the row recently read in with the previous row.
90 This routine takes care of alpha and transparency if requested.
91 This routine also handles the two methods of progressive display
92 of interlaced images, depending on the mask value.
93 The mask value describes which pixels are to be combined with
94 the row. The pattern always repeats every 8 pixels, so just 8
95 bits are needed. A one indicates the pixel is to be combined; a
96 zero indicates the pixel is to be skipped. This is in addition
97 to any alpha or transparency value associated with the pixel. If
98 you want all pixels to be combined, pass 0xff (255) in mask. */
100 /* Use this routine for x86 platform - uses faster MMX routine if machine
104 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
106 #ifdef PNG_USE_LOCAL_ARRAYS
107 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
110 png_debug(1,"in png_combine_row_asm\n");
112 if (mmx_supported == 2) {
118 png_memcpy(row, png_ptr->row_buf + 1,
119 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
121 /* GRR: add "else if (mask == 0)" case?
122 * or does png_combine_row() not even get called in that case? */
125 switch (png_ptr->row_info.pixel_depth)
131 int s_inc, s_start, s_end;
136 sp = png_ptr->row_buf + 1;
139 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
140 if (png_ptr->transformations & PNG_PACKSWAP)
156 for (i = 0; i < png_ptr->width; i++)
162 value = (*sp >> shift) & 0x1;
163 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
164 *dp |= (png_byte)(value << shift);
188 int s_start, s_end, s_inc;
194 sp = png_ptr->row_buf + 1;
197 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
198 if (png_ptr->transformations & PNG_PACKSWAP)
214 for (i = 0; i < png_ptr->width; i++)
218 value = (*sp >> shift) & 0x3;
219 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
220 *dp |= (png_byte)(value << shift);
243 int s_start, s_end, s_inc;
249 sp = png_ptr->row_buf + 1;
252 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
253 if (png_ptr->transformations & PNG_PACKSWAP)
268 for (i = 0; i < png_ptr->width; i++)
272 value = (*sp >> shift) & 0xf;
273 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
274 *dp |= (png_byte)(value << shift);
301 __int64 mask0=0x0102040810204080;
305 srcptr = png_ptr->row_buf + 1;
309 len = png_ptr->width &~7; //reduce to multiple of 8
310 diff = png_ptr->width & 7; //amount lost
314 movd mm7, unmask //load bit pattern
315 psubb mm6,mm6 //zero mm6
318 punpckldq mm7,mm7 //fill register with 8 masks
322 pand mm0,mm7 //nonzero if keep byte
323 pcmpeqb mm0,mm6 //zeros->1s, v versa
325 mov ecx,len //load length of line (pixels)
326 mov esi,srcptr //load source
327 mov ebx,dstptr //load dest
339 add esi,8 //inc by 8 bytes processed
341 sub ecx,8 //dec by 8 pixels processed
351 sal edx,24 //make low byte the high byte
354 sal edx,1 //move high bit to CF
355 jnc skip8 //if CF = 0
368 else /* mmx not supported - use modified C routine */
370 register unsigned int incr1, initial_val, final_val;
371 png_size_t pixel_bytes;
373 register int disp = png_pass_inc[png_ptr->pass];
374 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
376 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
377 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
379 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
380 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
381 final_val = png_ptr->width*pixel_bytes;
382 incr1 = (disp)*pixel_bytes;
383 for (i = initial_val; i < final_val; i += incr1)
385 png_memcpy(dstptr, srcptr, pixel_bytes);
400 __int64 mask1=0x0101020204040808,
401 mask0=0x1010202040408080;
405 srcptr = png_ptr->row_buf + 1;
409 len = (png_ptr->width)&~7;
410 diff = (png_ptr->width)&7;
413 movd mm7, unmask //load bit pattern
414 psubb mm6,mm6 //zero mm6
417 punpckldq mm7,mm7 //fill register with 8 masks
428 mov ecx,len //load length of line
429 mov esi,srcptr //load source
430 mov ebx,dstptr //load dest
451 add esi,16 //inc by 16 bytes processed
453 sub ecx,8 //dec by 8 pixels processed
463 sal edx,24 //make low byte the high byte
465 sal edx,1 //move high bit to CF
466 jnc skip16 //if CF = 0
479 else /* mmx not supported - use modified C routine */
481 register unsigned int incr1, initial_val, final_val;
482 png_size_t pixel_bytes;
484 register int disp = png_pass_inc[png_ptr->pass];
485 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
487 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
488 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
490 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
491 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
492 final_val = png_ptr->width*pixel_bytes;
493 incr1 = (disp)*pixel_bytes;
494 for (i = initial_val; i < final_val; i += incr1)
496 png_memcpy(dstptr, srcptr, pixel_bytes);
512 __int64 mask2=0x0101010202020404, //24bpp
513 mask1=0x0408080810101020,
514 mask0=0x2020404040808080;
516 srcptr = png_ptr->row_buf + 1;
520 len = (png_ptr->width)&~7;
521 diff = (png_ptr->width)&7;
527 movd mm7, unmask //load bit pattern
528 psubb mm6,mm6 //zero mm6
531 punpckldq mm7,mm7 //fill register with 8 masks
545 mov ecx,len //load length of line
546 mov esi,srcptr //load source
547 mov ebx,dstptr //load dest
577 add esi,24 //inc by 24 bytes processed
579 sub ecx,8 //dec by 8 pixels processed
589 sal edx,24 //make low byte the high byte
591 sal edx,1 //move high bit to CF
592 jnc skip24 //if CF = 0
609 else /* mmx not supported - use modified C routine */
611 register unsigned int incr1, initial_val, final_val;
612 png_size_t pixel_bytes;
614 register int disp = png_pass_inc[png_ptr->pass];
615 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
617 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
618 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
620 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
621 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
622 final_val = png_ptr->width*pixel_bytes;
623 incr1 = (disp)*pixel_bytes;
624 for (i = initial_val; i < final_val; i += incr1)
626 png_memcpy(dstptr, srcptr, pixel_bytes);
642 __int64 mask3=0x0101010102020202, //32bpp
643 mask2=0x0404040408080808,
644 mask1=0x1010101020202020,
645 mask0=0x4040404080808080;
647 srcptr = png_ptr->row_buf + 1;
651 len = (png_ptr->width)&~7;
652 diff = (png_ptr->width)&7;
658 movd mm7, unmask //load bit pattern
659 psubb mm6,mm6 //zero mm6
662 punpckldq mm7,mm7 //fill register with 8 masks
679 mov ecx,len //load length of line
680 mov esi,srcptr //load source
681 mov ebx,dstptr //load dest
719 add esi,32 //inc by 32 bytes processed
721 sub ecx,8 //dec by 8 pixels processed
731 sal edx,24 //make low byte the high byte
733 sal edx,1 //move high bit to CF
734 jnc skip32 //if CF = 0
748 else /* mmx _not supported - Use modified C routine */
750 register unsigned int incr1, initial_val, final_val;
751 png_size_t pixel_bytes;
753 register int disp = png_pass_inc[png_ptr->pass];
754 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
756 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
757 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
759 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
760 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
761 final_val = png_ptr->width*pixel_bytes;
762 incr1 = (disp)*pixel_bytes;
763 for (i = initial_val; i < final_val; i += incr1)
765 png_memcpy(dstptr, srcptr, pixel_bytes);
781 __int64 mask5=0x0101010101010202,
782 mask4=0x0202020204040404,
783 mask3=0x0404080808080808,
784 mask2=0x1010101010102020,
785 mask1=0x2020202040404040,
786 mask0=0x4040808080808080;
790 srcptr = png_ptr->row_buf + 1;
794 len = (png_ptr->width)&~7;
795 diff = (png_ptr->width)&7;
798 movd mm7, unmask //load bit pattern
799 psubb mm6,mm6 //zero mm6
802 punpckldq mm7,mm7 //fill register with 8 masks
825 mov ecx,len //load length of line
826 mov esi,srcptr //load source
827 mov ebx,dstptr //load dest
875 add esi,48 //inc by 32 bytes processed
877 sub ecx,8 //dec by 8 pixels processed
887 sal edx,24 //make low byte the high byte
890 sal edx,1 //move high bit to CF
891 jnc skip48 //if CF = 0
905 else /* mmx _not supported - Use modified C routine */
907 register unsigned int incr1, initial_val, final_val;
908 png_size_t pixel_bytes;
910 register int disp = png_pass_inc[png_ptr->pass];
911 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
913 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
914 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
916 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
917 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
918 final_val = png_ptr->width*pixel_bytes;
919 incr1 = (disp)*pixel_bytes;
920 for (i = initial_val; i < final_val; i += incr1)
922 png_memcpy(dstptr, srcptr, pixel_bytes);
935 png_size_t pixel_bytes;
936 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
938 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
939 register unsigned int incr1, initial_val, final_val;
941 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
942 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
944 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
945 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
946 final_val = png_ptr->width*pixel_bytes;
947 incr1 = (disp)*pixel_bytes;
948 for (i = initial_val; i < final_val; i += incr1)
950 png_memcpy(dp, sptr, pixel_bytes);
956 } /* end switch (png_ptr->row_info.pixel_depth) */
957 } /* end if (non-trivial mask) */
959 } /* end png_combine_row() */
962 #if defined(PNG_READ_INTERLACING_SUPPORTED)
965 png_do_read_interlace(png_structp png_ptr)
967 png_row_infop row_info = &(png_ptr->row_info);
968 png_bytep row = png_ptr->row_buf + 1;
969 int pass = png_ptr->pass;
970 png_uint_32 transformations = png_ptr->transformations;
971 #ifdef PNG_USE_LOCAL_ARRAYS
972 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
975 png_debug(1,"in png_do_read_interlace\n");
977 if (mmx_supported == 2) {
981 if (row != NULL && row_info != NULL)
983 png_uint_32 final_width;
985 final_width = row_info->width * png_pass_inc[pass];
987 switch (row_info->pixel_depth)
993 int s_start, s_end, s_inc;
998 sp = row + (png_size_t)((row_info->width - 1) >> 3);
999 dp = row + (png_size_t)((final_width - 1) >> 3);
1000 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1001 if (transformations & PNG_PACKSWAP)
1003 sshift = (int)((row_info->width + 7) & 7);
1004 dshift = (int)((final_width + 7) & 7);
1012 sshift = 7 - (int)((row_info->width + 7) & 7);
1013 dshift = 7 - (int)((final_width + 7) & 7);
1019 for (i = row_info->width; i; i--)
1021 v = (png_byte)((*sp >> sshift) & 0x1);
1022 for (j = 0; j < png_pass_inc[pass]; j++)
1024 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1025 *dp |= (png_byte)(v << dshift);
1026 if (dshift == s_end)
1034 if (sshift == s_end)
1049 int s_start, s_end, s_inc;
1052 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1053 dp = row + (png_size_t)((final_width - 1) >> 2);
1054 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1055 if (transformations & PNG_PACKSWAP)
1057 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1058 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1066 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1067 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1073 for (i = row_info->width; i; i--)
1078 v = (png_byte)((*sp >> sshift) & 0x3);
1079 for (j = 0; j < png_pass_inc[pass]; j++)
1081 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1082 *dp |= (png_byte)(v << dshift);
1083 if (dshift == s_end)
1091 if (sshift == s_end)
1106 int s_start, s_end, s_inc;
1109 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1110 dp = row + (png_size_t)((final_width - 1) >> 1);
1111 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1112 if (transformations & PNG_PACKSWAP)
1114 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1115 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1123 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1124 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1130 for (i = row_info->width; i; i--)
1135 v = (png_byte)((*sp >> sshift) & 0xf);
1136 for (j = 0; j < png_pass_inc[pass]; j++)
1138 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1139 *dp |= (png_byte)(v << dshift);
1140 if (dshift == s_end)
1148 if (sshift == s_end)
1159 default: // This is the place where the routine is modified
1161 __int64 const4 = 0x0000000000FFFFFF;
1162 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1163 __int64 const6 = 0x00000000000000FF;
1166 png_size_t pixel_bytes;
1167 int width = row_info->width;
1169 pixel_bytes = (row_info->pixel_depth >> 3);
1171 sptr = row + (width - 1) * pixel_bytes;
1172 dp = row + (final_width - 1) * pixel_bytes;
1173 // New code by Nirav Chhatrapati - Intel Corporation
1175 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1177 // use MMX routine if machine supports it
1178 if ( mmx_supported )
1180 if (pixel_bytes == 3)
1182 if (((pass == 0) || (pass == 1)) && width)
1189 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1191 movd mm0, [esi] ; X X X X X v2 v1 v0
1192 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1193 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1194 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1195 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1196 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1197 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1198 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1199 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1200 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1201 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1202 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1203 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1205 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1207 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1217 else if (((pass == 2) || (pass == 3)) && width)
1224 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1226 movd mm0, [esi] ; X X X X X v2 v1 v0
1227 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1228 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1229 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1230 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1231 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1232 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1233 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1234 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1235 movq [edi+4], mm0 ; move to memory
1236 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1237 movd [edi], mm0 ; move to memory
1245 else if (width) /* && ((pass == 4) || (pass == 5)) */
1247 int width_mmx = ((width >> 1) << 1) - 8;
1250 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1261 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1262 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1263 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1264 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1265 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1266 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1267 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1268 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1269 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1270 movq [edi], mm0 ; move quad to memory
1271 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1272 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1273 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1274 movd [edi+8], mm6 ; move double to memory
1283 sptr -= width_mmx*3;
1285 for (i = width; i; i--)
1290 png_memcpy(v, sptr, 3);
1291 for (j = 0; j < png_pass_inc[pass]; j++)
1293 png_memcpy(dp, v, 3);
1299 } /* end of pixel_bytes == 3 */
1301 else if (pixel_bytes == 1)
1303 if (((pass == 0) || (pass == 1)) && width)
1305 int width_mmx = ((width >> 2) << 2);
1317 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1318 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1319 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1320 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1321 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1322 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1323 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1324 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1325 movq [edi], mm0 ; move to memory v3
1326 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1327 movq [edi+8], mm3 ; move to memory v2
1328 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1329 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1330 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1331 movq [edi+16], mm2 ; move to memory v1
1332 movq [edi+24], mm4 ; move to memory v0
1343 for (i = width; i; i--)
1347 /* I simplified this part in version 1.0.4e
1348 * here and in several other instances where
1349 * pixel_bytes == 1 -- GR-P
1354 * png_memcpy(v, sptr, pixel_bytes);
1355 * for (j = 0; j < png_pass_inc[pass]; j++)
1357 * png_memcpy(dp, v, pixel_bytes);
1358 * dp -= pixel_bytes;
1360 * sptr -= pixel_bytes;
1362 * Replacement code is in the next three lines:
1365 for (j = 0; j < png_pass_inc[pass]; j++)
1370 else if (((pass == 2) || (pass == 3)) && width)
1372 int width_mmx = ((width >> 2) << 2);
1384 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1385 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1386 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1387 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1388 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1389 movq [edi], mm0 ; move to memory v2 and v3
1391 movq [edi+8], mm1 ; move to memory v1 and v0
1401 for (i = width; i; i--)
1405 for (j = 0; j < png_pass_inc[pass]; j++)
1412 else if (width) /* && ((pass == 4) || (pass == 5))) */
1414 int width_mmx = ((width >> 3) << 3);
1426 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1427 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1428 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1429 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1430 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1431 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1433 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1444 for (i = width; i; i--)
1448 for (j = 0; j < png_pass_inc[pass]; j++)
1455 } /* end of pixel_bytes == 1 */
1457 else if (pixel_bytes == 2)
1459 if (((pass == 0) || (pass == 1)) && width)
1461 int width_mmx = ((width >> 1) << 1);
1473 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1474 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1475 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1476 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1477 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1480 movq [edi + 16], mm1
1481 movq [edi + 24], mm1
1490 sptr -= (width_mmx*2 - 2); // sign fixed
1491 dp -= (width_mmx*16 - 2); // sign fixed
1492 for (i = width; i; i--)
1497 png_memcpy(v, sptr, 2);
1498 for (j = 0; j < png_pass_inc[pass]; j++)
1501 png_memcpy(dp, v, 2);
1505 else if (((pass == 2) || (pass == 3)) && width)
1507 int width_mmx = ((width >> 1) << 1) ;
1519 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1520 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1521 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1522 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1523 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1535 sptr -= (width_mmx*2 - 2); // sign fixed
1536 dp -= (width_mmx*8 - 2); // sign fixed
1537 for (i = width; i; i--)
1542 png_memcpy(v, sptr, 2);
1543 for (j = 0; j < png_pass_inc[pass]; j++)
1546 png_memcpy(dp, v, 2);
1550 else if (width) // pass == 4 or 5
1552 int width_mmx = ((width >> 1) << 1) ;
1564 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1565 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1575 sptr -= (width_mmx*2 - 2); // sign fixed
1576 dp -= (width_mmx*4 - 2); // sign fixed
1577 for (i = width; i; i--)
1582 png_memcpy(v, sptr, 2);
1583 for (j = 0; j < png_pass_inc[pass]; j++)
1586 png_memcpy(dp, v, 2);
1590 } /* end of pixel_bytes == 2 */
1592 else if (pixel_bytes == 4)
1594 if (((pass == 0) || (pass == 1)) && width)
1596 int width_mmx = ((width >> 1) << 1) ;
1608 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1609 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1610 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1611 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1614 movq [edi + 16], mm0
1615 movq [edi + 24], mm0
1617 movq [edi + 40], mm1
1620 movq [edi + 56], mm1
1628 sptr -= (width_mmx*4 - 4); // sign fixed
1629 dp -= (width_mmx*32 - 4); // sign fixed
1630 for (i = width; i; i--)
1635 png_memcpy(v, sptr, 4);
1636 for (j = 0; j < png_pass_inc[pass]; j++)
1639 png_memcpy(dp, v, 4);
1643 else if (((pass == 2) || (pass == 3)) && width)
1645 int width_mmx = ((width >> 1) << 1) ;
1657 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1658 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1659 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1660 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1664 movq [edi + 24], mm1
1673 sptr -= (width_mmx*4 - 4); // sign fixed
1674 dp -= (width_mmx*16 - 4); // sign fixed
1675 for (i = width; i; i--)
1680 png_memcpy(v, sptr, 4);
1681 for (j = 0; j < png_pass_inc[pass]; j++)
1684 png_memcpy(dp, v, 4);
1688 else if (width) // pass == 4 or 5
1690 int width_mmx = ((width >> 1) << 1) ;
1702 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1703 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1704 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1705 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1716 sptr -= (width_mmx*4 - 4); // sign fixed
1717 dp -= (width_mmx*8 - 4); // sign fixed
1718 for (i = width; i; i--)
1723 png_memcpy(v, sptr, 4);
1724 for (j = 0; j < png_pass_inc[pass]; j++)
1727 png_memcpy(dp, v, 4);
1732 } /* end of pixel_bytes == 4 */
1734 else if (pixel_bytes == 6)
1736 for (i = width; i; i--)
1740 png_memcpy(v, sptr, 6);
1741 for (j = 0; j < png_pass_inc[pass]; j++)
1743 png_memcpy(dp, v, 6);
1748 } /* end of pixel_bytes == 6 */
1752 for (i = width; i; i--)
1756 png_memcpy(v, sptr, pixel_bytes);
1757 for (j = 0; j < png_pass_inc[pass]; j++)
1759 png_memcpy(dp, v, pixel_bytes);
1765 } /* end of mmx_supported */
1767 else /* MMX not supported: use modified C code - takes advantage
1768 * of inlining of memcpy for a constant */
1770 if (pixel_bytes == 1)
1772 for (i = width; i; i--)
1775 for (j = 0; j < png_pass_inc[pass]; j++)
1780 else if (pixel_bytes == 3)
1782 for (i = width; i; i--)
1786 png_memcpy(v, sptr, pixel_bytes);
1787 for (j = 0; j < png_pass_inc[pass]; j++)
1789 png_memcpy(dp, v, pixel_bytes);
1792 sptr -= pixel_bytes;
1795 else if (pixel_bytes == 2)
1797 for (i = width; i; i--)
1801 png_memcpy(v, sptr, pixel_bytes);
1802 for (j = 0; j < png_pass_inc[pass]; j++)
1804 png_memcpy(dp, v, pixel_bytes);
1807 sptr -= pixel_bytes;
1810 else if (pixel_bytes == 4)
1812 for (i = width; i; i--)
1816 png_memcpy(v, sptr, pixel_bytes);
1817 for (j = 0; j < png_pass_inc[pass]; j++)
1819 png_memcpy(dp, v, pixel_bytes);
1822 sptr -= pixel_bytes;
1825 else if (pixel_bytes == 6)
1827 for (i = width; i; i--)
1831 png_memcpy(v, sptr, pixel_bytes);
1832 for (j = 0; j < png_pass_inc[pass]; j++)
1834 png_memcpy(dp, v, pixel_bytes);
1837 sptr -= pixel_bytes;
1842 for (i = width; i; i--)
1846 png_memcpy(v, sptr, pixel_bytes);
1847 for (j = 0; j < png_pass_inc[pass]; j++)
1849 png_memcpy(dp, v, pixel_bytes);
1852 sptr -= pixel_bytes;
1856 } /* end of MMX not supported */
1859 } /* end switch (row_info->pixel_depth) */
1861 row_info->width = final_width;
1862 row_info->rowbytes = ((final_width *
1863 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1868 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1871 // These variables are utilized in the functions below. They are declared
1872 // globally here to ensure alignment on 8-byte boundaries.
1877 } LBCarryMask = {0x0101010101010101},
1878 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1879 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1882 // Optimized code for PNG Average filter decoder
1884 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1885 , png_bytep prev_row)
1888 png_uint_32 FullLength;
1889 png_uint_32 MMXLength;
1893 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1894 FullLength = row_info->rowbytes; // # of bytes to filter
1896 // Init address pointers and offset
1897 mov edi, row // edi ==> Avg(x)
1898 xor ebx, ebx // ebx ==> x
1900 mov esi, prev_row // esi ==> Prior(x)
1901 sub edx, bpp // edx ==> Raw(x-bpp)
1904 // Compute the Raw value for the first bpp bytes
1905 // Raw(x) = Avg(x) + (Prior(x)/2)
1907 mov al, [esi + ebx] // Load al with Prior(x)
1909 shr al, 1 // divide by 2
1910 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1912 mov [edi+ebx-1], al // Write back Raw(x);
1913 // mov does not affect flags; -1 to offset inc ebx
1915 // get # of bytes to alignment
1916 mov diff, edi // take start of row
1917 add diff, ebx // add bpp
1918 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1919 and diff, 0xfffffff8 // mask to alignment boundary
1920 sub diff, edi // subtract from start ==> value ebx at alignment
1923 // Compute the Raw value for the bytes upto the alignment boundary
1924 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1928 mov cl, [esi + ebx] // load cl with Prior(x)
1929 mov al, [edx + ebx] // load al with Raw(x-bpp)
1932 shr ax, 1 // divide by 2
1933 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1934 cmp ebx, diff // Check if at alignment boundary
1935 mov [edi+ebx-1], al // Write back Raw(x);
1936 // mov does not affect flags; -1 to offset inc ebx
1937 jb davglp1 // Repeat until at alignment boundary
1941 sub eax, ebx // subtract alignment fix
1942 and eax, 0x00000007 // calc bytes over mult of 8
1943 sub ecx, eax // drop over bytes from original length
1946 // Now do the math for the rest of the row
1951 ActiveMask.use = 0x0000000000ffffff;
1952 ShiftBpp.use = 24; // == 3 * 8
1953 ShiftRem.use = 40; // == 64 - 24
1955 // Re-init address pointers and offset
1956 movq mm7, ActiveMask
1957 mov ebx, diff // ebx ==> x = offset to alignment boundary
1958 movq mm5, LBCarryMask
1959 mov edi, row // edi ==> Avg(x)
1960 movq mm4, HBClearMask
1961 mov esi, prev_row // esi ==> Prior(x)
1962 // PRIME the pump (load the first Raw(x-bpp) data set
1963 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1964 // (we correct position in loop below)
1966 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1967 // Add (Prev_row/2) to Average
1969 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1970 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1972 pand mm3, mm1 // get lsb for each prev_row byte
1973 psrlq mm1, 1 // divide prev_row bytes by 2
1974 pand mm1, mm4 // clear invalid bit 7 of each byte
1975 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1976 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1977 movq mm1, mm3 // now use mm1 for getting LBCarrys
1978 pand mm1, mm2 // get LBCarrys for each byte where both
1979 // lsb's were == 1 (Only valid for active group)
1980 psrlq mm2, 1 // divide raw bytes by 2
1981 pand mm2, mm4 // clear invalid bit 7 of each byte
1982 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1983 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1984 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
1986 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
1987 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
1988 movq mm2, mm0 // mov updated Raws to mm2
1989 psllq mm2, ShiftBpp // shift data to position correctly
1990 movq mm1, mm3 // now use mm1 for getting LBCarrys
1991 pand mm1, mm2 // get LBCarrys for each byte where both
1992 // lsb's were == 1 (Only valid for active group)
1993 psrlq mm2, 1 // divide raw bytes by 2
1994 pand mm2, mm4 // clear invalid bit 7 of each byte
1995 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1996 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
1997 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2000 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2001 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2003 movq mm2, mm0 // mov updated Raws to mm2
2004 psllq mm2, ShiftBpp // shift data to position correctly
2005 // Data only needs to be shifted once here to
2006 // get the correct x-bpp offset.
2007 movq mm1, mm3 // now use mm1 for getting LBCarrys
2008 pand mm1, mm2 // get LBCarrys for each byte where both
2009 // lsb's were == 1 (Only valid for active group)
2010 psrlq mm2, 1 // divide raw bytes by 2
2011 pand mm2, mm4 // clear invalid bit 7 of each byte
2012 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2013 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2015 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2018 // Now ready to write back to memory
2019 movq [edi + ebx - 8], mm0
2020 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2022 movq mm2, mm0 // mov updated Raw(x) to mm2
2033 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2034 // appropriate inactive bytes
2035 ShiftBpp.use = bpp << 3;
2036 ShiftRem.use = 64 - ShiftBpp.use;
2038 movq mm4, HBClearMask
2039 // Re-init address pointers and offset
2040 mov ebx, diff // ebx ==> x = offset to alignment boundary
2041 // Load ActiveMask and clear all bytes except for 1st active group
2042 movq mm7, ActiveMask
2043 mov edi, row // edi ==> Avg(x)
2045 mov esi, prev_row // esi ==> Prior(x)
2047 movq mm5, LBCarryMask
2048 psllq mm6, ShiftBpp // Create mask for 2nd active group
2049 // PRIME the pump (load the first Raw(x-bpp) data set
2050 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2051 // (we correct position in loop below)
2053 movq mm0, [edi + ebx]
2054 psrlq mm2, ShiftRem // shift data to position correctly
2055 movq mm1, [esi + ebx]
2056 // Add (Prev_row/2) to Average
2058 pand mm3, mm1 // get lsb for each prev_row byte
2059 psrlq mm1, 1 // divide prev_row bytes by 2
2060 pand mm1, mm4 // clear invalid bit 7 of each byte
2061 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2062 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2063 movq mm1, mm3 // now use mm1 for getting LBCarrys
2064 pand mm1, mm2 // get LBCarrys for each byte where both
2065 // lsb's were == 1 (Only valid for active group)
2066 psrlq mm2, 1 // divide raw bytes by 2
2067 pand mm2, mm4 // clear invalid bit 7 of each byte
2068 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2069 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2070 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2072 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2073 movq mm2, mm0 // mov updated Raws to mm2
2074 psllq mm2, ShiftBpp // shift data to position correctly
2076 movq mm1, mm3 // now use mm1 for getting LBCarrys
2077 pand mm1, mm2 // get LBCarrys for each byte where both
2078 // lsb's were == 1 (Only valid for active group)
2079 psrlq mm2, 1 // divide raw bytes by 2
2080 pand mm2, mm4 // clear invalid bit 7 of each byte
2081 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2082 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2083 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2086 // Now ready to write back to memory
2087 movq [edi + ebx - 8], mm0
2088 // Prep Raw(x-bpp) for next loop
2089 movq mm2, mm0 // mov updated Raws to mm2
2096 ActiveMask.use = 0x000000000000ffff;
2097 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2098 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2101 movq mm7, ActiveMask
2102 // Re-init address pointers and offset
2103 mov ebx, diff // ebx ==> x = offset to alignment boundary
2104 movq mm5, LBCarryMask
2105 mov edi, row // edi ==> Avg(x)
2106 movq mm4, HBClearMask
2107 mov esi, prev_row // esi ==> Prior(x)
2108 // PRIME the pump (load the first Raw(x-bpp) data set
2109 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2110 // (we correct position in loop below)
2112 movq mm0, [edi + ebx]
2113 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2114 movq mm1, [esi + ebx]
2115 // Add (Prev_row/2) to Average
2117 pand mm3, mm1 // get lsb for each prev_row byte
2118 psrlq mm1, 1 // divide prev_row bytes by 2
2119 pand mm1, mm4 // clear invalid bit 7 of each byte
2121 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2122 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2123 movq mm1, mm3 // now use mm1 for getting LBCarrys
2124 pand mm1, mm2 // get LBCarrys for each byte where both
2125 // lsb's were == 1 (Only valid for active group)
2126 psrlq mm2, 1 // divide raw bytes by 2
2127 pand mm2, mm4 // clear invalid bit 7 of each byte
2128 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2129 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2130 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2131 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2132 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2133 movq mm2, mm0 // mov updated Raws to mm2
2134 psllq mm2, ShiftBpp // shift data to position correctly
2135 movq mm1, mm3 // now use mm1 for getting LBCarrys
2136 pand mm1, mm2 // get LBCarrys for each byte where both
2137 // lsb's were == 1 (Only valid for active group)
2138 psrlq mm2, 1 // divide raw bytes by 2
2139 pand mm2, mm4 // clear invalid bit 7 of each byte
2140 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2141 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2142 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2144 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2145 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2146 movq mm2, mm0 // mov updated Raws to mm2
2147 psllq mm2, ShiftBpp // shift data to position correctly
2148 // Data only needs to be shifted once here to
2149 // get the correct x-bpp offset.
2150 movq mm1, mm3 // now use mm1 for getting LBCarrys
2151 pand mm1, mm2 // get LBCarrys for each byte where both
2152 // lsb's were == 1 (Only valid for active group)
2153 psrlq mm2, 1 // divide raw bytes by 2
2154 pand mm2, mm4 // clear invalid bit 7 of each byte
2155 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2156 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2157 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2159 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2160 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2161 movq mm2, mm0 // mov updated Raws to mm2
2162 psllq mm2, ShiftBpp // shift data to position correctly
2163 // Data only needs to be shifted once here to
2164 // get the correct x-bpp offset.
2166 movq mm1, mm3 // now use mm1 for getting LBCarrys
2167 pand mm1, mm2 // get LBCarrys for each byte where both
2168 // lsb's were == 1 (Only valid for active group)
2169 psrlq mm2, 1 // divide raw bytes by 2
2170 pand mm2, mm4 // clear invalid bit 7 of each byte
2171 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2172 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2173 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2176 // Now ready to write back to memory
2177 movq [edi + ebx - 8], mm0
2178 // Prep Raw(x-bpp) for next loop
2179 movq mm2, mm0 // mov updated Raws to mm2
2188 // Re-init address pointers and offset
2189 mov ebx, diff // ebx ==> x = offset to alignment boundary
2190 mov edi, row // edi ==> Avg(x)
2191 cmp ebx, FullLength // Test if offset at end of array
2193 // Do Paeth decode for remaining bytes
2194 mov esi, prev_row // esi ==> Prior(x)
2196 xor ecx, ecx // zero ecx before using cl & cx in loop below
2197 sub edx, bpp // edx ==> Raw(x-bpp)
2199 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2201 mov cl, [esi + ebx] // load cl with Prior(x)
2202 mov al, [edx + ebx] // load al with Raw(x-bpp)
2205 shr ax, 1 // divide by 2
2206 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2207 cmp ebx, FullLength // Check if at end of array
2208 mov [edi+ebx-1], al // Write back Raw(x);
2209 // mov does not affect flags; -1 to offset inc ebx
2219 // Re-init address pointers and offset
2220 mov ebx, diff // ebx ==> x = offset to alignment boundary
2221 movq mm5, LBCarryMask
2222 mov edi, row // edi ==> Avg(x)
2223 movq mm4, HBClearMask
2224 mov esi, prev_row // esi ==> Prior(x)
2225 // PRIME the pump (load the first Raw(x-bpp) data set
2226 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2227 // (NO NEED to correct position in loop below)
2229 movq mm0, [edi + ebx]
2231 movq mm1, [esi + ebx]
2233 pand mm3, mm1 // get lsb for each prev_row byte
2234 psrlq mm1, 1 // divide prev_row bytes by 2
2235 pand mm3, mm2 // get LBCarrys for each byte where both
2237 psrlq mm2, 1 // divide raw bytes by 2
2238 pand mm1, mm4 // clear invalid bit 7 of each byte
2239 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2240 pand mm2, mm4 // clear invalid bit 7 of each byte
2241 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2242 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2244 movq [edi + ebx - 8], mm0
2245 movq mm2, mm0 // reuse as Raw(x-bpp)
2250 default: // bpp greater than 8
2253 movq mm5, LBCarryMask
2254 // Re-init address pointers and offset
2255 mov ebx, diff // ebx ==> x = offset to alignment boundary
2256 mov edi, row // edi ==> Avg(x)
2257 movq mm4, HBClearMask
2259 mov esi, prev_row // esi ==> Prior(x)
2260 sub edx, bpp // edx ==> Raw(x-bpp)
2262 movq mm0, [edi + ebx]
2264 movq mm1, [esi + ebx]
2265 pand mm3, mm1 // get lsb for each prev_row byte
2266 movq mm2, [edx + ebx]
2267 psrlq mm1, 1 // divide prev_row bytes by 2
2268 pand mm3, mm2 // get LBCarrys for each byte where both
2270 psrlq mm2, 1 // divide raw bytes by 2
2271 pand mm1, mm4 // clear invalid bit 7 of each byte
2272 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2273 pand mm2, mm4 // clear invalid bit 7 of each byte
2274 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2276 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2278 movq [edi + ebx - 8], mm0
2283 } // end switch ( bpp )
2286 // MMX acceleration complete now do clean-up
2287 // Check if any remaining bytes left to decode
2288 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2289 mov edi, row // edi ==> Avg(x)
2290 cmp ebx, FullLength // Test if offset at end of array
2292 // Do Paeth decode for remaining bytes
2293 mov esi, prev_row // esi ==> Prior(x)
2295 xor ecx, ecx // zero ecx before using cl & cx in loop below
2296 sub edx, bpp // edx ==> Raw(x-bpp)
2298 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2300 mov cl, [esi + ebx] // load cl with Prior(x)
2301 mov al, [edx + ebx] // load al with Raw(x-bpp)
2304 shr ax, 1 // divide by 2
2305 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2306 cmp ebx, FullLength // Check if at end of array
2307 mov [edi+ebx-1], al // Write back Raw(x);
2308 // mov does not affect flags; -1 to offset inc ebx
2311 emms // End MMX instructions; prep for possible FP instrs.
2315 // Optimized code for PNG Paeth filter decoder
2317 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2320 png_uint_32 FullLength;
2321 png_uint_32 MMXLength;
2326 int patemp, pbtemp, pctemp;
2328 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2329 FullLength = row_info->rowbytes; // # of bytes to filter
2332 xor ebx, ebx // ebx ==> x offset
2334 xor edx, edx // edx ==> x-bpp offset
2338 // Compute the Raw value for the first bpp bytes
2339 // Note: the formula works out to be always
2340 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2346 mov [edi + ebx - 1], al
2348 // get # of bytes to alignment
2349 mov diff, edi // take start of row
2350 add diff, ebx // add bpp
2352 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2353 and diff, 0xfffffff8 // mask to alignment boundary
2354 sub diff, edi // subtract from start ==> value ebx at alignment
2359 // pav = p - a = (a + b - c) - a = b - c
2360 mov al, [esi + ebx] // load Prior(x) into al
2361 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2362 sub eax, ecx // subtract Prior(x-bpp)
2363 mov patemp, eax // Save pav for later use
2365 // pbv = p - b = (a + b - c) - b = a - c
2366 mov al, [edi + edx] // load Raw(x-bpp) into al
2367 sub eax, ecx // subtract Prior(x-bpp)
2369 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2370 add eax, patemp // pcv = pav + pbv
2372 test eax, 0x80000000
2374 neg eax // reverse sign of neg values
2376 mov pctemp, eax // save pc for later use
2378 test ecx, 0x80000000
2380 neg ecx // reverse sign of neg values
2382 mov pbtemp, ecx // save pb for later use
2385 test eax, 0x80000000
2387 neg eax // reverse sign of neg values
2389 mov patemp, eax // save pa for later use
2393 // pa > pb; now test if pb <= pc
2396 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2397 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2400 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2401 mov cl, [esi + ebx] // load Prior(x) into cl
2404 // pa <= pb; now test if pa <= pc
2407 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2408 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2411 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2412 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2416 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2417 add [edi + ebx - 1], cl
2423 sub eax, ebx // subtract alignment fix
2424 and eax, 0x00000007 // calc bytes over mult of 8
2425 sub ecx, eax // drop over bytes from original length
2428 // Now do the math for the rest of the row
2433 ActiveMask.use = 0x0000000000ffffff;
2434 ActiveMaskEnd.use = 0xffff000000000000;
2435 ShiftBpp.use = 24; // == bpp(3) * 8
2436 ShiftRem.use = 40; // == 64 - 24
2443 // PRIME the pump (load the first Raw(x-bpp) data set
2444 movq mm1, [edi+ebx-8]
2446 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2447 movq mm2, [esi + ebx] // load b=Prior(x)
2448 punpcklbw mm1, mm0 // Unpack High bytes of a
2449 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2450 punpcklbw mm2, mm0 // Unpack High bytes of b
2451 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2452 // pav = p - a = (a + b - c) - a = b - c
2454 punpcklbw mm3, mm0 // Unpack High bytes of c
2455 // pbv = p - b = (a + b - c) - b = a - c
2459 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2463 // pa = abs(p-a) = abs(pav)
2464 // pb = abs(p-b) = abs(pbv)
2465 // pc = abs(p-c) = abs(pcv)
2466 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2468 pand mm0, mm4 // Only pav bytes < 0 in mm7
2469 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2471 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2475 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2476 pand mm0, mm6 // Only pav bytes < 0 in mm7
2482 pcmpgtw mm7, mm5 // pa > pb?
2484 // use mm7 mask to merge pa & pb
2486 // use mm0 mask copy to merge a & b
2492 // test ((pa <= pb)? pa:pb) <= pc
2493 pcmpgtw mm7, mm6 // pab > pc?
2500 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2501 pand mm7, ActiveMask
2502 movq mm2, mm3 // load b=Prior(x) step 1
2503 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2504 punpcklbw mm3, mm0 // Unpack High bytes of c
2505 movq [edi + ebx], mm7 // write back updated value
2506 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2507 // Now do Paeth for 2nd set of bytes (3-5)
2508 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2509 punpcklbw mm1, mm0 // Unpack High bytes of a
2511 punpcklbw mm2, mm0 // Unpack High bytes of b
2512 // pbv = p - b = (a + b - c) - b = a - c
2514 // pav = p - a = (a + b - c) - a = b - c
2518 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2519 // pav + pbv = pbv + pav
2523 // pa = abs(p-a) = abs(pav)
2524 // pb = abs(p-b) = abs(pbv)
2525 // pc = abs(p-c) = abs(pcv)
2526 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2527 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2528 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2529 pand mm7, mm4 // Only pav bytes < 0 in mm7
2535 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2536 pand mm0, mm6 // Only pav bytes < 0 in mm7
2541 pcmpgtw mm7, mm5 // pa > pb?
2543 // use mm7 mask to merge pa & pb
2545 // use mm0 mask copy to merge a & b
2551 // test ((pa <= pb)? pa:pb) <= pc
2552 pcmpgtw mm7, mm6 // pab > pc?
2553 movq mm2, [esi + ebx] // load b=Prior(x)
2560 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2561 pand mm7, ActiveMask
2562 punpckhbw mm2, mm0 // Unpack High bytes of b
2563 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2564 // pav = p - a = (a + b - c) - a = b - c
2566 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2567 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2568 movq [edi + ebx], mm7 // write back updated value
2570 punpckhbw mm3, mm0 // Unpack High bytes of c
2571 psllq mm1, ShiftBpp // Shift bytes
2572 // Now mm1 will be used as Raw(x-bpp)
2573 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2575 punpckhbw mm1, mm0 // Unpack High bytes of a
2577 // pbv = p - b = (a + b - c) - b = a - c
2579 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2585 // pa = abs(p-a) = abs(pav)
2586 // pb = abs(p-b) = abs(pbv)
2587 // pc = abs(p-c) = abs(pcv)
2588 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2589 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2590 pand mm0, mm4 // Only pav bytes < 0 in mm7
2591 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2597 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2598 pand mm0, mm6 // Only pav bytes < 0 in mm7
2603 pcmpgtw mm7, mm5 // pa > pb?
2605 // use mm0 mask copy to merge a & b
2607 // use mm7 mask to merge pa & pb
2613 // test ((pa <= pb)? pa:pb) <= pc
2614 pcmpgtw mm7, mm6 // pab > pc?
2620 // Step ebx to next set of 8 bytes and repeat loop til done
2622 pand mm1, ActiveMaskEnd
2623 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2626 pxor mm0, mm0 // pxor does not affect flags
2627 movq [edi + ebx - 8], mm1 // write back updated value
2628 // mm1 will be used as Raw(x-bpp) next loop
2629 // mm3 ready to be used as Prior(x-bpp) next loop
2639 ActiveMask.use = 0x00000000ffffffff;
2640 ActiveMask2.use = 0xffffffff00000000;
2641 ShiftBpp.use = bpp << 3; // == bpp * 8
2642 ShiftRem.use = 64 - ShiftBpp.use;
2648 // PRIME the pump (load the first Raw(x-bpp) data set
2649 movq mm1, [edi+ebx-8]
2652 // Must shift to position Raw(x-bpp) data
2654 // Do first set of 4 bytes
2655 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2656 punpcklbw mm1, mm0 // Unpack Low bytes of a
2657 movq mm2, [esi + ebx] // load b=Prior(x)
2658 punpcklbw mm2, mm0 // Unpack Low bytes of b
2659 // Must shift to position Prior(x-bpp) data
2661 // pav = p - a = (a + b - c) - a = b - c
2663 punpcklbw mm3, mm0 // Unpack Low bytes of c
2664 // pbv = p - b = (a + b - c) - b = a - c
2668 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2671 // pa = abs(p-a) = abs(pav)
2672 // pb = abs(p-b) = abs(pbv)
2673 // pc = abs(p-c) = abs(pcv)
2674 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2676 pand mm0, mm4 // Only pav bytes < 0 in mm7
2677 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2679 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2683 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2684 pand mm0, mm6 // Only pav bytes < 0 in mm7
2690 pcmpgtw mm7, mm5 // pa > pb?
2692 // use mm7 mask to merge pa & pb
2694 // use mm0 mask copy to merge a & b
2700 // test ((pa <= pb)? pa:pb) <= pc
2701 pcmpgtw mm7, mm6 // pab > pc?
2708 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2709 pand mm7, ActiveMask
2711 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2712 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2714 movq [edi + ebx], mm7 // write back updated value
2715 movq mm1, [edi+ebx-8]
2721 punpckhbw mm3, mm0 // Unpack High bytes of c
2723 // Do second set of 4 bytes
2724 punpckhbw mm2, mm0 // Unpack High bytes of b
2725 punpckhbw mm1, mm0 // Unpack High bytes of a
2726 // pav = p - a = (a + b - c) - a = b - c
2728 // pbv = p - b = (a + b - c) - b = a - c
2732 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2735 // pa = abs(p-a) = abs(pav)
2736 // pb = abs(p-b) = abs(pbv)
2737 // pc = abs(p-c) = abs(pcv)
2738 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2740 pand mm0, mm4 // Only pav bytes < 0 in mm7
2741 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2743 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2747 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2748 pand mm0, mm6 // Only pav bytes < 0 in mm7
2754 pcmpgtw mm7, mm5 // pa > pb?
2756 // use mm7 mask to merge pa & pb
2758 // use mm0 mask copy to merge a & b
2764 // test ((pa <= pb)? pa:pb) <= pc
2765 pcmpgtw mm7, mm6 // pab > pc?
2772 // Step ex to next set of 8 bytes and repeat loop til done
2775 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2777 movq [edi + ebx - 8], mm1 // write back updated value
2778 // mm1 will be used as Raw(x-bpp) next loop
2786 ActiveMask.use = 0x00000000ffffffff;
2792 // PRIME the pump (load the first Raw(x-bpp) data set
2793 movq mm1, [edi+ebx-8] // Only time should need to read
2794 // a=Raw(x-bpp) bytes
2796 // Do first set of 4 bytes
2797 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2798 punpckhbw mm1, mm0 // Unpack Low bytes of a
2799 movq mm2, [esi + ebx] // load b=Prior(x)
2800 punpcklbw mm2, mm0 // Unpack High bytes of b
2801 // pav = p - a = (a + b - c) - a = b - c
2803 punpckhbw mm3, mm0 // Unpack High bytes of c
2804 // pbv = p - b = (a + b - c) - b = a - c
2808 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2811 // pa = abs(p-a) = abs(pav)
2812 // pb = abs(p-b) = abs(pbv)
2813 // pc = abs(p-c) = abs(pcv)
2814 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2816 pand mm0, mm4 // Only pav bytes < 0 in mm7
2817 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2819 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2823 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2824 pand mm0, mm6 // Only pav bytes < 0 in mm7
2830 pcmpgtw mm7, mm5 // pa > pb?
2832 // use mm7 mask to merge pa & pb
2834 // use mm0 mask copy to merge a & b
2840 // test ((pa <= pb)? pa:pb) <= pc
2841 pcmpgtw mm7, mm6 // pab > pc?
2848 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2849 pand mm7, ActiveMask
2850 movq mm2, mm3 // load b=Prior(x) step 1
2851 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2852 punpcklbw mm3, mm0 // Unpack High bytes of c
2853 movq [edi + ebx], mm7 // write back updated value
2854 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2855 // Do second set of 4 bytes
2856 punpckhbw mm2, mm0 // Unpack Low bytes of b
2857 punpcklbw mm1, mm0 // Unpack Low bytes of a
2858 // pav = p - a = (a + b - c) - a = b - c
2860 // pbv = p - b = (a + b - c) - b = a - c
2864 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2867 // pa = abs(p-a) = abs(pav)
2868 // pb = abs(p-b) = abs(pbv)
2869 // pc = abs(p-c) = abs(pcv)
2870 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2872 pand mm0, mm4 // Only pav bytes < 0 in mm7
2873 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2875 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2879 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2880 pand mm0, mm6 // Only pav bytes < 0 in mm7
2886 pcmpgtw mm7, mm5 // pa > pb?
2888 // use mm7 mask to merge pa & pb
2890 // use mm0 mask copy to merge a & b
2896 // test ((pa <= pb)? pa:pb) <= pc
2897 pcmpgtw mm7, mm6 // pab > pc?
2904 // Step ex to next set of 8 bytes and repeat loop til done
2907 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2909 movq [edi + ebx - 8], mm1 // write back updated value
2910 // mm1 will be used as Raw(x-bpp) next loop
2917 ActiveMask.use = 0x00000000ffffffff;
2923 // PRIME the pump (load the first Raw(x-bpp) data set
2924 movq mm1, [edi+ebx-8] // Only time should need to read
2925 // a=Raw(x-bpp) bytes
2927 // Do first set of 4 bytes
2928 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2929 punpcklbw mm1, mm0 // Unpack Low bytes of a
2930 movq mm2, [esi + ebx] // load b=Prior(x)
2931 punpcklbw mm2, mm0 // Unpack Low bytes of b
2932 // pav = p - a = (a + b - c) - a = b - c
2934 punpcklbw mm3, mm0 // Unpack Low bytes of c
2935 // pbv = p - b = (a + b - c) - b = a - c
2939 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2942 // pa = abs(p-a) = abs(pav)
2943 // pb = abs(p-b) = abs(pbv)
2944 // pc = abs(p-c) = abs(pcv)
2945 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2947 pand mm0, mm4 // Only pav bytes < 0 in mm7
2948 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2950 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2954 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2955 pand mm0, mm6 // Only pav bytes < 0 in mm7
2961 pcmpgtw mm7, mm5 // pa > pb?
2963 // use mm7 mask to merge pa & pb
2965 // use mm0 mask copy to merge a & b
2971 // test ((pa <= pb)? pa:pb) <= pc
2972 pcmpgtw mm7, mm6 // pab > pc?
2979 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2980 pand mm7, ActiveMask
2981 movq mm2, [esi + ebx] // load b=Prior(x)
2982 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2983 punpckhbw mm3, mm0 // Unpack High bytes of c
2984 movq [edi + ebx], mm7 // write back updated value
2985 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
2987 // Do second set of 4 bytes
2988 punpckhbw mm2, mm0 // Unpack High bytes of b
2989 punpckhbw mm1, mm0 // Unpack High bytes of a
2990 // pav = p - a = (a + b - c) - a = b - c
2992 // pbv = p - b = (a + b - c) - b = a - c
2996 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2999 // pa = abs(p-a) = abs(pav)
3000 // pb = abs(p-b) = abs(pbv)
3001 // pc = abs(p-c) = abs(pcv)
3002 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3004 pand mm0, mm4 // Only pav bytes < 0 in mm7
3005 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3007 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3011 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3012 pand mm0, mm6 // Only pav bytes < 0 in mm7
3018 pcmpgtw mm7, mm5 // pa > pb?
3020 // use mm7 mask to merge pa & pb
3022 // use mm0 mask copy to merge a & b
3028 // test ((pa <= pb)? pa:pb) <= pc
3029 pcmpgtw mm7, mm6 // pab > pc?
3036 // Step ex to next set of 8 bytes and repeat loop til done
3039 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3041 movq [edi + ebx - 8], mm1 // write back updated value
3042 // mm1 will be used as Raw(x-bpp) next loop
3058 // Do Paeth decode for remaining bytes
3060 xor ecx, ecx // zero ecx before using cl & cx in loop below
3061 sub edx, bpp // Set edx = ebx - bpp
3064 // pav = p - a = (a + b - c) - a = b - c
3065 mov al, [esi + ebx] // load Prior(x) into al
3066 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3067 sub eax, ecx // subtract Prior(x-bpp)
3068 mov patemp, eax // Save pav for later use
3070 // pbv = p - b = (a + b - c) - b = a - c
3071 mov al, [edi + edx] // load Raw(x-bpp) into al
3072 sub eax, ecx // subtract Prior(x-bpp)
3074 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3075 add eax, patemp // pcv = pav + pbv
3077 test eax, 0x80000000
3079 neg eax // reverse sign of neg values
3081 mov pctemp, eax // save pc for later use
3083 test ecx, 0x80000000
3085 neg ecx // reverse sign of neg values
3087 mov pbtemp, ecx // save pb for later use
3090 test eax, 0x80000000
3092 neg eax // reverse sign of neg values
3094 mov patemp, eax // save pa for later use
3098 // pa > pb; now test if pb <= pc
3101 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3102 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3105 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3106 mov cl, [esi + ebx] // load Prior(x) into cl
3109 // pa <= pb; now test if pa <= pc
3112 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3113 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3116 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3117 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3121 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3122 add [edi + ebx - 1], cl
3128 return; // No need to go further with this one
3129 } // end switch ( bpp )
3132 // MMX acceleration complete now do clean-up
3133 // Check if any remaining bytes left to decode
3139 // Do Paeth decode for remaining bytes
3141 xor ecx, ecx // zero ecx before using cl & cx in loop below
3142 sub edx, bpp // Set edx = ebx - bpp
3145 // pav = p - a = (a + b - c) - a = b - c
3146 mov al, [esi + ebx] // load Prior(x) into al
3147 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3148 sub eax, ecx // subtract Prior(x-bpp)
3149 mov patemp, eax // Save pav for later use
3151 // pbv = p - b = (a + b - c) - b = a - c
3152 mov al, [edi + edx] // load Raw(x-bpp) into al
3153 sub eax, ecx // subtract Prior(x-bpp)
3155 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3156 add eax, patemp // pcv = pav + pbv
3158 test eax, 0x80000000
3160 neg eax // reverse sign of neg values
3162 mov pctemp, eax // save pc for later use
3164 test ecx, 0x80000000
3166 neg ecx // reverse sign of neg values
3168 mov pbtemp, ecx // save pb for later use
3171 test eax, 0x80000000
3173 neg eax // reverse sign of neg values
3175 mov patemp, eax // save pa for later use
3179 // pa > pb; now test if pb <= pc
3182 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3183 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3186 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3187 mov cl, [esi + ebx] // load Prior(x) into cl
3190 // pa <= pb; now test if pa <= pc
3193 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3194 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3197 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3198 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3202 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3203 add [edi + ebx - 1], cl
3207 emms // End MMX instructions; prep for possible FP instrs.
3211 // Optimized code for PNG Sub filter decoder
3213 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3217 png_uint_32 FullLength;
3218 png_uint_32 MMXLength;
3221 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3222 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3225 mov esi, edi // lp = row
3226 add edi, bpp // rp = row + bpp
3228 // get # of bytes to alignment
3229 mov diff, edi // take start of row
3230 add diff, 0xf // add 7 + 8 to incr past
3231 // alignment boundary
3233 and diff, 0xfffffff8 // mask to alignment boundary
3234 sub diff, edi // subtract from start ==> value
3247 sub edx, ebx // subtract alignment fix
3248 and edx, 0x00000007 // calc bytes over mult of 8
3249 sub ecx, edx // drop over bytes from length
3253 // Now do the math for the rest of the row
3258 ActiveMask.use = 0x0000ffffff000000;
3259 ShiftBpp.use = 24; // == 3 * 8
3260 ShiftRem.use = 40; // == 64 - 24
3263 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3264 mov esi, edi // lp = row
3265 add edi, bpp // rp = row + bpp
3268 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3270 // PRIME the pump (load the first Raw(x-bpp) data set
3271 movq mm1, [edi+ebx-8]
3273 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3274 // no need for mask; shift clears inactive bytes
3275 // Add 1st active group
3278 // Add 2nd active group
3279 movq mm1, mm0 // mov updated Raws to mm1
3280 psllq mm1, ShiftBpp // shift data to position correctly
3281 pand mm1, mm7 // mask to use only 2nd active group
3283 // Add 3rd active group
3284 movq mm1, mm0 // mov updated Raws to mm1
3285 psllq mm1, ShiftBpp // shift data to position correctly
3286 pand mm1, mm6 // mask to use only 3rd active group
3290 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3291 // Prep for doing 1st add at top of loop
3300 // Placed here just in case this is a duplicate of the
3301 // non-MMX code for the SUB filter in png_read_filter_row below
3306 // bpp = (row_info->pixel_depth + 7) >> 3;
3307 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3308 // i < row_info->rowbytes; i++, rp++, lp++)
3310 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3317 mov esi, edi // lp = row
3319 add edi, bpp // rp = row + bpp
3336 ShiftBpp.use = bpp << 3;
3337 ShiftRem.use = 64 - ShiftBpp.use;
3341 mov esi, edi // lp = row
3342 add edi, bpp // rp = row + bpp
3343 // PRIME the pump (load the first Raw(x-bpp) data set
3344 movq mm1, [edi+ebx-8]
3346 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3347 // no need for mask; shift clears inactive bytes
3350 // Add 2nd active group
3351 movq mm1, mm0 // mov updated Raws to mm1
3352 psllq mm1, ShiftBpp // shift data to position correctly
3353 // there is no need for any mask
3354 // since shift clears inactive bits/bytes
3358 movq [edi+ebx-8], mm0
3359 movq mm1, mm0 // Prep for doing 1st add at top of loop
3367 ActiveMask.use = 0x00000000ffff0000;
3368 ShiftBpp.use = 16; // == 2 * 8
3369 ShiftRem.use = 48; // == 64 - 16
3371 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3375 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3377 mov esi, edi // lp = row
3379 add edi, bpp // rp = row + bpp
3380 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3382 // PRIME the pump (load the first Raw(x-bpp) data set
3383 movq mm1, [edi+ebx-8]
3385 // Add 1st active group
3386 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3387 // no need for mask; shift clears inactive
3391 // Add 2nd active group
3392 movq mm1, mm0 // mov updated Raws to mm1
3393 psllq mm1, ShiftBpp // shift data to position correctly
3394 pand mm1, mm7 // mask to use only 2nd active group
3396 // Add 3rd active group
3397 movq mm1, mm0 // mov updated Raws to mm1
3398 psllq mm1, ShiftBpp // shift data to position correctly
3399 pand mm1, mm6 // mask to use only 3rd active group
3401 // Add 4th active group
3402 movq mm1, mm0 // mov updated Raws to mm1
3403 psllq mm1, ShiftBpp // shift data to position correctly
3404 pand mm1, mm5 // mask to use only 4th active group
3408 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3409 movq mm1, mm0 // Prep for doing 1st add at top of loop
3419 mov esi, edi // lp = row
3420 add edi, bpp // rp = row + bpp
3422 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3423 // Raw(x-bpp) data set
3424 and ecx, 0x0000003f // calc bytes over mult of 64
3426 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3428 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3429 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3430 // Now mm0 will be used as Raw(x-bpp) for
3431 // the 2nd group of 8 bytes. This will be
3432 // repeated for each group of 8 bytes with
3433 // the 8th group being used as the Raw(x-bpp)
3434 // for the 1st group of the next loop.
3436 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3437 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3439 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3440 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3442 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3443 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3445 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3446 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3448 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3449 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3451 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3452 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3456 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3465 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3466 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3467 // be the new Raw(x-bpp) for the next loop
3474 default: // bpp greater than 8 bytes
3479 mov esi, edi // lp = row
3480 add edi, bpp // rp = row + bpp
3487 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3494 } // end switch ( bpp )
3501 mov esi, edi // lp = row
3503 add edi, bpp // rp = row + bpp
3511 emms // End MMX instructions; prep for possible FP instrs.
3515 // Optimized code for PNG Up filter decoder
3517 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3521 len = row_info->rowbytes; // # of bytes to filter
3524 // get # of bytes to alignment
3539 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3544 sub edx, ebx // subtract alignment fix
3545 and edx, 0x0000003f // calc bytes over mult of 64
3546 sub ecx, edx // drop over bytes from length
3547 // Unrolled loop - use all MMX registers and interleave to reduce
3548 // number of branch instructions (loops) and reduce partial stalls
3552 movq mm3, [esi+ebx+8]
3554 movq mm2, [edi+ebx+8]
3557 movq mm5, [esi+ebx+16]
3558 movq [edi+ebx+8], mm2
3559 movq mm4, [edi+ebx+16]
3560 movq mm7, [esi+ebx+24]
3562 movq mm6, [edi+ebx+24]
3563 movq [edi+ebx+16], mm4
3565 movq mm1, [esi+ebx+32]
3566 movq [edi+ebx+24], mm6
3567 movq mm0, [edi+ebx+32]
3568 movq mm3, [esi+ebx+40]
3570 movq mm2, [edi+ebx+40]
3571 movq [edi+ebx+32], mm0
3573 movq mm5, [esi+ebx+48]
3574 movq [edi+ebx+40], mm2
3575 movq mm4, [edi+ebx+48]
3576 movq mm7, [esi+ebx+56]
3578 movq mm6, [edi+ebx+56]
3579 movq [edi+ebx+48], mm4
3583 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3584 // -8 to offset add ebx
3587 cmp edx, 0 // Test for bytes over mult of 64
3591 // 2 lines added by lcreeve@netins.net
3592 // (mail 11 Jul 98 in png-implement list)
3593 cmp edx, 8 //test for less than 8 bytes
3598 and edx, 0x00000007 // calc bytes over mult of 8
3599 sub ecx, edx // drop over bytes from length
3601 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3608 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3610 cmp edx, 0 // Test for bytes over mult of 8
3614 add ecx, edx // move over byte count into counter
3615 // Loop using x86 registers to update remaining bytes
3621 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3624 // Conversion of filtered row completed
3625 emms // End MMX instructions; prep for possible FP instrs.
3630 // Optimized png_read_filter_row routines
3632 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3633 row, png_bytep prev_row, int filter)
3639 if (mmx_supported == 2) {
3644 png_debug(1, "in png_read_filter_row\n");
3647 case 0: sprintf(filnm, "none");
3649 case 1: sprintf(filnm, "sub-%s", "MMX");
3651 case 2: sprintf(filnm, "up-%s", "MMX");
3653 case 3: sprintf(filnm, "avg-%s", "MMX");
3655 case 4: sprintf(filnm, "Paeth-%s", "MMX");
3657 default: sprintf(filnm, "unknw");
3660 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3661 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3662 (int)((row_info->pixel_depth + 7) >> 3));
3663 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3664 #endif /* PNG_DEBUG */
3668 case PNG_FILTER_VALUE_NONE:
3671 case PNG_FILTER_VALUE_SUB:
3674 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3675 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
3677 png_read_filter_row_mmx_sub(row_info, row);
3682 png_uint_32 istop = row_info->rowbytes;
3683 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3684 png_bytep rp = row + bpp;
3687 for (i = bpp; i < istop; i++)
3689 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3696 case PNG_FILTER_VALUE_UP:
3699 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3700 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
3702 png_read_filter_row_mmx_up(row_info, row, prev_row);
3707 png_uint_32 istop = row_info->rowbytes;
3709 png_bytep pp = prev_row;
3711 for (i = 0; i < istop; ++i)
3713 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3720 case PNG_FILTER_VALUE_AVG:
3723 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3724 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
3726 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3732 png_bytep pp = prev_row;
3734 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3735 png_uint_32 istop = row_info->rowbytes - bpp;
3737 for (i = 0; i < bpp; i++)
3739 *rp = (png_byte)(((int)(*rp) +
3740 ((int)(*pp++) >> 1)) & 0xff);
3744 for (i = 0; i < istop; i++)
3746 *rp = (png_byte)(((int)(*rp) +
3747 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3754 case PNG_FILTER_VALUE_PAETH:
3757 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
3758 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
3760 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3766 png_bytep pp = prev_row;
3768 png_bytep cp = prev_row;
3769 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3770 png_uint_32 istop=row_info->rowbytes - bpp;
3772 for (i = 0; i < bpp; i++)
3774 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3778 for (i = 0; i < istop; i++) // use leftover rp,pp
3780 int a, b, c, pa, pb, pc, p;
3794 pa = p < 0 ? -p : p;
3795 pb = pc < 0 ? -pc : pc;
3796 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3800 if (pa <= pb && pa <= pc)
3808 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3810 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3818 png_warning(png_ptr, "Ignoring bad row filter type");
3824 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */