Initial revision
[rrdtool.git] / libraries / libpng-1.0.9 / pnggccrd.c
1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2  *
3  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4  *
5  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7  *     for Intel's performance analysis of the MMX vs. non-MMX code.
8  *
9  * libpng 1.0.9 - January 31, 2001
10  * For conditions of distribution and use, see copyright notice in png.h
11  * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12  * Copyright (c) 1998, Intel Corporation
13  *
14  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15  * Interface to libpng contributed by Gilles Vollant, 1999.
16  * GNU C port by Greg Roelofs, 1999-2001.
17  *
18  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19  *
20  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21  *
22  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
23  *
24  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25  *        is required to assemble the newer MMX instructions such as movq.
26  *        For djgpp, see
27  *
28  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29  *
30  *        (or a later version in the same directory).  For Linux, check your
31  *        distribution's web site(s) or try these links:
32  *
33  *           http://rufus.w3.org/linux/RPM/binutils.html
34  *           http://www.debian.org/Packages/stable/devel/binutils.html
35  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36  *             binutils.tgz
37  *
38  *        For other platforms, see the main GNU site:
39  *
40  *           ftp://ftp.gnu.org/pub/gnu/binutils/
41  *
42  *        Version 2.5.2l.15 is definitely too old...
43  */
44
45 /*
46  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47  * =====================================
48  *
49  * 19991006:
50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51  *
52  * 19991007:
53  *  - additional optimizations (possible or definite):
54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57  *        why subtract 8 from width_mmx in the pass 4/5 case?
58  *        (only width_mmx case) (near line 1606)
59  *     x [DONE] replace pixel_bytes within each block with the true
60  *        constant value (or are compilers smart enough to do that?)
61  *     - rewrite all MMX interlacing code so it's aligned with
62  *        the *beginning* of the row buffer, not the end.  This
63  *        would not only allow one to eliminate half of the memory
64  *        writes for odd passes (that is, pass == odd), it may also
65  *        eliminate some unaligned-data-access exceptions (assuming
66  *        there's a penalty for not aligning 64-bit accesses on
67  *        64-bit boundaries).  The only catch is that the "leftover"
68  *        pixel(s) at the end of the row would have to be saved,
69  *        but there are enough unused MMX registers in every case,
70  *        so this is not a problem.  A further benefit is that the
71  *        post-MMX cleanup code (C code) in at least some of the
72  *        cases could be done within the assembler block.
73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74  *     inconsistent, and don't match the MMX Programmer's Reference
75  *     Manual conventions anyway.  They should be changed to
76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77  *     was lowest in memory (e.g., corresponding to a left pixel)
78  *     and b7 is the byte that was highest (e.g., a right pixel).
79  *
80  * 19991016:
81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82  *     want globals prefixed by underscores when referencing them--
83  *     i.e., if the variable is const4, then refer to it as const4,
84  *     not _const4.  This seems to be a djgpp-specific requirement.
85  *     Also, such variables apparently *must* be declared outside
86  *     of functions; neither static nor automatic variables work if
87  *     defined within the scope of a single function, but both
88  *     static and truly global (multi-module) variables work fine.
89  *
90  * 19991023:
91  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92  *  - switched from string-concatenation-with-macros to cleaner method of
93  *     renaming global variables for djgpp--i.e., always use prefixes in
94  *     inlined assembler code (== strings) and conditionally rename the
95  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
96  *
97  * 19991024:
98  *  - fixed mmxsupport()/png_do_interlace() first-row bug
99  *     This one was severely weird:  even though mmxsupport() doesn't touch
100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
101  *     the register (even in static/non-fPIC code--see below), which in turn
102  *     caused png_do_interlace() to return prematurely on the first row of
103  *     interlaced images (i.e., without expanding the interlaced pixels).
104  *     Inspection of the generated assembly code didn't turn up any clues,
105  *     although it did point at a minor optimization (i.e., get rid of
106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
107  *     instruction is more destructive than it looks?  (Not yet checked.)
108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109  *     listings...  Apparently register spillage has to do with ebx, since
110  *     it's used to index the global offset table.  Commenting it out of the
111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
113  *
114  * 19991107:
115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
117  *
118  * 19991120:
119  *  - made "diff" variable (now "_dif") global to simplify conversion of
120  *     filtering routines (running out of regs, sigh).  "diff" is still used
121  *     in interlacing routines, however.
122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123  *     macro determines which is used); original not yet tested.
124  *
125  * 20000213:
126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
127  *
128  * 20000319:
129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130  *     pass == 4 or 5, that caused visible corruption of interlaced images
131  *
132  * 20000623:
133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138  *     for the original (anonymous) SourceForge bug report.
139  *
140  * 20000706:
141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142  *       pnggccrd.c: In function `png_combine_row':
143  *       pnggccrd.c:525: more than 10 operands in `asm'
144  *       pnggccrd.c:669: more than 10 operands in `asm'
145  *       pnggccrd.c:828: more than 10 operands in `asm'
146  *       pnggccrd.c:994: more than 10 operands in `asm'
147  *       pnggccrd.c:1177: more than 10 operands in `asm'
148  *     They are all the same problem and can be worked around by using the
149  *     global _unmask variable unconditionally, not just in the -fPIC case.
150  *     Reportedly earlier versions of gcc also have the problem with more than
151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
152  *
153  * 20000729:
154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
156  *  - to finish remaining sections:
157  *     - clean up indentation and comments
158  *     - preload local variables
159  *     - add output and input regs (order of former determines numerical
160  *        mapping of latter)
161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
163  *
164  * 20000731:
165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166  *
167  * 20000822:
168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169  *     shared-library (-fPIC) version!  Code works just fine as part of static
170  *     library.  Damn damn damn damn damn, should have tested that sooner.
171  *     ebx is getting clobbered again (explicitly this time); need to save it
172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
173  *
174  * 20000823:
175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
177  *     and *Mask* globals and got rid of leading "$" signs.
178  *
179  * 20000826:
180  *  - added visual separators to help navigate microscopic printed copies
181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182  *     on png_read_filter_row_mmx_avg()
183  *
184  * 20000828:
185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
187  *     cleaned up/shortened in either routine, but functionality is complete
188  *     and seems to be working fine.
189  *
190  * 20000829:
191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
194  *     is simple enough...
195  *
196  * 20000914:
197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
198  *     correctly (but 48-bit RGB just fine)
199  *
200  * 20000916:
201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
205  *
206  * 20010103:
207  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
208  *     and made it public
209  *
210  * 20010104:
211  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
212  *     within MMX version of png_read_filter_row()) so no longer necessary to
213  *     compile it into pngrutil.o
214  *
215  * STILL TO DO:
216  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
217  *     - write MMX code for 48-bit case (pixel_bytes == 6)
218  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
219  *        why subtract 8 from width_mmx in the pass 4/5 case?
220  *        (only width_mmx case) (near line 1606)
221  *     - rewrite all MMX interlacing code so it's aligned with beginning
222  *        of the row buffer, not the end (see 19991007 for details)
223  *     x pick one version of mmxsupport() and get rid of the other
224  *     - add error messages to any remaining bogus default cases
225  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
226  *     - add support for runtime enable/disable/query of various MMX routines
227  */
228
229 //#define PNG_DEBUG 2   // GRR
230
231 #define PNG_INTERNAL
232 #include "png.h"
233
234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
235
236 #ifdef PNG_USE_LOCAL_ARRAYS
237 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
238 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
239 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
240 #endif
241
242 // djgpp, Win32, and Cygwin add their own underscores to global variables,
243 // so define them without:
244 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
245 #  define _mmx_supported  mmx_supported
246 #  define _unmask         unmask
247 #  define _const4         const4
248 #  define _const6         const6
249 #  define _mask8_0        mask8_0
250 #  define _mask16_1       mask16_1
251 #  define _mask16_0       mask16_0
252 #  define _mask24_2       mask24_2
253 #  define _mask24_1       mask24_1
254 #  define _mask24_0       mask24_0
255 #  define _mask32_3       mask32_3
256 #  define _mask32_2       mask32_2
257 #  define _mask32_1       mask32_1
258 #  define _mask32_0       mask32_0
259 #  define _mask48_5       mask48_5
260 #  define _mask48_4       mask48_4
261 #  define _mask48_3       mask48_3
262 #  define _mask48_2       mask48_2
263 #  define _mask48_1       mask48_1
264 #  define _mask48_0       mask48_0
265 #  define _FullLength     FullLength
266 #  define _MMXLength      MMXLength
267 #  define _dif            dif
268 #  define _LBCarryMask    LBCarryMask
269 #  define _HBClearMask    HBClearMask
270 #  define _ActiveMask     ActiveMask
271 #  define _ActiveMask2    ActiveMask2
272 #  define _ActiveMaskEnd  ActiveMaskEnd
273 #  define _ShiftBpp       ShiftBpp
274 #  define _ShiftRem       ShiftRem
275 #  define _patemp         patemp
276 #  define _pbtemp         pbtemp
277 #  define _pctemp         pctemp
278 #endif
279
280 static int _mmx_supported = 2;
281
282 /* These constants are used in the inlined MMX assembly code.
283    Ignore gcc's "At top level: defined but not used" warnings. */
284
285 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
286  *  since that case uses the %ebx register for indexing the Global Offset Table
287  *  and there were no other registers available.  But gcc 2.95 and later emit
288  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
289  *  in the non-PIC case, so we'll just use the global unconditionally now.
290  */
291 static int _unmask;
292
293 static unsigned long long _mask8_0  = 0x0102040810204080LL;
294
295 static unsigned long long _mask16_1 = 0x0101020204040808LL;
296 static unsigned long long _mask16_0 = 0x1010202040408080LL;
297
298 static unsigned long long _mask24_2 = 0x0101010202020404LL;
299 static unsigned long long _mask24_1 = 0x0408080810101020LL;
300 static unsigned long long _mask24_0 = 0x2020404040808080LL;
301
302 static unsigned long long _mask32_3 = 0x0101010102020202LL;
303 static unsigned long long _mask32_2 = 0x0404040408080808LL;
304 static unsigned long long _mask32_1 = 0x1010101020202020LL;
305 static unsigned long long _mask32_0 = 0x4040404080808080LL;
306
307 static unsigned long long _mask48_5 = 0x0101010101010202LL;
308 static unsigned long long _mask48_4 = 0x0202020204040404LL;
309 static unsigned long long _mask48_3 = 0x0404080808080808LL;
310 static unsigned long long _mask48_2 = 0x1010101010102020LL;
311 static unsigned long long _mask48_1 = 0x2020202040404040LL;
312 static unsigned long long _mask48_0 = 0x4040808080808080LL;
313
314 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
315 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
316 static unsigned long long _const6   = 0x00000000000000FFLL;
317
318 // These are used in the row-filter routines and should/would be local
319 //  variables if not for gcc addressing limitations.
320
321 static png_uint_32  _FullLength;
322 static png_uint_32  _MMXLength;
323 static int          _dif;
324 static int          _patemp;    // temp variables for Paeth routine
325 static int          _pbtemp;
326 static int          _pctemp;
327
328
329
330
331 //===========================================================================//
332 //                                                                           //
333 //                       P N G _ C O M B I N E _ R O W                       //
334 //                                                                           //
335 //===========================================================================//
336
337 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
338
339 /* Combines the row recently read in with the previous row.
340    This routine takes care of alpha and transparency if requested.
341    This routine also handles the two methods of progressive display
342    of interlaced images, depending on the mask value.
343    The mask value describes which pixels are to be combined with
344    the row.  The pattern always repeats every 8 pixels, so just 8
345    bits are needed.  A one indicates the pixel is to be combined; a
346    zero indicates the pixel is to be skipped.  This is in addition
347    to any alpha or transparency value associated with the pixel.
348    If you want all pixels to be combined, pass 0xff (255) in mask. */
349
350 /* Use this routine for the x86 platform - it uses a faster MMX routine
351    if the machine supports MMX. */
352
353 void /* PRIVATE */
354 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
355 {
356    png_debug(1,"in png_combine_row_asm\n");
357
358    if (_mmx_supported == 2) {
359        png_mmx_support();
360    }
361
362    if (mask == 0xff)
363    {
364       png_memcpy(row, png_ptr->row_buf + 1,
365        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
366    }
367    /* GRR:  png_combine_row() never called with mask == 0 */
368    else
369    {
370       switch (png_ptr->row_info.pixel_depth)
371       {
372          case 1:        // png_ptr->row_info.pixel_depth
373          {
374             png_bytep sp;
375             png_bytep dp;
376             int s_inc, s_start, s_end;
377             int m;
378             int shift;
379             png_uint_32 i;
380
381             sp = png_ptr->row_buf + 1;
382             dp = row;
383             m = 0x80;
384 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
385             if (png_ptr->transformations & PNG_PACKSWAP)
386             {
387                 s_start = 0;
388                 s_end = 7;
389                 s_inc = 1;
390             }
391             else
392 #endif
393             {
394                 s_start = 7;
395                 s_end = 0;
396                 s_inc = -1;
397             }
398
399             shift = s_start;
400
401             for (i = 0; i < png_ptr->width; i++)
402             {
403                if (m & mask)
404                {
405                   int value;
406
407                   value = (*sp >> shift) & 0x1;
408                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
409                   *dp |= (png_byte)(value << shift);
410                }
411
412                if (shift == s_end)
413                {
414                   shift = s_start;
415                   sp++;
416                   dp++;
417                }
418                else
419                   shift += s_inc;
420
421                if (m == 1)
422                   m = 0x80;
423                else
424                   m >>= 1;
425             }
426             break;
427          }
428
429          case 2:        // png_ptr->row_info.pixel_depth
430          {
431             png_bytep sp;
432             png_bytep dp;
433             int s_start, s_end, s_inc;
434             int m;
435             int shift;
436             png_uint_32 i;
437             int value;
438
439             sp = png_ptr->row_buf + 1;
440             dp = row;
441             m = 0x80;
442 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
443             if (png_ptr->transformations & PNG_PACKSWAP)
444             {
445                s_start = 0;
446                s_end = 6;
447                s_inc = 2;
448             }
449             else
450 #endif
451             {
452                s_start = 6;
453                s_end = 0;
454                s_inc = -2;
455             }
456
457             shift = s_start;
458
459             for (i = 0; i < png_ptr->width; i++)
460             {
461                if (m & mask)
462                {
463                   value = (*sp >> shift) & 0x3;
464                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
465                   *dp |= (png_byte)(value << shift);
466                }
467
468                if (shift == s_end)
469                {
470                   shift = s_start;
471                   sp++;
472                   dp++;
473                }
474                else
475                   shift += s_inc;
476                if (m == 1)
477                   m = 0x80;
478                else
479                   m >>= 1;
480             }
481             break;
482          }
483
484          case 4:        // png_ptr->row_info.pixel_depth
485          {
486             png_bytep sp;
487             png_bytep dp;
488             int s_start, s_end, s_inc;
489             int m;
490             int shift;
491             png_uint_32 i;
492             int value;
493
494             sp = png_ptr->row_buf + 1;
495             dp = row;
496             m = 0x80;
497 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
498             if (png_ptr->transformations & PNG_PACKSWAP)
499             {
500                s_start = 0;
501                s_end = 4;
502                s_inc = 4;
503             }
504             else
505 #endif
506             {
507                s_start = 4;
508                s_end = 0;
509                s_inc = -4;
510             }
511             shift = s_start;
512
513             for (i = 0; i < png_ptr->width; i++)
514             {
515                if (m & mask)
516                {
517                   value = (*sp >> shift) & 0xf;
518                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
519                   *dp |= (png_byte)(value << shift);
520                }
521
522                if (shift == s_end)
523                {
524                   shift = s_start;
525                   sp++;
526                   dp++;
527                }
528                else
529                   shift += s_inc;
530                if (m == 1)
531                   m = 0x80;
532                else
533                   m >>= 1;
534             }
535             break;
536          }
537
538          case 8:        // png_ptr->row_info.pixel_depth
539          {
540             png_bytep srcptr;
541             png_bytep dstptr;
542
543             if ( _mmx_supported  )
544             {
545                png_uint_32 len;
546                int diff;
547                int dummy_value_a;   // fix 'forbidden register spilled' error
548                int dummy_value_d;
549                int dummy_value_c;
550                int dummy_value_S;
551                int dummy_value_D;
552                _unmask = ~mask;            // global variable for -fPIC version
553                srcptr = png_ptr->row_buf + 1;
554                dstptr = row;
555                len  = png_ptr->width &~7;  // reduce to multiple of 8
556                diff = png_ptr->width & 7;  // amount lost
557
558                __asm__ __volatile__ (
559                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
560                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
561                   "punpcklbw %%mm7, %%mm7    \n\t"
562                   "punpcklwd %%mm7, %%mm7    \n\t"
563                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
564
565                   "movq      _mask8_0, %%mm0 \n\t"
566                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
567                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
568
569 // preload        "movl      len, %%ecx      \n\t" // load length of line
570 // preload        "movl      srcptr, %%esi   \n\t" // load source
571 // preload        "movl      dstptr, %%edi   \n\t" // load dest
572
573                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
574                   "je        mainloop8end    \n\t"
575
576                 "mainloop8:                  \n\t"
577                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
578                   "pand      %%mm0, %%mm4    \n\t"
579                   "movq      %%mm0, %%mm6    \n\t"
580                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
581                   "por       %%mm6, %%mm4    \n\t"
582                   "movq      %%mm4, (%%edi)  \n\t"
583                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
584                   "addl      $8, %%edi       \n\t"
585                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
586                   "ja        mainloop8       \n\t"
587
588                 "mainloop8end:               \n\t"
589 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
590                   "movl      %%eax, %%ecx    \n\t"
591                   "cmpl      $0, %%ecx       \n\t"
592                   "jz        end8            \n\t"
593 // preload        "movl      mask, %%edx     \n\t"
594                   "sall      $24, %%edx      \n\t" // make low byte, high byte
595
596                 "secondloop8:                \n\t"
597                   "sall      %%edx           \n\t" // move high bit to CF
598                   "jnc       skip8           \n\t" // if CF = 0
599                   "movb      (%%esi), %%al   \n\t"
600                   "movb      %%al, (%%edi)   \n\t"
601
602                 "skip8:                      \n\t"
603                   "incl      %%esi           \n\t"
604                   "incl      %%edi           \n\t"
605                   "decl      %%ecx           \n\t"
606                   "jnz       secondloop8     \n\t"
607
608                 "end8:                       \n\t"
609                   "EMMS                      \n\t"  // DONE
610
611                   : "=a" (dummy_value_a),           // output regs (dummy)
612                     "=d" (dummy_value_d),
613                     "=c" (dummy_value_c),
614                     "=S" (dummy_value_S),
615                     "=D" (dummy_value_D)
616
617                   : "3" (srcptr),      // esi       // input regs
618                     "4" (dstptr),      // edi
619                     "0" (diff),        // eax
620 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
621                     "2" (len),         // ecx
622                     "1" (mask)         // edx
623
624 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
625                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
626 #endif
627                );
628             }
629             else /* mmx _not supported - Use modified C routine */
630             {
631                register png_uint_32 i;
632                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
633                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
634                register int stride = png_pass_inc[png_ptr->pass];
635                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
636                register int rep_bytes = png_pass_width[png_ptr->pass];
637                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
638                register png_uint_32 final_val = png_ptr->width;
639
640                srcptr = png_ptr->row_buf + 1 + initial_val;
641                dstptr = row + initial_val;
642
643                for (i = initial_val; i < final_val; i += stride)
644                {
645                   png_memcpy(dstptr, srcptr, rep_bytes);
646                   srcptr += stride;
647                   dstptr += stride;
648                }
649             } /* end of else */
650
651             break;
652          }       // end 8 bpp
653
654          case 16:       // png_ptr->row_info.pixel_depth
655          {
656             png_bytep srcptr;
657             png_bytep dstptr;
658
659             if ( _mmx_supported )
660             {
661                png_uint_32 len;
662                int diff;
663                int dummy_value_a;   // fix 'forbidden register spilled' error
664                int dummy_value_d;
665                int dummy_value_c;
666                int dummy_value_S;
667                int dummy_value_D;
668                _unmask = ~mask;            // global variable for -fPIC version
669                srcptr = png_ptr->row_buf + 1;
670                dstptr = row;
671                len  = png_ptr->width &~7;  // reduce to multiple of 8
672                diff = png_ptr->width & 7;  // amount lost
673
674                __asm__ __volatile__ (
675                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
676                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
677                   "punpcklbw %%mm7, %%mm7     \n\t"
678                   "punpcklwd %%mm7, %%mm7     \n\t"
679                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
680
681                   "movq      _mask16_0, %%mm0 \n\t"
682                   "movq      _mask16_1, %%mm1 \n\t"
683
684                   "pand      %%mm7, %%mm0     \n\t"
685                   "pand      %%mm7, %%mm1     \n\t"
686
687                   "pcmpeqb   %%mm6, %%mm0     \n\t"
688                   "pcmpeqb   %%mm6, %%mm1     \n\t"
689
690 // preload        "movl      len, %%ecx       \n\t" // load length of line
691 // preload        "movl      srcptr, %%esi    \n\t" // load source
692 // preload        "movl      dstptr, %%edi    \n\t" // load dest
693
694                   "cmpl      $0, %%ecx        \n\t"
695                   "jz        mainloop16end    \n\t"
696
697                 "mainloop16:                  \n\t"
698                   "movq      (%%esi), %%mm4   \n\t"
699                   "pand      %%mm0, %%mm4     \n\t"
700                   "movq      %%mm0, %%mm6     \n\t"
701                   "movq      (%%edi), %%mm7   \n\t"
702                   "pandn     %%mm7, %%mm6     \n\t"
703                   "por       %%mm6, %%mm4     \n\t"
704                   "movq      %%mm4, (%%edi)   \n\t"
705
706                   "movq      8(%%esi), %%mm5  \n\t"
707                   "pand      %%mm1, %%mm5     \n\t"
708                   "movq      %%mm1, %%mm7     \n\t"
709                   "movq      8(%%edi), %%mm6  \n\t"
710                   "pandn     %%mm6, %%mm7     \n\t"
711                   "por       %%mm7, %%mm5     \n\t"
712                   "movq      %%mm5, 8(%%edi)  \n\t"
713
714                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
715                   "addl      $16, %%edi       \n\t"
716                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
717                   "ja        mainloop16       \n\t"
718
719                 "mainloop16end:               \n\t"
720 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
721                   "movl      %%eax, %%ecx     \n\t"
722                   "cmpl      $0, %%ecx        \n\t"
723                   "jz        end16            \n\t"
724 // preload        "movl      mask, %%edx      \n\t"
725                   "sall      $24, %%edx       \n\t" // make low byte, high byte
726
727                 "secondloop16:                \n\t"
728                   "sall      %%edx            \n\t" // move high bit to CF
729                   "jnc       skip16           \n\t" // if CF = 0
730                   "movw      (%%esi), %%ax    \n\t"
731                   "movw      %%ax, (%%edi)    \n\t"
732
733                 "skip16:                      \n\t"
734                   "addl      $2, %%esi        \n\t"
735                   "addl      $2, %%edi        \n\t"
736                   "decl      %%ecx            \n\t"
737                   "jnz       secondloop16     \n\t"
738
739                 "end16:                       \n\t"
740                   "EMMS                       \n\t" // DONE
741
742                   : "=a" (dummy_value_a),           // output regs (dummy)
743                     "=c" (dummy_value_c),
744                     "=d" (dummy_value_d),
745                     "=S" (dummy_value_S),
746                     "=D" (dummy_value_D)
747
748                   : "0" (diff),        // eax       // input regs
749 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
750                     "1" (len),         // ecx
751                     "2" (mask),        // edx
752                     "3" (srcptr),      // esi
753                     "4" (dstptr)       // edi
754
755 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
756                   : "%mm0", "%mm1", "%mm4"          // clobber list
757                   , "%mm5", "%mm6", "%mm7"
758 #endif
759                );
760             }
761             else /* mmx _not supported - Use modified C routine */
762             {
763                register png_uint_32 i;
764                png_uint_32 initial_val = 2 * png_pass_start[png_ptr->pass];
765                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
766                register int stride = 2 * png_pass_inc[png_ptr->pass];
767                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
768                register int rep_bytes = 2 * png_pass_width[png_ptr->pass];
769                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
770                register png_uint_32 final_val = 2 * png_ptr->width;
771
772                srcptr = png_ptr->row_buf + 1 + initial_val;
773                dstptr = row + initial_val;
774
775                for (i = initial_val; i < final_val; i += stride)
776                {
777                   png_memcpy(dstptr, srcptr, rep_bytes);
778                   srcptr += stride;
779                   dstptr += stride;
780                }
781             } /* end of else */
782
783             break;
784          }       // end 16 bpp
785
786          case 24:       // png_ptr->row_info.pixel_depth
787          {
788             png_bytep srcptr;
789             png_bytep dstptr;
790
791             if ( _mmx_supported )
792             {
793                png_uint_32 len;
794                int diff;
795                int dummy_value_a;   // fix 'forbidden register spilled' error
796                int dummy_value_d;
797                int dummy_value_c;
798                int dummy_value_S;
799                int dummy_value_D;
800                _unmask = ~mask;            // global variable for -fPIC version
801                srcptr = png_ptr->row_buf + 1;
802                dstptr = row;
803                len  = png_ptr->width &~7;  // reduce to multiple of 8
804                diff = png_ptr->width & 7;  // amount lost
805
806                __asm__ __volatile__ (
807                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
808                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
809                   "punpcklbw %%mm7, %%mm7     \n\t"
810                   "punpcklwd %%mm7, %%mm7     \n\t"
811                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
812
813                   "movq      _mask24_0, %%mm0 \n\t"
814                   "movq      _mask24_1, %%mm1 \n\t"
815                   "movq      _mask24_2, %%mm2 \n\t"
816
817                   "pand      %%mm7, %%mm0     \n\t"
818                   "pand      %%mm7, %%mm1     \n\t"
819                   "pand      %%mm7, %%mm2     \n\t"
820
821                   "pcmpeqb   %%mm6, %%mm0     \n\t"
822                   "pcmpeqb   %%mm6, %%mm1     \n\t"
823                   "pcmpeqb   %%mm6, %%mm2     \n\t"
824
825 // preload        "movl      len, %%ecx       \n\t" // load length of line
826 // preload        "movl      srcptr, %%esi    \n\t" // load source
827 // preload        "movl      dstptr, %%edi    \n\t" // load dest
828
829                   "cmpl      $0, %%ecx        \n\t"
830                   "jz        mainloop24end    \n\t"
831
832                 "mainloop24:                  \n\t"
833                   "movq      (%%esi), %%mm4   \n\t"
834                   "pand      %%mm0, %%mm4     \n\t"
835                   "movq      %%mm0, %%mm6     \n\t"
836                   "movq      (%%edi), %%mm7   \n\t"
837                   "pandn     %%mm7, %%mm6     \n\t"
838                   "por       %%mm6, %%mm4     \n\t"
839                   "movq      %%mm4, (%%edi)   \n\t"
840
841                   "movq      8(%%esi), %%mm5  \n\t"
842                   "pand      %%mm1, %%mm5     \n\t"
843                   "movq      %%mm1, %%mm7     \n\t"
844                   "movq      8(%%edi), %%mm6  \n\t"
845                   "pandn     %%mm6, %%mm7     \n\t"
846                   "por       %%mm7, %%mm5     \n\t"
847                   "movq      %%mm5, 8(%%edi)  \n\t"
848
849                   "movq      16(%%esi), %%mm6 \n\t"
850                   "pand      %%mm2, %%mm6     \n\t"
851                   "movq      %%mm2, %%mm4     \n\t"
852                   "movq      16(%%edi), %%mm7 \n\t"
853                   "pandn     %%mm7, %%mm4     \n\t"
854                   "por       %%mm4, %%mm6     \n\t"
855                   "movq      %%mm6, 16(%%edi) \n\t"
856
857                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
858                   "addl      $24, %%edi       \n\t"
859                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
860
861                   "ja        mainloop24       \n\t"
862
863                 "mainloop24end:               \n\t"
864 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
865                   "movl      %%eax, %%ecx     \n\t"
866                   "cmpl      $0, %%ecx        \n\t"
867                   "jz        end24            \n\t"
868 // preload        "movl      mask, %%edx      \n\t"
869                   "sall      $24, %%edx       \n\t" // make low byte, high byte
870
871                 "secondloop24:                \n\t"
872                   "sall      %%edx            \n\t" // move high bit to CF
873                   "jnc       skip24           \n\t" // if CF = 0
874                   "movw      (%%esi), %%ax    \n\t"
875                   "movw      %%ax, (%%edi)    \n\t"
876                   "xorl      %%eax, %%eax     \n\t"
877                   "movb      2(%%esi), %%al   \n\t"
878                   "movb      %%al, 2(%%edi)   \n\t"
879
880                 "skip24:                      \n\t"
881                   "addl      $3, %%esi        \n\t"
882                   "addl      $3, %%edi        \n\t"
883                   "decl      %%ecx            \n\t"
884                   "jnz       secondloop24     \n\t"
885
886                 "end24:                       \n\t"
887                   "EMMS                       \n\t" // DONE
888
889                   : "=a" (dummy_value_a),           // output regs (dummy)
890                     "=d" (dummy_value_d),
891                     "=c" (dummy_value_c),
892                     "=S" (dummy_value_S),
893                     "=D" (dummy_value_D)
894
895                   : "3" (srcptr),      // esi       // input regs
896                     "4" (dstptr),      // edi
897                     "0" (diff),        // eax
898 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
899                     "2" (len),         // ecx
900                     "1" (mask)         // edx
901
902 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
903                   : "%mm0", "%mm1", "%mm2"          // clobber list
904                   , "%mm4", "%mm5", "%mm6", "%mm7"
905 #endif
906                );
907             }
908             else /* mmx _not supported - Use modified C routine */
909             {
910                register png_uint_32 i;
911                png_uint_32 initial_val = 3 * png_pass_start[png_ptr->pass];
912                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
913                register int stride = 3 * png_pass_inc[png_ptr->pass];
914                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
915                register int rep_bytes = 3 * png_pass_width[png_ptr->pass];
916                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
917                register png_uint_32 final_val = 3 * png_ptr->width;
918
919                srcptr = png_ptr->row_buf + 1 + initial_val;
920                dstptr = row + initial_val;
921
922                for (i = initial_val; i < final_val; i += stride)
923                {
924                   png_memcpy(dstptr, srcptr, rep_bytes);
925                   srcptr += stride;
926                   dstptr += stride;
927                }
928             } /* end of else */
929
930             break;
931          }       // end 24 bpp
932
933          case 32:       // png_ptr->row_info.pixel_depth
934          {
935             png_bytep srcptr;
936             png_bytep dstptr;
937
938             if ( _mmx_supported )
939             {
940                png_uint_32 len;
941                int diff;
942                int dummy_value_a;   // fix 'forbidden register spilled' error
943                int dummy_value_d;
944                int dummy_value_c;
945                int dummy_value_S;
946                int dummy_value_D;
947                _unmask = ~mask;            // global variable for -fPIC version
948                srcptr = png_ptr->row_buf + 1;
949                dstptr = row;
950                len  = png_ptr->width &~7;  // reduce to multiple of 8
951                diff = png_ptr->width & 7;  // amount lost
952
953                __asm__ __volatile__ (
954                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
955                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
956                   "punpcklbw %%mm7, %%mm7     \n\t"
957                   "punpcklwd %%mm7, %%mm7     \n\t"
958                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
959
960                   "movq      _mask32_0, %%mm0 \n\t"
961                   "movq      _mask32_1, %%mm1 \n\t"
962                   "movq      _mask32_2, %%mm2 \n\t"
963                   "movq      _mask32_3, %%mm3 \n\t"
964
965                   "pand      %%mm7, %%mm0     \n\t"
966                   "pand      %%mm7, %%mm1     \n\t"
967                   "pand      %%mm7, %%mm2     \n\t"
968                   "pand      %%mm7, %%mm3     \n\t"
969
970                   "pcmpeqb   %%mm6, %%mm0     \n\t"
971                   "pcmpeqb   %%mm6, %%mm1     \n\t"
972                   "pcmpeqb   %%mm6, %%mm2     \n\t"
973                   "pcmpeqb   %%mm6, %%mm3     \n\t"
974
975 // preload        "movl      len, %%ecx       \n\t" // load length of line
976 // preload        "movl      srcptr, %%esi    \n\t" // load source
977 // preload        "movl      dstptr, %%edi    \n\t" // load dest
978
979                   "cmpl      $0, %%ecx        \n\t" // lcr
980                   "jz        mainloop32end    \n\t"
981
982                 "mainloop32:                  \n\t"
983                   "movq      (%%esi), %%mm4   \n\t"
984                   "pand      %%mm0, %%mm4     \n\t"
985                   "movq      %%mm0, %%mm6     \n\t"
986                   "movq      (%%edi), %%mm7   \n\t"
987                   "pandn     %%mm7, %%mm6     \n\t"
988                   "por       %%mm6, %%mm4     \n\t"
989                   "movq      %%mm4, (%%edi)   \n\t"
990
991                   "movq      8(%%esi), %%mm5  \n\t"
992                   "pand      %%mm1, %%mm5     \n\t"
993                   "movq      %%mm1, %%mm7     \n\t"
994                   "movq      8(%%edi), %%mm6  \n\t"
995                   "pandn     %%mm6, %%mm7     \n\t"
996                   "por       %%mm7, %%mm5     \n\t"
997                   "movq      %%mm5, 8(%%edi)  \n\t"
998
999                   "movq      16(%%esi), %%mm6 \n\t"
1000                   "pand      %%mm2, %%mm6     \n\t"
1001                   "movq      %%mm2, %%mm4     \n\t"
1002                   "movq      16(%%edi), %%mm7 \n\t"
1003                   "pandn     %%mm7, %%mm4     \n\t"
1004                   "por       %%mm4, %%mm6     \n\t"
1005                   "movq      %%mm6, 16(%%edi) \n\t"
1006
1007                   "movq      24(%%esi), %%mm7 \n\t"
1008                   "pand      %%mm3, %%mm7     \n\t"
1009                   "movq      %%mm3, %%mm5     \n\t"
1010                   "movq      24(%%edi), %%mm4 \n\t"
1011                   "pandn     %%mm4, %%mm5     \n\t"
1012                   "por       %%mm5, %%mm7     \n\t"
1013                   "movq      %%mm7, 24(%%edi) \n\t"
1014
1015                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
1016                   "addl      $32, %%edi       \n\t"
1017                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1018                   "ja        mainloop32       \n\t"
1019
1020                 "mainloop32end:               \n\t"
1021 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1022                   "movl      %%eax, %%ecx     \n\t"
1023                   "cmpl      $0, %%ecx        \n\t"
1024                   "jz        end32            \n\t"
1025 // preload        "movl      mask, %%edx      \n\t"
1026                   "sall      $24, %%edx       \n\t" // low byte => high byte
1027
1028                 "secondloop32:                \n\t"
1029                   "sall      %%edx            \n\t" // move high bit to CF
1030                   "jnc       skip32           \n\t" // if CF = 0
1031                   "movl      (%%esi), %%eax   \n\t"
1032                   "movl      %%eax, (%%edi)   \n\t"
1033
1034                 "skip32:                      \n\t"
1035                   "addl      $4, %%esi        \n\t"
1036                   "addl      $4, %%edi        \n\t"
1037                   "decl      %%ecx            \n\t"
1038                   "jnz       secondloop32     \n\t"
1039
1040                 "end32:                       \n\t"
1041                   "EMMS                       \n\t" // DONE
1042
1043                   : "=a" (dummy_value_a),           // output regs (dummy)
1044                     "=d" (dummy_value_d),
1045                     "=c" (dummy_value_c),
1046                     "=S" (dummy_value_S),
1047                     "=D" (dummy_value_D)
1048
1049                   : "3" (srcptr),      // esi       // input regs
1050                     "4" (dstptr),      // edi
1051                     "0" (diff),        // eax
1052 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1053                     "2" (len),         // ecx
1054                     "1" (mask)         // edx
1055
1056 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1057                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1058                   , "%mm4", "%mm5", "%mm6", "%mm7"
1059 #endif
1060                );
1061             }
1062             else /* mmx _not supported - Use modified C routine */
1063             {
1064                register png_uint_32 i;
1065                png_uint_32 initial_val = 4 * png_pass_start[png_ptr->pass];
1066                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1067                register int stride = 4 * png_pass_inc[png_ptr->pass];
1068                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1069                register int rep_bytes = 4 * png_pass_width[png_ptr->pass];
1070                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1071                register png_uint_32 final_val = 4 * png_ptr->width;
1072
1073                srcptr = png_ptr->row_buf + 1 + initial_val;
1074                dstptr = row + initial_val;
1075
1076                for (i = initial_val; i < final_val; i += stride)
1077                {
1078                   png_memcpy(dstptr, srcptr, rep_bytes);
1079                   srcptr += stride;
1080                   dstptr += stride;
1081                }
1082             } /* end of else */
1083
1084             break;
1085          }       // end 32 bpp
1086
1087          case 48:       // png_ptr->row_info.pixel_depth
1088          {
1089             png_bytep srcptr;
1090             png_bytep dstptr;
1091
1092             if ( _mmx_supported )
1093             {
1094                png_uint_32 len;
1095                int diff;
1096                int dummy_value_a;   // fix 'forbidden register spilled' error
1097                int dummy_value_d;
1098                int dummy_value_c;
1099                int dummy_value_S;
1100                int dummy_value_D;
1101                _unmask = ~mask;            // global variable for -fPIC version
1102                srcptr = png_ptr->row_buf + 1;
1103                dstptr = row;
1104                len  = png_ptr->width &~7;  // reduce to multiple of 8
1105                diff = png_ptr->width & 7;  // amount lost
1106
1107                __asm__ __volatile__ (
1108                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1109                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1110                   "punpcklbw %%mm7, %%mm7     \n\t"
1111                   "punpcklwd %%mm7, %%mm7     \n\t"
1112                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1113
1114                   "movq      _mask48_0, %%mm0 \n\t"
1115                   "movq      _mask48_1, %%mm1 \n\t"
1116                   "movq      _mask48_2, %%mm2 \n\t"
1117                   "movq      _mask48_3, %%mm3 \n\t"
1118                   "movq      _mask48_4, %%mm4 \n\t"
1119                   "movq      _mask48_5, %%mm5 \n\t"
1120
1121                   "pand      %%mm7, %%mm0     \n\t"
1122                   "pand      %%mm7, %%mm1     \n\t"
1123                   "pand      %%mm7, %%mm2     \n\t"
1124                   "pand      %%mm7, %%mm3     \n\t"
1125                   "pand      %%mm7, %%mm4     \n\t"
1126                   "pand      %%mm7, %%mm5     \n\t"
1127
1128                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1129                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1130                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1131                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1132                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1133                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1134
1135 // preload        "movl      len, %%ecx       \n\t" // load length of line
1136 // preload        "movl      srcptr, %%esi    \n\t" // load source
1137 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1138
1139                   "cmpl      $0, %%ecx        \n\t"
1140                   "jz        mainloop48end    \n\t"
1141
1142                 "mainloop48:                  \n\t"
1143                   "movq      (%%esi), %%mm7   \n\t"
1144                   "pand      %%mm0, %%mm7     \n\t"
1145                   "movq      %%mm0, %%mm6     \n\t"
1146                   "pandn     (%%edi), %%mm6   \n\t"
1147                   "por       %%mm6, %%mm7     \n\t"
1148                   "movq      %%mm7, (%%edi)   \n\t"
1149
1150                   "movq      8(%%esi), %%mm6  \n\t"
1151                   "pand      %%mm1, %%mm6     \n\t"
1152                   "movq      %%mm1, %%mm7     \n\t"
1153                   "pandn     8(%%edi), %%mm7  \n\t"
1154                   "por       %%mm7, %%mm6     \n\t"
1155                   "movq      %%mm6, 8(%%edi)  \n\t"
1156
1157                   "movq      16(%%esi), %%mm6 \n\t"
1158                   "pand      %%mm2, %%mm6     \n\t"
1159                   "movq      %%mm2, %%mm7     \n\t"
1160                   "pandn     16(%%edi), %%mm7 \n\t"
1161                   "por       %%mm7, %%mm6     \n\t"
1162                   "movq      %%mm6, 16(%%edi) \n\t"
1163
1164                   "movq      24(%%esi), %%mm7 \n\t"
1165                   "pand      %%mm3, %%mm7     \n\t"
1166                   "movq      %%mm3, %%mm6     \n\t"
1167                   "pandn     24(%%edi), %%mm6 \n\t"
1168                   "por       %%mm6, %%mm7     \n\t"
1169                   "movq      %%mm7, 24(%%edi) \n\t"
1170
1171                   "movq      32(%%esi), %%mm6 \n\t"
1172                   "pand      %%mm4, %%mm6     \n\t"
1173                   "movq      %%mm4, %%mm7     \n\t"
1174                   "pandn     32(%%edi), %%mm7 \n\t"
1175                   "por       %%mm7, %%mm6     \n\t"
1176                   "movq      %%mm6, 32(%%edi) \n\t"
1177
1178                   "movq      40(%%esi), %%mm7 \n\t"
1179                   "pand      %%mm5, %%mm7     \n\t"
1180                   "movq      %%mm5, %%mm6     \n\t"
1181                   "pandn     40(%%edi), %%mm6 \n\t"
1182                   "por       %%mm6, %%mm7     \n\t"
1183                   "movq      %%mm7, 40(%%edi) \n\t"
1184
1185                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
1186                   "addl      $48, %%edi       \n\t"
1187                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1188
1189                   "ja        mainloop48       \n\t"
1190
1191                 "mainloop48end:               \n\t"
1192 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1193                   "movl      %%eax, %%ecx     \n\t"
1194                   "cmpl      $0, %%ecx        \n\t"
1195                   "jz        end48            \n\t"
1196 // preload        "movl      mask, %%edx      \n\t"
1197                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1198
1199                 "secondloop48:                \n\t"
1200                   "sall      %%edx            \n\t" // move high bit to CF
1201                   "jnc       skip48           \n\t" // if CF = 0
1202                   "movl      (%%esi), %%eax   \n\t"
1203                   "movl      %%eax, (%%edi)   \n\t"
1204
1205                 "skip48:                      \n\t"
1206                   "addl      $4, %%esi        \n\t"
1207                   "addl      $4, %%edi        \n\t"
1208                   "decl      %%ecx            \n\t"
1209                   "jnz       secondloop48     \n\t"
1210
1211                 "end48:                       \n\t"
1212                   "EMMS                       \n\t" // DONE
1213
1214                   : "=a" (dummy_value_a),           // output regs (dummy)
1215                     "=d" (dummy_value_d),
1216                     "=c" (dummy_value_c),
1217                     "=S" (dummy_value_S),
1218                     "=D" (dummy_value_D)
1219
1220                   : "3" (srcptr),      // esi       // input regs
1221                     "4" (dstptr),      // edi
1222                     "0" (diff),        // eax
1223 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1224                     "2" (len),         // ecx
1225                     "1" (mask)         // edx
1226
1227 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1228                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1229                   , "%mm4", "%mm5", "%mm6", "%mm7"
1230 #endif
1231                );
1232             }
1233             else /* mmx _not supported - Use modified C routine */
1234             {
1235                register png_uint_32 i;
1236                png_uint_32 initial_val = 6 * png_pass_start[png_ptr->pass];
1237                  // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1238                register int stride = 6 * png_pass_inc[png_ptr->pass];
1239                  // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1240                register int rep_bytes = 6 * png_pass_width[png_ptr->pass];
1241                  // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1242                register png_uint_32 final_val = 6 * png_ptr->width;
1243
1244                srcptr = png_ptr->row_buf + 1 + initial_val;
1245                dstptr = row + initial_val;
1246
1247                for (i = initial_val; i < final_val; i += stride)
1248                {
1249                   png_memcpy(dstptr, srcptr, rep_bytes);
1250                   srcptr += stride;
1251                   dstptr += stride;
1252                }
1253             } /* end of else */
1254
1255             break;
1256          }       // end 48 bpp
1257
1258          case 64:       // png_ptr->row_info.pixel_depth
1259          {
1260             png_bytep srcptr;
1261             png_bytep dstptr;
1262             register png_uint_32 i;
1263             png_uint_32 initial_val = 8 * png_pass_start[png_ptr->pass];
1264               // png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1265             register int stride = 8 * png_pass_inc[png_ptr->pass];
1266               // png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1267             register int rep_bytes = 8 * png_pass_width[png_ptr->pass];
1268               // png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1269             register png_uint_32 final_val = 8 * png_ptr->width;
1270
1271             srcptr = png_ptr->row_buf + 1 + initial_val;
1272             dstptr = row + initial_val;
1273
1274             for (i = initial_val; i < final_val; i += stride)
1275             {
1276                png_memcpy(dstptr, srcptr, rep_bytes);
1277                srcptr += stride;
1278                dstptr += stride;
1279             }
1280             break;
1281          }       // end 64 bpp
1282
1283          default:   // png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64
1284          {
1285             // this should never happen
1286             fprintf(stderr,
1287               "libpng internal error:  png_ptr->row_info.pixel_depth = %d\n",
1288               png_ptr->row_info.pixel_depth);
1289             fflush(stderr);
1290             break;
1291          }
1292       } /* end switch (png_ptr->row_info.pixel_depth) */
1293
1294    } /* end if (non-trivial mask) */
1295
1296 } /* end png_combine_row() */
1297
1298 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1299
1300
1301
1302
1303 //===========================================================================//
1304 //                                                                           //
1305 //                 P N G _ D O _ R E A D _ I N T E R L A C E                 //
1306 //                                                                           //
1307 //===========================================================================//
1308
1309 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1310 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1311
1312 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1313  * has taken place.  [GRR: what other steps come before and/or after?]
1314  */
1315
1316 void /* PRIVATE */
1317 png_do_read_interlace(png_structp png_ptr)
1318 {
1319    png_row_infop row_info = &(png_ptr->row_info);
1320    png_bytep row = png_ptr->row_buf + 1;
1321    int pass = png_ptr->pass;
1322    png_uint_32 transformations = png_ptr->transformations;
1323
1324    png_debug(1,"in png_do_read_interlace\n");
1325
1326    if (_mmx_supported == 2) {
1327        png_mmx_support();
1328    }
1329
1330    if (row != NULL && row_info != NULL)
1331    {
1332       png_uint_32 final_width;
1333
1334       final_width = row_info->width * png_pass_inc[pass];
1335
1336       switch (row_info->pixel_depth)
1337       {
1338          case 1:
1339          {
1340             png_bytep sp, dp;
1341             int sshift, dshift;
1342             int s_start, s_end, s_inc;
1343             png_byte v;
1344             png_uint_32 i;
1345             int j;
1346
1347             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1348             dp = row + (png_size_t)((final_width - 1) >> 3);
1349 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1350             if (transformations & PNG_PACKSWAP)
1351             {
1352                sshift = (int)((row_info->width + 7) & 7);
1353                dshift = (int)((final_width + 7) & 7);
1354                s_start = 7;
1355                s_end = 0;
1356                s_inc = -1;
1357             }
1358             else
1359 #endif
1360             {
1361                sshift = 7 - (int)((row_info->width + 7) & 7);
1362                dshift = 7 - (int)((final_width + 7) & 7);
1363                s_start = 0;
1364                s_end = 7;
1365                s_inc = 1;
1366             }
1367
1368             for (i = row_info->width; i; i--)
1369             {
1370                v = (png_byte)((*sp >> sshift) & 0x1);
1371                for (j = 0; j < png_pass_inc[pass]; j++)
1372                {
1373                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1374                   *dp |= (png_byte)(v << dshift);
1375                   if (dshift == s_end)
1376                   {
1377                      dshift = s_start;
1378                      dp--;
1379                   }
1380                   else
1381                      dshift += s_inc;
1382                }
1383                if (sshift == s_end)
1384                {
1385                   sshift = s_start;
1386                   sp--;
1387                }
1388                else
1389                   sshift += s_inc;
1390             }
1391             break;
1392          }
1393
1394          case 2:
1395          {
1396             png_bytep sp, dp;
1397             int sshift, dshift;
1398             int s_start, s_end, s_inc;
1399             png_uint_32 i;
1400
1401             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1402             dp = row + (png_size_t)((final_width - 1) >> 2);
1403 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1404             if (transformations & PNG_PACKSWAP)
1405             {
1406                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1407                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1408                s_start = 6;
1409                s_end = 0;
1410                s_inc = -2;
1411             }
1412             else
1413 #endif
1414             {
1415                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1416                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1417                s_start = 0;
1418                s_end = 6;
1419                s_inc = 2;
1420             }
1421
1422             for (i = row_info->width; i; i--)
1423             {
1424                png_byte v;
1425                int j;
1426
1427                v = (png_byte)((*sp >> sshift) & 0x3);
1428                for (j = 0; j < png_pass_inc[pass]; j++)
1429                {
1430                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1431                   *dp |= (png_byte)(v << dshift);
1432                   if (dshift == s_end)
1433                   {
1434                      dshift = s_start;
1435                      dp--;
1436                   }
1437                   else
1438                      dshift += s_inc;
1439                }
1440                if (sshift == s_end)
1441                {
1442                   sshift = s_start;
1443                   sp--;
1444                }
1445                else
1446                   sshift += s_inc;
1447             }
1448             break;
1449          }
1450
1451          case 4:
1452          {
1453             png_bytep sp, dp;
1454             int sshift, dshift;
1455             int s_start, s_end, s_inc;
1456             png_uint_32 i;
1457
1458             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1459             dp = row + (png_size_t)((final_width - 1) >> 1);
1460 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1461             if (transformations & PNG_PACKSWAP)
1462             {
1463                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1464                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1465                s_start = 4;
1466                s_end = 0;
1467                s_inc = -4;
1468             }
1469             else
1470 #endif
1471             {
1472                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1473                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1474                s_start = 0;
1475                s_end = 4;
1476                s_inc = 4;
1477             }
1478
1479             for (i = row_info->width; i; i--)
1480             {
1481                png_byte v;
1482                int j;
1483
1484                v = (png_byte)((*sp >> sshift) & 0xf);
1485                for (j = 0; j < png_pass_inc[pass]; j++)
1486                {
1487                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1488                   *dp |= (png_byte)(v << dshift);
1489                   if (dshift == s_end)
1490                   {
1491                      dshift = s_start;
1492                      dp--;
1493                   }
1494                   else
1495                      dshift += s_inc;
1496                }
1497                if (sshift == s_end)
1498                {
1499                   sshift = s_start;
1500                   sp--;
1501                }
1502                else
1503                   sshift += s_inc;
1504             }
1505             break;
1506          }
1507
1508          //====================================================================
1509
1510          default:  // 8-bit or larger (this is where the routine is modified)
1511          {
1512 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
1513 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
1514 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
1515 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
1516             png_bytep sptr, dp;
1517             png_uint_32 i;
1518             png_size_t pixel_bytes;
1519             int width = row_info->width;
1520
1521             pixel_bytes = (row_info->pixel_depth >> 3);
1522
1523             // point sptr at the last pixel in the pre-expanded row:
1524             sptr = row + (width - 1) * pixel_bytes;
1525
1526             // point dp at the last pixel position in the expanded row:
1527             dp = row + (final_width - 1) * pixel_bytes;
1528
1529             // New code by Nirav Chhatrapati - Intel Corporation
1530
1531             if ( _mmx_supported )
1532             {
1533                //--------------------------------------------------------------
1534                if (pixel_bytes == 3)
1535                {
1536                   if (((pass == 0) || (pass == 1)) && width)
1537                   {
1538                      int dummy_value_c;   // fix 'forbidden register spilled'
1539                      int dummy_value_S;
1540                      int dummy_value_D;
1541
1542                      __asm__ __volatile__ (
1543                         "subl $21, %%edi         \n\t"
1544                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1545
1546                      ".loop3_pass0:              \n\t"
1547                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1548                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1549                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1550                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1551                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1552                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1553                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1554                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1555                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1556                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
1557                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
1558                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
1559                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
1560                         "movq %%mm4, 16(%%edi)   \n\t"
1561                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
1562                         "movq %%mm3, 8(%%edi)    \n\t"
1563                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
1564                         "subl $3, %%esi          \n\t"
1565                         "movq %%mm0, (%%edi)     \n\t"
1566                         "subl $24, %%edi         \n\t"
1567                         "decl %%ecx              \n\t"
1568                         "jnz .loop3_pass0        \n\t"
1569                         "EMMS                    \n\t" // DONE
1570
1571                         : "=c" (dummy_value_c),        // output regs (dummy)
1572                           "=S" (dummy_value_S),
1573                           "=D" (dummy_value_D)
1574
1575                         : "1" (sptr),      // esi      // input regs
1576                           "2" (dp),        // edi
1577                           "0" (width)      // ecx
1578 // doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4)
1579
1580 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1581                         : "%mm0", "%mm1", "%mm2"       // clobber list
1582                         , "%mm3", "%mm4"
1583 #endif
1584                      );
1585                   }
1586                   else if (((pass == 2) || (pass == 3)) && width)
1587                   {
1588                      int dummy_value_c;   // fix 'forbidden register spilled'
1589                      int dummy_value_S;
1590                      int dummy_value_D;
1591
1592                      __asm__ __volatile__ (
1593                         "subl $9, %%edi          \n\t"
1594                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1595
1596                      ".loop3_pass2:              \n\t"
1597                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1598                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1599                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1600                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1601                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1602                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1603                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1604                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1605                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1606                         "movq %%mm0, 4(%%edi)    \n\t"
1607                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
1608                         "subl $3, %%esi          \n\t"
1609                         "movd %%mm0, (%%edi)     \n\t"
1610                         "subl $12, %%edi         \n\t"
1611                         "decl %%ecx              \n\t"
1612                         "jnz .loop3_pass2        \n\t"
1613                         "EMMS                    \n\t" // DONE
1614
1615                         : "=c" (dummy_value_c),        // output regs (dummy)
1616                           "=S" (dummy_value_S),
1617                           "=D" (dummy_value_D)
1618
1619                         : "1" (sptr),      // esi      // input regs
1620                           "2" (dp),        // edi
1621                           "0" (width)      // ecx
1622
1623 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1624                         : "%mm0", "%mm1", "%mm2"       // clobber list
1625 #endif
1626                      );
1627                   }
1628                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1629                   {
1630                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
1631                      if (width_mmx < 0)
1632                          width_mmx = 0;
1633                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1634                      if (width_mmx)
1635                      {
1636                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1637                         // sptr points at last pixel in pre-expanded row
1638                         // dp points at last pixel position in expanded row
1639                         int dummy_value_c;  // fix 'forbidden register spilled'
1640                         int dummy_value_S;
1641                         int dummy_value_D;
1642
1643                         __asm__ __volatile__ (
1644                            "subl $3, %%esi          \n\t"
1645                            "subl $9, %%edi          \n\t"
1646                                         // (png_pass_inc[pass] + 1)*pixel_bytes
1647
1648                         ".loop3_pass4:              \n\t"
1649                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
1650                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
1651                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
1652                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
1653                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
1654                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
1655                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
1656                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
1657                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
1658                            "movq %%mm0, (%%edi)     \n\t"
1659                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
1660                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
1661                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
1662                            "subl $6, %%esi          \n\t"
1663                            "movd %%mm2, 8(%%edi)    \n\t"
1664                            "subl $12, %%edi         \n\t"
1665                            "subl $2, %%ecx          \n\t"
1666                            "jnz .loop3_pass4        \n\t"
1667                            "EMMS                    \n\t" // DONE
1668
1669                            : "=c" (dummy_value_c),        // output regs (dummy)
1670                              "=S" (dummy_value_S),
1671                              "=D" (dummy_value_D)
1672
1673                            : "1" (sptr),      // esi      // input regs
1674                              "2" (dp),        // edi
1675                              "0" (width_mmx)  // ecx
1676
1677 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1678                            : "%mm0", "%mm1"               // clobber list
1679                            , "%mm2", "%mm3"
1680 #endif
1681                         );
1682                      }
1683
1684                      sptr -= width_mmx*3;
1685                      dp -= width_mmx*6;
1686                      for (i = width; i; i--)
1687                      {
1688                         png_byte v[8];
1689                         int j;
1690
1691                         png_memcpy(v, sptr, 3);
1692                         for (j = 0; j < png_pass_inc[pass]; j++)
1693                         {
1694                            png_memcpy(dp, v, 3);
1695                            dp -= 3;
1696                         }
1697                         sptr -= 3;
1698                      }
1699                   }
1700                } /* end of pixel_bytes == 3 */
1701
1702                //--------------------------------------------------------------
1703                else if (pixel_bytes == 1)
1704                {
1705                   if (((pass == 0) || (pass == 1)) && width)
1706                   {
1707                      int width_mmx = ((width >> 2) << 2);
1708                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1709                      if (width_mmx)
1710                      {
1711                         int dummy_value_c;  // fix 'forbidden register spilled'
1712                         int dummy_value_S;
1713                         int dummy_value_D;
1714
1715                         __asm__ __volatile__ (
1716                            "subl $3, %%esi          \n\t"
1717                            "subl $31, %%edi         \n\t"
1718
1719                         ".loop1_pass0:              \n\t"
1720                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1721                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
1722                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1723                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
1724                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1725                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
1726                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
1727                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
1728                            "movq %%mm0, (%%edi)     \n\t"
1729                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
1730                            "movq %%mm3, 8(%%edi)    \n\t"
1731                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
1732                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
1733                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
1734                            "movq %%mm2, 16(%%edi)   \n\t"
1735                            "subl $4, %%esi          \n\t"
1736                            "movq %%mm4, 24(%%edi)   \n\t"
1737                            "subl $32, %%edi         \n\t"
1738                            "subl $4, %%ecx          \n\t"
1739                            "jnz .loop1_pass0        \n\t"
1740                            "EMMS                    \n\t" // DONE
1741
1742                            : "=c" (dummy_value_c),        // output regs (dummy)
1743                              "=S" (dummy_value_S),
1744                              "=D" (dummy_value_D)
1745
1746                            : "1" (sptr),      // esi      // input regs
1747                              "2" (dp),        // edi
1748                              "0" (width_mmx)  // ecx
1749
1750 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1751                            : "%mm0", "%mm1", "%mm2"       // clobber list
1752                            , "%mm3", "%mm4"
1753 #endif
1754                         );
1755                      }
1756
1757                      sptr -= width_mmx;
1758                      dp -= width_mmx*8;
1759                      for (i = width; i; i--)
1760                      {
1761                         int j;
1762
1763                        /* I simplified this part in version 1.0.4e
1764                         * here and in several other instances where
1765                         * pixel_bytes == 1  -- GR-P
1766                         *
1767                         * Original code:
1768                         *
1769                         * png_byte v[8];
1770                         * png_memcpy(v, sptr, pixel_bytes);
1771                         * for (j = 0; j < png_pass_inc[pass]; j++)
1772                         * {
1773                         *    png_memcpy(dp, v, pixel_bytes);
1774                         *    dp -= pixel_bytes;
1775                         * }
1776                         * sptr -= pixel_bytes;
1777                         *
1778                         * Replacement code is in the next three lines:
1779                         */
1780
1781                         for (j = 0; j < png_pass_inc[pass]; j++)
1782                            *dp-- = *sptr;
1783                         --sptr;
1784                      }
1785                   }
1786                   else if (((pass == 2) || (pass == 3)) && width)
1787                   {
1788                      int width_mmx = ((width >> 2) << 2);
1789                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1790                      if (width_mmx)
1791                      {
1792                         int dummy_value_c;  // fix 'forbidden register spilled'
1793                         int dummy_value_S;
1794                         int dummy_value_D;
1795
1796                         __asm__ __volatile__ (
1797                            "subl $3, %%esi          \n\t"
1798                            "subl $15, %%edi         \n\t"
1799
1800                         ".loop1_pass2:              \n\t"
1801                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1802                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1803                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
1804                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1805                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
1806                            "movq %%mm0, (%%edi)     \n\t"
1807                            "subl $4, %%esi          \n\t"
1808                            "movq %%mm1, 8(%%edi)    \n\t"
1809                            "subl $16, %%edi         \n\t"
1810                            "subl $4, %%ecx          \n\t"
1811                            "jnz .loop1_pass2        \n\t"
1812                            "EMMS                    \n\t" // DONE
1813
1814                            : "=c" (dummy_value_c),        // output regs (dummy)
1815                              "=S" (dummy_value_S),
1816                              "=D" (dummy_value_D)
1817
1818                            : "1" (sptr),      // esi      // input regs
1819                              "2" (dp),        // edi
1820                              "0" (width_mmx)  // ecx
1821
1822 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1823                            : "%mm0", "%mm1"               // clobber list
1824 #endif
1825                         );
1826                      }
1827
1828                      sptr -= width_mmx;
1829                      dp -= width_mmx*4;
1830                      for (i = width; i; i--)
1831                      {
1832                         int j;
1833
1834                         for (j = 0; j < png_pass_inc[pass]; j++)
1835                            *dp-- = *sptr;
1836                         --sptr;
1837                      }
1838                   }
1839                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
1840                   {
1841                      int width_mmx = ((width >> 3) << 3);
1842                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1843                      if (width_mmx)
1844                      {
1845                         int dummy_value_c;  // fix 'forbidden register spilled'
1846                         int dummy_value_S;
1847                         int dummy_value_D;
1848
1849                         __asm__ __volatile__ (
1850                            "subl $7, %%esi          \n\t"
1851                            "subl $15, %%edi         \n\t"
1852
1853                         ".loop1_pass4:              \n\t"
1854                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
1855                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
1856                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1857                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
1858                            "movq %%mm1, 8(%%edi)    \n\t"
1859                            "subl $8, %%esi          \n\t"
1860                            "movq %%mm0, (%%edi)     \n\t"
1861                            "subl $16, %%edi         \n\t"
1862                            "subl $8, %%ecx          \n\t"
1863                            "jnz .loop1_pass4        \n\t"
1864                            "EMMS                    \n\t" // DONE
1865
1866                            : "=c" (dummy_value_c),        // output regs (none)
1867                              "=S" (dummy_value_S),
1868                              "=D" (dummy_value_D)
1869
1870                            : "1" (sptr),      // esi      // input regs
1871                              "2" (dp),        // edi
1872                              "0" (width_mmx)  // ecx
1873
1874 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1875                            : "%mm0", "%mm1"               // clobber list
1876 #endif
1877                         );
1878                      }
1879
1880                      sptr -= width_mmx;
1881                      dp -= width_mmx*2;
1882                      for (i = width; i; i--)
1883                      {
1884                         int j;
1885
1886                         for (j = 0; j < png_pass_inc[pass]; j++)
1887                            *dp-- = *sptr;
1888                         --sptr;
1889                      }
1890                   }
1891                } /* end of pixel_bytes == 1 */
1892
1893                //--------------------------------------------------------------
1894                else if (pixel_bytes == 2)
1895                {
1896                   if (((pass == 0) || (pass == 1)) && width)
1897                   {
1898                      int width_mmx = ((width >> 1) << 1);
1899                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
1900                      if (width_mmx)
1901                      {
1902                         int dummy_value_c;  // fix 'forbidden register spilled'
1903                         int dummy_value_S;
1904                         int dummy_value_D;
1905
1906                         __asm__ __volatile__ (
1907                            "subl $2, %%esi          \n\t"
1908                            "subl $30, %%edi         \n\t"
1909
1910                         ".loop2_pass0:              \n\t"
1911                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1912                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
1913                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
1914                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
1915                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
1916                            "movq %%mm0, (%%edi)     \n\t"
1917                            "movq %%mm0, 8(%%edi)    \n\t"
1918                            "movq %%mm1, 16(%%edi)   \n\t"
1919                            "subl $4, %%esi          \n\t"
1920                            "movq %%mm1, 24(%%edi)   \n\t"
1921                            "subl $32, %%edi         \n\t"
1922                            "subl $2, %%ecx          \n\t"
1923                            "jnz .loop2_pass0        \n\t"
1924                            "EMMS                    \n\t" // DONE
1925
1926                            : "=c" (dummy_value_c),        // output regs (dummy)
1927                              "=S" (dummy_value_S),
1928                              "=D" (dummy_value_D)
1929
1930                            : "1" (sptr),      // esi      // input regs
1931                              "2" (dp),        // edi
1932                              "0" (width_mmx)  // ecx
1933
1934 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1935                            : "%mm0", "%mm1"               // clobber list
1936 #endif
1937                         );
1938                      }
1939
1940                      sptr -= (width_mmx*2 - 2); // sign fixed
1941                      dp -= (width_mmx*16 - 2);  // sign fixed
1942                      for (i = width; i; i--)
1943                      {
1944                         png_byte v[8];
1945                         int j;
1946                         sptr -= 2;
1947                         png_memcpy(v, sptr, 2);
1948                         for (j = 0; j < png_pass_inc[pass]; j++)
1949                         {
1950                            dp -= 2;
1951                            png_memcpy(dp, v, 2);
1952                         }
1953                      }
1954                   }
1955                   else if (((pass == 2) || (pass == 3)) && width)
1956                   {
1957                      int width_mmx = ((width >> 1) << 1) ;
1958                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
1959                      if (width_mmx)
1960                      {
1961                         int dummy_value_c;  // fix 'forbidden register spilled'
1962                         int dummy_value_S;
1963                         int dummy_value_D;
1964
1965                         __asm__ __volatile__ (
1966                            "subl $2, %%esi          \n\t"
1967                            "subl $14, %%edi         \n\t"
1968
1969                         ".loop2_pass2:              \n\t"
1970                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1971                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
1972                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
1973                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
1974                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
1975                            "movq %%mm0, (%%edi)     \n\t"
1976                            "subl $4, %%esi          \n\t"
1977                            "movq %%mm1, 8(%%edi)    \n\t"
1978                            "subl $16, %%edi         \n\t"
1979                            "subl $2, %%ecx          \n\t"
1980                            "jnz .loop2_pass2        \n\t"
1981                            "EMMS                    \n\t" // DONE
1982
1983                            : "=c" (dummy_value_c),        // output regs (dummy)
1984                              "=S" (dummy_value_S),
1985                              "=D" (dummy_value_D)
1986
1987                            : "1" (sptr),      // esi      // input regs
1988                              "2" (dp),        // edi
1989                              "0" (width_mmx)  // ecx
1990
1991 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1992                            : "%mm0", "%mm1"               // clobber list
1993 #endif
1994                         );
1995                      }
1996
1997                      sptr -= (width_mmx*2 - 2); // sign fixed
1998                      dp -= (width_mmx*8 - 2);   // sign fixed
1999                      for (i = width; i; i--)
2000                      {
2001                         png_byte v[8];
2002                         int j;
2003                         sptr -= 2;
2004                         png_memcpy(v, sptr, 2);
2005                         for (j = 0; j < png_pass_inc[pass]; j++)
2006                         {
2007                            dp -= 2;
2008                            png_memcpy(dp, v, 2);
2009                         }
2010                      }
2011                   }
2012                   else if (width)  // pass == 4 or 5
2013                   {
2014                      int width_mmx = ((width >> 1) << 1) ;
2015                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2016                      if (width_mmx)
2017                      {
2018                         int dummy_value_c;  // fix 'forbidden register spilled'
2019                         int dummy_value_S;
2020                         int dummy_value_D;
2021
2022                         __asm__ __volatile__ (
2023                            "subl $2, %%esi          \n\t"
2024                            "subl $6, %%edi          \n\t"
2025
2026                         ".loop2_pass4:              \n\t"
2027                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2028                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2029                            "subl $4, %%esi          \n\t"
2030                            "movq %%mm0, (%%edi)     \n\t"
2031                            "subl $8, %%edi          \n\t"
2032                            "subl $2, %%ecx          \n\t"
2033                            "jnz .loop2_pass4        \n\t"
2034                            "EMMS                    \n\t" // DONE
2035
2036                            : "=c" (dummy_value_c),        // output regs (dummy)
2037                              "=S" (dummy_value_S),
2038                              "=D" (dummy_value_D)
2039
2040                            : "1" (sptr),      // esi      // input regs
2041                              "2" (dp),        // edi
2042                              "0" (width_mmx)  // ecx
2043
2044 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2045                            : "%mm0"                       // clobber list
2046 #endif
2047                         );
2048                      }
2049
2050                      sptr -= (width_mmx*2 - 2); // sign fixed
2051                      dp -= (width_mmx*4 - 2);   // sign fixed
2052                      for (i = width; i; i--)
2053                      {
2054                         png_byte v[8];
2055                         int j;
2056                         sptr -= 2;
2057                         png_memcpy(v, sptr, 2);
2058                         for (j = 0; j < png_pass_inc[pass]; j++)
2059                         {
2060                            dp -= 2;
2061                            png_memcpy(dp, v, 2);
2062                         }
2063                      }
2064                   }
2065                } /* end of pixel_bytes == 2 */
2066
2067                //--------------------------------------------------------------
2068                else if (pixel_bytes == 4)
2069                {
2070                   if (((pass == 0) || (pass == 1)) && width)
2071                   {
2072                      int width_mmx = ((width >> 1) << 1);
2073                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2074                      if (width_mmx)
2075                      {
2076                         int dummy_value_c;  // fix 'forbidden register spilled'
2077                         int dummy_value_S;
2078                         int dummy_value_D;
2079
2080                         __asm__ __volatile__ (
2081                            "subl $4, %%esi          \n\t"
2082                            "subl $60, %%edi         \n\t"
2083
2084                         ".loop4_pass0:              \n\t"
2085                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2086                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2087                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2088                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2089                            "movq %%mm0, (%%edi)     \n\t"
2090                            "movq %%mm0, 8(%%edi)    \n\t"
2091                            "movq %%mm0, 16(%%edi)   \n\t"
2092                            "movq %%mm0, 24(%%edi)   \n\t"
2093                            "movq %%mm1, 32(%%edi)   \n\t"
2094                            "movq %%mm1, 40(%%edi)   \n\t"
2095                            "movq %%mm1, 48(%%edi)   \n\t"
2096                            "subl $8, %%esi          \n\t"
2097                            "movq %%mm1, 56(%%edi)   \n\t"
2098                            "subl $64, %%edi         \n\t"
2099                            "subl $2, %%ecx          \n\t"
2100                            "jnz .loop4_pass0        \n\t"
2101                            "EMMS                    \n\t" // DONE
2102
2103                            : "=c" (dummy_value_c),        // output regs (dummy)
2104                              "=S" (dummy_value_S),
2105                              "=D" (dummy_value_D)
2106
2107                            : "1" (sptr),      // esi      // input regs
2108                              "2" (dp),        // edi
2109                              "0" (width_mmx)  // ecx
2110
2111 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2112                            : "%mm0", "%mm1"               // clobber list
2113 #endif
2114                         );
2115                      }
2116
2117                      sptr -= (width_mmx*4 - 4); // sign fixed
2118                      dp -= (width_mmx*32 - 4);  // sign fixed
2119                      for (i = width; i; i--)
2120                      {
2121                         png_byte v[8];
2122                         int j;
2123                         sptr -= 4;
2124                         png_memcpy(v, sptr, 4);
2125                         for (j = 0; j < png_pass_inc[pass]; j++)
2126                         {
2127                            dp -= 4;
2128                            png_memcpy(dp, v, 4);
2129                         }
2130                      }
2131                   }
2132                   else if (((pass == 2) || (pass == 3)) && width)
2133                   {
2134                      int width_mmx = ((width >> 1) << 1);
2135                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2136                      if (width_mmx)
2137                      {
2138                         int dummy_value_c;  // fix 'forbidden register spilled'
2139                         int dummy_value_S;
2140                         int dummy_value_D;
2141
2142                         __asm__ __volatile__ (
2143                            "subl $4, %%esi          \n\t"
2144                            "subl $28, %%edi         \n\t"
2145
2146                         ".loop4_pass2:              \n\t"
2147                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2148                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2149                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2150                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2151                            "movq %%mm0, (%%edi)     \n\t"
2152                            "movq %%mm0, 8(%%edi)    \n\t"
2153                            "movq %%mm1, 16(%%edi)   \n\t"
2154                            "movq %%mm1, 24(%%edi)   \n\t"
2155                            "subl $8, %%esi          \n\t"
2156                            "subl $32, %%edi         \n\t"
2157                            "subl $2, %%ecx          \n\t"
2158                            "jnz .loop4_pass2        \n\t"
2159                            "EMMS                    \n\t" // DONE
2160
2161                            : "=c" (dummy_value_c),        // output regs (dummy)
2162                              "=S" (dummy_value_S),
2163                              "=D" (dummy_value_D)
2164
2165                            : "1" (sptr),      // esi      // input regs
2166                              "2" (dp),        // edi
2167                              "0" (width_mmx)  // ecx
2168
2169 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2170                            : "%mm0", "%mm1"               // clobber list
2171 #endif
2172                         );
2173                      }
2174
2175                      sptr -= (width_mmx*4 - 4); // sign fixed
2176                      dp -= (width_mmx*16 - 4);  // sign fixed
2177                      for (i = width; i; i--)
2178                      {
2179                         png_byte v[8];
2180                         int j;
2181                         sptr -= 4;
2182                         png_memcpy(v, sptr, 4);
2183                         for (j = 0; j < png_pass_inc[pass]; j++)
2184                         {
2185                            dp -= 4;
2186                            png_memcpy(dp, v, 4);
2187                         }
2188                      }
2189                   }
2190                   else if (width)  // pass == 4 or 5
2191                   {
2192                      int width_mmx = ((width >> 1) << 1) ;
2193                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2194                      if (width_mmx)
2195                      {
2196                         int dummy_value_c;  // fix 'forbidden register spilled'
2197                         int dummy_value_S;
2198                         int dummy_value_D;
2199
2200                         __asm__ __volatile__ (
2201                            "subl $4, %%esi          \n\t"
2202                            "subl $12, %%edi         \n\t"
2203
2204                         ".loop4_pass4:              \n\t"
2205                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2206                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2207                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2208                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2209                            "movq %%mm0, (%%edi)     \n\t"
2210                            "subl $8, %%esi          \n\t"
2211                            "movq %%mm1, 8(%%edi)    \n\t"
2212                            "subl $16, %%edi         \n\t"
2213                            "subl $2, %%ecx          \n\t"
2214                            "jnz .loop4_pass4        \n\t"
2215                            "EMMS                    \n\t" // DONE
2216
2217                            : "=c" (dummy_value_c),        // output regs (dummy)
2218                              "=S" (dummy_value_S),
2219                              "=D" (dummy_value_D)
2220
2221                            : "1" (sptr),      // esi      // input regs
2222                              "2" (dp),        // edi
2223                              "0" (width_mmx)  // ecx
2224
2225 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2226                            : "%mm0", "%mm1"               // clobber list
2227 #endif
2228                         );
2229                      }
2230
2231                      sptr -= (width_mmx*4 - 4); // sign fixed
2232                      dp -= (width_mmx*8 - 4);   // sign fixed
2233                      for (i = width; i; i--)
2234                      {
2235                         png_byte v[8];
2236                         int j;
2237                         sptr -= 4;
2238                         png_memcpy(v, sptr, 4);
2239                         for (j = 0; j < png_pass_inc[pass]; j++)
2240                         {
2241                            dp -= 4;
2242                            png_memcpy(dp, v, 4);
2243                         }
2244                      }
2245                   }
2246                } /* end of pixel_bytes == 4 */
2247
2248                //--------------------------------------------------------------
2249                else if (pixel_bytes == 8)
2250                {
2251 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2252                   // GRR NOTE:  no need to combine passes here!
2253                   if (((pass == 0) || (pass == 1)) && width)
2254                   {
2255                      int dummy_value_c;  // fix 'forbidden register spilled'
2256                      int dummy_value_S;
2257                      int dummy_value_D;
2258
2259                      // source is 8-byte RRGGBBAA
2260                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2261                      __asm__ __volatile__ (
2262                         "subl $56, %%edi         \n\t" // start of last block
2263
2264                      ".loop8_pass0:              \n\t"
2265                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2266                         "movq %%mm0, (%%edi)     \n\t"
2267                         "movq %%mm0, 8(%%edi)    \n\t"
2268                         "movq %%mm0, 16(%%edi)   \n\t"
2269                         "movq %%mm0, 24(%%edi)   \n\t"
2270                         "movq %%mm0, 32(%%edi)   \n\t"
2271                         "movq %%mm0, 40(%%edi)   \n\t"
2272                         "movq %%mm0, 48(%%edi)   \n\t"
2273                         "subl $8, %%esi          \n\t"
2274                         "movq %%mm0, 56(%%edi)   \n\t"
2275                         "subl $64, %%edi         \n\t"
2276                         "decl %%ecx              \n\t"
2277                         "jnz .loop8_pass0        \n\t"
2278                         "EMMS                    \n\t" // DONE
2279
2280                         : "=c" (dummy_value_c),        // output regs (dummy)
2281                           "=S" (dummy_value_S),
2282                           "=D" (dummy_value_D)
2283
2284                         : "1" (sptr),      // esi      // input regs
2285                           "2" (dp),        // edi
2286                           "0" (width)      // ecx
2287
2288 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2289                         : "%mm0"                       // clobber list
2290 #endif
2291                      );
2292                   }
2293                   else if (((pass == 2) || (pass == 3)) && width)
2294                   {
2295                      // source is 8-byte RRGGBBAA
2296                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2297                      int width_mmx = ((width >> 1) << 1) ;
2298                      width -= width_mmx;
2299                      if (width_mmx)
2300                      {
2301                         int dummy_value_c;  // fix 'forbidden register spilled'
2302                         int dummy_value_S;
2303                         int dummy_value_D;
2304
2305                         __asm__ __volatile__ (
2306                            "subl $24, %%edi         \n\t" // start of last block
2307
2308                         ".loop8_pass2:              \n\t"
2309                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2310                            "movq %%mm0, (%%edi)     \n\t"
2311                            "movq %%mm0, 8(%%edi)    \n\t"
2312                            "movq %%mm0, 16(%%edi)   \n\t"
2313                            "subl $8, %%esi          \n\t"
2314                            "movq %%mm0, 24(%%edi)   \n\t"
2315                            "subl $32, %%edi         \n\t"
2316                            "decl %%ecx              \n\t"
2317                            "jnz .loop8_pass2        \n\t"
2318                            "EMMS                    \n\t" // DONE
2319
2320                            : "=c" (dummy_value_c),        // output regs (dummy)
2321                              "=S" (dummy_value_S),
2322                              "=D" (dummy_value_D)
2323
2324                            : "1" (sptr),      // esi      // input regs
2325                              "2" (dp),        // edi
2326                              "0" (width)      // ecx
2327
2328 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2329                            : "%mm0"                       // clobber list
2330 #endif
2331                         );
2332                      }
2333                   }
2334                   else if (width)  // pass == 4 or 5
2335                   {
2336                      // source is 8-byte RRGGBBAA
2337                      // dest is 16-byte RRGGBBAA RRGGBBAA
2338                      int width_mmx = ((width >> 1) << 1) ;
2339                      width -= width_mmx;
2340                      if (width_mmx)
2341                      {
2342                         int dummy_value_c;  // fix 'forbidden register spilled'
2343                         int dummy_value_S;
2344                         int dummy_value_D;
2345
2346                         __asm__ __volatile__ (
2347                            "subl $8, %%edi          \n\t" // start of last block
2348
2349                         ".loop8_pass4:              \n\t"
2350                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2351                            "movq %%mm0, (%%edi)     \n\t"
2352                            "subl $8, %%esi          \n\t"
2353                            "movq %%mm0, 8(%%edi)    \n\t"
2354                            "subl $16, %%edi         \n\t"
2355                            "decl %%ecx              \n\t"
2356                            "jnz .loop8_pass4        \n\t"
2357                            "EMMS                    \n\t" // DONE
2358
2359                            : "=c" (dummy_value_c),        // output regs (dummy)
2360                              "=S" (dummy_value_S),
2361                              "=D" (dummy_value_D)
2362
2363                            : "1" (sptr),      // esi      // input regs
2364                              "2" (dp),        // edi
2365                              "0" (width)      // ecx
2366
2367 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2368                            : "%mm0"                       // clobber list
2369 #endif
2370                         );
2371                      }
2372                   }
2373
2374                } /* end of pixel_bytes == 8 */
2375
2376                //--------------------------------------------------------------
2377                else if (pixel_bytes == 6)
2378                {
2379                   for (i = width; i; i--)
2380                   {
2381                      png_byte v[8];
2382                      int j;
2383                      png_memcpy(v, sptr, 6);
2384                      for (j = 0; j < png_pass_inc[pass]; j++)
2385                      {
2386                         png_memcpy(dp, v, 6);
2387                         dp -= 6;
2388                      }
2389                      sptr -= 6;
2390                   }
2391                } /* end of pixel_bytes == 6 */
2392
2393                //--------------------------------------------------------------
2394                else
2395                {
2396                   for (i = width; i; i--)
2397                   {
2398                      png_byte v[8];
2399                      int j;
2400                      png_memcpy(v, sptr, pixel_bytes);
2401                      for (j = 0; j < png_pass_inc[pass]; j++)
2402                      {
2403                         png_memcpy(dp, v, pixel_bytes);
2404                         dp -= pixel_bytes;
2405                      }
2406                      sptr-= pixel_bytes;
2407                   }
2408                }
2409             } // end of _mmx_supported ========================================
2410
2411             else /* MMX not supported:  use modified C code - takes advantage
2412                   *   of inlining of memcpy for a constant */
2413                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2414                   *   block be replaced with immediate value (e.g., 1)? */
2415                  /* GRR 19991017:  replaced with constants in each case */
2416             {
2417                if (pixel_bytes == 1)
2418                {
2419                   for (i = width; i; i--)
2420                   {
2421                      int j;
2422                      for (j = 0; j < png_pass_inc[pass]; j++)
2423                         *dp-- = *sptr;
2424                      --sptr;
2425                   }
2426                }
2427                else if (pixel_bytes == 3)
2428                {
2429                   for (i = width; i; i--)
2430                   {
2431                      png_byte v[8];
2432                      int j;
2433                      png_memcpy(v, sptr, 3);
2434                      for (j = 0; j < png_pass_inc[pass]; j++)
2435                      {
2436                         png_memcpy(dp, v, 3);
2437                         dp -= 3;
2438                      }
2439                      sptr -= 3;
2440                   }
2441                }
2442                else if (pixel_bytes == 2)
2443                {
2444                   for (i = width; i; i--)
2445                   {
2446                      png_byte v[8];
2447                      int j;
2448                      png_memcpy(v, sptr, 2);
2449                      for (j = 0; j < png_pass_inc[pass]; j++)
2450                      {
2451                         png_memcpy(dp, v, 2);
2452                         dp -= 2;
2453                      }
2454                      sptr -= 2;
2455                   }
2456                }
2457                else if (pixel_bytes == 4)
2458                {
2459                   for (i = width; i; i--)
2460                   {
2461                      png_byte v[8];
2462                      int j;
2463                      png_memcpy(v, sptr, 4);
2464                      for (j = 0; j < png_pass_inc[pass]; j++)
2465                      {
2466                         png_memcpy(dp, v, 4);
2467                         dp -= 4;
2468                      }
2469                      sptr -= 4;
2470                   }
2471                }
2472                else if (pixel_bytes == 6)
2473                {
2474                   for (i = width; i; i--)
2475                   {
2476                      png_byte v[8];
2477                      int j;
2478                      png_memcpy(v, sptr, 6);
2479                      for (j = 0; j < png_pass_inc[pass]; j++)
2480                      {
2481                         png_memcpy(dp, v, 6);
2482                         dp -= 6;
2483                      }
2484                      sptr -= 6;
2485                   }
2486                }
2487                else if (pixel_bytes == 8)
2488                {
2489                   for (i = width; i; i--)
2490                   {
2491                      png_byte v[8];
2492                      int j;
2493                      png_memcpy(v, sptr, 8);
2494                      for (j = 0; j < png_pass_inc[pass]; j++)
2495                      {
2496                         png_memcpy(dp, v, 8);
2497                         dp -= 8;
2498                      }
2499                      sptr -= 8;
2500                   }
2501                }
2502                else     // GRR:  should never be reached
2503                {
2504                   for (i = width; i; i--)
2505                   {
2506                      png_byte v[8];
2507                      int j;
2508                      png_memcpy(v, sptr, pixel_bytes);
2509                      for (j = 0; j < png_pass_inc[pass]; j++)
2510                      {
2511                         png_memcpy(dp, v, pixel_bytes);
2512                         dp -= pixel_bytes;
2513                      }
2514                      sptr -= pixel_bytes;
2515                   }
2516                }
2517
2518             } /* end if (MMX not supported) */
2519             break;
2520          }
2521       } /* end switch (row_info->pixel_depth) */
2522
2523       row_info->width = final_width;
2524       row_info->rowbytes = ((final_width *
2525          (png_uint_32)row_info->pixel_depth + 7) >> 3);
2526    }
2527
2528 } /* end png_do_read_interlace() */
2529
2530 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2531 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2532
2533
2534
2535
2536 // These variables are utilized in the functions below.  They are declared
2537 // globally here to ensure alignment on 8-byte boundaries.
2538
2539 union uAll {
2540    long long use;
2541    double  align;
2542 } _LBCarryMask = {0x0101010101010101LL},
2543   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2544   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2545
2546
2547
2548
2549 //===========================================================================//
2550 //                                                                           //
2551 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
2552 //                                                                           //
2553 //===========================================================================//
2554
2555 // Optimized code for PNG Average filter decoder
2556
2557 static void /* PRIVATE */
2558 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2559                             png_bytep prev_row)
2560 {
2561    int bpp;
2562    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
2563    int dummy_value_S;
2564    int dummy_value_D;
2565
2566    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
2567    _FullLength  = row_info->rowbytes;       // # of bytes to filter
2568
2569    __asm__ __volatile__ (
2570       // initialize address pointers and offset
2571 #ifdef __PIC__
2572       "pushl %%ebx                 \n\t" // save index to Global Offset Table
2573 #endif
2574 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
2575       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
2576       "movl %%edi, %%edx           \n\t"
2577 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2578 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2579       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2580
2581       "xorl %%eax,%%eax            \n\t"
2582
2583       // Compute the Raw value for the first bpp bytes
2584       //    Raw(x) = Avg(x) + (Prior(x)/2)
2585    "avg_rlp:                       \n\t"
2586       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
2587       "incl %%ebx                  \n\t"
2588       "shrb %%al                   \n\t" // divide by 2
2589       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
2590 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
2591       "cmpl %%ecx, %%ebx           \n\t"
2592       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
2593       "jb avg_rlp                  \n\t" // mov does not affect flags
2594
2595       // get # of bytes to alignment
2596       "movl %%edi, _dif            \n\t" // take start of row
2597       "addl %%ebx, _dif            \n\t" // add bpp
2598       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
2599       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
2600       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
2601       "jz avg_go                   \n\t" //  alignment
2602
2603       // fix alignment
2604       // Compute the Raw value for the bytes up to the alignment boundary
2605       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2606       "xorl %%ecx, %%ecx           \n\t"
2607
2608    "avg_lp1:                       \n\t"
2609       "xorl %%eax, %%eax           \n\t"
2610       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2611       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2612       "addw %%cx, %%ax             \n\t"
2613       "incl %%ebx                  \n\t"
2614       "shrw %%ax                   \n\t" // divide by 2
2615       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2616       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
2617       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2618       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
2619
2620    "avg_go:                        \n\t"
2621       "movl _FullLength, %%eax     \n\t"
2622       "movl %%eax, %%ecx           \n\t"
2623       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
2624       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
2625       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
2626       "movl %%ecx, _MMXLength      \n\t"
2627 #ifdef __PIC__
2628       "popl %%ebx                  \n\t" // restore index to Global Offset Table
2629 #endif
2630
2631       : "=c" (dummy_value_c),            // output regs (dummy)
2632         "=S" (dummy_value_S),
2633         "=D" (dummy_value_D)
2634
2635       : "0" (bpp),       // ecx          // input regs
2636         "1" (prev_row),  // esi
2637         "2" (row)        // edi
2638
2639       : "%eax", "%edx"                   // clobber list
2640 #ifndef __PIC__
2641       , "%ebx"
2642 #endif
2643       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2644       // (seems to work fine without...)
2645    );
2646
2647    // now do the math for the rest of the row
2648    switch (bpp)
2649    {
2650       case 3:
2651       {
2652          _ActiveMask.use  = 0x0000000000ffffffLL;
2653          _ShiftBpp.use = 24;    // == 3 * 8
2654          _ShiftRem.use = 40;    // == 64 - 24
2655
2656          __asm__ __volatile__ (
2657             // re-init address pointers and offset
2658             "movq _ActiveMask, %%mm7      \n\t"
2659             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
2660             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
2661 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
2662             "movq _HBClearMask, %%mm4     \n\t"
2663 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
2664
2665             // prime the pump:  load the first Raw(x-bpp) data set
2666             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2667                                                 // (correct pos. in loop below)
2668          "avg_3lp:                        \n\t"
2669             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
2670             "movq %%mm5, %%mm3            \n\t"
2671             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp) data
2672             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
2673             "movq %%mm7, %%mm6            \n\t"
2674             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
2675             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
2676             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each byte
2677             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for each byte
2678             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2679             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
2680             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
2681                                // lsb's were == 1 (only valid for active group)
2682             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2683             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
2684             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2685             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1 bytes to add to Avg
2686             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2687                                //  byte
2688             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2689             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover bytes 3-5
2690             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2691             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2692             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
2693             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
2694                                // lsb's were == 1 (only valid for active group)
2695             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2696             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
2697             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2698             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2 bytes to add to Avg
2699             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2700                                //  byte
2701
2702             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2703             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last two
2704                                  // bytes
2705             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2706             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2707                               // Data only needs to be shifted once here to
2708                               // get the correct x-bpp offset.
2709             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting LBCarrys
2710             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte where both
2711                               // lsb's were == 1 (only valid for active group)
2712             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2713             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each byte
2714             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2715             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2 bytes to add to Avg
2716             "addl $8, %%ecx               \n\t"
2717             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2718                                                 // byte
2719             // now ready to write back to memory
2720             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2721             // move updated Raw(x) to use as Raw(x-bpp) for next loop
2722             "cmpl _MMXLength, %%ecx       \n\t"
2723             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
2724             "jb avg_3lp                   \n\t"
2725
2726             : "=S" (dummy_value_S),             // output regs (dummy)
2727               "=D" (dummy_value_D)
2728
2729             : "0" (prev_row),  // esi           // input regs
2730               "1" (row)        // edi
2731
2732             : "%ecx"                            // clobber list
2733 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2734             , "%mm0", "%mm1", "%mm2", "%mm3"
2735             , "%mm4", "%mm5", "%mm6", "%mm7"
2736 #endif
2737          );
2738       }
2739       break;  // end 3 bpp
2740
2741       case 6:
2742       case 4:
2743       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
2744       //case 5:   // GRR BOGUS
2745       {
2746          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
2747                                                   // appropriate inactive bytes
2748          _ShiftBpp.use = bpp << 3;
2749          _ShiftRem.use = 64 - _ShiftBpp.use;
2750
2751          __asm__ __volatile__ (
2752             "movq _HBClearMask, %%mm4    \n\t"
2753
2754             // re-init address pointers and offset
2755             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment boundary
2756
2757             // load _ActiveMask and clear all bytes except for 1st active group
2758             "movq _ActiveMask, %%mm7     \n\t"
2759 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2760             "psrlq _ShiftRem, %%mm7      \n\t"
2761 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2762             "movq %%mm7, %%mm6           \n\t"
2763             "movq _LBCarryMask, %%mm5    \n\t"
2764             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active group
2765
2766             // prime the pump:  load the first Raw(x-bpp) data set
2767             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2768                                           // (we correct pos. in loop below)
2769          "avg_4lp:                       \n\t"
2770             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2771             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
2772             "movq (%%esi,%%ecx,), %%mm1  \n\t"
2773             // add (Prev_row/2) to average
2774             "movq %%mm5, %%mm3           \n\t"
2775             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
2776             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
2777             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
2778             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
2779             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2780             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2781             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2782                               // lsb's were == 1 (only valid for active group)
2783             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2784             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2785             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2786             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1 bytes to add to Avg
2787             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2788                               // byte
2789             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2790             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2791             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2792             "addl $8, %%ecx              \n\t"
2793             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2794             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2795                               // lsb's were == 1 (only valid for active group)
2796             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2797             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2798             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2799             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2800             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2801                               // byte
2802             "cmpl _MMXLength, %%ecx      \n\t"
2803             // now ready to write back to memory
2804             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2805             // prep Raw(x-bpp) for next loop
2806             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2807             "jb avg_4lp                  \n\t"
2808
2809             : "=S" (dummy_value_S),            // output regs (dummy)
2810               "=D" (dummy_value_D)
2811
2812             : "0" (prev_row),  // esi          // input regs
2813               "1" (row)        // edi
2814
2815             : "%ecx"                           // clobber list
2816 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2817             , "%mm0", "%mm1", "%mm2", "%mm3"
2818             , "%mm4", "%mm5", "%mm6", "%mm7"
2819 #endif
2820          );
2821       }
2822       break;  // end 4,6 bpp
2823
2824       case 2:
2825       {
2826          _ActiveMask.use  = 0x000000000000ffffLL;
2827          _ShiftBpp.use = 16;   // == 2 * 8
2828          _ShiftRem.use = 48;   // == 64 - 16
2829
2830          __asm__ __volatile__ (
2831             // load _ActiveMask
2832             "movq _ActiveMask, %%mm7     \n\t"
2833             // re-init address pointers and offset
2834             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment boundary
2835             "movq _LBCarryMask, %%mm5    \n\t"
2836 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2837             "movq _HBClearMask, %%mm4    \n\t"
2838 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2839
2840             // prime the pump:  load the first Raw(x-bpp) data set
2841             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2842                               // (we correct pos. in loop below)
2843          "avg_2lp:                       \n\t"
2844             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2845             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
2846             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
2847             // add (Prev_row/2) to average
2848             "movq %%mm5, %%mm3           \n\t"
2849             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
2850             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
2851             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
2852             "movq %%mm7, %%mm6           \n\t"
2853             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
2854
2855             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2856             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2857             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2858                                                // lsb's were == 1 (only valid for active group)
2859             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2860             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2861             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2862             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1 bytes to add to Avg
2863             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2864
2865             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2866             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 2 & 3
2867             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2868             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2869             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2870             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2871                                                // lsb's were == 1 (only valid for active group)
2872             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2873             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2874             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2875             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2876             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2877
2878             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2879             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 4 & 5
2880             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2881             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2882             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2883             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2884                                                // lsb's were == 1 (only valid for active group)
2885             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2886             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2887             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2888             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2889             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2890
2891             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
2892             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover bytes 6 & 7
2893             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2894             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
2895             "addl $8, %%ecx              \n\t"
2896             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting LBCarrys
2897             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte where both
2898                                                // lsb's were == 1 (only valid for active group)
2899             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2900             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
2901             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2902             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2 bytes to add to Avg
2903             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2904
2905             "cmpl _MMXLength, %%ecx      \n\t"
2906             // now ready to write back to memory
2907             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2908             // prep Raw(x-bpp) for next loop
2909             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
2910             "jb avg_2lp                  \n\t"
2911
2912             : "=S" (dummy_value_S),            // output regs (dummy)
2913               "=D" (dummy_value_D)
2914
2915             : "0" (prev_row),  // esi          // input regs
2916               "1" (row)        // edi
2917
2918             : "%ecx"                           // clobber list
2919 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2920             , "%mm0", "%mm1", "%mm2", "%mm3"
2921             , "%mm4", "%mm5", "%mm6", "%mm7"
2922 #endif
2923          );
2924       }
2925       break;  // end 2 bpp
2926
2927       case 1:
2928       {
2929          __asm__ __volatile__ (
2930             // re-init address pointers and offset
2931 #ifdef __PIC__
2932             "pushl %%ebx                 \n\t" // save Global Offset Table index
2933 #endif
2934             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment boundary
2935 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2936             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
2937             "jnb avg_1end                \n\t"
2938             // do Paeth decode for remaining bytes
2939 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2940             "movl %%edi, %%edx           \n\t"
2941 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2942             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2943             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
2944                                                //  in loop below
2945          "avg_1lp:                       \n\t"
2946             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2947             "xorl %%eax, %%eax           \n\t"
2948             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2949             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2950             "addw %%cx, %%ax             \n\t"
2951             "incl %%ebx                  \n\t"
2952             "shrw %%ax                   \n\t" // divide by 2
2953             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2954             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
2955             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
2956                          // mov does not affect flags; -1 to offset inc ebx
2957             "jb avg_1lp                  \n\t"
2958
2959          "avg_1end:                      \n\t"
2960 #ifdef __PIC__
2961             "popl %%ebx                  \n\t" // Global Offset Table index
2962 #endif
2963
2964             : "=c" (dummy_value_c),            // output regs (dummy)
2965               "=S" (dummy_value_S),
2966               "=D" (dummy_value_D)
2967
2968             : "0" (bpp),       // ecx          // input regs
2969               "1" (prev_row),  // esi
2970               "2" (row)        // edi
2971
2972             : "%eax", "%edx"                   // clobber list
2973 #ifndef __PIC__
2974             , "%ebx"
2975 #endif
2976          );
2977       }
2978       return;  // end 1 bpp
2979
2980       case 8:
2981       {
2982          __asm__ __volatile__ (
2983             // re-init address pointers and offset
2984             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
2985             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
2986 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2987             "movq _HBClearMask, %%mm4    \n\t"
2988 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2989
2990             // prime the pump:  load the first Raw(x-bpp) data set
2991             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2992                                       // (NO NEED to correct pos. in loop below)
2993
2994          "avg_8lp:                       \n\t"
2995             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2996             "movq %%mm5, %%mm3           \n\t"
2997             "movq (%%esi,%%ecx,), %%mm1  \n\t"
2998             "addl $8, %%ecx              \n\t"
2999             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3000             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3001             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3002                                                //  where both lsb's were == 1
3003             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3004             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3005             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3006             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3007             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3008             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3009             "cmpl _MMXLength, %%ecx      \n\t"
3010             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3011             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3012             "jb avg_8lp                  \n\t"
3013
3014             : "=S" (dummy_value_S),            // output regs (dummy)
3015               "=D" (dummy_value_D)
3016
3017             : "0" (prev_row),  // esi          // input regs
3018               "1" (row)        // edi
3019
3020             : "%ecx"                           // clobber list
3021 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3022             , "%mm0", "%mm1", "%mm2"
3023             , "%mm3", "%mm4", "%mm5"
3024 #endif
3025          );
3026       }
3027       break;  // end 8 bpp
3028
3029       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3030       {
3031
3032          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
3033          fprintf(stderr,
3034            "libpng:  internal logic error (png_read_filter_row_mmx_avg())\n");
3035
3036 #if 0
3037         __asm__ __volatile__ (
3038             "movq _LBCarryMask, %%mm5    \n\t"
3039             // re-init address pointers and offset
3040             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment boundary
3041             "movl row, %%edi             \n\t" // edi:  Avg(x)
3042             "movq _HBClearMask, %%mm4    \n\t"
3043             "movl %%edi, %%edx           \n\t"
3044             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3045             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
3046          "avg_Alp:                       \n\t"
3047             "movq (%%edi,%%ebx,), %%mm0  \n\t"
3048             "movq %%mm5, %%mm3           \n\t"
3049             "movq (%%esi,%%ebx,), %%mm1  \n\t"
3050             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3051             "movq (%%edx,%%ebx,), %%mm2  \n\t"
3052             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3053             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte where both
3054                                 // lsb's were == 1
3055             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3056             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each byte
3057             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each byte
3058             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each byte
3059             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for each byte
3060             "addl $8, %%ebx              \n\t"
3061             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each byte
3062             "cmpl _MMXLength, %%ebx      \n\t"
3063             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3064             "jb avg_Alp                  \n\t"
3065
3066             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
3067
3068             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
3069
3070             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3071          );
3072 #endif /* 0 - NEVER REACHED */
3073       }
3074       break;
3075
3076    } // end switch (bpp)
3077
3078    __asm__ __volatile__ (
3079       // MMX acceleration complete; now do clean-up
3080       // check if any remaining bytes left to decode
3081 #ifdef __PIC__
3082       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3083 #endif
3084       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
3085 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
3086       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3087       "jnb avg_end                 \n\t"
3088
3089       // do Avg decode for remaining bytes
3090 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3091       "movl %%edi, %%edx           \n\t"
3092 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3093       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3094       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3095
3096    "avg_lp2:                       \n\t"
3097       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3098       "xorl %%eax, %%eax           \n\t"
3099       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3100       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3101       "addw %%cx, %%ax             \n\t"
3102       "incl %%ebx                  \n\t"
3103       "shrw %%ax                   \n\t" // divide by 2
3104       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3105       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3106       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3107       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
3108
3109    "avg_end:                       \n\t"
3110       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3111 #ifdef __PIC__
3112       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3113 #endif
3114
3115       : "=c" (dummy_value_c),            // output regs (dummy)
3116         "=S" (dummy_value_S),
3117         "=D" (dummy_value_D)
3118
3119       : "0" (bpp),       // ecx          // input regs
3120         "1" (prev_row),  // esi
3121         "2" (row)        // edi
3122
3123       : "%eax", "%edx"                   // clobber list
3124 #ifndef __PIC__
3125       , "%ebx"
3126 #endif
3127    );
3128
3129 } /* end png_read_filter_row_mmx_avg() */
3130
3131
3132
3133
3134 //===========================================================================//
3135 //                                                                           //
3136 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
3137 //                                                                           //
3138 //===========================================================================//
3139
3140 // Optimized code for PNG Paeth filter decoder
3141
3142 static void /* PRIVATE */
3143 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3144                               png_bytep prev_row)
3145 {
3146    int bpp;
3147    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3148    int dummy_value_S;
3149    int dummy_value_D;
3150
3151    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3152    _FullLength  = row_info->rowbytes; // # of bytes to filter
3153
3154    __asm__ __volatile__ (
3155 #ifdef __PIC__
3156       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3157 #endif
3158       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
3159 //pre "movl row, %%edi             \n\t"
3160       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
3161 //pre "movl prev_row, %%esi        \n\t"
3162       "xorl %%eax, %%eax           \n\t"
3163
3164       // Compute the Raw value for the first bpp bytes
3165       // Note: the formula works out to be always
3166       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
3167    "paeth_rlp:                     \n\t"
3168       "movb (%%edi,%%ebx,), %%al   \n\t"
3169       "addb (%%esi,%%ebx,), %%al   \n\t"
3170       "incl %%ebx                  \n\t"
3171 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
3172       "cmpl %%ecx, %%ebx           \n\t"
3173       "movb %%al, -1(%%edi,%%ebx,) \n\t"
3174       "jb paeth_rlp                \n\t"
3175       // get # of bytes to alignment
3176       "movl %%edi, _dif            \n\t" // take start of row
3177       "addl %%ebx, _dif            \n\t" // add bpp
3178       "xorl %%ecx, %%ecx           \n\t"
3179       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment boundary
3180       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
3181       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx at alignment
3182       "jz paeth_go                 \n\t"
3183       // fix alignment
3184
3185    "paeth_lp1:                     \n\t"
3186       "xorl %%eax, %%eax           \n\t"
3187       // pav = p - a = (a + b - c) - a = b - c
3188       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3189       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3190       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3191       "movl %%eax, _patemp         \n\t" // Save pav for later use
3192       "xorl %%eax, %%eax           \n\t"
3193       // pbv = p - b = (a + b - c) - b = a - c
3194       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3195       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3196       "movl %%eax, %%ecx           \n\t"
3197       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3198       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3199       // pc = abs(pcv)
3200       "testl $0x80000000, %%eax    \n\t"
3201       "jz paeth_pca                \n\t"
3202       "negl %%eax                  \n\t" // reverse sign of neg values
3203
3204    "paeth_pca:                     \n\t"
3205       "movl %%eax, _pctemp         \n\t" // save pc for later use
3206       // pb = abs(pbv)
3207       "testl $0x80000000, %%ecx    \n\t"
3208       "jz paeth_pba                \n\t"
3209       "negl %%ecx                  \n\t" // reverse sign of neg values
3210
3211    "paeth_pba:                     \n\t"
3212       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
3213       // pa = abs(pav)
3214       "movl _patemp, %%eax         \n\t"
3215       "testl $0x80000000, %%eax    \n\t"
3216       "jz paeth_paa                \n\t"
3217       "negl %%eax                  \n\t" // reverse sign of neg values
3218
3219    "paeth_paa:                     \n\t"
3220       "movl %%eax, _patemp         \n\t" // save pa for later use
3221       // test if pa <= pb
3222       "cmpl %%ecx, %%eax           \n\t"
3223       "jna paeth_abb               \n\t"
3224       // pa > pb; now test if pb <= pc
3225       "cmpl _pctemp, %%ecx         \n\t"
3226       "jna paeth_bbc               \n\t"
3227       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3228       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3229       "jmp paeth_paeth             \n\t"
3230
3231    "paeth_bbc:                     \n\t"
3232       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3233       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
3234       "jmp paeth_paeth             \n\t"
3235
3236    "paeth_abb:                     \n\t"
3237       // pa <= pb; now test if pa <= pc
3238       "cmpl _pctemp, %%eax         \n\t"
3239       "jna paeth_abc               \n\t"
3240       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3241       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3242       "jmp paeth_paeth             \n\t"
3243
3244    "paeth_abc:                     \n\t"
3245       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3246       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
3247
3248    "paeth_paeth:                   \n\t"
3249       "incl %%ebx                  \n\t"
3250       "incl %%edx                  \n\t"
3251       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3252       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3253       "cmpl _dif, %%ebx            \n\t"
3254       "jb paeth_lp1                \n\t"
3255
3256    "paeth_go:                      \n\t"
3257       "movl _FullLength, %%ecx     \n\t"
3258       "movl %%ecx, %%eax           \n\t"
3259       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3260       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3261       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
3262       "movl %%ecx, _MMXLength      \n\t"
3263 #ifdef __PIC__
3264       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3265 #endif
3266
3267       : "=c" (dummy_value_c),            // output regs (dummy)
3268         "=S" (dummy_value_S),
3269         "=D" (dummy_value_D)
3270
3271       : "0" (bpp),       // ecx          // input regs
3272         "1" (prev_row),  // esi
3273         "2" (row)        // edi
3274
3275       : "%eax", "%edx"                   // clobber list
3276 #ifndef __PIC__
3277       , "%ebx"
3278 #endif
3279    );
3280
3281    // now do the math for the rest of the row
3282    switch (bpp)
3283    {
3284       case 3:
3285       {
3286          _ActiveMask.use = 0x0000000000ffffffLL;
3287          _ActiveMaskEnd.use = 0xffff000000000000LL;
3288          _ShiftBpp.use = 24;    // == bpp(3) * 8
3289          _ShiftRem.use = 40;    // == 64 - 24
3290
3291          __asm__ __volatile__ (
3292             "movl _dif, %%ecx            \n\t"
3293 // preload  "movl row, %%edi             \n\t"
3294 // preload  "movl prev_row, %%esi        \n\t"
3295             "pxor %%mm0, %%mm0           \n\t"
3296             // prime the pump:  load the first Raw(x-bpp) data set
3297             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3298          "paeth_3lp:                     \n\t"
3299             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st 3 bytes
3300             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3301             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3302             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3303             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3304             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st 3 bytes
3305             // pav = p - a = (a + b - c) - a = b - c
3306             "movq %%mm2, %%mm4           \n\t"
3307             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3308             // pbv = p - b = (a + b - c) - b = a - c
3309             "movq %%mm1, %%mm5           \n\t"
3310             "psubw %%mm3, %%mm4          \n\t"
3311             "pxor %%mm7, %%mm7           \n\t"
3312             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3313             "movq %%mm4, %%mm6           \n\t"
3314             "psubw %%mm3, %%mm5          \n\t"
3315
3316             // pa = abs(p-a) = abs(pav)
3317             // pb = abs(p-b) = abs(pbv)
3318             // pc = abs(p-c) = abs(pcv)
3319             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3320             "paddw %%mm5, %%mm6          \n\t"
3321             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3322             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3323             "psubw %%mm0, %%mm4          \n\t"
3324             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3325             "psubw %%mm0, %%mm4          \n\t"
3326             "psubw %%mm7, %%mm5          \n\t"
3327             "pxor %%mm0, %%mm0           \n\t"
3328             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3329             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3330             "psubw %%mm7, %%mm5          \n\t"
3331             "psubw %%mm0, %%mm6          \n\t"
3332             //  test pa <= pb
3333             "movq %%mm4, %%mm7           \n\t"
3334             "psubw %%mm0, %%mm6          \n\t"
3335             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3336             "movq %%mm7, %%mm0           \n\t"
3337             // use mm7 mask to merge pa & pb
3338             "pand %%mm7, %%mm5           \n\t"
3339             // use mm0 mask copy to merge a & b
3340             "pand %%mm0, %%mm2           \n\t"
3341             "pandn %%mm4, %%mm7          \n\t"
3342             "pandn %%mm1, %%mm0          \n\t"
3343             "paddw %%mm5, %%mm7          \n\t"
3344             "paddw %%mm2, %%mm0          \n\t"
3345             //  test  ((pa <= pb)? pa:pb) <= pc
3346             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3347             "pxor %%mm1, %%mm1           \n\t"
3348             "pand %%mm7, %%mm3           \n\t"
3349             "pandn %%mm0, %%mm7          \n\t"
3350             "paddw %%mm3, %%mm7          \n\t"
3351             "pxor %%mm0, %%mm0           \n\t"
3352             "packuswb %%mm1, %%mm7       \n\t"
3353             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3354             "pand _ActiveMask, %%mm7     \n\t"
3355             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3356             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3357             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3358             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3359             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
3360             // now do Paeth for 2nd set of bytes (3-5)
3361             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
3362             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3363             "pxor %%mm7, %%mm7           \n\t"
3364             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3365             // pbv = p - b = (a + b - c) - b = a - c
3366             "movq %%mm1, %%mm5           \n\t"
3367             // pav = p - a = (a + b - c) - a = b - c
3368             "movq %%mm2, %%mm4           \n\t"
3369             "psubw %%mm3, %%mm5          \n\t"
3370             "psubw %%mm3, %%mm4          \n\t"
3371             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3372             //       pav + pbv = pbv + pav
3373             "movq %%mm5, %%mm6           \n\t"
3374             "paddw %%mm4, %%mm6          \n\t"
3375
3376             // pa = abs(p-a) = abs(pav)
3377             // pb = abs(p-b) = abs(pbv)
3378             // pc = abs(p-c) = abs(pcv)
3379             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
3380             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
3381             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
3382             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
3383             "psubw %%mm0, %%mm5          \n\t"
3384             "psubw %%mm7, %%mm4          \n\t"
3385             "psubw %%mm0, %%mm5          \n\t"
3386             "psubw %%mm7, %%mm4          \n\t"
3387             "pxor %%mm0, %%mm0           \n\t"
3388             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3389             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3390             "psubw %%mm0, %%mm6          \n\t"
3391             //  test pa <= pb
3392             "movq %%mm4, %%mm7           \n\t"
3393             "psubw %%mm0, %%mm6          \n\t"
3394             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3395             "movq %%mm7, %%mm0           \n\t"
3396             // use mm7 mask to merge pa & pb
3397             "pand %%mm7, %%mm5           \n\t"
3398             // use mm0 mask copy to merge a & b
3399             "pand %%mm0, %%mm2           \n\t"
3400             "pandn %%mm4, %%mm7          \n\t"
3401             "pandn %%mm1, %%mm0          \n\t"
3402             "paddw %%mm5, %%mm7          \n\t"
3403             "paddw %%mm2, %%mm0          \n\t"
3404             //  test  ((pa <= pb)? pa:pb) <= pc
3405             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3406             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3407             "pand %%mm7, %%mm3           \n\t"
3408             "pandn %%mm0, %%mm7          \n\t"
3409             "pxor %%mm1, %%mm1           \n\t"
3410             "paddw %%mm3, %%mm7          \n\t"
3411             "pxor %%mm0, %%mm0           \n\t"
3412             "packuswb %%mm1, %%mm7       \n\t"
3413             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
3414             "pand _ActiveMask, %%mm7     \n\t"
3415             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3416             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of 3 bytes
3417              // pav = p - a = (a + b - c) - a = b - c
3418             "movq %%mm2, %%mm4           \n\t"
3419             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3420             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
3421             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3422             "movq %%mm7, %%mm1           \n\t"
3423             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3424             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
3425                                     // now mm1 will be used as Raw(x-bpp)
3426             // now do Paeth for 3rd, and final, set of bytes (6-7)
3427             "pxor %%mm7, %%mm7           \n\t"
3428             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3429             "psubw %%mm3, %%mm4          \n\t"
3430             // pbv = p - b = (a + b - c) - b = a - c
3431             "movq %%mm1, %%mm5           \n\t"
3432             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3433             "movq %%mm4, %%mm6           \n\t"
3434             "psubw %%mm3, %%mm5          \n\t"
3435             "pxor %%mm0, %%mm0           \n\t"
3436             "paddw %%mm5, %%mm6          \n\t"
3437
3438             // pa = abs(p-a) = abs(pav)
3439             // pb = abs(p-b) = abs(pbv)
3440             // pc = abs(p-c) = abs(pcv)
3441             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3442             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3443             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3444             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3445             "psubw %%mm0, %%mm4          \n\t"
3446             "psubw %%mm7, %%mm5          \n\t"
3447             "psubw %%mm0, %%mm4          \n\t"
3448             "psubw %%mm7, %%mm5          \n\t"
3449             "pxor %%mm0, %%mm0           \n\t"
3450             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3451             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3452             "psubw %%mm0, %%mm6          \n\t"
3453             //  test pa <= pb
3454             "movq %%mm4, %%mm7           \n\t"
3455             "psubw %%mm0, %%mm6          \n\t"
3456             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3457             "movq %%mm7, %%mm0           \n\t"
3458             // use mm0 mask copy to merge a & b
3459             "pand %%mm0, %%mm2           \n\t"
3460             // use mm7 mask to merge pa & pb
3461             "pand %%mm7, %%mm5           \n\t"
3462             "pandn %%mm1, %%mm0          \n\t"
3463             "pandn %%mm4, %%mm7          \n\t"
3464             "paddw %%mm2, %%mm0          \n\t"
3465             "paddw %%mm5, %%mm7          \n\t"
3466             //  test  ((pa <= pb)? pa:pb) <= pc
3467             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3468             "pand %%mm7, %%mm3           \n\t"
3469             "pandn %%mm0, %%mm7          \n\t"
3470             "paddw %%mm3, %%mm7          \n\t"
3471             "pxor %%mm1, %%mm1           \n\t"
3472             "packuswb %%mm7, %%mm1       \n\t"
3473             // step ecx to next set of 8 bytes and repeat loop til done
3474             "addl $8, %%ecx              \n\t"
3475             "pand _ActiveMaskEnd, %%mm1  \n\t"
3476             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3477
3478             "cmpl _MMXLength, %%ecx      \n\t"
3479             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
3480             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3481                                  // mm1 will be used as Raw(x-bpp) next loop
3482                            // mm3 ready to be used as Prior(x-bpp) next loop
3483             "jb paeth_3lp                \n\t"
3484
3485             : "=S" (dummy_value_S),             // output regs (dummy)
3486               "=D" (dummy_value_D)
3487
3488             : "0" (prev_row),  // esi           // input regs
3489               "1" (row)        // edi
3490
3491             : "%ecx"                            // clobber list
3492 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3493             , "%mm0", "%mm1", "%mm2", "%mm3"
3494             , "%mm4", "%mm5", "%mm6", "%mm7"
3495 #endif
3496          );
3497       }
3498       break;  // end 3 bpp
3499
3500       case 6:
3501       //case 7:   // GRR BOGUS
3502       //case 5:   // GRR BOGUS
3503       {
3504          _ActiveMask.use  = 0x00000000ffffffffLL;
3505          _ActiveMask2.use = 0xffffffff00000000LL;
3506          _ShiftBpp.use = bpp << 3;    // == bpp * 8
3507          _ShiftRem.use = 64 - _ShiftBpp.use;
3508
3509          __asm__ __volatile__ (
3510             "movl _dif, %%ecx            \n\t"
3511 // preload  "movl row, %%edi             \n\t"
3512 // preload  "movl prev_row, %%esi        \n\t"
3513             // prime the pump:  load the first Raw(x-bpp) data set
3514             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3515             "pxor %%mm0, %%mm0           \n\t"
3516
3517          "paeth_6lp:                     \n\t"
3518             // must shift to position Raw(x-bpp) data
3519             "psrlq _ShiftRem, %%mm1      \n\t"
3520             // do first set of 4 bytes
3521             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3522             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3523             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3524             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3525             // must shift to position Prior(x-bpp) data
3526             "psrlq _ShiftRem, %%mm3      \n\t"
3527             // pav = p - a = (a + b - c) - a = b - c
3528             "movq %%mm2, %%mm4           \n\t"
3529             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3530             // pbv = p - b = (a + b - c) - b = a - c
3531             "movq %%mm1, %%mm5           \n\t"
3532             "psubw %%mm3, %%mm4          \n\t"
3533             "pxor %%mm7, %%mm7           \n\t"
3534             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3535             "movq %%mm4, %%mm6           \n\t"
3536             "psubw %%mm3, %%mm5          \n\t"
3537             // pa = abs(p-a) = abs(pav)
3538             // pb = abs(p-b) = abs(pbv)
3539             // pc = abs(p-c) = abs(pcv)
3540             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3541             "paddw %%mm5, %%mm6          \n\t"
3542             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3543             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3544             "psubw %%mm0, %%mm4          \n\t"
3545             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3546             "psubw %%mm0, %%mm4          \n\t"
3547             "psubw %%mm7, %%mm5          \n\t"
3548             "pxor %%mm0, %%mm0           \n\t"
3549             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3550             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3551             "psubw %%mm7, %%mm5          \n\t"
3552             "psubw %%mm0, %%mm6          \n\t"
3553             //  test pa <= pb
3554             "movq %%mm4, %%mm7           \n\t"
3555             "psubw %%mm0, %%mm6          \n\t"
3556             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3557             "movq %%mm7, %%mm0           \n\t"
3558             // use mm7 mask to merge pa & pb
3559             "pand %%mm7, %%mm5           \n\t"
3560             // use mm0 mask copy to merge a & b
3561             "pand %%mm0, %%mm2           \n\t"
3562             "pandn %%mm4, %%mm7          \n\t"
3563             "pandn %%mm1, %%mm0          \n\t"
3564             "paddw %%mm5, %%mm7          \n\t"
3565             "paddw %%mm2, %%mm0          \n\t"
3566             //  test  ((pa <= pb)? pa:pb) <= pc
3567             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3568             "pxor %%mm1, %%mm1           \n\t"
3569             "pand %%mm7, %%mm3           \n\t"
3570             "pandn %%mm0, %%mm7          \n\t"
3571             "paddw %%mm3, %%mm7          \n\t"
3572             "pxor %%mm0, %%mm0           \n\t"
3573             "packuswb %%mm1, %%mm7       \n\t"
3574             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3575             "pand _ActiveMask, %%mm7     \n\t"
3576             "psrlq _ShiftRem, %%mm3      \n\t"
3577             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
3578             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3579             "movq %%mm2, %%mm6           \n\t"
3580             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3581             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3582             "psllq _ShiftBpp, %%mm6      \n\t"
3583             "movq %%mm7, %%mm5           \n\t"
3584             "psrlq _ShiftRem, %%mm1      \n\t"
3585             "por %%mm6, %%mm3            \n\t"
3586             "psllq _ShiftBpp, %%mm5      \n\t"
3587             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3588             "por %%mm5, %%mm1            \n\t"
3589             // do second set of 4 bytes
3590             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3591             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3592             // pav = p - a = (a + b - c) - a = b - c
3593             "movq %%mm2, %%mm4           \n\t"
3594             // pbv = p - b = (a + b - c) - b = a - c
3595             "movq %%mm1, %%mm5           \n\t"
3596             "psubw %%mm3, %%mm4          \n\t"
3597             "pxor %%mm7, %%mm7           \n\t"
3598             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3599             "movq %%mm4, %%mm6           \n\t"
3600             "psubw %%mm3, %%mm5          \n\t"
3601             // pa = abs(p-a) = abs(pav)
3602             // pb = abs(p-b) = abs(pbv)
3603             // pc = abs(p-c) = abs(pcv)
3604             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3605             "paddw %%mm5, %%mm6          \n\t"
3606             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3607             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3608             "psubw %%mm0, %%mm4          \n\t"
3609             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3610             "psubw %%mm0, %%mm4          \n\t"
3611             "psubw %%mm7, %%mm5          \n\t"
3612             "pxor %%mm0, %%mm0           \n\t"
3613             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3614             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3615             "psubw %%mm7, %%mm5          \n\t"
3616             "psubw %%mm0, %%mm6          \n\t"
3617             //  test pa <= pb
3618             "movq %%mm4, %%mm7           \n\t"
3619             "psubw %%mm0, %%mm6          \n\t"
3620             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3621             "movq %%mm7, %%mm0           \n\t"
3622             // use mm7 mask to merge pa & pb
3623             "pand %%mm7, %%mm5           \n\t"
3624             // use mm0 mask copy to merge a & b
3625             "pand %%mm0, %%mm2           \n\t"
3626             "pandn %%mm4, %%mm7          \n\t"
3627             "pandn %%mm1, %%mm0          \n\t"
3628             "paddw %%mm5, %%mm7          \n\t"
3629             "paddw %%mm2, %%mm0          \n\t"
3630             //  test  ((pa <= pb)? pa:pb) <= pc
3631             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3632             "pxor %%mm1, %%mm1           \n\t"
3633             "pand %%mm7, %%mm3           \n\t"
3634             "pandn %%mm0, %%mm7          \n\t"
3635             "pxor %%mm1, %%mm1           \n\t"
3636             "paddw %%mm3, %%mm7          \n\t"
3637             "pxor %%mm0, %%mm0           \n\t"
3638             // step ecx to next set of 8 bytes and repeat loop til done
3639             "addl $8, %%ecx              \n\t"
3640             "packuswb %%mm7, %%mm1       \n\t"
3641             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3642             "cmpl _MMXLength, %%ecx      \n\t"
3643             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3644                                 // mm1 will be used as Raw(x-bpp) next loop
3645             "jb paeth_6lp                \n\t"
3646
3647             : "=S" (dummy_value_S),             // output regs (dummy)
3648               "=D" (dummy_value_D)
3649
3650             : "0" (prev_row),  // esi           // input regs
3651               "1" (row)        // edi
3652
3653             : "%ecx"                            // clobber list
3654 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3655             , "%mm0", "%mm1", "%mm2", "%mm3"
3656             , "%mm4", "%mm5", "%mm6", "%mm7"
3657 #endif
3658          );
3659       }
3660       break;  // end 6 bpp
3661
3662       case 4:
3663       {
3664          _ActiveMask.use  = 0x00000000ffffffffLL;
3665
3666          __asm__ __volatile__ (
3667             "movl _dif, %%ecx            \n\t"
3668 // preload  "movl row, %%edi             \n\t"
3669 // preload  "movl prev_row, %%esi        \n\t"
3670             "pxor %%mm0, %%mm0           \n\t"
3671             // prime the pump:  load the first Raw(x-bpp) data set
3672             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3673                                      //  a=Raw(x-bpp) bytes
3674          "paeth_4lp:                     \n\t"
3675             // do first set of 4 bytes
3676             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3677             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3678             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3679             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3680             // pav = p - a = (a + b - c) - a = b - c
3681             "movq %%mm2, %%mm4           \n\t"
3682             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3683             // pbv = p - b = (a + b - c) - b = a - c
3684             "movq %%mm1, %%mm5           \n\t"
3685             "psubw %%mm3, %%mm4          \n\t"
3686             "pxor %%mm7, %%mm7           \n\t"
3687             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3688             "movq %%mm4, %%mm6           \n\t"
3689             "psubw %%mm3, %%mm5          \n\t"
3690             // pa = abs(p-a) = abs(pav)
3691             // pb = abs(p-b) = abs(pbv)
3692             // pc = abs(p-c) = abs(pcv)
3693             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3694             "paddw %%mm5, %%mm6          \n\t"
3695             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3696             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3697             "psubw %%mm0, %%mm4          \n\t"
3698             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3699             "psubw %%mm0, %%mm4          \n\t"
3700             "psubw %%mm7, %%mm5          \n\t"
3701             "pxor %%mm0, %%mm0           \n\t"
3702             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3703             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3704             "psubw %%mm7, %%mm5          \n\t"
3705             "psubw %%mm0, %%mm6          \n\t"
3706             //  test pa <= pb
3707             "movq %%mm4, %%mm7           \n\t"
3708             "psubw %%mm0, %%mm6          \n\t"
3709             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3710             "movq %%mm7, %%mm0           \n\t"
3711             // use mm7 mask to merge pa & pb
3712             "pand %%mm7, %%mm5           \n\t"
3713             // use mm0 mask copy to merge a & b
3714             "pand %%mm0, %%mm2           \n\t"
3715             "pandn %%mm4, %%mm7          \n\t"
3716             "pandn %%mm1, %%mm0          \n\t"
3717             "paddw %%mm5, %%mm7          \n\t"
3718             "paddw %%mm2, %%mm0          \n\t"
3719             //  test  ((pa <= pb)? pa:pb) <= pc
3720             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3721             "pxor %%mm1, %%mm1           \n\t"
3722             "pand %%mm7, %%mm3           \n\t"
3723             "pandn %%mm0, %%mm7          \n\t"
3724             "paddw %%mm3, %%mm7          \n\t"
3725             "pxor %%mm0, %%mm0           \n\t"
3726             "packuswb %%mm1, %%mm7       \n\t"
3727             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3728             "pand _ActiveMask, %%mm7     \n\t"
3729             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3730             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3731             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3732             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3733             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
3734             // do second set of 4 bytes
3735             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3736             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3737             // pav = p - a = (a + b - c) - a = b - c
3738             "movq %%mm2, %%mm4           \n\t"
3739             // pbv = p - b = (a + b - c) - b = a - c
3740             "movq %%mm1, %%mm5           \n\t"
3741             "psubw %%mm3, %%mm4          \n\t"
3742             "pxor %%mm7, %%mm7           \n\t"
3743             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3744             "movq %%mm4, %%mm6           \n\t"
3745             "psubw %%mm3, %%mm5          \n\t"
3746             // pa = abs(p-a) = abs(pav)
3747             // pb = abs(p-b) = abs(pbv)
3748             // pc = abs(p-c) = abs(pcv)
3749             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3750             "paddw %%mm5, %%mm6          \n\t"
3751             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3752             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3753             "psubw %%mm0, %%mm4          \n\t"
3754             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3755             "psubw %%mm0, %%mm4          \n\t"
3756             "psubw %%mm7, %%mm5          \n\t"
3757             "pxor %%mm0, %%mm0           \n\t"
3758             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3759             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3760             "psubw %%mm7, %%mm5          \n\t"
3761             "psubw %%mm0, %%mm6          \n\t"
3762             //  test pa <= pb
3763             "movq %%mm4, %%mm7           \n\t"
3764             "psubw %%mm0, %%mm6          \n\t"
3765             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3766             "movq %%mm7, %%mm0           \n\t"
3767             // use mm7 mask to merge pa & pb
3768             "pand %%mm7, %%mm5           \n\t"
3769             // use mm0 mask copy to merge a & b
3770             "pand %%mm0, %%mm2           \n\t"
3771             "pandn %%mm4, %%mm7          \n\t"
3772             "pandn %%mm1, %%mm0          \n\t"
3773             "paddw %%mm5, %%mm7          \n\t"
3774             "paddw %%mm2, %%mm0          \n\t"
3775             //  test  ((pa <= pb)? pa:pb) <= pc
3776             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3777             "pxor %%mm1, %%mm1           \n\t"
3778             "pand %%mm7, %%mm3           \n\t"
3779             "pandn %%mm0, %%mm7          \n\t"
3780             "pxor %%mm1, %%mm1           \n\t"
3781             "paddw %%mm3, %%mm7          \n\t"
3782             "pxor %%mm0, %%mm0           \n\t"
3783             // step ecx to next set of 8 bytes and repeat loop til done
3784             "addl $8, %%ecx              \n\t"
3785             "packuswb %%mm7, %%mm1       \n\t"
3786             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
3787             "cmpl _MMXLength, %%ecx      \n\t"
3788             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3789                                 // mm1 will be used as Raw(x-bpp) next loop
3790             "jb paeth_4lp                \n\t"
3791
3792             : "=S" (dummy_value_S),             // output regs (dummy)
3793               "=D" (dummy_value_D)
3794
3795             : "0" (prev_row),  // esi           // input regs
3796               "1" (row)        // edi
3797
3798             : "%ecx"                            // clobber list
3799 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3800             , "%mm0", "%mm1", "%mm2", "%mm3"
3801             , "%mm4", "%mm5", "%mm6", "%mm7"
3802 #endif
3803          );
3804       }
3805       break;  // end 4 bpp
3806
3807       case 8:                          // bpp == 8
3808       {
3809          _ActiveMask.use  = 0x00000000ffffffffLL;
3810
3811          __asm__ __volatile__ (
3812             "movl _dif, %%ecx            \n\t"
3813 // preload  "movl row, %%edi             \n\t"
3814 // preload  "movl prev_row, %%esi        \n\t"
3815             "pxor %%mm0, %%mm0           \n\t"
3816             // prime the pump:  load the first Raw(x-bpp) data set
3817             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3818                                        //  a=Raw(x-bpp) bytes
3819          "paeth_8lp:                     \n\t"
3820             // do first set of 4 bytes
3821             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3822             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3823             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3824             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3825             // pav = p - a = (a + b - c) - a = b - c
3826             "movq %%mm2, %%mm4           \n\t"
3827             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3828             // pbv = p - b = (a + b - c) - b = a - c
3829             "movq %%mm1, %%mm5           \n\t"
3830             "psubw %%mm3, %%mm4          \n\t"
3831             "pxor %%mm7, %%mm7           \n\t"
3832             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3833             "movq %%mm4, %%mm6           \n\t"
3834             "psubw %%mm3, %%mm5          \n\t"
3835             // pa = abs(p-a) = abs(pav)
3836             // pb = abs(p-b) = abs(pbv)
3837             // pc = abs(p-c) = abs(pcv)
3838             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3839             "paddw %%mm5, %%mm6          \n\t"
3840             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3841             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3842             "psubw %%mm0, %%mm4          \n\t"
3843             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3844             "psubw %%mm0, %%mm4          \n\t"
3845             "psubw %%mm7, %%mm5          \n\t"
3846             "pxor %%mm0, %%mm0           \n\t"
3847             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3848             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3849             "psubw %%mm7, %%mm5          \n\t"
3850             "psubw %%mm0, %%mm6          \n\t"
3851             //  test pa <= pb
3852             "movq %%mm4, %%mm7           \n\t"
3853             "psubw %%mm0, %%mm6          \n\t"
3854             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3855             "movq %%mm7, %%mm0           \n\t"
3856             // use mm7 mask to merge pa & pb
3857             "pand %%mm7, %%mm5           \n\t"
3858             // use mm0 mask copy to merge a & b
3859             "pand %%mm0, %%mm2           \n\t"
3860             "pandn %%mm4, %%mm7          \n\t"
3861             "pandn %%mm1, %%mm0          \n\t"
3862             "paddw %%mm5, %%mm7          \n\t"
3863             "paddw %%mm2, %%mm0          \n\t"
3864             //  test  ((pa <= pb)? pa:pb) <= pc
3865             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3866             "pxor %%mm1, %%mm1           \n\t"
3867             "pand %%mm7, %%mm3           \n\t"
3868             "pandn %%mm0, %%mm7          \n\t"
3869             "paddw %%mm3, %%mm7          \n\t"
3870             "pxor %%mm0, %%mm0           \n\t"
3871             "packuswb %%mm1, %%mm7       \n\t"
3872             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3873             "pand _ActiveMask, %%mm7     \n\t"
3874             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3875             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3876             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3877             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3878             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
3879
3880             // do second set of 4 bytes
3881             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3882             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3883             // pav = p - a = (a + b - c) - a = b - c
3884             "movq %%mm2, %%mm4           \n\t"
3885             // pbv = p - b = (a + b - c) - b = a - c
3886             "movq %%mm1, %%mm5           \n\t"
3887             "psubw %%mm3, %%mm4          \n\t"
3888             "pxor %%mm7, %%mm7           \n\t"
3889             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3890             "movq %%mm4, %%mm6           \n\t"
3891             "psubw %%mm3, %%mm5          \n\t"
3892             // pa = abs(p-a) = abs(pav)
3893             // pb = abs(p-b) = abs(pbv)
3894             // pc = abs(p-c) = abs(pcv)
3895             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3896             "paddw %%mm5, %%mm6          \n\t"
3897             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3898             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3899             "psubw %%mm0, %%mm4          \n\t"
3900             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3901             "psubw %%mm0, %%mm4          \n\t"
3902             "psubw %%mm7, %%mm5          \n\t"
3903             "pxor %%mm0, %%mm0           \n\t"
3904             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3905             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3906             "psubw %%mm7, %%mm5          \n\t"
3907             "psubw %%mm0, %%mm6          \n\t"
3908             //  test pa <= pb
3909             "movq %%mm4, %%mm7           \n\t"
3910             "psubw %%mm0, %%mm6          \n\t"
3911             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3912             "movq %%mm7, %%mm0           \n\t"
3913             // use mm7 mask to merge pa & pb
3914             "pand %%mm7, %%mm5           \n\t"
3915             // use mm0 mask copy to merge a & b
3916             "pand %%mm0, %%mm2           \n\t"
3917             "pandn %%mm4, %%mm7          \n\t"
3918             "pandn %%mm1, %%mm0          \n\t"
3919             "paddw %%mm5, %%mm7          \n\t"
3920             "paddw %%mm2, %%mm0          \n\t"
3921             //  test  ((pa <= pb)? pa:pb) <= pc
3922             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3923             "pxor %%mm1, %%mm1           \n\t"
3924             "pand %%mm7, %%mm3           \n\t"
3925             "pandn %%mm0, %%mm7          \n\t"
3926             "pxor %%mm1, %%mm1           \n\t"
3927             "paddw %%mm3, %%mm7          \n\t"
3928             "pxor %%mm0, %%mm0           \n\t"
3929             // step ecx to next set of 8 bytes and repeat loop til done
3930             "addl $8, %%ecx              \n\t"
3931             "packuswb %%mm7, %%mm1       \n\t"
3932             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3933             "cmpl _MMXLength, %%ecx      \n\t"
3934             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3935                             // mm1 will be used as Raw(x-bpp) next loop
3936             "jb paeth_8lp                \n\t"
3937
3938             : "=S" (dummy_value_S),             // output regs (dummy)
3939               "=D" (dummy_value_D)
3940
3941             : "0" (prev_row),  // esi           // input regs
3942               "1" (row)        // edi
3943
3944             : "%ecx"                            // clobber list
3945 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3946             , "%mm0", "%mm1", "%mm2", "%mm3"
3947             , "%mm4", "%mm5", "%mm6", "%mm7"
3948 #endif
3949          );
3950       }
3951       break;  // end 8 bpp
3952
3953       case 1:                // bpp = 1
3954       case 2:                // bpp = 2
3955       default:               // bpp > 8
3956       {
3957          __asm__ __volatile__ (
3958 #ifdef __PIC__
3959             "pushl %%ebx                 \n\t" // save Global Offset Table index
3960 #endif
3961             "movl _dif, %%ebx            \n\t"
3962             "cmpl _FullLength, %%ebx     \n\t"
3963             "jnb paeth_dend              \n\t"
3964
3965 // preload  "movl row, %%edi             \n\t"
3966 // preload  "movl prev_row, %%esi        \n\t"
3967             // do Paeth decode for remaining bytes
3968             "movl %%ebx, %%edx           \n\t"
3969 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3970             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
3971             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
3972
3973          "paeth_dlp:                     \n\t"
3974             "xorl %%eax, %%eax           \n\t"
3975             // pav = p - a = (a + b - c) - a = b - c
3976             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3977             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3978             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3979             "movl %%eax, _patemp         \n\t" // Save pav for later use
3980             "xorl %%eax, %%eax           \n\t"
3981             // pbv = p - b = (a + b - c) - b = a - c
3982             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3983             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3984             "movl %%eax, %%ecx           \n\t"
3985             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3986             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3987             // pc = abs(pcv)
3988             "testl $0x80000000, %%eax    \n\t"
3989             "jz paeth_dpca               \n\t"
3990             "negl %%eax                  \n\t" // reverse sign of neg values
3991
3992          "paeth_dpca:                    \n\t"
3993             "movl %%eax, _pctemp         \n\t" // save pc for later use
3994             // pb = abs(pbv)
3995             "testl $0x80000000, %%ecx    \n\t"
3996             "jz paeth_dpba               \n\t"
3997             "negl %%ecx                  \n\t" // reverse sign of neg values
3998
3999          "paeth_dpba:                    \n\t"
4000             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4001             // pa = abs(pav)
4002             "movl _patemp, %%eax         \n\t"
4003             "testl $0x80000000, %%eax    \n\t"
4004             "jz paeth_dpaa               \n\t"
4005             "negl %%eax                  \n\t" // reverse sign of neg values
4006
4007          "paeth_dpaa:                    \n\t"
4008             "movl %%eax, _patemp         \n\t" // save pa for later use
4009             // test if pa <= pb
4010             "cmpl %%ecx, %%eax           \n\t"
4011             "jna paeth_dabb              \n\t"
4012             // pa > pb; now test if pb <= pc
4013             "cmpl _pctemp, %%ecx         \n\t"
4014             "jna paeth_dbbc              \n\t"
4015             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4016             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4017             "jmp paeth_dpaeth            \n\t"
4018
4019          "paeth_dbbc:                    \n\t"
4020             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4021             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4022             "jmp paeth_dpaeth            \n\t"
4023
4024          "paeth_dabb:                    \n\t"
4025             // pa <= pb; now test if pa <= pc
4026             "cmpl _pctemp, %%eax         \n\t"
4027             "jna paeth_dabc              \n\t"
4028             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4029             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4030             "jmp paeth_dpaeth            \n\t"
4031
4032          "paeth_dabc:                    \n\t"
4033             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4034             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4035
4036          "paeth_dpaeth:                  \n\t"
4037             "incl %%ebx                  \n\t"
4038             "incl %%edx                  \n\t"
4039             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4040             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4041             "cmpl _FullLength, %%ebx     \n\t"
4042             "jb paeth_dlp                \n\t"
4043
4044          "paeth_dend:                    \n\t"
4045 #ifdef __PIC__
4046             "popl %%ebx                  \n\t" // index to Global Offset Table
4047 #endif
4048
4049             : "=c" (dummy_value_c),            // output regs (dummy)
4050               "=S" (dummy_value_S),
4051               "=D" (dummy_value_D)
4052
4053             : "0" (bpp),       // ecx          // input regs
4054               "1" (prev_row),  // esi
4055               "2" (row)        // edi
4056
4057             : "%eax", "%edx"                   // clobber list
4058 #ifndef __PIC__
4059             , "%ebx"
4060 #endif
4061          );
4062       }
4063       return;                   // No need to go further with this one
4064
4065    } // end switch (bpp)
4066
4067    __asm__ __volatile__ (
4068       // MMX acceleration complete; now do clean-up
4069       // check if any remaining bytes left to decode
4070 #ifdef __PIC__
4071       "pushl %%ebx                 \n\t" // save index to Global Offset Table
4072 #endif
4073       "movl _MMXLength, %%ebx      \n\t"
4074       "cmpl _FullLength, %%ebx     \n\t"
4075       "jnb paeth_end               \n\t"
4076 //pre "movl row, %%edi             \n\t"
4077 //pre "movl prev_row, %%esi        \n\t"
4078       // do Paeth decode for remaining bytes
4079       "movl %%ebx, %%edx           \n\t"
4080 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4081       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4082       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
4083
4084    "paeth_lp2:                     \n\t"
4085       "xorl %%eax, %%eax           \n\t"
4086       // pav = p - a = (a + b - c) - a = b - c
4087       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4088       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4089       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4090       "movl %%eax, _patemp         \n\t" // Save pav for later use
4091       "xorl %%eax, %%eax           \n\t"
4092       // pbv = p - b = (a + b - c) - b = a - c
4093       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4094       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4095       "movl %%eax, %%ecx           \n\t"
4096       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4097       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4098       // pc = abs(pcv)
4099       "testl $0x80000000, %%eax    \n\t"
4100       "jz paeth_pca2               \n\t"
4101       "negl %%eax                  \n\t" // reverse sign of neg values
4102
4103    "paeth_pca2:                    \n\t"
4104       "movl %%eax, _pctemp         \n\t" // save pc for later use
4105       // pb = abs(pbv)
4106       "testl $0x80000000, %%ecx    \n\t"
4107       "jz paeth_pba2               \n\t"
4108       "negl %%ecx                  \n\t" // reverse sign of neg values
4109
4110    "paeth_pba2:                    \n\t"
4111       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4112       // pa = abs(pav)
4113       "movl _patemp, %%eax         \n\t"
4114       "testl $0x80000000, %%eax    \n\t"
4115       "jz paeth_paa2               \n\t"
4116       "negl %%eax                  \n\t" // reverse sign of neg values
4117
4118    "paeth_paa2:                    \n\t"
4119       "movl %%eax, _patemp         \n\t" // save pa for later use
4120       // test if pa <= pb
4121       "cmpl %%ecx, %%eax           \n\t"
4122       "jna paeth_abb2              \n\t"
4123       // pa > pb; now test if pb <= pc
4124       "cmpl _pctemp, %%ecx         \n\t"
4125       "jna paeth_bbc2              \n\t"
4126       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4127       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4128       "jmp paeth_paeth2            \n\t"
4129
4130    "paeth_bbc2:                    \n\t"
4131       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4132       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4133       "jmp paeth_paeth2            \n\t"
4134
4135    "paeth_abb2:                    \n\t"
4136       // pa <= pb; now test if pa <= pc
4137       "cmpl _pctemp, %%eax         \n\t"
4138       "jna paeth_abc2              \n\t"
4139       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4140       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4141       "jmp paeth_paeth2            \n\t"
4142
4143    "paeth_abc2:                    \n\t"
4144       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4145       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4146
4147    "paeth_paeth2:                  \n\t"
4148       "incl %%ebx                  \n\t"
4149       "incl %%edx                  \n\t"
4150       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4151       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4152       "cmpl _FullLength, %%ebx     \n\t"
4153       "jb paeth_lp2                \n\t"
4154
4155    "paeth_end:                     \n\t"
4156       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
4157 #ifdef __PIC__
4158       "popl %%ebx                  \n\t" // restore index to Global Offset Table
4159 #endif
4160
4161       : "=c" (dummy_value_c),            // output regs (dummy)
4162         "=S" (dummy_value_S),
4163         "=D" (dummy_value_D)
4164
4165       : "0" (bpp),       // ecx          // input regs
4166         "1" (prev_row),  // esi
4167         "2" (row)        // edi
4168
4169       : "%eax", "%edx"                   // clobber list (no input regs!)
4170 #ifndef __PIC__
4171       , "%ebx"
4172 #endif
4173    );
4174
4175 } /* end png_read_filter_row_mmx_paeth() */
4176
4177
4178
4179
4180 //===========================================================================//
4181 //                                                                           //
4182 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
4183 //                                                                           //
4184 //===========================================================================//
4185
4186 // Optimized code for PNG Sub filter decoder
4187
4188 static void /* PRIVATE */
4189 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4190 {
4191    int bpp;
4192    int dummy_value_a;
4193    int dummy_value_D;
4194
4195    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
4196    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
4197
4198    __asm__ __volatile__ (
4199 //pre "movl row, %%edi             \n\t"
4200       "movl %%edi, %%esi           \n\t" // lp = row
4201 //pre "movl bpp, %%eax             \n\t"
4202       "addl %%eax, %%edi           \n\t" // rp = row + bpp
4203 //irr "xorl %%eax, %%eax           \n\t"
4204       // get # of bytes to alignment
4205       "movl %%edi, _dif            \n\t" // take start of row
4206       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
4207                                          //  alignment boundary
4208       "xorl %%ecx, %%ecx           \n\t"
4209       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
4210       "subl %%edi, _dif            \n\t" // subtract from start ==> value
4211       "jz sub_go                   \n\t" //  ecx at alignment
4212
4213    "sub_lp1:                       \n\t" // fix alignment
4214       "movb (%%esi,%%ecx,), %%al   \n\t"
4215       "addb %%al, (%%edi,%%ecx,)   \n\t"
4216       "incl %%ecx                  \n\t"
4217       "cmpl _dif, %%ecx            \n\t"
4218       "jb sub_lp1                  \n\t"
4219
4220    "sub_go:                        \n\t"
4221       "movl _FullLength, %%eax     \n\t"
4222       "movl %%eax, %%edx           \n\t"
4223       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
4224       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
4225       "subl %%edx, %%eax           \n\t" // drop over bytes from length
4226       "movl %%eax, _MMXLength      \n\t"
4227
4228       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4229         "=D" (dummy_value_D)    // 1
4230
4231       : "0" (bpp),              // eax    // input regs
4232         "1" (row)               // edi
4233
4234       : "%ebx", "%ecx", "%edx"            // clobber list
4235       , "%esi"
4236
4237 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4238       , "%mm0", "%mm1", "%mm2", "%mm3"
4239       , "%mm4", "%mm5", "%mm6", "%mm7"
4240 #endif
4241    );
4242
4243    // now do the math for the rest of the row
4244    switch (bpp)
4245    {
4246       case 3:
4247       {
4248          _ActiveMask.use  = 0x0000ffffff000000LL;
4249          _ShiftBpp.use = 24;       // == 3 * 8
4250          _ShiftRem.use  = 40;      // == 64 - 24
4251
4252          __asm__ __volatile__ (
4253 // preload  "movl row, %%edi              \n\t"
4254             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
4255                                                 //  active byte group
4256             "movl %%edi, %%esi            \n\t" // lp = row
4257 // preload  "movl bpp, %%eax              \n\t"
4258             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4259             "movq %%mm7, %%mm6            \n\t"
4260             "movl _dif, %%edx             \n\t"
4261             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4262                                                 //  3rd active byte group
4263             // prime the pump:  load the first Raw(x-bpp) data set
4264             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4265
4266          "sub_3lp:                        \n\t" // shift data for adding first
4267             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4268                                                 //  shift clears inactive bytes)
4269             // add 1st active group
4270             "movq (%%edi,%%edx,), %%mm0   \n\t"
4271             "paddb %%mm1, %%mm0           \n\t"
4272
4273             // add 2nd active group
4274             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4275             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4276             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4277             "paddb %%mm1, %%mm0           \n\t"
4278
4279             // add 3rd active group
4280             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4281             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4282             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4283             "addl $8, %%edx               \n\t"
4284             "paddb %%mm1, %%mm0           \n\t"
4285
4286             "cmpl _MMXLength, %%edx       \n\t"
4287             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4288             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4289             "jb sub_3lp                   \n\t"
4290
4291             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4292               "=D" (dummy_value_D)    // 1
4293
4294             : "0" (bpp),              // eax    // input regs
4295               "1" (row)               // edi
4296
4297             : "%edx", "%esi"                    // clobber list
4298 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4299             , "%mm0", "%mm1", "%mm6", "%mm7"
4300 #endif
4301          );
4302       }
4303       break;
4304
4305       case 1:
4306       {
4307          __asm__ __volatile__ (
4308             "movl _dif, %%edx            \n\t"
4309 // preload  "movl row, %%edi             \n\t"
4310             "cmpl _FullLength, %%edx     \n\t"
4311             "jnb sub_1end                \n\t"
4312             "movl %%edi, %%esi           \n\t" // lp = row
4313             "xorl %%eax, %%eax           \n\t"
4314 // preload  "movl bpp, %%eax             \n\t"
4315             "addl %%eax, %%edi           \n\t" // rp = row + bpp
4316
4317          "sub_1lp:                       \n\t"
4318             "movb (%%esi,%%edx,), %%al   \n\t"
4319             "addb %%al, (%%edi,%%edx,)   \n\t"
4320             "incl %%edx                  \n\t"
4321             "cmpl _FullLength, %%edx     \n\t"
4322             "jb sub_1lp                  \n\t"
4323
4324          "sub_1end:                      \n\t"
4325
4326             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4327               "=D" (dummy_value_D)    // 1
4328
4329             : "0" (bpp),              // eax    // input regs
4330               "1" (row)               // edi
4331
4332             : "%edx", "%esi"                    // clobber list
4333          );
4334       }
4335       return;
4336
4337       case 6:
4338       case 4:
4339       //case 7:   // GRR BOGUS
4340       //case 5:   // GRR BOGUS
4341       {
4342          _ShiftBpp.use = bpp << 3;
4343          _ShiftRem.use = 64 - _ShiftBpp.use;
4344
4345          __asm__ __volatile__ (
4346 // preload  "movl row, %%edi              \n\t"
4347             "movl _dif, %%edx             \n\t"
4348             "movl %%edi, %%esi            \n\t" // lp = row
4349 // preload  "movl bpp, %%eax              \n\t"
4350             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4351
4352             // prime the pump:  load the first Raw(x-bpp) data set
4353             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4354
4355          "sub_4lp:                        \n\t" // shift data for adding first
4356             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4357                                                 //  shift clears inactive bytes)
4358             "movq (%%edi,%%edx,), %%mm0   \n\t"
4359             "paddb %%mm1, %%mm0           \n\t"
4360
4361             // add 2nd active group
4362             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4363             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4364             "addl $8, %%edx               \n\t"
4365             "paddb %%mm1, %%mm0           \n\t"
4366
4367             "cmpl _MMXLength, %%edx       \n\t"
4368             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4369             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4370             "jb sub_4lp                   \n\t"
4371
4372             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4373               "=D" (dummy_value_D)    // 1
4374
4375             : "0" (bpp),              // eax    // input regs
4376               "1" (row)               // edi
4377
4378             : "%edx", "%esi"                    // clobber list
4379 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4380             , "%mm0", "%mm1"
4381 #endif
4382          );
4383       }
4384       break;
4385
4386       case 2:
4387       {
4388          _ActiveMask.use = 0x00000000ffff0000LL;
4389          _ShiftBpp.use = 16;       // == 2 * 8
4390          _ShiftRem.use = 48;       // == 64 - 16
4391
4392          __asm__ __volatile__ (
4393             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
4394                                                 //  active byte group
4395             "movl _dif, %%edx             \n\t"
4396             "movq %%mm7, %%mm6            \n\t"
4397 // preload  "movl row, %%edi              \n\t"
4398             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4399                                                 //  3rd active byte group
4400             "movl %%edi, %%esi            \n\t" // lp = row
4401             "movq %%mm6, %%mm5            \n\t"
4402 // preload  "movl bpp, %%eax              \n\t"
4403             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4404             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
4405                                                 //  4th active byte group
4406             // prime the pump:  load the first Raw(x-bpp) data set
4407             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4408
4409          "sub_2lp:                        \n\t" // shift data for adding first
4410             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4411                                                 //  shift clears inactive bytes)
4412             // add 1st active group
4413             "movq (%%edi,%%edx,), %%mm0   \n\t"
4414             "paddb %%mm1, %%mm0           \n\t"
4415
4416             // add 2nd active group
4417             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4418             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4419             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4420             "paddb %%mm1, %%mm0           \n\t"
4421
4422             // add 3rd active group
4423             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4424             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4425             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4426             "paddb %%mm1, %%mm0           \n\t"
4427
4428             // add 4th active group
4429             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4430             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4431             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
4432             "addl $8, %%edx               \n\t"
4433             "paddb %%mm1, %%mm0           \n\t"
4434             "cmpl _MMXLength, %%edx       \n\t"
4435             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4436             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4437             "jb sub_2lp                   \n\t"
4438
4439             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4440               "=D" (dummy_value_D)    // 1
4441
4442             : "0" (bpp),              // eax    // input regs
4443               "1" (row)               // edi
4444
4445             : "%edx", "%esi"                    // clobber list
4446 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4447             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4448 #endif
4449          );
4450       }
4451       break;
4452
4453       case 8:
4454       {
4455          __asm__ __volatile__ (
4456 // preload  "movl row, %%edi              \n\t"
4457             "movl _dif, %%edx             \n\t"
4458             "movl %%edi, %%esi            \n\t" // lp = row
4459 // preload  "movl bpp, %%eax              \n\t"
4460             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4461             "movl _MMXLength, %%ecx       \n\t"
4462
4463             // prime the pump:  load the first Raw(x-bpp) data set
4464             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4465             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
4466
4467          "sub_8lp:                        \n\t"
4468             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
4469             "paddb %%mm7, %%mm0           \n\t"
4470             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
4471             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
4472
4473             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4474             // This will be repeated for each group of 8 bytes with the 8th
4475             // group being used as the Raw(x-bpp) for the 1st group of the
4476             // next loop.
4477
4478             "paddb %%mm0, %%mm1           \n\t"
4479             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4480             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
4481             "paddb %%mm1, %%mm2           \n\t"
4482             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4483             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4484             "paddb %%mm2, %%mm3           \n\t"
4485             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4486             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4487             "paddb %%mm3, %%mm4           \n\t"
4488             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4489             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4490             "paddb %%mm4, %%mm5           \n\t"
4491             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4492             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4493             "paddb %%mm5, %%mm6           \n\t"
4494             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4495             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4496             "addl $64, %%edx              \n\t"
4497             "paddb %%mm6, %%mm7           \n\t"
4498             "cmpl %%ecx, %%edx            \n\t"
4499             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4500             "jb sub_8lp                   \n\t"
4501
4502             "cmpl _MMXLength, %%edx       \n\t"
4503             "jnb sub_8lt8                 \n\t"
4504
4505          "sub_8lpA:                       \n\t"
4506             "movq (%%edi,%%edx,), %%mm0   \n\t"
4507             "addl $8, %%edx               \n\t"
4508             "paddb %%mm7, %%mm0           \n\t"
4509             "cmpl _MMXLength, %%edx       \n\t"
4510             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4511             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
4512                                                 //  to mm1 to be new Raw(x-bpp)
4513                                                 //  for next loop
4514             "jb sub_8lpA                  \n\t"
4515
4516          "sub_8lt8:                       \n\t"
4517
4518             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4519               "=D" (dummy_value_D)    // 1
4520
4521             : "0" (bpp),              // eax    // input regs
4522               "1" (row)               // edi
4523
4524             : "%ecx", "%edx", "%esi"            // clobber list
4525 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4526             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4527 #endif
4528          );
4529       }
4530       break;
4531
4532       default:                // bpp greater than 8 bytes       GRR BOGUS
4533       {
4534          __asm__ __volatile__ (
4535             "movl _dif, %%edx             \n\t"
4536 // preload  "movl row, %%edi              \n\t"
4537             "movl %%edi, %%esi            \n\t" // lp = row
4538 // preload  "movl bpp, %%eax              \n\t"
4539             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4540
4541          "sub_Alp:                        \n\t"
4542             "movq (%%edi,%%edx,), %%mm0   \n\t"
4543             "movq (%%esi,%%edx,), %%mm1   \n\t"
4544             "addl $8, %%edx               \n\t"
4545             "paddb %%mm1, %%mm0           \n\t"
4546             "cmpl _MMXLength, %%edx       \n\t"
4547             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4548                                                 //  -8 to offset addl edx
4549             "jb sub_Alp                   \n\t"
4550
4551             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4552               "=D" (dummy_value_D)    // 1
4553
4554             : "0" (bpp),              // eax    // input regs
4555               "1" (row)               // edi
4556
4557             : "%edx", "%esi"                    // clobber list
4558 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4559             , "%mm0", "%mm1"
4560 #endif
4561          );
4562       }
4563       break;
4564
4565    } // end switch (bpp)
4566
4567    __asm__ __volatile__ (
4568       "movl _MMXLength, %%edx       \n\t"
4569 //pre "movl row, %%edi              \n\t"
4570       "cmpl _FullLength, %%edx      \n\t"
4571       "jnb sub_end                  \n\t"
4572
4573       "movl %%edi, %%esi            \n\t" // lp = row
4574 //pre "movl bpp, %%eax              \n\t"
4575       "addl %%eax, %%edi            \n\t" // rp = row + bpp
4576       "xorl %%eax, %%eax            \n\t"
4577
4578    "sub_lp2:                        \n\t"
4579       "movb (%%esi,%%edx,), %%al    \n\t"
4580       "addb %%al, (%%edi,%%edx,)    \n\t"
4581       "incl %%edx                   \n\t"
4582       "cmpl _FullLength, %%edx      \n\t"
4583       "jb sub_lp2                   \n\t"
4584
4585    "sub_end:                        \n\t"
4586       "EMMS                         \n\t" // end MMX instructions
4587
4588       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4589         "=D" (dummy_value_D)    // 1
4590
4591       : "0" (bpp),              // eax    // input regs
4592         "1" (row)               // edi
4593
4594       : "%edx", "%esi"                    // clobber list
4595    );
4596
4597 } // end of png_read_filter_row_mmx_sub()
4598
4599
4600
4601
4602 //===========================================================================//
4603 //                                                                           //
4604 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
4605 //                                                                           //
4606 //===========================================================================//
4607
4608 // Optimized code for PNG Up filter decoder
4609
4610 static void /* PRIVATE */
4611 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4612                            png_bytep prev_row)
4613 {
4614    png_uint_32 len;
4615    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
4616    int dummy_value_S;
4617    int dummy_value_D;
4618
4619    len = row_info->rowbytes;              // number of bytes to filter
4620
4621    __asm__ __volatile__ (
4622 //pre "movl row, %%edi              \n\t"
4623       // get # of bytes to alignment
4624       "movl %%edi, %%ecx            \n\t"
4625       "xorl %%ebx, %%ebx            \n\t"
4626       "addl $0x7, %%ecx             \n\t"
4627       "xorl %%eax, %%eax            \n\t"
4628       "andl $0xfffffff8, %%ecx      \n\t"
4629 //pre "movl prev_row, %%esi         \n\t"
4630       "subl %%edi, %%ecx            \n\t"
4631       "jz up_go                     \n\t"
4632
4633    "up_lp1:                         \n\t" // fix alignment
4634       "movb (%%edi,%%ebx,), %%al    \n\t"
4635       "addb (%%esi,%%ebx,), %%al    \n\t"
4636       "incl %%ebx                   \n\t"
4637       "cmpl %%ecx, %%ebx            \n\t"
4638       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4639       "jb up_lp1                    \n\t" //  offset incl ebx
4640
4641    "up_go:                          \n\t"
4642 //pre "movl len, %%edx              \n\t"
4643       "movl %%edx, %%ecx            \n\t"
4644       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
4645       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
4646       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4647
4648       // unrolled loop - use all MMX registers and interleave to reduce
4649       // number of branch instructions (loops) and reduce partial stalls
4650    "up_loop:                        \n\t"
4651       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4652       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4653       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4654       "paddb %%mm1, %%mm0           \n\t"
4655       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4656       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4657       "paddb %%mm3, %%mm2           \n\t"
4658       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4659       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4660       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4661       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4662       "paddb %%mm5, %%mm4           \n\t"
4663       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4664       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4665       "paddb %%mm7, %%mm6           \n\t"
4666       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4667       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4668       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4669       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4670       "paddb %%mm1, %%mm0           \n\t"
4671       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4672       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4673       "paddb %%mm3, %%mm2           \n\t"
4674       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4675       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4676       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4677       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4678       "paddb %%mm5, %%mm4           \n\t"
4679       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4680       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4681       "addl $64, %%ebx              \n\t"
4682       "paddb %%mm7, %%mm6           \n\t"
4683       "cmpl %%ecx, %%ebx            \n\t"
4684       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4685       "jb up_loop                   \n\t" //  -8 to offset addl ebx
4686
4687       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
4688       "jz up_end                    \n\t"
4689
4690       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
4691       "jb up_lt8                    \n\t" //  [added by lcreeve@netins.net]
4692
4693       "addl %%edx, %%ecx            \n\t"
4694       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
4695       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4696       "jz up_lt8                    \n\t"
4697
4698    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
4699       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4700       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4701       "addl $8, %%ebx               \n\t"
4702       "paddb %%mm1, %%mm0           \n\t"
4703       "cmpl %%ecx, %%ebx            \n\t"
4704       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4705       "jb up_lpA                    \n\t" //  offset add ebx
4706       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
4707       "jz up_end                    \n\t"
4708
4709    "up_lt8:                         \n\t"
4710       "xorl %%eax, %%eax            \n\t"
4711       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
4712
4713    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
4714       "movb (%%edi,%%ebx,), %%al    \n\t"
4715       "addb (%%esi,%%ebx,), %%al    \n\t"
4716       "incl %%ebx                   \n\t"
4717       "cmpl %%ecx, %%ebx            \n\t"
4718       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4719       "jb up_lp2                    \n\t" //  offset inc ebx
4720
4721    "up_end:                         \n\t"
4722       "EMMS                         \n\t" // conversion of filtered row complete
4723
4724       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
4725         "=S" (dummy_value_S),   // 1
4726         "=D" (dummy_value_D)    // 2
4727
4728       : "0" (len),              // edx    // input regs
4729         "1" (prev_row),         // esi
4730         "2" (row)               // edi
4731
4732       : "%eax", "%ebx", "%ecx"            // clobber list (no input regs!)
4733
4734 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4735       , "%mm0", "%mm1", "%mm2", "%mm3"
4736       , "%mm4", "%mm5", "%mm6", "%mm7"
4737 #endif
4738    );
4739
4740 } // end of png_read_filter_row_mmx_up()
4741
4742
4743
4744
4745 //===========================================================================//
4746 //                                                                           //
4747 //                   P N G _ R E A D _ F I L T E R _ R O W                   //
4748 //                                                                           //
4749 //===========================================================================//
4750
4751 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
4752
4753 // Optimized png_read_filter_row routines
4754
4755 void /* PRIVATE */
4756 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
4757    row, png_bytep prev_row, int filter)
4758 {
4759 #ifdef PNG_DEBUG
4760    char filnm[10];
4761 #endif
4762
4763 /* GRR:  these are superseded by png_ptr->asm_flags: */
4764 #define UseMMX_sub    1   // GRR:  converted 20000730
4765 #define UseMMX_up     1   // GRR:  converted 20000729
4766 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
4767 #define UseMMX_paeth  1   // GRR:  converted 20000828
4768
4769    if (_mmx_supported == 2) {
4770        png_mmx_support();
4771    }
4772
4773 #ifdef PNG_DEBUG
4774    png_debug(1, "in png_read_filter_row\n");
4775    switch (filter)
4776    {
4777       case 0: sprintf(filnm, "none");
4778          break;
4779       case 1: sprintf(filnm, "sub-%s", "MMX");
4780          break;
4781       case 2: sprintf(filnm, "up-%s", "MMX");
4782          break;
4783       case 3: sprintf(filnm, "avg-%s", "MMX");
4784          break;
4785       case 4: sprintf(filnm, "Paeth-%s", "MMX");
4786          break;
4787       default: sprintf(filnm, "unknw");
4788          break;
4789    }
4790    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
4791    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
4792    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
4793       (int)((row_info->pixel_depth + 7) >> 3));
4794    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
4795 #endif /* PNG_DEBUG */
4796
4797    switch (filter)
4798    {
4799       case PNG_FILTER_VALUE_NONE:
4800          break;
4801
4802       case PNG_FILTER_VALUE_SUB:
4803          if (
4804              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4805              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4806          {
4807             png_read_filter_row_mmx_sub(row_info, row);
4808          }
4809          else
4810          {
4811             png_uint_32 i;
4812             png_uint_32 istop = row_info->rowbytes;
4813             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4814             png_bytep rp = row + bpp;
4815             png_bytep lp = row;
4816
4817             for (i = bpp; i < istop; i++)
4818             {
4819                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
4820                rp++;
4821             }
4822          }  //end !UseMMX_sub
4823          break;
4824
4825       case PNG_FILTER_VALUE_UP:
4826          if (
4827              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4828              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4829          {
4830             png_read_filter_row_mmx_up(row_info, row, prev_row);
4831          }
4832          else
4833          {
4834             png_uint_32 i;
4835             png_uint_32 istop = row_info->rowbytes;
4836             png_bytep rp = row;
4837             png_bytep pp = prev_row;
4838
4839             for (i = 0; i < istop; ++i)
4840             {
4841                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4842                rp++;
4843             }
4844          }  //end !UseMMX_up
4845          break;
4846
4847       case PNG_FILTER_VALUE_AVG:
4848          if (
4849              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4850              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4851          {
4852             png_read_filter_row_mmx_avg(row_info, row, prev_row);
4853          }
4854          else
4855          {
4856             png_uint_32 i;
4857             png_bytep rp = row;
4858             png_bytep pp = prev_row;
4859             png_bytep lp = row;
4860             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4861             png_uint_32 istop = row_info->rowbytes - bpp;
4862
4863             for (i = 0; i < bpp; i++)
4864             {
4865                *rp = (png_byte)(((int)(*rp) +
4866                   ((int)(*pp++) >> 1)) & 0xff);
4867                rp++;
4868             }
4869
4870             for (i = 0; i < istop; i++)
4871             {
4872                *rp = (png_byte)(((int)(*rp) +
4873                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
4874                rp++;
4875             }
4876          }  //end !UseMMX_avg
4877          break;
4878
4879       case PNG_FILTER_VALUE_PAETH:
4880          if (
4881              (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4882              (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4883          {
4884             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
4885          }
4886          else
4887          {
4888             png_uint_32 i;
4889             png_bytep rp = row;
4890             png_bytep pp = prev_row;
4891             png_bytep lp = row;
4892             png_bytep cp = prev_row;
4893             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4894             png_uint_32 istop = row_info->rowbytes - bpp;
4895
4896             for (i = 0; i < bpp; i++)
4897             {
4898                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4899                rp++;
4900             }
4901
4902             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
4903             {
4904                int a, b, c, pa, pb, pc, p;
4905
4906                a = *lp++;
4907                b = *pp++;
4908                c = *cp++;
4909
4910                p = b - c;
4911                pc = a - c;
4912
4913 #ifdef PNG_USE_ABS
4914                pa = abs(p);
4915                pb = abs(pc);
4916                pc = abs(p + pc);
4917 #else
4918                pa = p < 0 ? -p : p;
4919                pb = pc < 0 ? -pc : pc;
4920                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
4921 #endif
4922
4923                /*
4924                   if (pa <= pb && pa <= pc)
4925                      p = a;
4926                   else if (pb <= pc)
4927                      p = b;
4928                   else
4929                      p = c;
4930                 */
4931
4932                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
4933
4934                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
4935                rp++;
4936             }
4937          }  //end !UseMMX_paeth
4938          break;
4939
4940       default:
4941          png_warning(png_ptr, "Ignoring bad row-filter type");
4942          *row=0;
4943          break;
4944    }
4945 }
4946
4947 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
4948
4949
4950
4951
4952 //===========================================================================//
4953 //                                                                           //
4954 //                      P N G _ M M X _ S U P P O R T                        //
4955 //                                                                           //
4956 //===========================================================================//
4957
4958 // GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
4959 //             (2) all instructions compile with gcc 2.7.2.3 and later
4960 //             (3) the function is moved down here to prevent gcc from
4961 //                  inlining it in multiple places and then barfing be-
4962 //                  cause the ".NOT_SUPPORTED" label is multiply defined
4963 //             [is there a way to signal that a *single* function should
4964 //              not be inlined?  is there a way to modify the label for
4965 //              each inlined instance, e.g., by appending _1, _2, etc.?
4966 //              maybe if don't use leading "." in label name? (nope...sigh)]
4967
4968 // GRR TO DO:  make sure PNGAPI doesn't do/require anything screwy here
4969 //             [looks OK for everybody except possibly Cygwin (__cdecl)]
4970
4971 int PNGAPI
4972 png_mmx_support(void)
4973 {
4974     __asm__ __volatile__ (
4975         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
4976         "pushl %%ecx          \n\t"  // so does ecx...
4977         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
4978 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
4979 //      "pushf                \n\t"  // 16-bit pushf
4980         "pushfl               \n\t"  // save Eflag to stack
4981         "popl %%eax           \n\t"  // get Eflag from stack into eax
4982         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
4983         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
4984         "pushl %%eax          \n\t"  // save modified Eflag back to stack
4985 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
4986 //      "popf                 \n\t"  // 16-bit popf
4987         "popfl                \n\t"  // restore modified value to Eflag reg
4988         "pushfl               \n\t"  // save Eflag to stack
4989         "popl %%eax           \n\t"  // get Eflag from stack
4990         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
4991         "jz .NOT_SUPPORTED    \n\t"  // if same, CPUID instr. is not supported
4992
4993         "xorl %%eax, %%eax    \n\t"  // set eax to zero
4994 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
4995         "cpuid                \n\t"  // get the CPU identification info
4996         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
4997         "jl .NOT_SUPPORTED    \n\t"  // if eax is zero, MMX is not supported
4998
4999         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
5000         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
5001                                      // faster than the instruction "mov eax, 1"
5002         "cpuid                \n\t"  // get the CPU identification info again
5003         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5004         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
5005         "jz .NOT_SUPPORTED    \n\t"  // non-zero = yes, MMX IS supported
5006
5007         "movl $1, %%eax       \n\t"  // set return value to 1
5008         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5009         "popl %%edx           \n\t"  // restore edx
5010         "popl %%ecx           \n\t"  // restore ecx
5011         "popl %%ebx           \n\t"  // restore ebx ("row" in png_do_interlace)
5012         "ret                  \n\t"  // DONE:  have MMX support
5013
5014     ".NOT_SUPPORTED:          \n\t"  // target label for jump instructions
5015         "movl $0, %%eax       \n\t"  // set return value to 0
5016         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5017         "popl %%edx           \n\t"  // restore edx
5018         "popl %%ecx           \n\t"  // restore ecx
5019         "popl %%ebx           \n\t"  // restore ebx ("row" in png_do_interlace)
5020 //      "ret                  \n\t"  // DONE:  no MMX support
5021                                      // (fall through to standard C "ret")
5022
5023         :                            // output list (none)
5024
5025         :                            // any variables used on input (none)
5026
5027         : "%eax"                     // clobber list
5028 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
5029 //      , "memory"   // if write to a variable gcc thought was in a reg
5030 //      , "cc"       // "condition codes" (flag bits)
5031     );
5032
5033     // return %%eax;
5034 }
5035
5036 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */